From d218b191f6888db1d6d7068a1c359b6191cb22d9 Mon Sep 17 00:00:00 2001 From: Simon Cambier Date: Mon, 22 Jan 2024 21:39:07 +0100 Subject: [PATCH] Tokenize Chinese for search --- src/search/tokenizer.ts | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/search/tokenizer.ts b/src/search/tokenizer.ts index f254324..3bc4825 100644 --- a/src/search/tokenizer.ts +++ b/src/search/tokenizer.ts @@ -35,8 +35,6 @@ export function tokenizeForIndexing(text: string): string[] { // Add whole words (aka "not tokens") tokens = [...tokens, ...words] - // When enabled, we only use the chsSegmenter, - // and not the other custom tokenizers const chsSegmenter = getChsSegmenter() if (chsSegmenter) { const chs = tokens.flatMap(word => @@ -59,12 +57,22 @@ export function tokenizeForIndexing(text: string): string[] { */ export function tokenizeForSearch(text: string): QueryCombination { const tokens = tokenizeTokens(text) + + const chsSegmenter = getChsSegmenter() + let chs: string[] = [] + if (chsSegmenter) { + chs = tokens.flatMap(word => + chsRegex.test(word) ? chsSegmenter.cut(word) : [word] + ) + } + const query = { combineWith: 'OR', queries: [ { combineWith: 'AND', queries: tokens }, { combineWith: 'AND', queries: tokens.flatMap(splitHyphens) }, { combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) }, + { combineWith: 'AND', queries: chs }, { combineWith: 'AND', queries: tokenizeWords(text) }, ], }