Tokenize Chinese for search

2024-01-22 21:39:07 +01:00
parent a391daf2d3
commit d218b191f6
1 changed files with 10 additions and 2 deletions
--- a/src/search/tokenizer.ts
+++ b/src/search/tokenizer.ts
@@ -35,8 +35,6 @@ export function tokenizeForIndexing(text: string): string[] {
  // Add whole words (aka "not tokens")
  tokens = [...tokens, ...words]

-  // When enabled, we only use the chsSegmenter,
-  // and not the other custom tokenizers
  const chsSegmenter = getChsSegmenter()
  if (chsSegmenter) {
    const chs = tokens.flatMap(word =>
@@ -59,12 +57,22 @@ export function tokenizeForIndexing(text: string): string[] {
 */
 export function tokenizeForSearch(text: string): QueryCombination {
  const tokens = tokenizeTokens(text)
+
+  const chsSegmenter = getChsSegmenter()
+  let chs: string[] = []
+  if (chsSegmenter) {
+    chs = tokens.flatMap(word =>
+      chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
+    )
+  }
+
  const query = {
    combineWith: 'OR',
    queries: [
      { combineWith: 'AND', queries: tokens },
      { combineWith: 'AND', queries: tokens.flatMap(splitHyphens) },
      { combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) },
+      { combineWith: 'AND', queries: chs },
      { combineWith: 'AND', queries: tokenizeWords(text) },
    ],
  }