From d218b191f6888db1d6d7068a1c359b6191cb22d9 Mon Sep 17 00:00:00 2001
From: Simon Cambier <simon.cambier@protonmail.com>
Date: Mon, 22 Jan 2024 21:39:07 +0100
Subject: [PATCH] Tokenize Chinese for search

---
 src/search/tokenizer.ts | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/search/tokenizer.ts b/src/search/tokenizer.ts
index f254324..3bc4825 100644
--- a/src/search/tokenizer.ts
+++ b/src/search/tokenizer.ts
@@ -35,8 +35,6 @@ export function tokenizeForIndexing(text: string): string[] {
   // Add whole words (aka "not tokens")
   tokens = [...tokens, ...words]
 
-  // When enabled, we only use the chsSegmenter,
-  // and not the other custom tokenizers
   const chsSegmenter = getChsSegmenter()
   if (chsSegmenter) {
     const chs = tokens.flatMap(word =>
@@ -59,12 +57,22 @@ export function tokenizeForIndexing(text: string): string[] {
  */
 export function tokenizeForSearch(text: string): QueryCombination {
   const tokens = tokenizeTokens(text)
+
+  const chsSegmenter = getChsSegmenter()
+  let chs: string[] = []
+  if (chsSegmenter) {
+    chs = tokens.flatMap(word =>
+      chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
+    )
+  }
+
   const query = {
     combineWith: 'OR',
     queries: [
       { combineWith: 'AND', queries: tokens },
       { combineWith: 'AND', queries: tokens.flatMap(splitHyphens) },
       { combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) },
+      { combineWith: 'AND', queries: chs },
       { combineWith: 'AND', queries: tokenizeWords(text) },
     ],
   }