Change the order of tokenizing text. (#267)

A Chinese user may also have english notes, but previous implementation can not handle hyphens and camel case This commit should fix by changing the order of how tokens are generated
2023-08-02 15:59:30 +08:00
parent 9eeb8db841
commit 095e5f841d
1 changed files with 7 additions and 5 deletions
--- a/src/search/omnisearch.ts
+++ b/src/search/omnisearch.ts
@@ -25,6 +25,12 @@ import { sortBy } from 'lodash-es'
 const tokenize = (text: string): string[] => {
  let tokens = text.split(SPACE_OR_PUNCTUATION)

+  // Split hyphenated tokens
+  tokens = [...tokens, ...tokens.flatMap(splitHyphens)]
+
+  // Split camelCase tokens into "camel" and "case
+  tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]
+
  // When enabled, we only use the chsSegmenter,
  // and not the other custom tokenizers
  const chsSegmenter = getChsSegmenter()
@@ -32,12 +38,8 @@ const tokenize = (text: string): string[] => {
    tokens = tokens.flatMap(word =>
      chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
    )
-  } else {
-    // Split camelCase tokens into "camel" and "case
-    tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]
-    // Split hyphenated tokens
-    tokens = [...tokens, ...tokens.flatMap(splitHyphens)]
  }
+
  return tokens
 }