From 095e5f841d2b5831cfdf7ab0e5154d5339ba6985 Mon Sep 17 00:00:00 2001 From: YuNing Chen Date: Wed, 2 Aug 2023 15:59:30 +0800 Subject: [PATCH] Change the order of tokenizing text. (#267) A Chinese user may also have english notes, but previous implementation can not handle hyphens and camel case This commit should fix by changing the order of how tokens are generated --- src/search/omnisearch.ts | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/search/omnisearch.ts b/src/search/omnisearch.ts index a5de141..a91a684 100644 --- a/src/search/omnisearch.ts +++ b/src/search/omnisearch.ts @@ -25,6 +25,12 @@ import { sortBy } from 'lodash-es' const tokenize = (text: string): string[] => { let tokens = text.split(SPACE_OR_PUNCTUATION) + // Split hyphenated tokens + tokens = [...tokens, ...tokens.flatMap(splitHyphens)] + + // Split camelCase tokens into "camel" and "case + tokens = [...tokens, ...tokens.flatMap(splitCamelCase)] + // When enabled, we only use the chsSegmenter, // and not the other custom tokenizers const chsSegmenter = getChsSegmenter() @@ -32,12 +38,8 @@ const tokenize = (text: string): string[] => { tokens = tokens.flatMap(word => chsRegex.test(word) ? chsSegmenter.cut(word) : [word] ) - } else { - // Split camelCase tokens into "camel" and "case - tokens = [...tokens, ...tokens.flatMap(splitCamelCase)] - // Split hyphenated tokens - tokens = [...tokens, ...tokens.flatMap(splitHyphens)] } + return tokens }