This commit is contained in:
Simon Cambier
2023-08-22 07:53:07 +02:00
5 changed files with 20 additions and 17 deletions

View File

@@ -25,6 +25,12 @@ import { sortBy } from 'lodash-es'
const tokenize = (text: string): string[] => {
let tokens = text.split(SPACE_OR_PUNCTUATION)
// Split hyphenated tokens
tokens = [...tokens, ...tokens.flatMap(splitHyphens)]
// Split camelCase tokens into "camel" and "case
tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]
// When enabled, we only use the chsSegmenter,
// and not the other custom tokenizers
const chsSegmenter = getChsSegmenter()
@@ -32,12 +38,8 @@ const tokenize = (text: string): string[] => {
tokens = tokens.flatMap(word =>
chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
)
} else {
// Split camelCase tokens into "camel" and "case
tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]
// Split hyphenated tokens
tokens = [...tokens, ...tokens.flatMap(splitHyphens)]
}
return tokens
}