Change the order of tokenizing text. (#267)

A Chinese user may also have english notes, but previous implementation can not handle hyphens and camel case

This commit should fix by changing the order of how tokens are generated
This commit is contained in:
YuNing Chen
2023-08-02 15:59:30 +08:00
committed by GitHub
parent 9eeb8db841
commit 095e5f841d

View File

@@ -25,6 +25,12 @@ import { sortBy } from 'lodash-es'
const tokenize = (text: string): string[] => {
let tokens = text.split(SPACE_OR_PUNCTUATION)
// Split hyphenated tokens
tokens = [...tokens, ...tokens.flatMap(splitHyphens)]
// Split camelCase tokens into "camel" and "case
tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]
// When enabled, we only use the chsSegmenter,
// and not the other custom tokenizers
const chsSegmenter = getChsSegmenter()
@@ -32,12 +38,8 @@ const tokenize = (text: string): string[] => {
tokens = tokens.flatMap(word =>
chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
)
} else {
// Split camelCase tokens into "camel" and "case
tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]
// Split hyphenated tokens
tokens = [...tokens, ...tokens.flatMap(splitHyphens)]
}
return tokens
}