Change the order of tokenizing text. (#267)
A Chinese user may also have english notes, but previous implementation can not handle hyphens and camel case This commit should fix by changing the order of how tokens are generated
This commit is contained in:
@@ -25,6 +25,12 @@ import { sortBy } from 'lodash-es'
|
||||
const tokenize = (text: string): string[] => {
|
||||
let tokens = text.split(SPACE_OR_PUNCTUATION)
|
||||
|
||||
// Split hyphenated tokens
|
||||
tokens = [...tokens, ...tokens.flatMap(splitHyphens)]
|
||||
|
||||
// Split camelCase tokens into "camel" and "case
|
||||
tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]
|
||||
|
||||
// When enabled, we only use the chsSegmenter,
|
||||
// and not the other custom tokenizers
|
||||
const chsSegmenter = getChsSegmenter()
|
||||
@@ -32,12 +38,8 @@ const tokenize = (text: string): string[] => {
|
||||
tokens = tokens.flatMap(word =>
|
||||
chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
|
||||
)
|
||||
} else {
|
||||
// Split camelCase tokens into "camel" and "case
|
||||
tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]
|
||||
// Split hyphenated tokens
|
||||
tokens = [...tokens, ...tokens.flatMap(splitHyphens)]
|
||||
}
|
||||
|
||||
return tokens
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user