perf: combine token split into 1 pass (#427)

This commit is contained in:
YuNing Chen
2025-01-03 19:27:19 +08:00
committed by GitHub
parent c4fa419aa0
commit 0ebe903bb9

View File

@@ -27,15 +27,11 @@ export class Tokenizer {
} }
let tokens = this.tokenizeTokens(text, { skipChs: true }) let tokens = this.tokenizeTokens(text, { skipChs: true })
tokens = [...tokens.flatMap(token => [
// Split hyphenated tokens token,
tokens = [...tokens, ...tokens.flatMap(splitHyphens)] ...splitHyphens(token),
...splitCamelCase(token),
// Split camelCase tokens into "camel" and "case ]), ...words]
tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]
// Add whole words (aka "not tokens")
tokens = [...tokens, ...words]
// Add urls // Add urls
if (urls.length) { if (urls.length) {