Tokenize Chinese for search

This commit is contained in:
Simon Cambier
2024-01-22 21:39:07 +01:00
parent a391daf2d3
commit d218b191f6

View File

@@ -35,8 +35,6 @@ export function tokenizeForIndexing(text: string): string[] {
// Add whole words (aka "not tokens") // Add whole words (aka "not tokens")
tokens = [...tokens, ...words] tokens = [...tokens, ...words]
// When enabled, we only use the chsSegmenter,
// and not the other custom tokenizers
const chsSegmenter = getChsSegmenter() const chsSegmenter = getChsSegmenter()
if (chsSegmenter) { if (chsSegmenter) {
const chs = tokens.flatMap(word => const chs = tokens.flatMap(word =>
@@ -59,12 +57,22 @@ export function tokenizeForIndexing(text: string): string[] {
*/ */
export function tokenizeForSearch(text: string): QueryCombination { export function tokenizeForSearch(text: string): QueryCombination {
const tokens = tokenizeTokens(text) const tokens = tokenizeTokens(text)
const chsSegmenter = getChsSegmenter()
let chs: string[] = []
if (chsSegmenter) {
chs = tokens.flatMap(word =>
chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
)
}
const query = { const query = {
combineWith: 'OR', combineWith: 'OR',
queries: [ queries: [
{ combineWith: 'AND', queries: tokens }, { combineWith: 'AND', queries: tokens },
{ combineWith: 'AND', queries: tokens.flatMap(splitHyphens) }, { combineWith: 'AND', queries: tokens.flatMap(splitHyphens) },
{ combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) }, { combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) },
{ combineWith: 'AND', queries: chs },
{ combineWith: 'AND', queries: tokenizeWords(text) }, { combineWith: 'AND', queries: tokenizeWords(text) },
], ],
} }