Tokenize Chinese for search
This commit is contained in:
@@ -35,8 +35,6 @@ export function tokenizeForIndexing(text: string): string[] {
|
|||||||
// Add whole words (aka "not tokens")
|
// Add whole words (aka "not tokens")
|
||||||
tokens = [...tokens, ...words]
|
tokens = [...tokens, ...words]
|
||||||
|
|
||||||
// When enabled, we only use the chsSegmenter,
|
|
||||||
// and not the other custom tokenizers
|
|
||||||
const chsSegmenter = getChsSegmenter()
|
const chsSegmenter = getChsSegmenter()
|
||||||
if (chsSegmenter) {
|
if (chsSegmenter) {
|
||||||
const chs = tokens.flatMap(word =>
|
const chs = tokens.flatMap(word =>
|
||||||
@@ -59,12 +57,22 @@ export function tokenizeForIndexing(text: string): string[] {
|
|||||||
*/
|
*/
|
||||||
export function tokenizeForSearch(text: string): QueryCombination {
|
export function tokenizeForSearch(text: string): QueryCombination {
|
||||||
const tokens = tokenizeTokens(text)
|
const tokens = tokenizeTokens(text)
|
||||||
|
|
||||||
|
const chsSegmenter = getChsSegmenter()
|
||||||
|
let chs: string[] = []
|
||||||
|
if (chsSegmenter) {
|
||||||
|
chs = tokens.flatMap(word =>
|
||||||
|
chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
const query = {
|
const query = {
|
||||||
combineWith: 'OR',
|
combineWith: 'OR',
|
||||||
queries: [
|
queries: [
|
||||||
{ combineWith: 'AND', queries: tokens },
|
{ combineWith: 'AND', queries: tokens },
|
||||||
{ combineWith: 'AND', queries: tokens.flatMap(splitHyphens) },
|
{ combineWith: 'AND', queries: tokens.flatMap(splitHyphens) },
|
||||||
{ combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) },
|
{ combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) },
|
||||||
|
{ combineWith: 'AND', queries: chs },
|
||||||
{ combineWith: 'AND', queries: tokenizeWords(text) },
|
{ combineWith: 'AND', queries: tokenizeWords(text) },
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user