From c4211a0b0990dac4762424daa98dc58183410c2c Mon Sep 17 00:00:00 2001 From: Aiden Liu <31102694+aidenlx@users.noreply.github.com> Date: Tue, 7 May 2024 23:58:01 +0800 Subject: [PATCH] feat: enable search optimized tokenizer for Chinese (#370) * feat: enable search optimized tokenizer for Chinese * docs: update readme on CJK support --- README.md | 2 +- src/search/tokenizer.ts | 39 +++++++++++++++++---------------------- 2 files changed, 18 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index e58b150..25f7c00 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ You can check the [CHANGELOG](./CHANGELOG.md) for more information on the differ - Directly Insert a `[[link]]` from the search results - Supports Vim navigation keys -**Note:** support of Chinese, Japanese, Korean, etc. depends +**Note:** support of Chinese depends on [this additional plugin](https://github.com/aidenlx/cm-chs-patch). Please read its documentation for more information. diff --git a/src/search/tokenizer.ts b/src/search/tokenizer.ts index 3eb8ecc..bd5fb50 100644 --- a/src/search/tokenizer.ts +++ b/src/search/tokenizer.ts @@ -9,12 +9,24 @@ import { settings } from 'src/settings' import { logDebug, splitCamelCase, splitHyphens } from 'src/tools/utils' const markdownLinkExtractor = require('markdown-link-extractor') -function tokenizeWords(text: string): string[] { - return text.split(BRACKETS_AND_SPACE) +function tokenizeWords(text: string, { skipChs = false } = {}): string[] { + const tokens = text.split(BRACKETS_AND_SPACE) + if (skipChs) return tokens + return tokenizeChsWord(tokens) } -function tokenizeTokens(text: string): string[] { - return text.split(SPACE_OR_PUNCTUATION) +function tokenizeTokens(text: string, { skipChs = false } = {}): string[] { + const tokens = text.split(SPACE_OR_PUNCTUATION) + if (skipChs) return tokens + return tokenizeChsWord(tokens) +} + +function tokenizeChsWord(tokens: string[]): string[] { + const segmenter = getChsSegmenter() + if (!segmenter) return tokens + return tokens.flatMap(word => + chsRegex.test(word) ? segmenter.cut(word, { search: true }) : [word] + ) } /** @@ -34,7 +46,7 @@ export function tokenizeForIndexing(text: string): string[] { } } - let tokens = tokenizeTokens(text) + let tokens = tokenizeTokens(text, { skipChs: true }) // Split hyphenated tokens tokens = [...tokens, ...tokens.flatMap(splitHyphens)] @@ -50,14 +62,6 @@ export function tokenizeForIndexing(text: string): string[] { tokens = [...tokens, ...urls] } - const chsSegmenter = getChsSegmenter() - if (chsSegmenter) { - const chs = tokens.flatMap(word => - chsRegex.test(word) ? chsSegmenter.cut(word) : [word] - ) - tokens = [...tokens, ...chs] - } - // Remove duplicates tokens = [...new Set(tokens)] @@ -77,14 +81,6 @@ export function tokenizeForSearch(text: string): QueryCombination { const tokens = [...tokenizeTokens(text), ...urls].filter(Boolean) - let chs: string[] = [] - const chsSegmenter = getChsSegmenter() - if (chsSegmenter) { - chs = tokens.flatMap(word => - chsRegex.test(word) ? chsSegmenter.cut(word) : [word] - ) - } - return { combineWith: 'OR', queries: [ @@ -92,7 +88,6 @@ export function tokenizeForSearch(text: string): QueryCombination { { combineWith: 'AND', queries: tokenizeWords(text).filter(Boolean) }, { combineWith: 'AND', queries: tokens.flatMap(splitHyphens) }, { combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) }, - { combineWith: 'AND', queries: chs }, ], } }