feat: enable search optimized tokenizer for Chinese (#370)
* feat: enable search optimized tokenizer for Chinese * docs: update readme on CJK support
This commit is contained in:
@@ -52,7 +52,7 @@ You can check the [CHANGELOG](./CHANGELOG.md) for more information on the differ
|
|||||||
- Directly Insert a `[[link]]` from the search results
|
- Directly Insert a `[[link]]` from the search results
|
||||||
- Supports Vim navigation keys
|
- Supports Vim navigation keys
|
||||||
|
|
||||||
**Note:** support of Chinese, Japanese, Korean, etc. depends
|
**Note:** support of Chinese depends
|
||||||
on [this additional plugin](https://github.com/aidenlx/cm-chs-patch). Please read its documentation for more
|
on [this additional plugin](https://github.com/aidenlx/cm-chs-patch). Please read its documentation for more
|
||||||
information.
|
information.
|
||||||
|
|
||||||
|
|||||||
@@ -9,12 +9,24 @@ import { settings } from 'src/settings'
|
|||||||
import { logDebug, splitCamelCase, splitHyphens } from 'src/tools/utils'
|
import { logDebug, splitCamelCase, splitHyphens } from 'src/tools/utils'
|
||||||
const markdownLinkExtractor = require('markdown-link-extractor')
|
const markdownLinkExtractor = require('markdown-link-extractor')
|
||||||
|
|
||||||
function tokenizeWords(text: string): string[] {
|
function tokenizeWords(text: string, { skipChs = false } = {}): string[] {
|
||||||
return text.split(BRACKETS_AND_SPACE)
|
const tokens = text.split(BRACKETS_AND_SPACE)
|
||||||
|
if (skipChs) return tokens
|
||||||
|
return tokenizeChsWord(tokens)
|
||||||
}
|
}
|
||||||
|
|
||||||
function tokenizeTokens(text: string): string[] {
|
function tokenizeTokens(text: string, { skipChs = false } = {}): string[] {
|
||||||
return text.split(SPACE_OR_PUNCTUATION)
|
const tokens = text.split(SPACE_OR_PUNCTUATION)
|
||||||
|
if (skipChs) return tokens
|
||||||
|
return tokenizeChsWord(tokens)
|
||||||
|
}
|
||||||
|
|
||||||
|
function tokenizeChsWord(tokens: string[]): string[] {
|
||||||
|
const segmenter = getChsSegmenter()
|
||||||
|
if (!segmenter) return tokens
|
||||||
|
return tokens.flatMap(word =>
|
||||||
|
chsRegex.test(word) ? segmenter.cut(word, { search: true }) : [word]
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -34,7 +46,7 @@ export function tokenizeForIndexing(text: string): string[] {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let tokens = tokenizeTokens(text)
|
let tokens = tokenizeTokens(text, { skipChs: true })
|
||||||
|
|
||||||
// Split hyphenated tokens
|
// Split hyphenated tokens
|
||||||
tokens = [...tokens, ...tokens.flatMap(splitHyphens)]
|
tokens = [...tokens, ...tokens.flatMap(splitHyphens)]
|
||||||
@@ -50,14 +62,6 @@ export function tokenizeForIndexing(text: string): string[] {
|
|||||||
tokens = [...tokens, ...urls]
|
tokens = [...tokens, ...urls]
|
||||||
}
|
}
|
||||||
|
|
||||||
const chsSegmenter = getChsSegmenter()
|
|
||||||
if (chsSegmenter) {
|
|
||||||
const chs = tokens.flatMap(word =>
|
|
||||||
chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
|
|
||||||
)
|
|
||||||
tokens = [...tokens, ...chs]
|
|
||||||
}
|
|
||||||
|
|
||||||
// Remove duplicates
|
// Remove duplicates
|
||||||
tokens = [...new Set(tokens)]
|
tokens = [...new Set(tokens)]
|
||||||
|
|
||||||
@@ -77,14 +81,6 @@ export function tokenizeForSearch(text: string): QueryCombination {
|
|||||||
|
|
||||||
const tokens = [...tokenizeTokens(text), ...urls].filter(Boolean)
|
const tokens = [...tokenizeTokens(text), ...urls].filter(Boolean)
|
||||||
|
|
||||||
let chs: string[] = []
|
|
||||||
const chsSegmenter = getChsSegmenter()
|
|
||||||
if (chsSegmenter) {
|
|
||||||
chs = tokens.flatMap(word =>
|
|
||||||
chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
combineWith: 'OR',
|
combineWith: 'OR',
|
||||||
queries: [
|
queries: [
|
||||||
@@ -92,7 +88,6 @@ export function tokenizeForSearch(text: string): QueryCombination {
|
|||||||
{ combineWith: 'AND', queries: tokenizeWords(text).filter(Boolean) },
|
{ combineWith: 'AND', queries: tokenizeWords(text).filter(Boolean) },
|
||||||
{ combineWith: 'AND', queries: tokens.flatMap(splitHyphens) },
|
{ combineWith: 'AND', queries: tokens.flatMap(splitHyphens) },
|
||||||
{ combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) },
|
{ combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) },
|
||||||
{ combineWith: 'AND', queries: chs },
|
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user