#304 - Index more tokens, words split by brackets/parentheses
This commit is contained in:
@@ -125,3 +125,4 @@ const separators =
|
|||||||
.slice(1, -1)
|
.slice(1, -1)
|
||||||
export const SPACE_OR_PUNCTUATION_UNIQUE = new RegExp(`${separators}`, 'u')
|
export const SPACE_OR_PUNCTUATION_UNIQUE = new RegExp(`${separators}`, 'u')
|
||||||
export const SPACE_OR_PUNCTUATION = new RegExp(`${separators}+`, 'u')
|
export const SPACE_OR_PUNCTUATION = new RegExp(`${separators}+`, 'u')
|
||||||
|
export const BRACKETS_AND_SPACE = /[|\[\]\(\)<>\{\} \t\n\r]/u
|
||||||
|
|||||||
@@ -1,6 +1,11 @@
|
|||||||
import MiniSearch, { type Options, type SearchResult } from 'minisearch'
|
import MiniSearch, { type Options, type SearchResult } from 'minisearch'
|
||||||
import type { DocumentRef, IndexedDocument, ResultNote } from '../globals'
|
import type { DocumentRef, IndexedDocument, ResultNote } from '../globals'
|
||||||
import { chsRegex, getChsSegmenter, SPACE_OR_PUNCTUATION } from '../globals'
|
import {
|
||||||
|
BRACKETS_AND_SPACE,
|
||||||
|
chsRegex,
|
||||||
|
getChsSegmenter,
|
||||||
|
SPACE_OR_PUNCTUATION,
|
||||||
|
} from '../globals'
|
||||||
import { settings } from '../settings'
|
import { settings } from '../settings'
|
||||||
import {
|
import {
|
||||||
chunkArray,
|
chunkArray,
|
||||||
@@ -17,6 +22,8 @@ import { sortBy } from 'lodash-es'
|
|||||||
import { getMatches, stringsToRegex } from 'src/tools/text-processing'
|
import { getMatches, stringsToRegex } from 'src/tools/text-processing'
|
||||||
|
|
||||||
const tokenize = (text: string): string[] => {
|
const tokenize = (text: string): string[] => {
|
||||||
|
const words = text.split(BRACKETS_AND_SPACE)
|
||||||
|
|
||||||
let tokens = text.split(SPACE_OR_PUNCTUATION)
|
let tokens = text.split(SPACE_OR_PUNCTUATION)
|
||||||
|
|
||||||
// Split hyphenated tokens
|
// Split hyphenated tokens
|
||||||
@@ -25,15 +32,22 @@ const tokenize = (text: string): string[] => {
|
|||||||
// Split camelCase tokens into "camel" and "case
|
// Split camelCase tokens into "camel" and "case
|
||||||
tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]
|
tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]
|
||||||
|
|
||||||
|
// Add whole words (aka "not tokens")
|
||||||
|
tokens = [...tokens, ...words]
|
||||||
|
|
||||||
// When enabled, we only use the chsSegmenter,
|
// When enabled, we only use the chsSegmenter,
|
||||||
// and not the other custom tokenizers
|
// and not the other custom tokenizers
|
||||||
const chsSegmenter = getChsSegmenter()
|
const chsSegmenter = getChsSegmenter()
|
||||||
if (chsSegmenter) {
|
if (chsSegmenter) {
|
||||||
tokens = tokens.flatMap(word =>
|
const chs = tokens.flatMap(word =>
|
||||||
chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
|
chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
|
||||||
)
|
)
|
||||||
|
tokens = [...tokens, ...chs]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Remove duplicates
|
||||||
|
tokens = [...new Set(tokens)]
|
||||||
|
|
||||||
return tokens
|
return tokens
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user