#304 - Index more tokens, words split by brackets/parentheses

This commit is contained in:
Simon Cambier
2023-11-01 13:22:06 +01:00
parent 285c4e9257
commit 8355489bcd
2 changed files with 17 additions and 2 deletions

View File

@@ -125,3 +125,4 @@ const separators =
.slice(1, -1)
export const SPACE_OR_PUNCTUATION_UNIQUE = new RegExp(`${separators}`, 'u')
export const SPACE_OR_PUNCTUATION = new RegExp(`${separators}+`, 'u')
export const BRACKETS_AND_SPACE = /[|\[\]\(\)<>\{\} \t\n\r]/u

View File

@@ -1,6 +1,11 @@
import MiniSearch, { type Options, type SearchResult } from 'minisearch'
import type { DocumentRef, IndexedDocument, ResultNote } from '../globals'
import { chsRegex, getChsSegmenter, SPACE_OR_PUNCTUATION } from '../globals'
import {
BRACKETS_AND_SPACE,
chsRegex,
getChsSegmenter,
SPACE_OR_PUNCTUATION,
} from '../globals'
import { settings } from '../settings'
import {
chunkArray,
@@ -17,6 +22,8 @@ import { sortBy } from 'lodash-es'
import { getMatches, stringsToRegex } from 'src/tools/text-processing'
const tokenize = (text: string): string[] => {
const words = text.split(BRACKETS_AND_SPACE)
let tokens = text.split(SPACE_OR_PUNCTUATION)
// Split hyphenated tokens
@@ -25,15 +32,22 @@ const tokenize = (text: string): string[] => {
// Split camelCase tokens into "camel" and "case
tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]
// Add whole words (aka "not tokens")
tokens = [...tokens, ...words]
// When enabled, we only use the chsSegmenter,
// and not the other custom tokenizers
const chsSegmenter = getChsSegmenter()
if (chsSegmenter) {
tokens = tokens.flatMap(word =>
const chs = tokens.flatMap(word =>
chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
)
tokens = [...tokens, ...chs]
}
// Remove duplicates
tokens = [...new Set(tokens)]
return tokens
}