diff --git a/src/globals.ts b/src/globals.ts index f12945f..986c7c0 100644 --- a/src/globals.ts +++ b/src/globals.ts @@ -125,3 +125,4 @@ const separators = .slice(1, -1) export const SPACE_OR_PUNCTUATION_UNIQUE = new RegExp(`${separators}`, 'u') export const SPACE_OR_PUNCTUATION = new RegExp(`${separators}+`, 'u') +export const BRACKETS_AND_SPACE = /[|\[\]\(\)<>\{\} \t\n\r]/u diff --git a/src/search/omnisearch.ts b/src/search/omnisearch.ts index 6adf19b..4e3f141 100644 --- a/src/search/omnisearch.ts +++ b/src/search/omnisearch.ts @@ -1,6 +1,11 @@ import MiniSearch, { type Options, type SearchResult } from 'minisearch' import type { DocumentRef, IndexedDocument, ResultNote } from '../globals' -import { chsRegex, getChsSegmenter, SPACE_OR_PUNCTUATION } from '../globals' +import { + BRACKETS_AND_SPACE, + chsRegex, + getChsSegmenter, + SPACE_OR_PUNCTUATION, +} from '../globals' import { settings } from '../settings' import { chunkArray, @@ -17,6 +22,8 @@ import { sortBy } from 'lodash-es' import { getMatches, stringsToRegex } from 'src/tools/text-processing' const tokenize = (text: string): string[] => { + const words = text.split(BRACKETS_AND_SPACE) + let tokens = text.split(SPACE_OR_PUNCTUATION) // Split hyphenated tokens @@ -25,15 +32,22 @@ const tokenize = (text: string): string[] => { // Split camelCase tokens into "camel" and "case tokens = [...tokens, ...tokens.flatMap(splitCamelCase)] + // Add whole words (aka "not tokens") + tokens = [...tokens, ...words] + // When enabled, we only use the chsSegmenter, // and not the other custom tokenizers const chsSegmenter = getChsSegmenter() if (chsSegmenter) { - tokens = tokens.flatMap(word => + const chs = tokens.flatMap(word => chsRegex.test(word) ? chsSegmenter.cut(word) : [word] ) + tokens = [...tokens, ...chs] } + // Remove duplicates + tokens = [...new Set(tokens)] + return tokens }