#304 - Index more tokens, words split by brackets/parentheses

2023-11-01 13:22:06 +01:00
parent 285c4e9257
commit 8355489bcd
2 changed files with 17 additions and 2 deletions
--- a/src/globals.ts
+++ b/src/globals.ts
@@ -125,3 +125,4 @@ const separators =
    .slice(1, -1)
 export const SPACE_OR_PUNCTUATION_UNIQUE = new RegExp(`${separators}`, 'u')
 export const SPACE_OR_PUNCTUATION = new RegExp(`${separators}+`, 'u')
+export const BRACKETS_AND_SPACE = /[|\[\]\(\)<>\{\} \t\n\r]/u
--- a/src/search/omnisearch.ts
+++ b/src/search/omnisearch.ts
@@ -1,6 +1,11 @@
 import MiniSearch, { type Options, type SearchResult } from 'minisearch'
 import type { DocumentRef, IndexedDocument, ResultNote } from '../globals'
-import { chsRegex, getChsSegmenter, SPACE_OR_PUNCTUATION } from '../globals'
+import {
+  BRACKETS_AND_SPACE,
+  chsRegex,
+  getChsSegmenter,
+  SPACE_OR_PUNCTUATION,
+} from '../globals'
 import { settings } from '../settings'
 import {
  chunkArray,
@@ -17,6 +22,8 @@ import { sortBy } from 'lodash-es'
 import { getMatches, stringsToRegex } from 'src/tools/text-processing'

 const tokenize = (text: string): string[] => {
+  const words = text.split(BRACKETS_AND_SPACE)
+
  let tokens = text.split(SPACE_OR_PUNCTUATION)

  // Split hyphenated tokens
@@ -25,15 +32,22 @@ const tokenize = (text: string): string[] => {
  // Split camelCase tokens into "camel" and "case
  tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]

+  // Add whole words (aka "not tokens")
+  tokens = [...tokens, ...words]
+
  // When enabled, we only use the chsSegmenter,
  // and not the other custom tokenizers
  const chsSegmenter = getChsSegmenter()
  if (chsSegmenter) {
-    tokens = tokens.flatMap(word =>
+    const chs = tokens.flatMap(word =>
      chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
    )
+    tokens = [...tokens, ...chs]
  }

+  // Remove duplicates
+  tokens = [...new Set(tokens)]
+
  return tokens
 }