#225 - hyphenated words should now be indexed whole _and_ split

2023-06-17 18:08:39 +02:00
parent f368704383
commit 887dc669f3
3 changed files with 35 additions and 8 deletions
--- a/src/search/omnisearch.ts
+++ b/src/search/omnisearch.ts
@@ -12,6 +12,7 @@ import {
  logDebug,
  removeDiacritics,
  splitCamelCase,
+  splitHyphens,
  stringsToRegex,
  stripMarkdownCharacters,
  warnDebug,
@@ -22,17 +23,22 @@ import { cacheManager } from '../cache-manager'
 import { sortBy } from 'lodash-es'

 const tokenize = (text: string): string[] => {
-  const tokens = text.split(SPACE_OR_PUNCTUATION)
+  let tokens = text.split(SPACE_OR_PUNCTUATION)
+
+  // When enabled, we only use the chsSegmenter,
+  // and not the other custom tokenizers
  const chsSegmenter = getChsSegmenter()
  if (chsSegmenter) {
-    return tokens.flatMap(word =>
+    tokens = tokens.flatMap(word =>
      chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
    )
  } else {
-    if (settings.splitCamelCase)
-      return [...tokens, ...tokens.flatMap(splitCamelCase)]
-    return tokens
+    // Split camelCase tokens into "camel" and "case
+    tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]
+    // Split hyphenated tokens
+    tokens = [...tokens, ...tokens.flatMap(splitHyphens)]
  }
+  return tokens
 }

 export class Omnisearch {