Highlighting rework, should help with #304

2024-03-26 21:32:03 +01:00
parent f77ff3ec2f
commit 484e961a7e
6 changed files with 32 additions and 31 deletions
--- a/src/search/omnisearch.ts
+++ b/src/search/omnisearch.ts
@@ -190,6 +190,8 @@ export class Omnisearch {
        headings3: settings.weightH3,
        unmarkedTags: settings.weightUnmarkedTags,
      },
+      // The query is already tokenized, don't tokenize again
+      tokenize: text => [text],
    })

    logDebug('Found', results.length, 'results')
@@ -404,7 +406,7 @@ export class Omnisearch {

        // Tags, starting with #
        ...query.getTags(),
-      ].filter(w => w.length > 1 || /\p{Emoji}/u.test(w))
+      ]
      logDebug('Matching tokens:', foundWords)

      logDebug('Getting matches locations...')
--- a/src/search/tokenizer.ts
+++ b/src/search/tokenizer.ts
@@ -6,6 +6,7 @@ import {
  getChsSegmenter,
 } from 'src/globals'
 import { logDebug, splitCamelCase, splitHyphens } from 'src/tools/utils'
+const markdownLinkExtractor = require('markdown-link-extractor')

 function tokenizeWords(text: string): string[] {
  return text.split(BRACKETS_AND_SPACE)
@@ -23,6 +24,7 @@ function tokenizeTokens(text: string): string[] {
 */
 export function tokenizeForIndexing(text: string): string[] {
  const words = tokenizeWords(text)
+  const urls: string[] = markdownLinkExtractor(text)

  let tokens = tokenizeTokens(text)

@@ -35,6 +37,11 @@ export function tokenizeForIndexing(text: string): string[] {
  // Add whole words (aka "not tokens")
  tokens = [...tokens, ...words]

+  // Add urls
+  if (urls.length) {
+    tokens = [...tokens, ...urls]
+  }
+
  const chsSegmenter = getChsSegmenter()
  if (chsSegmenter) {
    const chs = tokens.flatMap(word =>
@@ -56,7 +63,12 @@ export function tokenizeForIndexing(text: string): string[] {
 * @returns
 */
 export function tokenizeForSearch(text: string): QueryCombination {
-  const tokens = tokenizeTokens(text)
+
+  // Extract urls and remove them from the query
+  const urls: string[] = markdownLinkExtractor(text)
+  text = urls.reduce((acc, url) => acc.replace(url, ''), text)
+
+  const tokens = [...tokenizeTokens(text), ...urls].filter(Boolean)

  let chs: string[] = []
  const chsSegmenter = getChsSegmenter()
@@ -70,7 +82,7 @@ export function tokenizeForSearch(text: string): QueryCombination {
    combineWith: 'OR',
    queries: [
      { combineWith: 'AND', queries: tokens },
-      { combineWith: 'AND', queries: tokenizeWords(text) },
+      { combineWith: 'AND', queries: tokenizeWords(text).filter(Boolean) },
      { combineWith: 'AND', queries: tokens.flatMap(splitHyphens) },
      { combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) },
      { combineWith: 'AND', queries: chs },