From 484e961a7e082a9caa6f5ad606ec36402f37e6f9 Mon Sep 17 00:00:00 2001
From: Simon Cambier <simon.cambier@protonmail.com>
Date: Tue, 26 Mar 2024 21:32:03 +0100
Subject: [PATCH] Highlighting rework, should help with #304

---
 src/components/ResultItemInFile.svelte | 12 +++---------
 src/components/ResultItemVault.svelte  |  6 +-----
 src/search/omnisearch.ts               |  4 +++-
 src/search/tokenizer.ts                | 16 ++++++++++++++--
 src/tools/api.ts                       |  2 +-
 src/tools/text-processing.ts           | 23 ++++++++++-------------
 6 files changed, 32 insertions(+), 31 deletions(-)
diff --git a/src/components/ResultItemInFile.svelte b/src/components/ResultItemInFile.svelte
index db9bfa5..096d9db 100644
--- a/src/components/ResultItemInFile.svelte
+++ b/src/components/ResultItemInFile.svelte
@@ -1,8 +1,5 @@
 <script lang="ts">
-  import {
-    makeExcerpt,
-    highlightText,
-  } from 'src/tools/text-processing'
+  import { makeExcerpt, highlightText } from 'src/tools/text-processing'
   import type { ResultNote } from '../globals'
   import ResultItemContainer from './ResultItemContainer.svelte'
   import { cloneDeep } from 'lodash-es'
@@ -13,10 +10,7 @@
   export let selected = false
 
   $: cleanedContent = makeExcerpt(note?.content ?? '', offset)
-  $: matchesExcerpt = cloneDeep(note.matches).map(m => {
-    m.offset = m.offset - cleanedContent.offset
-    return m
-  })
+  $: matchesExcerpt = cloneDeep(note.matches)
 </script>
 
 <ResultItemContainer
@@ -26,6 +20,6 @@
   on:click
   on:auxclick>
   <div class="omnisearch-result__body">
-    {@html highlightText(cleanedContent.content, matchesExcerpt)}
+    {@html highlightText(cleanedContent, matchesExcerpt)}
   </div>
 </ResultItemContainer>
diff --git a/src/components/ResultItemVault.svelte b/src/components/ResultItemVault.svelte
index 9357dae..b1f9970 100644
--- a/src/components/ResultItemVault.svelte
+++ b/src/components/ResultItemVault.svelte
@@ -40,10 +40,6 @@
   $: reg = stringsToRegex(note.foundWords)
   $: matchesTitle = getMatches(title, reg)
   $: matchesNotePath = getMatches(notePath, reg)
-  $: matchesExcerpt = cloneDeep(note.matches).map(m => {
-    m.offset = m.offset - cleanedContent.offset
-    return m
-  })
   $: cleanedContent = makeExcerpt(note.content, note.matches[0]?.offset ?? -1)
   $: glyph = false //cacheManager.getLiveDocument(note.path)?.doesNotExist
   $: {
@@ -102,7 +98,7 @@
     <div style="display: flex; flex-direction: row;">
       {#if $showExcerpt}
         <div class="omnisearch-result__body">
-          {@html highlightText(cleanedContent.content, matchesExcerpt)}
+          {@html highlightText(cleanedContent, note.matches)}
         </div>
       {/if}
 
diff --git a/src/search/omnisearch.ts b/src/search/omnisearch.ts
index 4411dc3..88e9cb3 100644
--- a/src/search/omnisearch.ts
+++ b/src/search/omnisearch.ts
@@ -190,6 +190,8 @@ export class Omnisearch {
         headings3: settings.weightH3,
         unmarkedTags: settings.weightUnmarkedTags,
       },
+      // The query is already tokenized, don't tokenize again
+      tokenize: text => [text],
     })
 
     logDebug('Found', results.length, 'results')
@@ -404,7 +406,7 @@ export class Omnisearch {
 
         // Tags, starting with #
         ...query.getTags(),
-      ].filter(w => w.length > 1 || /\p{Emoji}/u.test(w))
+      ]
       logDebug('Matching tokens:', foundWords)
 
       logDebug('Getting matches locations...')
diff --git a/src/search/tokenizer.ts b/src/search/tokenizer.ts
index 2fca57e..29816e2 100644
--- a/src/search/tokenizer.ts
+++ b/src/search/tokenizer.ts
@@ -6,6 +6,7 @@ import {
   getChsSegmenter,
 } from 'src/globals'
 import { logDebug, splitCamelCase, splitHyphens } from 'src/tools/utils'
+const markdownLinkExtractor = require('markdown-link-extractor')
 
 function tokenizeWords(text: string): string[] {
   return text.split(BRACKETS_AND_SPACE)
@@ -23,6 +24,7 @@ function tokenizeTokens(text: string): string[] {
  */
 export function tokenizeForIndexing(text: string): string[] {
   const words = tokenizeWords(text)
+  const urls: string[] = markdownLinkExtractor(text)
 
   let tokens = tokenizeTokens(text)
 
@@ -35,6 +37,11 @@ export function tokenizeForIndexing(text: string): string[] {
   // Add whole words (aka "not tokens")
   tokens = [...tokens, ...words]
 
+  // Add urls
+  if (urls.length) {
+    tokens = [...tokens, ...urls]
+  }
+
   const chsSegmenter = getChsSegmenter()
   if (chsSegmenter) {
     const chs = tokens.flatMap(word =>
@@ -56,7 +63,12 @@ export function tokenizeForIndexing(text: string): string[] {
  * @returns
  */
 export function tokenizeForSearch(text: string): QueryCombination {
-  const tokens = tokenizeTokens(text)
+
+  // Extract urls and remove them from the query
+  const urls: string[] = markdownLinkExtractor(text)
+  text = urls.reduce((acc, url) => acc.replace(url, ''), text)
+
+  const tokens = [...tokenizeTokens(text), ...urls].filter(Boolean)
 
   let chs: string[] = []
   const chsSegmenter = getChsSegmenter()
@@ -70,7 +82,7 @@ export function tokenizeForSearch(text: string): QueryCombination {
     combineWith: 'OR',
     queries: [
       { combineWith: 'AND', queries: tokens },
-      { combineWith: 'AND', queries: tokenizeWords(text) },
+      { combineWith: 'AND', queries: tokenizeWords(text).filter(Boolean) },
       { combineWith: 'AND', queries: tokens.flatMap(splitHyphens) },
       { combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) },
       { combineWith: 'AND', queries: chs },
diff --git a/src/tools/api.ts b/src/tools/api.ts
index 711253c..93abe7a 100644
--- a/src/tools/api.ts
+++ b/src/tools/api.ts
@@ -44,7 +44,7 @@ function mapResults(results: ResultNote[]): ResultNoteApi[] {
           offset: match.offset,
         }
       }),
-      excerpt: excerpt.content,
+      excerpt: excerpt,
     }
 
     return res
diff --git a/src/tools/text-processing.ts b/src/tools/text-processing.ts
index f9d2047..481ea3d 100644
--- a/src/tools/text-processing.ts
+++ b/src/tools/text-processing.ts
@@ -6,15 +6,12 @@ import {
   regexStripQuotes,
   excerptAfter,
   excerptBefore,
-  SEPARATORS,
 } from 'src/globals'
 import { settings } from 'src/settings'
 import { removeDiacritics, warnDebug } from './utils'
 import type { Query } from 'src/search/query'
 import { Notice } from 'obsidian'
 import { escapeRegExp } from 'lodash-es'
-import { tokenizeForSearch } from 'src/search/tokenizer'
-import type { QueryCombination } from 'minisearch'
 
 /**
  * Wraps the matches in the text with a <span> element and a highlight class
@@ -115,14 +112,19 @@ export function stringsToRegex(strings: string[]): RegExp {
   return new RegExp(`${joined}`, 'gui')
 }
 
+/**
+ * Returns an array of matches in the text, using the provided regex
+ * @param text
+ * @param reg
+ * @param query
+ */
 export function getMatches(
   text: string,
   reg: RegExp,
   query?: Query
 ): SearchMatch[] {
-  const separatorRegExp = new RegExp(SEPARATORS, 'gu')
   const originalText = text
-  text = text.toLowerCase().replace(separatorRegExp, ' ')
+  // text = text.toLowerCase().replace(new RegExp(SEPARATORS, 'gu'), ' ')
   if (settings.ignoreDiacritics) {
     text = removeDiacritics(text)
   }
@@ -153,21 +155,16 @@ export function getMatches(
   ) {
     const best = text.indexOf(query.getBestStringForExcerpt())
     if (best > -1 && matches.find(m => m.offset === best)) {
-      matches = matches.filter(m => m.offset !== best)
       matches.unshift({
         offset: best,
         match: query.getBestStringForExcerpt(),
       })
     }
   }
-
   return matches
 }
 
-export function makeExcerpt(
-  content: string,
-  offset: number
-): { content: string; offset: number } {
+export function makeExcerpt(content: string, offset: number): string {
   try {
     const pos = offset ?? -1
     const from = Math.max(0, pos - excerptBefore)
@@ -201,14 +198,14 @@ export function makeExcerpt(
       content = content.trim().replaceAll('\n', '<br>')
     }
 
-    return { content: content, offset: pos }
+    return content
   } catch (e) {
     new Notice(
       'Omnisearch - Error while creating excerpt, see developer console'
     )
     console.error(`Omnisearch - Error while creating excerpt`)
     console.error(e)
-    return { content: '', offset: -1 }
+    return ''
   }
 }