Refactored excerpts and highlighting

2023-08-31 19:23:44 +02:00
parent d82a12d8d6
commit 603116158e
8 changed files with 247 additions and 178 deletions
@@ -1,7 +1,7 @@
 import type { ResultNote } from '../globals'
 import { Query } from '../search/query'
 import { searchEngine } from '../search/omnisearch'
-import { makeExcerpt } from './utils'
+import { makeExcerpt } from './text-processing'
 import { refreshIndex } from '../notes-index'

 type ResultNoteApi = {
@@ -31,7 +31,7 @@ function mapResults(results: ResultNote[]): ResultNoteApi[] {

    const excerpt = makeExcerpt(content, matches[0]?.offset ?? -1)

-    return {
+    const res: ResultNoteApi = {
      score,
      path,
      basename,
@@ -42,8 +42,10 @@ function mapResults(results: ResultNote[]): ResultNoteApi[] {
          offset: match.offset,
        }
      }),
-      excerpt,
+      excerpt: excerpt.content,
    }
+
+    return res
  })
 }

@@ -1,6 +1,6 @@
 import { type CachedMetadata, MarkdownView, TFile } from 'obsidian'
-import { stringsToRegex } from './utils'
 import type { ResultNote } from '../globals'
+import { stringsToRegex } from './text-processing'

 export async function openNote(
  item: ResultNote,
@@ -0,0 +1,204 @@
+import {
+  highlightClass,
+  type SearchMatch,
+  regexLineSplit,
+  regexYaml,
+  getChsSegmenter,
+  SPACE_OR_PUNCTUATION_UNIQUE,
+  regexStripQuotes,
+  excerptAfter,
+  excerptBefore,
+} from 'src/globals'
+import { settings } from 'src/settings'
+import { escapeRegex, warnDebug } from './utils'
+import type { Query } from 'src/search/query'
+import { Notice } from 'obsidian'
+
+export function highlighterGroups(_substring: string, ...args: any[]) {
+  // args[0] is the single char preceding args[1], which is the word we want to highlight
+  if (!!args[1].trim())
+    return `<span>${args[0]}</span><span class="${highlightClass}">${args[1]}</span>`
+  return '&lt;no content&gt;'
+}
+
+export function highlightText(text: string, matches: SearchMatch[]): string {
+  matches.forEach(matchInfo => {
+    const matchRegex = new RegExp(`\\b${matchInfo.match}\\b`, 'giu')
+    const matchOffsets = []
+
+    let match
+    while ((match = matchRegex.exec(text)) !== null) {
+      matchOffsets.push({ index: match.index, text: match[0] })
+    }
+
+    if (!matchOffsets.length) {
+      return text
+    }
+
+    const closestMatch = matchOffsets.reduce((prev, curr) => {
+      return Math.abs(curr.index - matchInfo.offset) <
+        Math.abs(prev.index - matchInfo.offset)
+        ? curr
+        : prev
+    })
+
+    if (matchOffsets.includes(closestMatch)) {
+      const originalMatch = closestMatch.text
+      text =
+        text.substring(0, closestMatch.index) +
+        `<span class="${highlightClass}">` +
+        originalMatch +
+        '</span>' +
+        text.substring(closestMatch.index + originalMatch.length)
+    }
+  })
+
+  return text
+}
+
+export function escapeHTML(html: string): string {
+  return html
+    .replaceAll('&', '&amp;')
+    .replaceAll('<', '&lt;')
+    .replaceAll('>', '&gt;')
+    .replaceAll('"', '&quot;')
+    .replaceAll("'", '&#039;')
+}
+
+export function splitLines(text: string): string[] {
+  return text.split(regexLineSplit).filter(l => !!l && l.length > 2)
+}
+
+export function removeFrontMatter(text: string): string {
+  // Regex to recognize YAML Front Matter (at beginning of file, 3 hyphens, than any charecter, including newlines, then 3 hyphens).
+  return text.replace(regexYaml, '')
+}
+
+/**
+ * Used to find excerpts in a note body, or select which words to highlight
+ */
+export function stringsToRegex(strings: string[]): RegExp {
+  if (!strings.length) return /^$/g
+
+  // sort strings by decreasing length, so that longer strings are matched first
+  strings.sort((a, b) => b.length - a.length)
+
+  const joined =
+    '(' +
+    // Default word split is not applied if the user uses the cm-chs-patch plugin
+    (getChsSegmenter()
+      ? ''
+      : // Split on start of line, spaces, punctuation, or capital letters (for camelCase)
+      // We also add the hyphen to the list of characters that can split words
+      settings.splitCamelCase
+      ? `^|${SPACE_OR_PUNCTUATION_UNIQUE.source}|\-|[A-Z]`
+      : `^|${SPACE_OR_PUNCTUATION_UNIQUE.source}|\-`) +
+    ')' +
+    `(${strings.map(s => escapeRegex(s)).join('|')})`
+
+  const reg = new RegExp(`${joined}`, 'gu')
+  return reg
+}
+
+export function getMatches(
+  text: string,
+  reg: RegExp,
+  query?: Query
+): SearchMatch[] {
+  text = text.toLowerCase()
+  const startTime = new Date().getTime()
+  let match: RegExpExecArray | null = null
+  let matches: SearchMatch[] = []
+  let count = 0
+  while ((match = reg.exec(text)) !== null) {
+    // Avoid infinite loops, stop looking after 100 matches or if we're taking too much time
+    if (++count >= 100 || new Date().getTime() - startTime > 50) {
+      warnDebug('Stopped getMatches at', count, 'results')
+      break
+    }
+    const m = match[2]
+    if (m && match.index >= 0) {
+      matches.push({ match: m, offset: match.index + 1 })
+    }
+  }
+
+  // If the query can be found "as is" in the text, put this match first
+  if (query) {
+    const best = text.indexOf(query.segmentsToStr())
+    if (best > -1 && matches.find(m => m.offset === best)) {
+      matches = matches.filter(m => m.offset !== best)
+      matches.unshift({
+        offset: best,
+        match: query.segmentsToStr(),
+      })
+    }
+  }
+
+  return matches
+}
+
+export function makeExcerpt(
+  content: string,
+  offset: number
+): { content: string; offset: number } {
+  try {
+    const pos = offset ?? -1
+    const from = Math.max(0, pos - excerptBefore)
+    const to = Math.min(content.length, pos + excerptAfter)
+    if (pos > -1) {
+      content =
+        (from > 0 ? '…' : '') +
+        content.slice(from, to).trim() +
+        (to < content.length - 1 ? '…' : '')
+    } else {
+      content = content.slice(0, excerptAfter)
+    }
+    if (settings.renderLineReturnInExcerpts) {
+      const lineReturn = new RegExp(/(?:\r\n|\r|\n)/g)
+      // Remove multiple line returns
+      content = content
+        .split(lineReturn)
+        .filter(l => l)
+        .join('\n')
+
+      const last = content.lastIndexOf('\n', pos - from)
+
+      if (last > 0) {
+        content = content.slice(last)
+      }
+    }
+
+    content = escapeHTML(content)
+
+    if (settings.renderLineReturnInExcerpts) {
+      content = content.trim().replaceAll('\n', '<br>')
+    }
+
+    return { content: content, offset: pos }
+  } catch (e) {
+    new Notice(
+      'Omnisearch - Error while creating excerpt, see developer console'
+    )
+    console.error(`Omnisearch - Error while creating excerpt`)
+    console.error(e)
+    return { content: '', offset: -1 }
+  }
+}
+
+/**
+ * splits a string in words or "expressions in quotes"
+ * @param str
+ * @returns
+ */
+export function splitQuotes(str: string): string[] {
+  return (
+    str
+      .match(/"(.*?)"/g)
+      ?.map(s => s.replace(/"/g, ''))
+      .filter(q => !!q) ?? []
+  )
+}
+
+export function stripSurroundingQuotes(str: string): string {
+  return str.replace(regexStripQuotes, '')
+}
@@ -1,52 +1,17 @@
 import {
  type CachedMetadata,
  getAllTags,
-  Notice,
  parseFrontMatterAliases,
  Platform,
 } from 'obsidian'
-import {
-  excerptAfter,
-  excerptBefore,
-  getChsSegmenter,
-  getTextExtractor,
-  highlightClass,
-  isSearchMatch,
-  regexLineSplit,
-  regexStripQuotes,
-  regexYaml,
-  SPACE_OR_PUNCTUATION,
-  type SearchMatch,
-} from '../globals'
+import { getTextExtractor, isSearchMatch, type SearchMatch } from '../globals'
 import { canIndexUnsupportedFiles, settings } from '../settings'
 import { type BinaryLike, createHash } from 'crypto'
 import { md5 } from 'pure-md5'

-export function highlighter(str: string): string {
-  return `<span class="${highlightClass}">${str}</span>`
-}
-
-export function highlighterGroups(substring: string, ...args: any[]): string {
-  return `<span class="${highlightClass}">${substring}</span>`
-}
-
-export function escapeHTML(html: string): string {
-  return html
-    .replaceAll('&', '&amp;')
-    .replaceAll('<', '&lt;')
-    .replaceAll('>', '&gt;')
-    .replaceAll('"', '&quot;')
-    .replaceAll("'", '&#039;')
-}
-
-export function splitLines(text: string): string[] {
-  return text.split(regexLineSplit).filter(l => !!l && l.length > 2)
-}
-
-export function removeFrontMatter(text: string): string {
-  // Regex to recognize YAML Front Matter (at beginning of file, 3 hyphens, than any charecter, including newlines, then 3 hyphens).
-  return text.replace(regexYaml, '')
-}
+// export function highlighter(str: string): string {
+//   return `<span class="${highlightClass}">${str}</span>`
+// }

 export function pathWithoutFilename(path: string): string {
  const split = path.split('/')
@@ -79,20 +44,6 @@ export function getAllIndices(text: string, regex: RegExp): SearchMatch[] {
    .filter(isSearchMatch)
 }

-/**
- * Used to find excerpts in a note body, or select which words to highlight
- */
-export function stringsToRegex(strings: string[]): RegExp {
-  if (!strings.length) return /^$/g
-
-  // sort strings by decreasing length, so that longer strings are matched first
-  strings.sort((a, b) => b.length - a.length)
-
-  const joined = `(${strings.map(s => escapeRegex(s)).join('|')})`
-
-  return new RegExp(`${joined}`, 'giu')
-}
-
 export function extractHeadingsFromCache(
  cache: CachedMetadata,
  level: number
@@ -106,69 +57,6 @@ export function loopIndex(index: number, nbItems: number): number {
  return (index + nbItems) % nbItems
 }

-export function makeExcerpt(content: string, offset: number): string {
-  try {
-    const pos = offset ?? -1
-    const from = Math.max(0, pos - excerptBefore)
-    const to = Math.min(content.length, pos + excerptAfter)
-    if (pos > -1) {
-      content =
-        (from > 0 ? '…' : '') +
-        content.slice(from, to).trim() +
-        (to < content.length - 1 ? '…' : '')
-    } else {
-      content = content.slice(0, excerptAfter)
-    }
-    if (settings.renderLineReturnInExcerpts) {
-      const lineReturn = new RegExp(/(?:\r\n|\r|\n)/g)
-      // Remove multiple line returns
-      content = content
-        .split(lineReturn)
-        .filter(l => l)
-        .join('\n')
-
-      const last = content.lastIndexOf('\n', pos - from)
-
-      if (last > 0) {
-        content = content.slice(last)
-      }
-    }
-
-    content = escapeHTML(content)
-
-    if (settings.renderLineReturnInExcerpts) {
-      content = content.trim().replaceAll('\n', '<br>')
-    }
-
-    return content
-  } catch (e) {
-    new Notice(
-      'Omnisearch - Error while creating excerpt, see developer console'
-    )
-    console.error(`Omnisearch - Error while creating excerpt`)
-    console.error(e)
-    return ''
-  }
-}
-
-/**
- * splits a string in words or "expressions in quotes"
- * @param str
- * @returns
- */
-export function splitQuotes(str: string): string[] {
-  return (
-    str
-      .match(/"(.*?)"/g)
-      ?.map(s => s.replace(/"/g, ''))
-      .filter(q => !!q) ?? []
-  )
-}
-
-export function stripSurroundingQuotes(str: string): string {
-  return str.replace(regexStripQuotes, '')
-}
-
 function mapAsync<T, U>(
  array: T[],
  callbackfn: (value: T, index: number, array: T[]) => Promise<U>
@@ -263,7 +151,7 @@ export function isContentIndexable(path: string): boolean {

 export function isFilenameIndexable(path: string): boolean {
  return (
-    (canIndexUnsupportedFiles()) ||
+    canIndexUnsupportedFiles() ||
    isFilePlaintext(path) ||
    isFileCanvas(path) ||
    isFileFromDataloomPlugin(path)
@@ -329,13 +217,13 @@ export function chunkArray<T>(arr: T[], len: number): T[][] {
 export function splitCamelCase(text: string): string[] {
  // if no camel case found, do nothing
  if (!/[a-z][A-Z]/.test(text)) {
-    return [];
+    return []
  }
  const splittedText = text
    .replace(/([a-z](?=[A-Z]))/g, '$1 ')
    .split(' ')
-    .filter(t => t);
-  return splittedText;
+    .filter(t => t)
+  return splittedText
 }

 /**