Squashed commit of the following:

commit ac82511ddd17d5472ae3cfea9bbad9754f5a4d62 Author: Simon Cambier <simon.cambier@protonmail.com> Date: Sat Oct 22 08:23:42 2022 +0200 Screw that cache, seriously. commit 8ba40d1be73daaaffea09e07bc56c339266db9b6 Author: Simon Cambier <simon.cambier@protonmail.com> Date: Fri Oct 21 22:36:48 2022 +0200 Stuff commit 27b8fd7dc809be9714a109d3a458eb1276a47e2e Author: Simon Cambier <simon.cambier@protonmail.com> Date: Fri Oct 21 22:22:20 2022 +0200 Moved files commit fb1349c914907e586e103ca54fb04b9ddd45ef5d Author: Simon Cambier <simon.cambier@protonmail.com> Date: Thu Oct 20 22:25:29 2022 +0200 Removed duplicate code commit e7371138e60cbe4155cfd4fb44e3ee1d2e3ee088 Author: Simon Cambier <simon.cambier@protonmail.com> Date: Thu Oct 20 21:50:09 2022 +0200 Moved a bunch of files commit 2ee1b2a0e799d4b41ab3a444d8cc44dfff5b5623 Author: Simon Cambier <simon.cambier@protonmail.com> Date: Thu Oct 20 21:32:21 2022 +0200 Removed useless code commit 76c530dfb9adbad1bbe9079de2330fe43a044249 Author: Simon Cambier <simon.cambier@protonmail.com> Date: Thu Oct 20 20:44:11 2022 +0200 Split file reading and indexing
2022-10-22 08:25:34 +02:00
parent 1376cea282
commit c2ecdd79ad
25 changed files with 338 additions and 403 deletions
--- a/src/search/search.ts
+++ b/src/search/search.ts
@@ -0,0 +1,249 @@
+import MiniSearch, {
+  type AsPlainObject,
+  type Options,
+  type SearchResult,
+} from 'minisearch'
+import {
+  chsRegex,
+  type IndexedDocument,
+  type ResultNote,
+  type SearchMatch,
+  SPACE_OR_PUNCTUATION,
+} from '../globals'
+import {
+  removeDiacritics,
+  stringsToRegex,
+  stripMarkdownCharacters,
+} from '../tools/utils'
+import type { Query } from './query'
+import { settings } from '../settings'
+import { cacheManager } from '../cache-manager'
+
+let minisearchInstance: MiniSearch<IndexedDocument>
+
+const tokenize = (text: string): string[] => {
+  const tokens = text.split(SPACE_OR_PUNCTUATION)
+  const chsSegmenter = (app as any).plugins.plugins['cm-chs-patch']
+
+  if (chsSegmenter) {
+    return tokens.flatMap(word =>
+      chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
+    )
+  } else return tokens
+}
+
+const minisearchOptions: Options<IndexedDocument> = {
+  tokenize,
+  processTerm: (term: string) =>
+    (settings.ignoreDiacritics ? removeDiacritics(term) : term).toLowerCase(),
+  idField: 'path',
+  fields: [
+    'basename',
+    'aliases',
+    'content',
+    'headings1',
+    'headings2',
+    'headings3',
+  ],
+  storeFields: ['tags'],
+}
+
+/**
+ * Initializes the MiniSearch instance,
+ * and adds all the notes to the index
+ */
+export async function initSearchEngine(): Promise<void> {
+  // Default instance
+  minisearchInstance = new MiniSearch(minisearchOptions)
+}
+
+export async function initSearchEngineFromData(json: string): Promise<void> {
+  try {
+    minisearchInstance = MiniSearch.loadJSON(json, minisearchOptions)
+    console.log('Omnisearch - MiniSearch index loaded from the file')
+  } catch (e) {
+    console.error('Omnisearch - Could not load MiniSearch index from json')
+    console.error(e)
+  }
+}
+
+/**
+ * Searches the index for the given query,
+ * and returns an array of raw results
+ * @param query
+ * @returns
+ */
+async function search(query: Query): Promise<SearchResult[]> {
+  if (!query.segmentsToStr()) return []
+
+  let results = minisearchInstance.search(query.segmentsToStr(), {
+    prefix: true,
+    fuzzy: term => (term.length > 4 ? 0.2 : false),
+    combineWith: 'AND',
+    boost: {
+      basename: settings.weightBasename,
+      aliases: settings.weightBasename,
+      headings1: settings.weightH1,
+      headings2: settings.weightH2,
+      headings3: settings.weightH3,
+    },
+  })
+
+  // Downrank files that are in Obsidian's excluded list
+  if (settings.respectExcluded) {
+    results.forEach(result => {
+      if (
+        app.metadataCache.isUserIgnored &&
+        app.metadataCache.isUserIgnored(result.id)
+      ) {
+        result.score /= 3 // TODO: make this value configurable or toggleable?
+      }
+    })
+  }
+
+  // If the search query contains quotes, filter out results that don't have the exact match
+  const exactTerms = query.getExactTerms()
+  if (exactTerms.length) {
+    results = results.filter(r => {
+      const title = cacheManager.getDocument(r.id)?.path.toLowerCase() ?? ''
+      const content = stripMarkdownCharacters(
+        cacheManager.getDocument(r.id)?.content ?? ''
+      ).toLowerCase()
+      return exactTerms.every(q => content.includes(q) || title.includes(q))
+    })
+  }
+
+  // If the search query contains exclude terms, filter out results that have them
+  const exclusions = query.exclusions
+  if (exclusions.length) {
+    results = results.filter(r => {
+      const content = stripMarkdownCharacters(
+        cacheManager.getDocument(r.id)?.content ?? ''
+      ).toLowerCase()
+      return exclusions.every(q => !content.includes(q.value))
+    })
+  }
+  return results
+}
+
+/**
+ * Parses a text against a regex, and returns the { string, offset } matches
+ * @param text
+ * @param reg
+ * @returns
+ */
+export function getMatches(text: string, reg: RegExp): SearchMatch[] {
+  let match: RegExpExecArray | null = null
+  const matches: SearchMatch[] = []
+  let count = 0 // TODO: FIXME: this is a hack to avoid infinite loops
+  while ((match = reg.exec(text)) !== null) {
+    if (++count > 100) break
+    const m = match[0]
+    if (m) matches.push({ match: m, offset: match.index })
+  }
+  return matches
+}
+
+/**
+ * Searches the index, and returns an array of ResultNote objects.
+ * If we have the singleFile option set,
+ * the array contains a single result from that file
+ * @param query
+ * @param options
+ * @returns
+ */
+export async function getSuggestions(
+  query: Query,
+  options?: Partial<{ singleFilePath: string | null }>
+): Promise<ResultNote[]> {
+  // Get the raw results
+  let results = await search(query)
+  if (!results.length) return []
+
+  // Extract tags from the query
+  const tags = query.segments
+    .filter(s => s.value.startsWith('#'))
+    .map(s => s.value)
+
+  // Either keep the 50 first results,
+  // or the one corresponding to `singleFile`
+  if (options?.singleFilePath) {
+    const result = results.find(r => r.id === options.singleFilePath)
+    if (result) results = [result]
+    else results = []
+  } else {
+    results = results.slice(0, 50)
+
+    // Put the results with tags on top
+    for (const tag of tags) {
+      for (const result of results) {
+        if ((result.tags ?? []).includes(tag)) {
+          result.score *= 100
+        }
+      }
+    }
+  }
+
+  // Map the raw results to get usable suggestions
+  return results.map(result => {
+    const note = cacheManager.getDocument(result.id)
+    if (!note) {
+      throw new Error(`Note "${result.id}" not indexed`)
+    }
+
+    // Remove '#' from tags, for highlighting
+    query.segments.forEach(s => {
+      s.value = s.value.replace(/^#/, '')
+    })
+    // Clean search matches that match quoted expressions,
+    // and inject those expressions instead
+    const foundWords = [
+      // Matching terms from the result,
+      // do not necessarily match the query
+      ...Object.keys(result.match),
+
+      // // Matching terms from the query,
+      // // but only if they stem from the result's matches
+      // ...Object.keys(result.match).filter(w =>
+      //   query.segments.some(s => w.startsWith(s.value)),
+      // ),
+
+      // Quoted expressions
+      ...query.segments.filter(s => s.exact).map(s => s.value),
+
+      // Tags, starting with #
+      ...tags,
+    ]
+
+    const matches = getMatches(note.content, stringsToRegex(foundWords))
+    const resultNote: ResultNote = {
+      score: result.score,
+      foundWords,
+      matches,
+      ...note,
+    }
+    return resultNote
+  })
+}
+
+// #region Read/write minisearch index
+
+export function getMinisearchIndexJSON(): AsPlainObject {
+  return minisearchInstance.toJSON()
+}
+
+export async function addAllToMinisearch(
+  documents: IndexedDocument[]
+): Promise<void> {
+  await minisearchInstance.addAllAsync(documents)
+}
+
+export function addSingleToMinisearch(document: IndexedDocument): void {
+  minisearchInstance.add(document)
+}
+
+export function removeFromMinisearch(document: IndexedDocument): void {
+  minisearchInstance.remove(document)
+}
+
+// #endregion