#92 - Simpler cache implementation

2022-10-26 22:21:11 +02:00
parent 33e7f8fe25
commit cb8de1ad8d
9 changed files with 379 additions and 289 deletions
--- a/src/search/search-engine.ts
+++ b/src/search/search-engine.ts
@@ -0,0 +1,299 @@
+import MiniSearch, {
+  type AsPlainObject,
+  type Options,
+  type SearchResult,
+} from 'minisearch'
+import {
+  chsRegex,
+  type IndexedDocument,
+  type ResultNote,
+  type SearchMatch,
+  SPACE_OR_PUNCTUATION,
+} from '../globals'
+import {
+  removeDiacritics,
+  stringsToRegex,
+  stripMarkdownCharacters,
+} from '../tools/utils'
+import type { Query } from './query'
+import { settings } from '../settings'
+import { cacheManager } from '../cache-manager'
+
+const tokenize = (text: string): string[] => {
+  const tokens = text.split(SPACE_OR_PUNCTUATION)
+  const chsSegmenter = (app as any).plugins.plugins['cm-chs-patch']
+
+  if (chsSegmenter) {
+    return tokens.flatMap(word =>
+      chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
+    )
+  } else return tokens
+}
+
+export const minisearchOptions: Options<IndexedDocument> = {
+  tokenize,
+  processTerm: (term: string) =>
+    (settings.ignoreDiacritics ? removeDiacritics(term) : term).toLowerCase(),
+  idField: 'path',
+  fields: [
+    'basename',
+    'aliases',
+    'content',
+    'headings1',
+    'headings2',
+    'headings3',
+  ],
+  storeFields: ['tags'],
+}
+
+export class SearchEngine {
+  private static engine: SearchEngine
+  private static tmpEngine: SearchEngine
+  public static isIndexing = true
+
+  /**
+   * The main singleton SearchEngine instance.
+   * Should be used for all queries
+   */
+  public static getEngine(): SearchEngine {
+    if (!this.engine) {
+      this.engine = new SearchEngine()
+    }
+    return this.engine
+  }
+
+  /**
+   * The secondary instance. This one is indexed in the background,
+   * while the main instance is quickly filled with cache data
+   */
+  public static getTmpEngine(): SearchEngine {
+    if (!this.tmpEngine) {
+      this.tmpEngine = new SearchEngine()
+    }
+    return this.tmpEngine
+  }
+
+  /**
+   * Instantiates the main instance with cache data (if it exists)
+   */
+  public static async initFromCache(): Promise<void> {
+    try {
+      const cache = await cacheManager.getMinisearchCache()
+      if (cache) {
+        this.getEngine().minisearch = cache
+      }
+    } catch (e) {
+      console.error(e)
+    }
+  }
+
+  /**
+   * Must be called when the background indexing is done,
+   * to load the freshest data into the main instance
+   */
+  public static swapEngines(): void {
+    ;[this.engine, this.tmpEngine] = [this.tmpEngine, this.engine]
+    this.isIndexing = false
+  }
+
+  private minisearch: MiniSearch
+
+  private constructor() {
+    this.minisearch = new MiniSearch(minisearchOptions)
+  }
+
+  /**
+   * Searches the index for the given query,
+   * and returns an array of raw results
+   */
+  public async search(
+    query: Query,
+    options = { fuzzy: 0.1 }
+  ): Promise<SearchResult[]> {
+    if (!query.segmentsToStr()) return []
+
+    let results = this.minisearch.search(query.segmentsToStr(), {
+      prefix: true,
+      // fuzzy: term => (term.length > 4 ? 0.2 : false),
+      fuzzy: options.fuzzy,
+      combineWith: 'AND',
+      boost: {
+        basename: settings.weightBasename,
+        aliases: settings.weightBasename,
+        headings1: settings.weightH1,
+        headings2: settings.weightH2,
+        headings3: settings.weightH3,
+      },
+    })
+
+    // Downrank files that are in Obsidian's excluded list
+    if (settings.respectExcluded) {
+      results.forEach(result => {
+        if (
+          app.metadataCache.isUserIgnored &&
+          app.metadataCache.isUserIgnored(result.id)
+        ) {
+          result.score /= 10 // TODO: make this value configurable or toggleable?
+        }
+      })
+    }
+
+    // If the search query contains quotes, filter out results that don't have the exact match
+    const exactTerms = query.getExactTerms()
+    if (exactTerms.length) {
+      results = results.filter(r => {
+        const title = cacheManager.getDocument(r.id)?.path.toLowerCase() ?? ''
+        const content = stripMarkdownCharacters(
+          cacheManager.getDocument(r.id)?.content ?? ''
+        ).toLowerCase()
+        return exactTerms.every(q => content.includes(q) || title.includes(q))
+      })
+    }
+
+    // If the search query contains exclude terms, filter out results that have them
+    const exclusions = query.exclusions
+    if (exclusions.length) {
+      results = results.filter(r => {
+        const content = stripMarkdownCharacters(
+          cacheManager.getDocument(r.id)?.content ?? ''
+        ).toLowerCase()
+        return exclusions.every(q => !content.includes(q.value))
+      })
+    }
+    return results
+  }
+
+  /**
+   * Parses a text against a regex, and returns the { string, offset } matches
+   */
+  public getMatches(text: string, reg: RegExp, query: Query): SearchMatch[] {
+    let match: RegExpExecArray | null = null
+    const matches: SearchMatch[] = []
+    let count = 0
+    while ((match = reg.exec(text)) !== null) {
+      if (++count >= 100) break // Avoid infinite loops, stop looking after 100 matches
+      const m = match[0]
+      if (m) matches.push({ match: m, offset: match.index })
+    }
+
+    // If the query can be found "as is" in the text, put this match first
+    const best = text.toLowerCase().indexOf(query.segmentsToStr())
+    if (best > -1) {
+      matches.unshift({
+        offset: best,
+        match: query.segmentsToStr(),
+      })
+    }
+
+    return matches
+  }
+
+  /**
+   * Searches the index, and returns an array of ResultNote objects.
+   * If we have the singleFile option set,
+   * the array contains a single result from that file
+   * @param query
+   * @param options
+   * @returns
+   */
+  public async getSuggestions(
+    query: Query,
+    options?: Partial<{ singleFilePath: string | null }>
+  ): Promise<ResultNote[]> {
+    // Get the raw results
+    let results = await this.search(query)
+    if (results.length == 0) {
+      results = await this.search(query, { fuzzy: 0.2 })
+    }
+    if (!results.length) return []
+
+    // Extract tags from the query
+    const tags = query.segments
+      .filter(s => s.value.startsWith('#'))
+      .map(s => s.value)
+
+    // Either keep the 50 first results,
+    // or the one corresponding to `singleFile`
+    if (options?.singleFilePath) {
+      const result = results.find(r => r.id === options.singleFilePath)
+      if (result) results = [result]
+      else results = []
+    } else {
+      results = results.slice(0, 50)
+
+      // Put the results with tags on top
+      for (const tag of tags) {
+        for (const result of results) {
+          if ((result.tags ?? []).includes(tag)) {
+            result.score *= 100
+          }
+        }
+      }
+    }
+
+    // Map the raw results to get usable suggestions
+    return results.map(result => {
+      let note = cacheManager.getDocument(result.id)
+      if (!note) {
+        // throw new Error(`Omnisearch - Note "${result.id}" not indexed`)
+        note = {
+          content: '',
+          basename: result.id,
+          path: result.id,
+        } as IndexedDocument
+      }
+
+      // Remove '#' from tags, for highlighting
+      query.segments.forEach(s => {
+        s.value = s.value.replace(/^#/, '')
+      })
+      // Clean search matches that match quoted expressions,
+      // and inject those expressions instead
+      const foundWords = [
+        // Matching terms from the result,
+        // do not necessarily match the query
+        ...Object.keys(result.match),
+
+        // Quoted expressions
+        ...query.segments.filter(s => s.exact).map(s => s.value),
+
+        // Tags, starting with #
+        ...tags,
+      ].filter(w => w.length > 1)
+
+      // console.log(foundWords)
+      const matches = this.getMatches(
+        note.content,
+        stringsToRegex(foundWords),
+        query
+      )
+      const resultNote: ResultNote = {
+        score: result.score,
+        foundWords,
+        matches,
+        ...note,
+      }
+      return resultNote
+    })
+  }
+
+  // #region Read/write minisearch index
+
+  public async addAllToMinisearch(documents: IndexedDocument[]): Promise<void> {
+    await this.minisearch.addAllAsync(documents)
+  }
+
+  public addSingleToMinisearch(document: IndexedDocument): void {
+    this.minisearch.add(document)
+  }
+
+  public removeFromMinisearch(document: IndexedDocument): void {
+    this.minisearch.remove(document)
+  }
+
+  // #endregion
+
+  public async writeToCache(): Promise<void> {
+    await cacheManager.writeMinisearchCache(this.minisearch)
+  }
+}
--- a/src/search/search.ts
+++ b/src/search/search.ts
@@ -1,260 +0,0 @@
-import MiniSearch, {
-  type AsPlainObject,
-  type Options,
-  type SearchResult,
-} from 'minisearch'
-import {
-  chsRegex,
-  type IndexedDocument,
-  type ResultNote,
-  type SearchMatch,
-  SPACE_OR_PUNCTUATION,
-} from '../globals'
-import {
-  removeDiacritics,
-  stringsToRegex,
-  stripMarkdownCharacters,
-} from '../tools/utils'
-import type { Query } from './query'
-import { settings } from '../settings'
-import { cacheManager } from '../cache-manager'
-
-let minisearchInstance: MiniSearch<IndexedDocument>
-
-const tokenize = (text: string): string[] => {
-  const tokens = text.split(SPACE_OR_PUNCTUATION)
-  const chsSegmenter = (app as any).plugins.plugins['cm-chs-patch']
-
-  if (chsSegmenter) {
-    return tokens.flatMap(word =>
-      chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
-    )
-  } else return tokens
-}
-
-const minisearchOptions: Options<IndexedDocument> = {
-  tokenize,
-  processTerm: (term: string) =>
-    (settings.ignoreDiacritics ? removeDiacritics(term) : term).toLowerCase(),
-  idField: 'path',
-  fields: [
-    'basename',
-    'aliases',
-    'content',
-    'headings1',
-    'headings2',
-    'headings3',
-  ],
-  storeFields: ['tags'],
-}
-
-/**
- * Initializes the MiniSearch instance,
- * and adds all the notes to the index
- */
-export async function initSearchEngine(): Promise<void> {
-  // Default instance
-  minisearchInstance = new MiniSearch(minisearchOptions)
-}
-
-export async function initSearchEngineFromData(json: string): Promise<void> {
-  try {
-    minisearchInstance = MiniSearch.loadJSON(json, minisearchOptions)
-    console.log('Omnisearch - MiniSearch index loaded from the file')
-  } catch (e) {
-    console.error('Omnisearch - Could not load MiniSearch index from json')
-    console.error(e)
-  }
-}
-
-/**
- * Searches the index for the given query,
- * and returns an array of raw results
- */
-async function search(
-  query: Query,
-  options = { fuzzy: 0.1 }
-): Promise<SearchResult[]> {
-  if (!query.segmentsToStr()) return []
-
-  let results = minisearchInstance.search(query.segmentsToStr(), {
-    prefix: true,
-    // fuzzy: term => (term.length > 4 ? 0.2 : false),
-    fuzzy: options.fuzzy,
-    combineWith: 'AND',
-    boost: {
-      basename: settings.weightBasename,
-      aliases: settings.weightBasename,
-      headings1: settings.weightH1,
-      headings2: settings.weightH2,
-      headings3: settings.weightH3,
-    },
-  })
-
-  // Downrank files that are in Obsidian's excluded list
-  if (settings.respectExcluded) {
-    results.forEach(result => {
-      if (
-        app.metadataCache.isUserIgnored &&
-        app.metadataCache.isUserIgnored(result.id)
-      ) {
-        result.score /= 10 // TODO: make this value configurable or toggleable?
-      }
-    })
-  }
-
-  // If the search query contains quotes, filter out results that don't have the exact match
-  const exactTerms = query.getExactTerms()
-  if (exactTerms.length) {
-    results = results.filter(r => {
-      const title = cacheManager.getDocument(r.id)?.path.toLowerCase() ?? ''
-      const content = stripMarkdownCharacters(
-        cacheManager.getDocument(r.id)?.content ?? ''
-      ).toLowerCase()
-      return exactTerms.every(q => content.includes(q) || title.includes(q))
-    })
-  }
-
-  // If the search query contains exclude terms, filter out results that have them
-  const exclusions = query.exclusions
-  if (exclusions.length) {
-    results = results.filter(r => {
-      const content = stripMarkdownCharacters(
-        cacheManager.getDocument(r.id)?.content ?? ''
-      ).toLowerCase()
-      return exclusions.every(q => !content.includes(q.value))
-    })
-  }
-  return results
-}
-
-/**
- * Parses a text against a regex, and returns the { string, offset } matches
- */
-export function getMatches(
-  text: string,
-  reg: RegExp,
-  query: Query
-): SearchMatch[] {
-  let match: RegExpExecArray | null = null
-  const matches: SearchMatch[] = []
-  let count = 0
-  while ((match = reg.exec(text)) !== null) {
-    if (++count >= 100) break // Avoid infinite loops, stop looking after 100 matches
-    const m = match[0]
-    if (m) matches.push({ match: m, offset: match.index })
-  }
-
-  // If the query can be found "as is" in the text, put this match first
-  const best = text.toLowerCase().indexOf(query.segmentsToStr())
-  if (best > -1) {
-    matches.unshift({
-      offset: best,
-      match: query.segmentsToStr(),
-    })
-  }
-
-  return matches
-}
-
-/**
- * Searches the index, and returns an array of ResultNote objects.
- * If we have the singleFile option set,
- * the array contains a single result from that file
- * @param query
- * @param options
- * @returns
- */
-export async function getSuggestions(
-  query: Query,
-  options?: Partial<{ singleFilePath: string | null }>
-): Promise<ResultNote[]> {
-  // Get the raw results
-  let results = await search(query)
-  if (results.length == 0) {
-    results = await search(query, { fuzzy: 0.2 })
-  }
-  if (!results.length) return []
-
-  // Extract tags from the query
-  const tags = query.segments
-    .filter(s => s.value.startsWith('#'))
-    .map(s => s.value)
-
-  // Either keep the 50 first results,
-  // or the one corresponding to `singleFile`
-  if (options?.singleFilePath) {
-    const result = results.find(r => r.id === options.singleFilePath)
-    if (result) results = [result]
-    else results = []
-  } else {
-    results = results.slice(0, 50)
-
-    // Put the results with tags on top
-    for (const tag of tags) {
-      for (const result of results) {
-        if ((result.tags ?? []).includes(tag)) {
-          result.score *= 100
-        }
-      }
-    }
-  }
-
-  // Map the raw results to get usable suggestions
-  return results.map(result => {
-    const note = cacheManager.getDocument(result.id)
-    if (!note) {
-      throw new Error(`Omnisearch - Note "${result.id}" not indexed`)
-    }
-
-    // Remove '#' from tags, for highlighting
-    query.segments.forEach(s => {
-      s.value = s.value.replace(/^#/, '')
-    })
-    // Clean search matches that match quoted expressions,
-    // and inject those expressions instead
-    const foundWords = [
-      // Matching terms from the result,
-      // do not necessarily match the query
-      ...Object.keys(result.match),
-
-      // Quoted expressions
-      ...query.segments.filter(s => s.exact).map(s => s.value),
-
-      // Tags, starting with #
-      ...tags,
-    ].filter(w => w.length > 1)
-
-    // console.log(foundWords)
-    const matches = getMatches(note.content, stringsToRegex(foundWords), query)
-    const resultNote: ResultNote = {
-      score: result.score,
-      foundWords,
-      matches,
-      ...note,
-    }
-    return resultNote
-  })
-}
-
-// #region Read/write minisearch index
-
-export function getMinisearchIndexJSON(): AsPlainObject {
-  return minisearchInstance.toJSON()
-}
-
-export async function addAllToMinisearch(
-  documents: IndexedDocument[]
-): Promise<void> {
-  await minisearchInstance.addAllAsync(documents)
-}
-
-export function addSingleToMinisearch(document: IndexedDocument): void {
-  minisearchInstance.add(document)
-}
-
-export function removeFromMinisearch(document: IndexedDocument): void {
-  minisearchInstance.remove(document)
-}
-
-// #endregion