From dcef2d3719cf416cfad4777fc179b42e8d0ba401 Mon Sep 17 00:00:00 2001
From: Simon Cambier <simon.cambier@protonmail.com>
Date: Fri, 25 Nov 2022 08:28:50 +0100
Subject: [PATCH] WIP refactor to take advantage of minisearch 6.0

---
 src/cache-manager.ts                  |  32 ++++----
 src/components/ModalVault.svelte      |   4 +-
 src/components/ResultItemVault.svelte |   1 +
 src/database.ts                       |  18 ++++-
 src/file-loader.ts                    | 104 +++++++++++++++++++-------
 src/globals.ts                        |   1 +
 src/main.ts                           |  15 ++--
 src/notes-index.ts                    |  26 +++----
 src/search/search-engine.ts           |  58 ++++++++------
 src/tools/utils.ts                    |  10 ++-
 10 files changed, 172 insertions(+), 97 deletions(-)
diff --git a/src/cache-manager.ts b/src/cache-manager.ts
index a71eac1..8f6fdb4 100644
--- a/src/cache-manager.ts
+++ b/src/cache-manager.ts
@@ -6,6 +6,10 @@ import { minisearchOptions } from './search/search-engine'
 import { makeMD5 } from './tools/utils'
 
 class CacheManager {
+  /**
+   * @deprecated
+   * @private
+   */
   private liveDocuments: Map<string, IndexedDocument> = new Map()
   /**
    * Show an empty input field next time the user opens Omnisearch modal
@@ -39,6 +43,7 @@ class CacheManager {
   /**
    * Important: keep this method async for the day it _really_ becomes async.
    * This will avoid a refactor.
+   * @deprecated
    * @param path
    * @param note
    */
@@ -49,19 +54,22 @@ class CacheManager {
     this.liveDocuments.set(path, note)
   }
 
+  /**
+   * @deprecated
+   * @param key
+   */
   public deleteLiveDocument(key: string): void {
     this.liveDocuments.delete(key)
   }
 
+  /**
+   * @deprecated
+   * @param key
+   */
   public getLiveDocument(key: string): IndexedDocument | undefined {
     return this.liveDocuments.get(key)
   }
 
-  public isDocumentOutdated(file: TFile): boolean {
-    const indexedNote = this.getLiveDocument(file.path)
-    return !indexedNote || indexedNote.mtime !== file.stat.mtime
-  }
-
   //#region Minisearch
 
   public getDocumentsChecksum(documents: IndexedDocument[]): string {
@@ -82,7 +90,6 @@ class CacheManager {
   public async getMinisearchCache(): Promise<MiniSearch | null> {
     // Retrieve documents and make their checksum
     const cachedDocs = await database.documents.toArray()
-    const checksum = this.getDocumentsChecksum(cachedDocs.map(d => d.document))
 
     // Add those documents in the live cache
     cachedDocs.forEach(doc =>
@@ -91,13 +98,6 @@ class CacheManager {
 
     // Retrieve the search cache, and verify the checksum
     const cachedIndex = (await database.minisearch.toArray())[0]
-    if (cachedIndex?.checksum !== checksum) {
-      console.warn("Omnisearch - Cache - Checksums don't match, clearing cache")
-      // Invalid (or null) cache, clear everything
-      await database.minisearch.clear()
-      await database.documents.clear()
-      return null
-    }
 
     try {
       return MiniSearch.loadJS(cachedIndex.data, minisearchOptions)
@@ -116,7 +116,7 @@ class CacheManager {
    * @param documents
    */
   public async getDiffDocuments(documents: IndexedDocument[]): Promise<{
-    toDelete: IndexedDocument[]
+    toDelete: string[]
     toAdd: IndexedDocument[]
     toUpdate: { oldDoc: IndexedDocument; newDoc: IndexedDocument }[]
   }> {
@@ -128,7 +128,7 @@ class CacheManager {
     // present in `cachedDocs` but not in `documents`
     const toDelete = cachedDocs
       .filter(c => !documents.find(d => d.path === c.path))
-      .map(d => d.document)
+      .map(d => d.path)
 
     // toUpdate: same path, but different mtime
     const toUpdate = cachedDocs
@@ -158,7 +158,7 @@ class CacheManager {
 
     // Delete
     // console.log(`Omnisearch - Cache - Will delete ${toDelete.length} documents`)
-    await database.documents.bulkDelete(toDelete.map(o => o.path))
+    await database.documents.bulkDelete(toDelete)
 
     // Add
     // console.log(`Omnisearch - Cache - Will add ${toAdd.length} documents`)
diff --git a/src/components/ModalVault.svelte b/src/components/ModalVault.svelte
index 1805204..dfe2ce1 100644
--- a/src/components/ModalVault.svelte
+++ b/src/components/ModalVault.svelte
@@ -6,7 +6,7 @@
   import { eventBus, IndexingStep, type ResultNote } from 'src/globals'
   import { createNote, openNote } from 'src/tools/notes'
   import { SearchEngine } from 'src/search/search-engine'
-  import { getCtrlKeyLabel, getExtension, loopIndex } from 'src/tools/utils'
+  import { getCtrlKeyLabel, getExtension, isFilePDF, loopIndex } from 'src/tools/utils'
   import {
     OmnisearchInFileModal,
     type OmnisearchVaultModal,
@@ -189,7 +189,7 @@
   function switchToInFileModal(): void {
     // Do nothing if the selectedNote is a PDF,
     // or if there is 0 match (e.g indexing in progress)
-    if (selectedNote?.path.endsWith('.pdf') || !selectedNote?.matches.length) {
+    if (isFilePDF(selectedNote?.path) || !selectedNote?.matches.length) {
       return
     }
 
diff --git a/src/components/ResultItemVault.svelte b/src/components/ResultItemVault.svelte
index 75bf665..d6b3708 100644
--- a/src/components/ResultItemVault.svelte
+++ b/src/components/ResultItemVault.svelte
@@ -21,6 +21,7 @@
       // @ts-ignore
       const file = app.vault.getFiles().find(f => f.path === note.path)
       if (file) {
+      // @ts-ignore
         imagePath = app.vault.getResourcePath(file)
       }
     }
diff --git a/src/database.ts b/src/database.ts
index eb94ea4..635a322 100644
--- a/src/database.ts
+++ b/src/database.ts
@@ -32,12 +32,26 @@ export class OmnisearchCache extends Dexie {
   //#region Table declarations
 
   documents!: Dexie.Table<
-    { path: string; mtime: number; document: IndexedDocument },
+    {
+      path: string
+      mtime: number
+      /**
+       * @deprecated
+       */
+      document: IndexedDocument
+    },
     string
   >
   searchHistory!: Dexie.Table<{ id?: number; query: string }, number>
   minisearch!: Dexie.Table<
-    { date: string; checksum: string; data: AsPlainObject },
+    {
+      date: string
+      /**
+       * @deprecated
+       */
+      checksum: string
+      data: AsPlainObject
+    },
     string
   >
 
diff --git a/src/file-loader.ts b/src/file-loader.ts
index f8cb0f3..fa03097 100644
--- a/src/file-loader.ts
+++ b/src/file-loader.ts
@@ -4,14 +4,13 @@ import {
   getAliasesFromMetadata,
   getTagsFromMetadata,
   isFileImage,
+  isFilePDF,
   isFilePlaintext,
   removeDiacritics,
 } from './tools/utils'
-import * as NotesIndex from './notes-index'
 import type { TFile } from 'obsidian'
 import type { IndexedDocument } from './globals'
-import { getNonExistingNotes } from './tools/notes'
-import { getPdfText, getImageText } from 'obsidian-text-extract'
+import { getImageText, getPdfText } from 'obsidian-text-extract'
 
 /**
  * Return all plaintext files as IndexedDocuments
@@ -20,9 +19,9 @@ export async function getPlainTextFiles(): Promise<IndexedDocument[]> {
   const allFiles = app.vault.getFiles().filter(f => isFilePlaintext(f.path))
   const data: IndexedDocument[] = []
   for (const file of allFiles) {
-    const doc = await fileToIndexedDocument(file)
+    const doc = await getIndexedDocument(file.path)
     data.push(doc)
-    await cacheManager.updateLiveDocument(file.path, doc)
+    // await cacheManager.updateLiveDocument(file.path, doc)
   }
   return data
 }
@@ -31,7 +30,7 @@ export async function getPlainTextFiles(): Promise<IndexedDocument[]> {
  * Return all PDFs as IndexedDocuments.
  */
 export async function getPDFAsDocuments(): Promise<IndexedDocument[]> {
-  const files = app.vault.getFiles().filter(f => f.path.endsWith('.pdf'))
+  const files = app.vault.getFiles().filter(f => isFilePDF(f.path))
   return await getBinaryFiles(files)
 }
 
@@ -49,8 +48,8 @@ async function getBinaryFiles(files: TFile[]): Promise<IndexedDocument[]> {
   for (const file of files) {
     input.push(
       new Promise(async (resolve, reject) => {
-        const doc = await fileToIndexedDocument(file)
-        await cacheManager.updateLiveDocument(file.path, doc)
+        const doc = await getIndexedDocument(file.path)
+        // await cacheManager.updateLiveDocument(file.path, doc)
         data.push(doc)
         return resolve(null)
       })
@@ -60,37 +59,34 @@ async function getBinaryFiles(files: TFile[]): Promise<IndexedDocument[]> {
   return data
 }
 
-/**
- * Convert a file into an IndexedDocument.
- * Will use the cache if possible.
- */
-export async function fileToIndexedDocument(
-  file: TFile
+export async function getIndexedDocument(
+  path: string
 ): Promise<IndexedDocument> {
+  const file = app.vault.getFiles().find(f => f.path === path)
+  if (!file) throw new Error(`Invalid file path: "${path}"`)
   let content: string
-  if (isFilePlaintext(file.path)) {
+  if (isFilePlaintext(path)) {
     content = await app.vault.cachedRead(file)
-  } else if (file.path.endsWith('.pdf')) {
+  } else if (isFilePDF(path)) {
     content = await getPdfText(file)
   } else if (isFileImage(file.path)) {
     content = await getImageText(file)
   } else {
-    throw new Error('Invalid file: ' + file.path)
+    throw new Error('Invalid file format: ' + file.path)
   }
-
   content = removeDiacritics(content)
   const metadata = app.metadataCache.getFileCache(file)
 
   // Look for links that lead to non-existing files,
   // and add them to the index.
   if (metadata) {
-    // FIXME: https://github.com/scambier/obsidian-omnisearch/issues/129
-    const nonExisting = getNonExistingNotes(file, metadata)
-    for (const name of nonExisting.filter(
-      o => !cacheManager.getLiveDocument(o)
-    )) {
-      NotesIndex.addNonExistingToIndex(name, file.path)
-    }
+    // // FIXME: https://github.com/scambier/obsidian-omnisearch/issues/129
+    // const nonExisting = getNonExistingNotes(file, metadata)
+    // for (const name of nonExisting.filter(
+    //   o => !cacheManager.getLiveDocument(o)
+    // )) {
+    //   NotesIndex.addNonExistingToIndex(name, file.path)
+    // }
 
     // EXCALIDRAW
     // Remove the json code
@@ -117,3 +113,61 @@ export async function fileToIndexedDocument(
     headings3: metadata ? extractHeadingsFromCache(metadata, 3).join(' ') : '',
   }
 }
+
+/**
+ * Convert a file into an IndexedDocument.
+ * Will use the cache if possible.
+ */
+// async function fileToIndexedDocument(
+//   file: TFile
+// ): Promise<IndexedDocument> {
+//   let content: string
+//   if (isFilePlaintext(file.path)) {
+//     content = await app.vault.cachedRead(file)
+//   } else if (isFilePDF(file.path)) {
+//     content = await getPdfText(file)
+//   } else if (isFileImage(file.path)) {
+//     content = await getImageText(file)
+//   } else {
+//     throw new Error('Invalid file: ' + file.path)
+//   }
+//
+//   content = removeDiacritics(content)
+//   const metadata = app.metadataCache.getFileCache(file)
+//
+//   // Look for links that lead to non-existing files,
+//   // and add them to the index.
+//   if (metadata) {
+//     // FIXME: https://github.com/scambier/obsidian-omnisearch/issues/129
+//     const nonExisting = getNonExistingNotes(file, metadata)
+//     for (const name of nonExisting.filter(
+//       o => !cacheManager.getLiveDocument(o)
+//     )) {
+//       NotesIndex.addNonExistingToIndex(name, file.path)
+//     }
+//
+//     // EXCALIDRAW
+//     // Remove the json code
+//     if (metadata.frontmatter?.['excalidraw-plugin']) {
+//       const comments =
+//         metadata.sections?.filter(s => s.type === 'comment') ?? []
+//       for (const { start, end } of comments.map(c => c.position)) {
+//         content =
+//           content.substring(0, start.offset - 1) + content.substring(end.offset)
+//       }
+//     }
+//   }
+//
+//   return {
+//     basename: removeDiacritics(file.basename),
+//     content,
+//     path: file.path,
+//     mtime: file.stat.mtime,
+//
+//     tags: getTagsFromMetadata(metadata),
+//     aliases: getAliasesFromMetadata(metadata).join(''),
+//     headings1: metadata ? extractHeadingsFromCache(metadata, 1).join(' ') : '',
+//     headings2: metadata ? extractHeadingsFromCache(metadata, 2).join(' ') : '',
+//     headings3: metadata ? extractHeadingsFromCache(metadata, 3).join(' ') : '',
+//   }
+// }
diff --git a/src/globals.ts b/src/globals.ts
index 8bee954..64562d7 100644
--- a/src/globals.ts
+++ b/src/globals.ts
@@ -37,6 +37,7 @@ export type IndexedDocument = {
   headings2: string
   headings3: string
 
+  // TODO: reimplement this
   doesNotExist?: boolean
   parent?: string
 }
diff --git a/src/main.ts b/src/main.ts
index 7e1e405..113f6f2 100644
--- a/src/main.ts
+++ b/src/main.ts
@@ -8,10 +8,11 @@ import { loadSettings, settings, SettingsTab, showExcerpt } from './settings'
 import { eventBus, EventNames, IndexingStep } from './globals'
 import api from './tools/api'
 import { isFilePlaintext, wait } from './tools/utils'
-import * as NotesIndex from './notes-index'
 import * as FileLoader from './file-loader'
 import { OmnisearchCache } from './database'
 import { cacheManager } from './cache-manager'
+import * as NotesIndex from './notes-index'
+import { addToIndexAndMemCache } from "./notes-index";
 
 export default class OmnisearchPlugin extends Plugin {
   private ribbonButton?: HTMLElement
@@ -186,21 +187,15 @@ async function populateIndex(): Promise<void> {
 
     // Add
     await engine.addAllToMinisearch(diffDocs.toAdd)
-    diffDocs.toAdd.forEach(doc =>
-      cacheManager.updateLiveDocument(doc.path, doc)
-    )
 
     // Delete
-    for (const [i, doc] of diffDocs.toDelete.entries()) {
-      await wait(0)
-      engine.removeFromMinisearch(doc)
-      cacheManager.deleteLiveDocument(doc.path)
+    for (const pathToDel of diffDocs.toDelete) {
+      NotesIndex.removeFromIndex(pathToDel)
     }
 
     // Update (delete + add)
     diffDocs.toUpdate.forEach(({ oldDoc, newDoc }) => {
-      engine.removeFromMinisearch(oldDoc)
-      cacheManager.updateLiveDocument(oldDoc.path, newDoc)
+      NotesIndex.removeFromIndex(oldDoc.path)
     })
     await engine.addAllToMinisearch(diffDocs.toUpdate.map(d => d.newDoc))
   }
diff --git a/src/notes-index.ts b/src/notes-index.ts
index 4b022f8..4c2ecd7 100644
--- a/src/notes-index.ts
+++ b/src/notes-index.ts
@@ -4,7 +4,9 @@ import { removeAnchors } from './tools/notes'
 import { SearchEngine } from './search/search-engine'
 import { cacheManager } from './cache-manager'
 import type { IndexedDocument } from './globals'
-import { fileToIndexedDocument } from './file-loader'
+import { getIndexedDocument } from "./file-loader";
+
+const indexedList: Set<string> = new Set()
 
 /**
  * Adds a file to the search index
@@ -18,21 +20,14 @@ export async function addToIndexAndMemCache(
     return
   }
 
-  // Check if the file was already indexed as non-existent.
-  // If so, remove it from the index, and add it again as a real note.
-  if (cacheManager.getLiveDocument(file.path)?.doesNotExist) {
-    removeFromIndex(file.path)
-  }
-
   try {
-    if (cacheManager.getLiveDocument(file.path)) {
+    if (indexedList.has(file.path)) {
       throw new Error(`${file.basename} is already indexed`)
     }
 
     // Make the document and index it
-    const note = await fileToIndexedDocument(file)
-    SearchEngine.getEngine().addSingleToMinisearch(note)
-    await cacheManager.updateLiveDocument(note.path, note)
+    SearchEngine.getEngine().addSingleToMinisearch(file.path)
+    indexedList.add(file.path)
   } catch (e) {
     // console.trace('Error while indexing ' + file.basename)
     console.error(e)
@@ -65,8 +60,7 @@ export function addNonExistingToIndex(name: string, parent: string): void {
     doesNotExist: true,
     parent,
   }
-  SearchEngine.getEngine().addSingleToMinisearch(note)
-  cacheManager.updateLiveDocument(filename, note)
+  SearchEngine.getEngine().addSingleToMinisearch(note.path)
 }
 
 /**
@@ -77,10 +71,8 @@ export function removeFromIndex(path: string): void {
     console.info(`"${path}" is not an indexable file`)
     return
   }
-  const note = cacheManager.getLiveDocument(path)
-  if (note) {
-    SearchEngine.getEngine().removeFromMinisearch(note)
-    cacheManager.deleteLiveDocument(path)
+  if (indexedList.has(path)) {
+    SearchEngine.getEngine().removeFromMinisearch(path)
 
     // FIXME: only remove non-existing notes if they don't have another parent
     // cacheManager
diff --git a/src/search/search-engine.ts b/src/search/search-engine.ts
index 324197e..c35688b 100644
--- a/src/search/search-engine.ts
+++ b/src/search/search-engine.ts
@@ -2,10 +2,10 @@ import MiniSearch, { type Options, type SearchResult } from 'minisearch'
 import {
   chsRegex,
   type IndexedDocument,
+  IndexingStep,
   type ResultNote,
   type SearchMatch,
   SPACE_OR_PUNCTUATION,
-  IndexingStep,
 } from '../globals'
 import {
   removeDiacritics,
@@ -17,8 +17,9 @@ import { settings } from '../settings'
 import { cacheManager } from '../cache-manager'
 import { writable } from 'svelte/store'
 import { Notice } from 'obsidian'
+import { getIndexedDocument } from '../file-loader'
 
-let previousResults: ResultNote[] = []
+let previousResults: SearchResult[] = []
 
 const tokenize = (text: string): string[] => {
   const tokens = text.split(SPACE_OR_PUNCTUATION)
@@ -102,7 +103,10 @@ export class SearchEngine {
     query: Query,
     options: { prefixLength: number }
   ): Promise<SearchResult[]> {
-    if (!query.segmentsToStr()) return []
+    if (query.isEmpty()) {
+      previousResults = []
+      return []
+    }
 
     let results = this.minisearch.search(query.segmentsToStr(), {
       prefix: term => term.length >= options.prefixLength,
@@ -116,6 +120,7 @@ export class SearchEngine {
         headings3: settings.weightH3,
       },
     })
+    if (!results.length) return previousResults
 
     // Downrank files that are in Obsidian's excluded list
     if (settings.respectExcluded) {
@@ -124,19 +129,23 @@ export class SearchEngine {
           app.metadataCache.isUserIgnored &&
           app.metadataCache.isUserIgnored(result.id)
         ) {
-          result.score /= 10 // TODO: make this value configurable or toggleable?
+          result.score /= 10
         }
       })
     }
 
+    const documents = await Promise.all(
+      results.map(async result => await getIndexedDocument(result.id))
+    )
+
     // If the search query contains quotes, filter out results that don't have the exact match
     const exactTerms = query.getExactTerms()
     if (exactTerms.length) {
       results = results.filter(r => {
-        const title =
-          cacheManager.getLiveDocument(r.id)?.path.toLowerCase() ?? ''
+        const document = documents.find(d => d.path === r.id)
+        const title = document?.path.toLowerCase() ?? ''
         const content = stripMarkdownCharacters(
-          cacheManager.getLiveDocument(r.id)?.content ?? ''
+          document?.content ?? ''
         ).toLowerCase()
         return exactTerms.every(q => content.includes(q) || title.includes(q))
       })
@@ -147,16 +156,22 @@ export class SearchEngine {
     if (exclusions.length) {
       results = results.filter(r => {
         const content = stripMarkdownCharacters(
-          cacheManager.getLiveDocument(r.id)?.content ?? ''
+          documents.find(d => d.path === r.id)?.content ?? ''
         ).toLowerCase()
         return exclusions.every(q => !content.includes(q.value))
       })
     }
     // FIXME:
     // Dedupe results - clutch for https://github.com/scambier/obsidian-omnisearch/issues/129
-    return results.filter(
-      (result, index, arr) => arr.findIndex(t => t.id === result.id) === index
-    )
+    results = results
+      .filter(
+        (result, index, arr) => arr.findIndex(t => t.id === result.id) === index
+      )
+      .slice(0, 50)
+
+    previousResults = results
+
+    return results
   }
 
   /**
@@ -196,10 +211,6 @@ export class SearchEngine {
     query: Query,
     options?: Partial<{ singleFilePath: string | null }>
   ): Promise<ResultNote[]> {
-    if (query.isEmpty()) {
-      previousResults = []
-      return []
-    }
     // Get the raw results
     let results: SearchResult[]
     if (settings.simpleSearch) {
@@ -207,7 +218,6 @@ export class SearchEngine {
     } else {
       results = await this.search(query, { prefixLength: 3 })
     }
-    if (!results.length) return previousResults
 
     // Extract tags from the query
     const tags = query.segments
@@ -233,9 +243,14 @@ export class SearchEngine {
       }
     }
 
+    // TODO: this already called in search(), pass each document in its SearchResult instead?
+    const documents = await Promise.all(
+      results.map(async result => await getIndexedDocument(result.id))
+    )
+
     // Map the raw results to get usable suggestions
     const resultNotes = results.map(result => {
-      let note = cacheManager.getLiveDocument(result.id)
+      let note = documents.find(d => d.path === result.id)
       if (!note) {
         // throw new Error(`Omnisearch - Note "${result.id}" not indexed`)
         console.warn(`Omnisearch - Note "${result.id}" not in the live cache`)
@@ -278,7 +293,6 @@ export class SearchEngine {
       }
       return resultNote
     })
-    previousResults = resultNotes
     return resultNotes
   }
 
@@ -291,12 +305,12 @@ export class SearchEngine {
     await this.minisearch.addAllAsync(documents, { chunkSize })
   }
 
-  public addSingleToMinisearch(document: IndexedDocument): void {
-    this.minisearch.add(document)
+  public addSingleToMinisearch(path: string): void {
+    getIndexedDocument(path).then(doc => this.minisearch.add(doc))
   }
 
-  public removeFromMinisearch(document: IndexedDocument): void {
-    this.minisearch.remove(document)
+  public removeFromMinisearch(path: string): void {
+    this.minisearch.discard(path)
   }
 
   // #endregion
diff --git a/src/tools/utils.ts b/src/tools/utils.ts
index 1339d3e..81dec10 100644
--- a/src/tools/utils.ts
+++ b/src/tools/utils.ts
@@ -1,9 +1,9 @@
 import {
   type CachedMetadata,
-  Notice,
-  Platform,
   getAllTags,
+  Notice,
   parseFrontMatterAliases,
+  Platform,
 } from 'obsidian'
 import type { SearchMatch } from '../globals'
 import {
@@ -207,8 +207,8 @@ export function getCtrlKeyLabel(): 'ctrl' | '⌘' {
 
 export function isFileIndexable(path: string): boolean {
   return (
-    (settings.PDFIndexing && path.endsWith('.pdf')) ||
     isFilePlaintext(path) ||
+    (settings.PDFIndexing && isFilePDF(path)) ||
     (settings.imagesIndexing && isFileImage(path))
   )
 }
@@ -219,6 +219,10 @@ export function isFileImage(path: string): boolean {
   )
 }
 
+export function isFilePDF(path: string): boolean {
+  return path.endsWith('.pdf')
+}
+
 export function isFilePlaintext(path: string): boolean {
   return getPlaintextExtensions().some(t => path.endsWith(`.${t}`))
 }