WIP refactor to take advantage of minisearch 6.0

2022-11-25 08:28:50 +01:00
parent 2daa718bc9
commit dcef2d3719
10 changed files with 172 additions and 97 deletions
--- a/src/cache-manager.ts
+++ b/src/cache-manager.ts
@@ -6,6 +6,10 @@ import { minisearchOptions } from './search/search-engine'
 import { makeMD5 } from './tools/utils'

 class CacheManager {
+  /**
+   * @deprecated
+   * @private
+   */
  private liveDocuments: Map<string, IndexedDocument> = new Map()
  /**
   * Show an empty input field next time the user opens Omnisearch modal
@@ -39,6 +43,7 @@ class CacheManager {
  /**
   * Important: keep this method async for the day it _really_ becomes async.
   * This will avoid a refactor.
+   * @deprecated
   * @param path
   * @param note
   */
@@ -49,19 +54,22 @@ class CacheManager {
    this.liveDocuments.set(path, note)
  }

+  /**
+   * @deprecated
+   * @param key
+   */
  public deleteLiveDocument(key: string): void {
    this.liveDocuments.delete(key)
  }

+  /**
+   * @deprecated
+   * @param key
+   */
  public getLiveDocument(key: string): IndexedDocument | undefined {
    return this.liveDocuments.get(key)
  }

-  public isDocumentOutdated(file: TFile): boolean {
-    const indexedNote = this.getLiveDocument(file.path)
-    return !indexedNote || indexedNote.mtime !== file.stat.mtime
-  }
-
  //#region Minisearch

  public getDocumentsChecksum(documents: IndexedDocument[]): string {
@@ -82,7 +90,6 @@ class CacheManager {
  public async getMinisearchCache(): Promise<MiniSearch | null> {
    // Retrieve documents and make their checksum
    const cachedDocs = await database.documents.toArray()
-    const checksum = this.getDocumentsChecksum(cachedDocs.map(d => d.document))

    // Add those documents in the live cache
    cachedDocs.forEach(doc =>
@@ -91,13 +98,6 @@ class CacheManager {

    // Retrieve the search cache, and verify the checksum
    const cachedIndex = (await database.minisearch.toArray())[0]
-    if (cachedIndex?.checksum !== checksum) {
-      console.warn("Omnisearch - Cache - Checksums don't match, clearing cache")
-      // Invalid (or null) cache, clear everything
-      await database.minisearch.clear()
-      await database.documents.clear()
-      return null
-    }

    try {
      return MiniSearch.loadJS(cachedIndex.data, minisearchOptions)
@@ -116,7 +116,7 @@ class CacheManager {
   * @param documents
   */
  public async getDiffDocuments(documents: IndexedDocument[]): Promise<{
-    toDelete: IndexedDocument[]
+    toDelete: string[]
    toAdd: IndexedDocument[]
    toUpdate: { oldDoc: IndexedDocument; newDoc: IndexedDocument }[]
  }> {
@@ -128,7 +128,7 @@ class CacheManager {
    // present in `cachedDocs` but not in `documents`
    const toDelete = cachedDocs
      .filter(c => !documents.find(d => d.path === c.path))
-      .map(d => d.document)
+      .map(d => d.path)

    // toUpdate: same path, but different mtime
    const toUpdate = cachedDocs
@@ -158,7 +158,7 @@ class CacheManager {

    // Delete
    // console.log(`Omnisearch - Cache - Will delete ${toDelete.length} documents`)
-    await database.documents.bulkDelete(toDelete.map(o => o.path))
+    await database.documents.bulkDelete(toDelete)

    // Add
    // console.log(`Omnisearch - Cache - Will add ${toAdd.length} documents`)
--- a/src/components/ModalVault.svelte
+++ b/src/components/ModalVault.svelte
@@ -6,7 +6,7 @@
  import { eventBus, IndexingStep, type ResultNote } from 'src/globals'
  import { createNote, openNote } from 'src/tools/notes'
  import { SearchEngine } from 'src/search/search-engine'
-  import { getCtrlKeyLabel, getExtension, loopIndex } from 'src/tools/utils'
+  import { getCtrlKeyLabel, getExtension, isFilePDF, loopIndex } from 'src/tools/utils'
  import {
    OmnisearchInFileModal,
    type OmnisearchVaultModal,
@@ -189,7 +189,7 @@
  function switchToInFileModal(): void {
    // Do nothing if the selectedNote is a PDF,
    // or if there is 0 match (e.g indexing in progress)
-    if (selectedNote?.path.endsWith('.pdf') || !selectedNote?.matches.length) {
+    if (isFilePDF(selectedNote?.path) || !selectedNote?.matches.length) {
      return
    }

--- a/src/components/ResultItemVault.svelte
+++ b/src/components/ResultItemVault.svelte
@@ -21,6 +21,7 @@
      // @ts-ignore
      const file = app.vault.getFiles().find(f => f.path === note.path)
      if (file) {
+      // @ts-ignore
        imagePath = app.vault.getResourcePath(file)
      }
    }
--- a/src/database.ts
+++ b/src/database.ts
@@ -32,12 +32,26 @@ export class OmnisearchCache extends Dexie {
  //#region Table declarations

  documents!: Dexie.Table<
-    { path: string; mtime: number; document: IndexedDocument },
+    {
+      path: string
+      mtime: number
+      /**
+       * @deprecated
+       */
+      document: IndexedDocument
+    },
    string
  >
  searchHistory!: Dexie.Table<{ id?: number; query: string }, number>
  minisearch!: Dexie.Table<
-    { date: string; checksum: string; data: AsPlainObject },
+    {
+      date: string
+      /**
+       * @deprecated
+       */
+      checksum: string
+      data: AsPlainObject
+    },
    string
  >

--- a/src/file-loader.ts
+++ b/src/file-loader.ts
@@ -4,14 +4,13 @@ import {
  getAliasesFromMetadata,
  getTagsFromMetadata,
  isFileImage,
+  isFilePDF,
  isFilePlaintext,
  removeDiacritics,
 } from './tools/utils'
-import * as NotesIndex from './notes-index'
 import type { TFile } from 'obsidian'
 import type { IndexedDocument } from './globals'
-import { getNonExistingNotes } from './tools/notes'
-import { getPdfText, getImageText } from 'obsidian-text-extract'
+import { getImageText, getPdfText } from 'obsidian-text-extract'

 /**
 * Return all plaintext files as IndexedDocuments
@@ -20,9 +19,9 @@ export async function getPlainTextFiles(): Promise<IndexedDocument[]> {
  const allFiles = app.vault.getFiles().filter(f => isFilePlaintext(f.path))
  const data: IndexedDocument[] = []
  for (const file of allFiles) {
-    const doc = await fileToIndexedDocument(file)
+    const doc = await getIndexedDocument(file.path)
    data.push(doc)
-    await cacheManager.updateLiveDocument(file.path, doc)
+    // await cacheManager.updateLiveDocument(file.path, doc)
  }
  return data
 }
@@ -31,7 +30,7 @@ export async function getPlainTextFiles(): Promise<IndexedDocument[]> {
 * Return all PDFs as IndexedDocuments.
 */
 export async function getPDFAsDocuments(): Promise<IndexedDocument[]> {
-  const files = app.vault.getFiles().filter(f => f.path.endsWith('.pdf'))
+  const files = app.vault.getFiles().filter(f => isFilePDF(f.path))
  return await getBinaryFiles(files)
 }

@@ -49,8 +48,8 @@ async function getBinaryFiles(files: TFile[]): Promise<IndexedDocument[]> {
  for (const file of files) {
    input.push(
      new Promise(async (resolve, reject) => {
-        const doc = await fileToIndexedDocument(file)
-        await cacheManager.updateLiveDocument(file.path, doc)
+        const doc = await getIndexedDocument(file.path)
+        // await cacheManager.updateLiveDocument(file.path, doc)
        data.push(doc)
        return resolve(null)
      })
@@ -60,37 +59,34 @@ async function getBinaryFiles(files: TFile[]): Promise<IndexedDocument[]> {
  return data
 }

-/**
- * Convert a file into an IndexedDocument.
- * Will use the cache if possible.
- */
-export async function fileToIndexedDocument(
-  file: TFile
+export async function getIndexedDocument(
+  path: string
 ): Promise<IndexedDocument> {
+  const file = app.vault.getFiles().find(f => f.path === path)
+  if (!file) throw new Error(`Invalid file path: "${path}"`)
  let content: string
-  if (isFilePlaintext(file.path)) {
+  if (isFilePlaintext(path)) {
    content = await app.vault.cachedRead(file)
-  } else if (file.path.endsWith('.pdf')) {
+  } else if (isFilePDF(path)) {
    content = await getPdfText(file)
  } else if (isFileImage(file.path)) {
    content = await getImageText(file)
  } else {
-    throw new Error('Invalid file: ' + file.path)
+    throw new Error('Invalid file format: ' + file.path)
  }
-
  content = removeDiacritics(content)
  const metadata = app.metadataCache.getFileCache(file)

  // Look for links that lead to non-existing files,
  // and add them to the index.
  if (metadata) {
-    // FIXME: https://github.com/scambier/obsidian-omnisearch/issues/129
-    const nonExisting = getNonExistingNotes(file, metadata)
-    for (const name of nonExisting.filter(
-      o => !cacheManager.getLiveDocument(o)
-    )) {
-      NotesIndex.addNonExistingToIndex(name, file.path)
-    }
+    // // FIXME: https://github.com/scambier/obsidian-omnisearch/issues/129
+    // const nonExisting = getNonExistingNotes(file, metadata)
+    // for (const name of nonExisting.filter(
+    //   o => !cacheManager.getLiveDocument(o)
+    // )) {
+    //   NotesIndex.addNonExistingToIndex(name, file.path)
+    // }

    // EXCALIDRAW
    // Remove the json code
@@ -117,3 +113,61 @@ export async function fileToIndexedDocument(
    headings3: metadata ? extractHeadingsFromCache(metadata, 3).join(' ') : '',
  }
 }
+
+/**
+ * Convert a file into an IndexedDocument.
+ * Will use the cache if possible.
+ */
+// async function fileToIndexedDocument(
+//   file: TFile
+// ): Promise<IndexedDocument> {
+//   let content: string
+//   if (isFilePlaintext(file.path)) {
+//     content = await app.vault.cachedRead(file)
+//   } else if (isFilePDF(file.path)) {
+//     content = await getPdfText(file)
+//   } else if (isFileImage(file.path)) {
+//     content = await getImageText(file)
+//   } else {
+//     throw new Error('Invalid file: ' + file.path)
+//   }
+//
+//   content = removeDiacritics(content)
+//   const metadata = app.metadataCache.getFileCache(file)
+//
+//   // Look for links that lead to non-existing files,
+//   // and add them to the index.
+//   if (metadata) {
+//     // FIXME: https://github.com/scambier/obsidian-omnisearch/issues/129
+//     const nonExisting = getNonExistingNotes(file, metadata)
+//     for (const name of nonExisting.filter(
+//       o => !cacheManager.getLiveDocument(o)
+//     )) {
+//       NotesIndex.addNonExistingToIndex(name, file.path)
+//     }
+//
+//     // EXCALIDRAW
+//     // Remove the json code
+//     if (metadata.frontmatter?.['excalidraw-plugin']) {
+//       const comments =
+//         metadata.sections?.filter(s => s.type === 'comment') ?? []
+//       for (const { start, end } of comments.map(c => c.position)) {
+//         content =
+//           content.substring(0, start.offset - 1) + content.substring(end.offset)
+//       }
+//     }
+//   }
+//
+//   return {
+//     basename: removeDiacritics(file.basename),
+//     content,
+//     path: file.path,
+//     mtime: file.stat.mtime,
+//
+//     tags: getTagsFromMetadata(metadata),
+//     aliases: getAliasesFromMetadata(metadata).join(''),
+//     headings1: metadata ? extractHeadingsFromCache(metadata, 1).join(' ') : '',
+//     headings2: metadata ? extractHeadingsFromCache(metadata, 2).join(' ') : '',
+//     headings3: metadata ? extractHeadingsFromCache(metadata, 3).join(' ') : '',
+//   }
+// }
--- a/src/globals.ts
+++ b/src/globals.ts
@@ -37,6 +37,7 @@ export type IndexedDocument = {
  headings2: string
  headings3: string

+  // TODO: reimplement this
  doesNotExist?: boolean
  parent?: string
 }
--- a/src/main.ts
+++ b/src/main.ts
@@ -8,10 +8,11 @@ import { loadSettings, settings, SettingsTab, showExcerpt } from './settings'
 import { eventBus, EventNames, IndexingStep } from './globals'
 import api from './tools/api'
 import { isFilePlaintext, wait } from './tools/utils'
-import * as NotesIndex from './notes-index'
 import * as FileLoader from './file-loader'
 import { OmnisearchCache } from './database'
 import { cacheManager } from './cache-manager'
+import * as NotesIndex from './notes-index'
+import { addToIndexAndMemCache } from "./notes-index";

 export default class OmnisearchPlugin extends Plugin {
  private ribbonButton?: HTMLElement
@@ -186,21 +187,15 @@ async function populateIndex(): Promise<void> {

    // Add
    await engine.addAllToMinisearch(diffDocs.toAdd)
-    diffDocs.toAdd.forEach(doc =>
-      cacheManager.updateLiveDocument(doc.path, doc)
-    )

    // Delete
-    for (const [i, doc] of diffDocs.toDelete.entries()) {
-      await wait(0)
-      engine.removeFromMinisearch(doc)
-      cacheManager.deleteLiveDocument(doc.path)
+    for (const pathToDel of diffDocs.toDelete) {
+      NotesIndex.removeFromIndex(pathToDel)
    }

    // Update (delete + add)
    diffDocs.toUpdate.forEach(({ oldDoc, newDoc }) => {
-      engine.removeFromMinisearch(oldDoc)
-      cacheManager.updateLiveDocument(oldDoc.path, newDoc)
+      NotesIndex.removeFromIndex(oldDoc.path)
    })
    await engine.addAllToMinisearch(diffDocs.toUpdate.map(d => d.newDoc))
  }
--- a/src/notes-index.ts
+++ b/src/notes-index.ts
@@ -4,7 +4,9 @@ import { removeAnchors } from './tools/notes'
 import { SearchEngine } from './search/search-engine'
 import { cacheManager } from './cache-manager'
 import type { IndexedDocument } from './globals'
-import { fileToIndexedDocument } from './file-loader'
+import { getIndexedDocument } from "./file-loader";
+
+const indexedList: Set<string> = new Set()

 /**
 * Adds a file to the search index
@@ -18,21 +20,14 @@ export async function addToIndexAndMemCache(
    return
  }

-  // Check if the file was already indexed as non-existent.
-  // If so, remove it from the index, and add it again as a real note.
-  if (cacheManager.getLiveDocument(file.path)?.doesNotExist) {
-    removeFromIndex(file.path)
-  }
-
  try {
-    if (cacheManager.getLiveDocument(file.path)) {
+    if (indexedList.has(file.path)) {
      throw new Error(`${file.basename} is already indexed`)
    }

    // Make the document and index it
-    const note = await fileToIndexedDocument(file)
-    SearchEngine.getEngine().addSingleToMinisearch(note)
-    await cacheManager.updateLiveDocument(note.path, note)
+    SearchEngine.getEngine().addSingleToMinisearch(file.path)
+    indexedList.add(file.path)
  } catch (e) {
    // console.trace('Error while indexing ' + file.basename)
    console.error(e)
@@ -65,8 +60,7 @@ export function addNonExistingToIndex(name: string, parent: string): void {
    doesNotExist: true,
    parent,
  }
-  SearchEngine.getEngine().addSingleToMinisearch(note)
-  cacheManager.updateLiveDocument(filename, note)
+  SearchEngine.getEngine().addSingleToMinisearch(note.path)
 }

 /**
@@ -77,10 +71,8 @@ export function removeFromIndex(path: string): void {
    console.info(`"${path}" is not an indexable file`)
    return
  }
-  const note = cacheManager.getLiveDocument(path)
-  if (note) {
-    SearchEngine.getEngine().removeFromMinisearch(note)
-    cacheManager.deleteLiveDocument(path)
+  if (indexedList.has(path)) {
+    SearchEngine.getEngine().removeFromMinisearch(path)

    // FIXME: only remove non-existing notes if they don't have another parent
    // cacheManager
--- a/src/search/search-engine.ts
+++ b/src/search/search-engine.ts
@@ -2,10 +2,10 @@ import MiniSearch, { type Options, type SearchResult } from 'minisearch'
 import {
  chsRegex,
  type IndexedDocument,
+  IndexingStep,
  type ResultNote,
  type SearchMatch,
  SPACE_OR_PUNCTUATION,
-  IndexingStep,
 } from '../globals'
 import {
  removeDiacritics,
@@ -17,8 +17,9 @@ import { settings } from '../settings'
 import { cacheManager } from '../cache-manager'
 import { writable } from 'svelte/store'
 import { Notice } from 'obsidian'
+import { getIndexedDocument } from '../file-loader'

-let previousResults: ResultNote[] = []
+let previousResults: SearchResult[] = []

 const tokenize = (text: string): string[] => {
  const tokens = text.split(SPACE_OR_PUNCTUATION)
@@ -102,7 +103,10 @@ export class SearchEngine {
    query: Query,
    options: { prefixLength: number }
  ): Promise<SearchResult[]> {
-    if (!query.segmentsToStr()) return []
+    if (query.isEmpty()) {
+      previousResults = []
+      return []
+    }

    let results = this.minisearch.search(query.segmentsToStr(), {
      prefix: term => term.length >= options.prefixLength,
@@ -116,6 +120,7 @@ export class SearchEngine {
        headings3: settings.weightH3,
      },
    })
+    if (!results.length) return previousResults

    // Downrank files that are in Obsidian's excluded list
    if (settings.respectExcluded) {
@@ -124,19 +129,23 @@ export class SearchEngine {
          app.metadataCache.isUserIgnored &&
          app.metadataCache.isUserIgnored(result.id)
        ) {
-          result.score /= 10 // TODO: make this value configurable or toggleable?
+          result.score /= 10
        }
      })
    }

+    const documents = await Promise.all(
+      results.map(async result => await getIndexedDocument(result.id))
+    )
+
    // If the search query contains quotes, filter out results that don't have the exact match
    const exactTerms = query.getExactTerms()
    if (exactTerms.length) {
      results = results.filter(r => {
-        const title =
-          cacheManager.getLiveDocument(r.id)?.path.toLowerCase() ?? ''
+        const document = documents.find(d => d.path === r.id)
+        const title = document?.path.toLowerCase() ?? ''
        const content = stripMarkdownCharacters(
-          cacheManager.getLiveDocument(r.id)?.content ?? ''
+          document?.content ?? ''
        ).toLowerCase()
        return exactTerms.every(q => content.includes(q) || title.includes(q))
      })
@@ -147,16 +156,22 @@ export class SearchEngine {
    if (exclusions.length) {
      results = results.filter(r => {
        const content = stripMarkdownCharacters(
-          cacheManager.getLiveDocument(r.id)?.content ?? ''
+          documents.find(d => d.path === r.id)?.content ?? ''
        ).toLowerCase()
        return exclusions.every(q => !content.includes(q.value))
      })
    }
    // FIXME:
    // Dedupe results - clutch for https://github.com/scambier/obsidian-omnisearch/issues/129
-    return results.filter(
+    results = results
+      .filter(
        (result, index, arr) => arr.findIndex(t => t.id === result.id) === index
      )
+      .slice(0, 50)
+
+    previousResults = results
+
+    return results
  }

  /**
@@ -196,10 +211,6 @@ export class SearchEngine {
    query: Query,
    options?: Partial<{ singleFilePath: string | null }>
  ): Promise<ResultNote[]> {
-    if (query.isEmpty()) {
-      previousResults = []
-      return []
-    }
    // Get the raw results
    let results: SearchResult[]
    if (settings.simpleSearch) {
@@ -207,7 +218,6 @@ export class SearchEngine {
    } else {
      results = await this.search(query, { prefixLength: 3 })
    }
-    if (!results.length) return previousResults

    // Extract tags from the query
    const tags = query.segments
@@ -233,9 +243,14 @@ export class SearchEngine {
      }
    }

+    // TODO: this already called in search(), pass each document in its SearchResult instead?
+    const documents = await Promise.all(
+      results.map(async result => await getIndexedDocument(result.id))
+    )
+
    // Map the raw results to get usable suggestions
    const resultNotes = results.map(result => {
-      let note = cacheManager.getLiveDocument(result.id)
+      let note = documents.find(d => d.path === result.id)
      if (!note) {
        // throw new Error(`Omnisearch - Note "${result.id}" not indexed`)
        console.warn(`Omnisearch - Note "${result.id}" not in the live cache`)
@@ -278,7 +293,6 @@ export class SearchEngine {
      }
      return resultNote
    })
-    previousResults = resultNotes
    return resultNotes
  }

@@ -291,12 +305,12 @@ export class SearchEngine {
    await this.minisearch.addAllAsync(documents, { chunkSize })
  }

-  public addSingleToMinisearch(document: IndexedDocument): void {
-    this.minisearch.add(document)
+  public addSingleToMinisearch(path: string): void {
+    getIndexedDocument(path).then(doc => this.minisearch.add(doc))
  }

-  public removeFromMinisearch(document: IndexedDocument): void {
-    this.minisearch.remove(document)
+  public removeFromMinisearch(path: string): void {
+    this.minisearch.discard(path)
  }

  // #endregion
--- a/src/tools/utils.ts
+++ b/src/tools/utils.ts
@@ -1,9 +1,9 @@
 import {
  type CachedMetadata,
-  Notice,
-  Platform,
  getAllTags,
+  Notice,
  parseFrontMatterAliases,
+  Platform,
 } from 'obsidian'
 import type { SearchMatch } from '../globals'
 import {
@@ -207,8 +207,8 @@ export function getCtrlKeyLabel(): 'ctrl' | '⌘' {

 export function isFileIndexable(path: string): boolean {
  return (
-    (settings.PDFIndexing && path.endsWith('.pdf')) ||
    isFilePlaintext(path) ||
+    (settings.PDFIndexing && isFilePDF(path)) ||
    (settings.imagesIndexing && isFileImage(path))
  )
 }
@@ -219,6 +219,10 @@ export function isFileImage(path: string): boolean {
  )
 }

+export function isFilePDF(path: string): boolean {
+  return path.endsWith('.pdf')
+}
+
 export function isFilePlaintext(path: string): boolean {
  return getPlaintextExtensions().some(t => path.endsWith(`.${t}`))
 }