Refactor to split the text extractor is done.

2022-11-05 23:18:28 +01:00
parent 74db2844a9
commit e6c113d83b
8 changed files with 818 additions and 617 deletions
--- a/src/cache-manager.ts
+++ b/src/cache-manager.ts
@@ -3,7 +3,7 @@ import type { IndexedDocument } from './globals'
 import { database } from './database'
 import MiniSearch from 'minisearch'
 import { minisearchOptions } from './search/search-engine'
-import { makeMD5, wait } from './tools/utils'
+import { makeMD5 } from './tools/utils'
 import { settings } from './settings'

 class CacheManager {
@@ -121,28 +121,34 @@ class CacheManager {
  public async getDiffDocuments(documents: IndexedDocument[]): Promise<{
    toDelete: IndexedDocument[]
    toAdd: IndexedDocument[]
-    toUpdate: { old: IndexedDocument; new: IndexedDocument }[]
+    toUpdate: { oldDoc: IndexedDocument; newDoc: IndexedDocument }[]
  }> {
    let cachedDocs = await database.documents.toArray()
+    // present in `documents` but not in `cachedDocs`
    const toAdd = documents.filter(
      d => !cachedDocs.find(c => c.path === d.path)
    )
+    // present in `cachedDocs` but not in `documents`
    const toDelete = cachedDocs
      .filter(c => !documents.find(d => d.path === c.path))
      .map(d => d.document)

+    // toUpdate: same path, but different mtime
    const toUpdate = cachedDocs
-      .filter(c =>
-        documents.find(d => d.path === c.path && d.mtime !== c.mtime)
+      .filter(({ mtime: cMtime, path: cPath }) =>
+        documents.some(
+          ({ mtime: dMtime, path: dPath }) =>
+            cPath === dPath && dMtime !== cMtime
+        )
      )
      .map(c => ({
-        old: c.document,
-        new: documents.find(d => d.path === c.path)!,
+        oldDoc: c.document,
+        newDoc: documents.find(d => d.path === c.path)!,
      }))

    return {
-      toDelete,
      toAdd,
+      toDelete,
      toUpdate,
    }
  }
@@ -167,9 +173,9 @@ class CacheManager {
    // console.log(`Omnisearch - Cache - Will update ${toUpdate.length} documents`)
    await database.documents.bulkPut(
      toUpdate.map(o => ({
-        document: o.new,
-        mtime: o.new.mtime,
-        path: o.new.path,
+        document: o.newDoc,
+        mtime: o.newDoc.mtime,
+        path: o.newDoc.path,
      }))
    )

--- a/src/components/modals.ts
+++ b/src/components/modals.ts
@@ -2,7 +2,6 @@ import { App, Modal, TFile } from 'obsidian'
 import ModalVault from './ModalVault.svelte'
 import ModalInFile from './ModalInFile.svelte'
 import { eventBus, EventNames, isInputComposition } from '../globals'
-import { settings } from '../settings'

 abstract class OmnisearchModal extends Modal {
  protected constructor(app: App) {
--- a/src/database.ts
+++ b/src/database.ts
@@ -32,7 +32,6 @@ export class OmnisearchCache extends Dexie {

  //#region Table declarations

-  pdf!: Dexie.Table<{ path: string; hash: string; text: string }, string>
  documents!: Dexie.Table<
    { path: string; mtime: number; document: IndexedDocument },
    string
@@ -56,7 +55,6 @@ export class OmnisearchCache extends Dexie {
    super(OmnisearchCache.dbName)
    // Database structure
    this.version(OmnisearchCache.dbVersion).stores({
-      pdf: 'path, hash, size',
      searchHistory: '++id',
      documents: 'path',
      minisearch: 'date',
--- a/src/file-loader.ts
+++ b/src/file-loader.ts
@@ -32,22 +32,16 @@ export async function getPlainTextFiles(): Promise<IndexedDocument[]> {
 * If a PDF isn't cached, it will be read from the disk and added to the IndexedDB
 */
 export async function getPDFFiles(): Promise<IndexedDocument[]> {
-  const fromDisk = app.vault.getFiles().filter(f => f.path.endsWith('.pdf'))
-  const fromDb = await database.pdf.toArray()
-
+  const files = app.vault.getFiles().filter(f => f.path.endsWith('.pdf'))
  const data: IndexedDocument[] = []
  const input = []
-  for (const file of fromDisk) {
-    input.push(
-      NotesIndex.processQueue(async () => {
-        const doc = await fileToIndexedDocument(
-          file,
-          fromDb.find(o => o.path === file.path)?.text
-        )
-        await cacheManager.updateLiveDocument(file.path, doc)
-        data.push(doc)
-      })
-    )
+  for (const file of files) {
+    input.push(new Promise(async (resolve, reject) => {
+      const doc = await fileToIndexedDocument(file)
+      await cacheManager.updateLiveDocument(file.path, doc)
+      data.push(doc)
+      return resolve(null)
+    }))
  }
  await Promise.all(input)
  return data
@@ -56,21 +50,17 @@ export async function getPDFFiles(): Promise<IndexedDocument[]> {
 /**
 * Convert a file into an IndexedDocument.
 * Will use the cache if possible.
- * @param file
- * @param content If we give a text content, will skip the fetching part
 */
 export async function fileToIndexedDocument(
  file: TFile,
-  content?: string
 ): Promise<IndexedDocument> {
-  if (!content) {
-    if (isFilePlaintext(file.path)) {
-      content = await app.vault.cachedRead(file)
-    } else if (file.path.endsWith('.pdf')) {
-      content = await getPdfText(file)
-    } else {
-      throw new Error('Invalid file: ' + file.path)
-    }
+  let content: string
+  if (isFilePlaintext(file.path)) {
+    content = await app.vault.cachedRead(file)
+  } else if (file.path.endsWith('.pdf')) {
+    content = await getPdfText(file)
+  } else {
+    throw new Error('Invalid file: ' + file.path)
  }

  content = removeDiacritics(content)
--- a/src/main.ts
+++ b/src/main.ts
@@ -105,7 +105,7 @@ export default class OmnisearchPlugin extends Plugin {
 * Read the files and feed them to Minisearch
 */
 async function populateIndex(): Promise<void> {
-  console.time('Omnisearch - Indexing duration')
+  console.time('Omnisearch - Indexing total time')

  // Initialize minisearch
  let engine = SearchEngine.getEngine()
@@ -116,6 +116,7 @@ async function populateIndex(): Promise<void> {
  }

  // Load plaintext files
+  console.log('Omnisearch - Fetching notes')
  const plainTextFiles = await FileLoader.getPlainTextFiles()
  let allFiles = [...plainTextFiles]
  // iOS: since there's no cache, directly index the documents
@@ -126,6 +127,7 @@ async function populateIndex(): Promise<void> {

  // Load PDFs
  if (settings.PDFIndexing) {
+    console.log('Omnisearch - Fetching PDFs')
    const pdfs = await FileLoader.getPDFFiles()
    // iOS: since there's no cache, just index the documents
    if (Platform.isIosApp) {
@@ -136,37 +138,52 @@ async function populateIndex(): Promise<void> {
    allFiles = [...allFiles, ...pdfs]
  }

+  console.log('Omnisearch - Total number of files: ' + allFiles.length)
+  let needToUpdateCache = false
+
  // Other platforms: make a diff of what's to add/update/delete
  if (!Platform.isIosApp) {
+    console.log('Omnisearch - Checking index cache diff...')
    // Check which documents need to be removed/added/updated
    const diffDocs = await cacheManager.getDiffDocuments(allFiles)
+    needToUpdateCache = !!(
+      diffDocs.toAdd.length ||
+      diffDocs.toDelete.length ||
+      diffDocs.toUpdate.length
+    )
+
    // Add
    await engine.addAllToMinisearch(diffDocs.toAdd)
+    console.log(`Omnisearch - ${diffDocs.toAdd.length} files to add`)
    diffDocs.toAdd.forEach(doc =>
      cacheManager.updateLiveDocument(doc.path, doc)
    )

    // Delete
+    console.log(`Omnisearch - ${diffDocs.toDelete.length} files to remove`)
    diffDocs.toDelete.forEach(d => engine.removeFromMinisearch(d))
    diffDocs.toDelete.forEach(doc => cacheManager.deleteLiveDocument(doc.path))

    // Update (delete + add)
+    console.log(`Omnisearch - ${diffDocs.toUpdate.length} files to update`)
    diffDocs.toUpdate
-      .map(d => d.old)
-      .forEach(d => {
-        engine.removeFromMinisearch(d)
-        cacheManager.updateLiveDocument(d.path, d)
+      .forEach(({ oldDoc, newDoc }) => {
+        engine.removeFromMinisearch(oldDoc)
+        cacheManager.updateLiveDocument(oldDoc.path, newDoc)
      })
-    await engine.addAllToMinisearch(diffDocs.toUpdate.map(d => d.new))
+    await engine.addAllToMinisearch(diffDocs.toUpdate.map(d => d.newDoc))
  }
+
  // Load PDFs into the main search engine, and write cache
  // SearchEngine.loadTmpDataIntoMain()
  SearchEngine.isIndexing.set(false)
-  if (!Platform.isIosApp) {
+
+  if (!Platform.isIosApp && needToUpdateCache) {
+    console.log('Omnisearch - Writing cache...')
    await SearchEngine.getEngine().writeToCache(allFiles)
  }

-  console.timeEnd('Omnisearch - Indexing duration')
+  console.timeEnd('Omnisearch - Indexing total time')
 }

 async function cleanOldCacheFiles() {
--- a/src/tools/utils.ts
+++ b/src/tools/utils.ts
@@ -156,7 +156,7 @@ export async function filterAsync<T>(
 * @returns
 */
 export function stripMarkdownCharacters(text: string): string {
-  return text.replace(/(\*|_)+(.+?)(\*|_)+/g, (match, p1, p2) => p2)
+  return text.replace(/(\*|_)+(.+?)(\*|_)+/g, (_match, _p1, p2) => p2)
 }

 export function getAliasesFromMetadata(