Squashed commit of the following:

commit ac82511ddd17d5472ae3cfea9bbad9754f5a4d62 Author: Simon Cambier <simon.cambier@protonmail.com> Date: Sat Oct 22 08:23:42 2022 +0200 Screw that cache, seriously. commit 8ba40d1be73daaaffea09e07bc56c339266db9b6 Author: Simon Cambier <simon.cambier@protonmail.com> Date: Fri Oct 21 22:36:48 2022 +0200 Stuff commit 27b8fd7dc809be9714a109d3a458eb1276a47e2e Author: Simon Cambier <simon.cambier@protonmail.com> Date: Fri Oct 21 22:22:20 2022 +0200 Moved files commit fb1349c914907e586e103ca54fb04b9ddd45ef5d Author: Simon Cambier <simon.cambier@protonmail.com> Date: Thu Oct 20 22:25:29 2022 +0200 Removed duplicate code commit e7371138e60cbe4155cfd4fb44e3ee1d2e3ee088 Author: Simon Cambier <simon.cambier@protonmail.com> Date: Thu Oct 20 21:50:09 2022 +0200 Moved a bunch of files commit 2ee1b2a0e799d4b41ab3a444d8cc44dfff5b5623 Author: Simon Cambier <simon.cambier@protonmail.com> Date: Thu Oct 20 21:32:21 2022 +0200 Removed useless code commit 76c530dfb9adbad1bbe9079de2330fe43a044249 Author: Simon Cambier <simon.cambier@protonmail.com> Date: Thu Oct 20 20:44:11 2022 +0200 Split file reading and indexing
2022-10-22 08:25:34 +02:00
parent 1376cea282
commit c2ecdd79ad
25 changed files with 338 additions and 403 deletions
--- a/src/pdf/pdf-manager.ts
+++ b/src/pdf/pdf-manager.ts
@@ -0,0 +1,107 @@
+import type { TFile } from 'obsidian'
+import WebWorker from 'web-worker:./pdf-worker.ts'
+import { makeMD5 } from '../tools/utils'
+import { database } from '../database'
+
+const workerTimeout = 120_000
+
+class PDFWorker {
+  private static pool: PDFWorker[] = []
+  static getWorker(): PDFWorker {
+    const free = PDFWorker.pool.find(w => !w.running)
+    if (free) {
+      return free
+    }
+    const worker = new PDFWorker(new WebWorker({ name: 'PDF Text Extractor' }))
+    PDFWorker.pool.push(worker)
+    return worker
+  }
+
+  private running = false
+
+  private constructor(private worker: Worker) {}
+
+  public async run(msg: any): Promise<any> {
+    return new Promise((resolve, reject) => {
+      this.running = true
+
+      const timeout = setTimeout(() => {
+        this.worker.terminate()
+        console.warn('Omnisearch - Worker timeout')
+        reject('timeout')
+        this.running = false
+      }, workerTimeout)
+
+      this.worker.postMessage(msg)
+      this.worker.onmessage = evt => {
+        clearTimeout(timeout)
+        resolve(evt)
+        this.running = false
+      }
+    })
+  }
+}
+
+class PDFManager {
+  public async getPdfText(file: TFile): Promise<string> {
+    // 1) Check if we can find by path & size
+    const docByPath = await database.pdf.get({
+      path: file.path,
+      size: file.stat.size,
+    })
+
+    if (docByPath) {
+      return docByPath.text
+    }
+
+    // 2) Check by hash
+    const data = new Uint8Array(await app.vault.readBinary(file))
+    const hash = makeMD5(data)
+    const docByHash = await database.pdf.get(hash)
+    if (docByHash) {
+      return docByHash.text
+    }
+
+    // 3) The PDF is not cached, extract it
+    const worker = PDFWorker.getWorker() // new PDFWorker({ name: 'PDF Text Extractor' })
+    return new Promise(async (resolve, reject) => {
+      try {
+        const res = await worker.run({ data, name: file.basename })
+        const text = (res.data.text as string)
+          // Replace \n with spaces
+          .replace(/\n/g, ' ')
+          // Trim multiple spaces
+          .replace(/ +/g, ' ')
+          .trim()
+
+        // Add it to the cache
+        database.pdf
+          .add({ hash, text, path: file.path, size: file.stat.size })
+          .then(() => {
+            resolve(text)
+          })
+      } catch (e) {
+        // In case of error (unreadable PDF or timeout) just add
+        // an empty string to the cache
+        database.pdf
+          .add({ hash, text: '', path: file.path, size: file.stat.size })
+          .then(() => {
+            resolve('')
+          })
+      }
+    })
+  }
+
+  /**
+   * Removes the outdated cache entries
+   */
+  public async cleanCache(): Promise<void> {
+    database.pdf.each(async item => {
+      if (!(await app.vault.adapter.exists(item.path))) {
+        console.log(item.path + ' does not exist')
+      }
+    })
+  }
+}
+
+export const pdfManager = new PDFManager()
--- a/src/pdf/pdf-worker.ts
+++ b/src/pdf/pdf-worker.ts
@@ -0,0 +1,21 @@
+import rustPlugin from '../../pkg/obsidian_search_bg.wasm'
+import * as plugin from '../../pkg'
+
+const decodedPlugin = decodeBase64(rustPlugin as any)
+
+onmessage = async evt => {
+  const buffer = Uint8Array.from(decodedPlugin, c => c.charCodeAt(0))
+  await plugin.default(Promise.resolve(buffer))
+  try {
+    const text = plugin.extract_pdf_text(evt.data.data as Uint8Array)
+    self.postMessage({ text })
+  } catch (e) {
+    console.warn('Omnisearch - Could not extract text from ' + evt.data.name)
+    self.postMessage({ text: '' })
+  }
+}
+
+function decodeBase64(data: string) {
+  return atob(data)
+  // return Buffer.from(data, 'base64').toString()
+}