#58 - Using a pool of web workers

2022-10-18 22:20:26 +02:00
parent 31bf65283f
commit 4c212a798e
3 changed files with 55 additions and 25 deletions
--- a/src/notes-index.ts
+++ b/src/notes-index.ts
@@ -180,10 +180,6 @@ export async function indexPDFs() {
          await cacheManager.writeMinisearchIndex(Search.minisearchInstance)
        })
      )
-      // pdfQueue.add(async () => {
-      //   await addToIndexAndCache(file)
-      //   await cacheManager.writeMinisearchIndex(Search.minisearchInstance)
-      // })
    }
    await Promise.all(input)
    // await pdfQueue.onEmpty()
--- a/src/pdf-manager.ts
+++ b/src/pdf-manager.ts
@@ -1,11 +1,47 @@
 import type { TFile } from 'obsidian'
-import PDFWorker from 'web-worker:./pdf-worker.ts'
+import WebWorker from 'web-worker:./pdf-worker.ts'
 import { makeMD5 } from './utils'
 import { database } from './database'
-import { settings } from './settings'

 const workerTimeout = 120_000

+class PDFWorker {
+  private static pool: PDFWorker[] = []
+  static getWorker(): PDFWorker {
+    const free = PDFWorker.pool.find(w => !w.running)
+    if (free) {
+      return free
+    }
+    const worker = new PDFWorker(new WebWorker({ name: 'PDF Text Extractor' }))
+    PDFWorker.pool.push(worker)
+    return worker
+  }
+
+  private running = false
+
+  private constructor(private worker: Worker) {}
+
+  public async run(msg: any): Promise<any> {
+    return new Promise((resolve, reject) => {
+      this.running = true
+
+      const timeout = setTimeout(() => {
+        this.worker.terminate()
+        console.warn('Omnisearch - Worker timeout')
+        reject('timeout')
+        this.running = false
+      }, workerTimeout)
+
+      this.worker.postMessage(msg)
+      this.worker.onmessage = evt => {
+        clearTimeout(timeout)
+        resolve(evt)
+        this.running = false
+      }
+    })
+  }
+}
+
 class PDFManager {
  public async getPdfText(file: TFile): Promise<string> {
    // 1) Check if we can find by path & size
@@ -27,34 +63,31 @@ class PDFManager {
    }

    // 3) The PDF is not cached, extract it
-    const worker = new PDFWorker({ name: 'PDF Text Extractor' })
+    const worker = PDFWorker.getWorker() // new PDFWorker({ name: 'PDF Text Extractor' })
    return new Promise(async (resolve, reject) => {
-      // @ts-ignore
-      file.stat.size
-
-      // In case of a timeout, we just return an empty line.
-      // If we don't, it will try to reindex at each restart.
-      const timeout = setTimeout(() => {
-        worker.terminate()
-        console.warn('Omnisearch - Worker timeout to extract text from ' + file.basename)
-        resolve('')
-      }, workerTimeout)
-
-      worker.postMessage({ data, name: file.basename })
-      worker.onmessage = (evt: any) => {
-        const text = (evt.data.text as string)
+      try {
+        const res = await worker.run({ data, name: file.basename })
+        const text = (res.data.text as string)
          // Replace \n with spaces
          .replace(/\n/g, ' ')
          // Trim multiple spaces
          .replace(/ +/g, ' ')
          .trim()
+
+        // Add it to the cache
        database.pdf
          .add({ hash, text, path: file.path, size: file.stat.size })
          .then(() => {
-            clearTimeout(timeout)
            resolve(text)
          })
-        worker.terminate()
+      } catch (e) {
+        // In case of error (unreadable PDF or timeout) just add
+        // an empty string to the cache
+        database.pdf
+          .add({ hash, text: '', path: file.path, size: file.stat.size })
+          .then(() => {
+            resolve('')
+          })
      }
    })
  }
--- a/src/settings.ts
+++ b/src/settings.ts
@@ -178,9 +178,10 @@ export class SettingsTab extends PluginSettingTab {
    indexPDFsDesc.createSpan({}, span => {
      span.innerHTML = `Omnisearch will include PDFs in search results.
      <ul>
+        <li>⚠️ Depending on their size, PDFs can take anywhere from a few seconds to 2 minutes to be processed.</li>
        <li>⚠️ Texts extracted from PDFs may contain errors such as missing spaces, or spaces in the middle of words.</li>
+        <li>⚠️ Some PDFs can't be processed correctly and will return an empty text.</li>
        <li>This feature is currently a work-in-progress, please report issues that you might experience.</li>
-        <li>Each PDF can take a few seconds to be indexed, so it may not appear immediately in search results.</li>
      </ul>
       <strong style="color: var(--text-accent)">Needs a restart to fully take effect.</strong>`
    })
@@ -340,7 +341,7 @@ export const DEFAULT_SETTINGS: OmnisearchSettings = {
  PDFIndexing: false,
  backgroundProcesses: Platform.isMobileApp
    ? 1
-    : Math.max(1, Math.floor(require('os').cpus().length / 2)),
+    : Math.max(1, Math.floor(require('os').cpus().length - 2)),

  showIndexingNotices: false,
  showShortName: false,