Embedding PDF.js in Omnisearch to avoid crashes

2022-10-03 13:32:16 +02:00
parent a6659d78a5
commit c497b91651
7 changed files with 84 additions and 62 deletions
@@ -1,8 +1,10 @@
-import { Notice, TAbstractFile, TFile } from 'obsidian'
+import {Notice, TAbstractFile, TFile} from 'obsidian'
 import {
+  canIndexPDFs,
  extractHeadingsFromCache,
  getAliasesFromMetadata,
-  getTagsFromMetadata, isFileIndexable,
+  getTagsFromMetadata,
+  isFileIndexable,
  isFilePlaintext,
  removeDiacritics,
  wait,
@@ -16,11 +18,11 @@ import {
  removeNoteFromCache,
  saveNotesCacheToFile,
 } from './notes'
-import { getPdfText } from './pdf-parser'
-import type { IndexedNote } from './globals'
-import { searchIndexFilePath } from './globals'
-import { settings } from './settings'
-import { minisearchInstance } from './search'
+import {getPdfText} from './pdf-parser'
+import type {IndexedNote} from './globals'
+import {searchIndexFilePath} from './globals'
+import {settings} from './settings'
+import {minisearchInstance} from './search'

 let isIndexChanged: boolean

@@ -178,3 +180,31 @@ export async function saveIndexToFile(): Promise<void> {
    isIndexChanged = false
  }
 }
+
+export async function indexPDFs() {
+  if (canIndexPDFs()) {
+    const start = new Date().getTime()
+    const files = app.vault.getFiles().filter(f => f.path.endsWith('.pdf'))
+    if (files.length > 50) {
+      new Notice(`⚠️ Omnisearch is indexing ${files.length} PDFs. You can experience slowdowns while this work is in progress.`)
+    }
+
+    const promises: Promise<void>[] = []
+    for (const file of files) {
+      if (getNoteFromCache(file.path)) {
+        removeFromIndex(file.path)
+      }
+      promises.push(addToIndex(file))
+    }
+    await Promise.all(promises)
+
+    // Notice & log
+    const message = `Omnisearch - Indexed ${files.length} PDFs in ${
+      new Date().getTime() - start
+    }ms`
+    if (settings.showIndexingNotices) {
+      new Notice(message)
+    }
+    console.log(message)
+  }
+}
@@ -1,11 +1,11 @@
 import type { TFile } from 'obsidian'
-import { loadPdfJs } from 'obsidian'
+import PDFJs from 'pdfjs-dist'
+import pdfjsWorker from 'pdfjs-dist/build/pdf.worker.entry'

-let PDFJs: any = null
+PDFJs.GlobalWorkerOptions.workerSrc = pdfjsWorker

 // https://stackoverflow.com/a/59929946
 export async function getPdfText(file: TFile): Promise<string> {
-  PDFJs = PDFJs ?? (await loadPdfJs())
  const data = await app.vault.readBinary(file)
  const doc = await PDFJs.getDocument(data).promise
  const pageTexts = Array.from({ length: doc.numPages }, async (v, i) => {
@@ -24,7 +24,7 @@ import {
  loadNotesCache,
  resetNotesCache,
 } from './notes'
-import { addToIndex, removeFromIndex, saveIndexToFile } from './notes-index'
+import {addToIndex, indexPDFs, removeFromIndex, saveIndexToFile} from './notes-index'

 export let minisearchInstance: MiniSearch<IndexedNote>

@@ -134,38 +134,6 @@ export async function initGlobalSearchIndex(): Promise<void> {
  }
 }

-async function indexPDFs() {
-  if (canIndexPDFs()) {
-    const start = new Date().getTime()
-    console.warn(
-      "Omnisearch - Warnings on 'pdf.worker.min' are due to some issues while reading PDFs file and can usually be ignored."
-    )
-    const files = app.vault.getFiles().filter(f => f.path.endsWith('.pdf'))
-    let promises: Promise<void>[] = []
-    for (const [i, file] of files.entries()) {
-      if (getNoteFromCache(file.path)) {
-        removeFromIndex(file.path)
-      }
-      promises.push(addToIndex(file))
-      if (i % 10 === 0) {
-        await wait(1)
-        await Promise.all(promises)
-        promises = []
-      }
-    }
-    await Promise.all(promises)
-
-    // Notice & log
-    const message = `Omnisearch - Indexed ${files.length} PDFs in ${
-      new Date().getTime() - start
-    }ms`
-    if (settings.showIndexingNotices) {
-      new Notice(message)
-    }
-    console.log(message)
-  }
-}
-
 /**
 * Searches the index for the given query,
 * and returns an array of raw results
@@ -106,23 +106,23 @@ export class SettingsTab extends PluginSettingTab {
          })
      })

-    // // Index PDFs
-    // const indexPDFsDesc = new DocumentFragment()
-    // indexPDFsDesc.createSpan({}, span => {
-    //   span.innerHTML = `Omnisearch will index your PDFs, and return them in search results.
-    //     This feature is currently a work-in-progress, please report slowdowns or issues that you might experience.<br>
-    //     PDFs being quite slow to index, <strong style="color: var(--text-accent)">it is strongly recommended to also enable "Store index in file"</strong>.<br>
-    //     <strong>Needs a restart to fully take effect.</strong>`
-    // })
-    // new Setting(containerEl)
-    //   .setName('BETA - Index PDFs')
-    //   .setDesc(indexPDFsDesc)
-    //   .addToggle(toggle =>
-    //     toggle.setValue(settings.indexPDFs).onChange(async v => {
-    //       settings.indexPDFs = v
-    //       await saveSettings(this.plugin)
-    //     })
-    //   )
+    // Index PDFs
+    const indexPDFsDesc = new DocumentFragment()
+    indexPDFsDesc.createSpan({}, span => {
+      span.innerHTML = `Omnisearch will index your PDFs, and return them in search results.
+        This feature is currently a work-in-progress, please report slowdowns or issues that you might experience.<br>
+        PDFs being quite slow to index, <strong style="color: var(--text-accent)">it is strongly recommended to also enable "Store index in file"</strong>.<br>
+        <strong>Needs a restart to fully take effect.</strong>`
+    })
+    new Setting(containerEl)
+      .setName('BETA - Index PDFs')
+      .setDesc(indexPDFsDesc)
+      .addToggle(toggle =>
+        toggle.setValue(settings.indexPDFs).onChange(async v => {
+          settings.indexPDFs = v
+          await saveSettings(this.plugin)
+        })
+      )

    // Store index
    const serializedIndexDesc = new DocumentFragment()
@@ -173,7 +173,7 @@ export function getCtrlKeyLabel(): 'ctrl' | '⌘' {
 }

 export function canIndexPDFs(): boolean {
-  return false
+  return settings.indexPDFs
 }

 export function isFileIndexable(path: string): boolean {