From c497b91651504744e78f8ea2216754e5f5eddc3c Mon Sep 17 00:00:00 2001 From: Simon Cambier Date: Mon, 3 Oct 2022 13:32:16 +0200 Subject: [PATCH] Embedding PDF.js in Omnisearch to avoid crashes --- package.json | 3 ++- pnpm-lock.yaml | 23 +++++++++++++++++++++++ src/notes-index.ts | 44 +++++++++++++++++++++++++++++++++++++------- src/pdf-parser.ts | 6 +++--- src/search.ts | 34 +--------------------------------- src/settings.ts | 34 +++++++++++++++++----------------- src/utils.ts | 2 +- 7 files changed, 84 insertions(+), 62 deletions(-) diff --git a/package.json b/package.json index df2d8d1..3b1ebf8 100644 --- a/package.json +++ b/package.json @@ -37,7 +37,8 @@ }, "dependencies": { "@vanakat/plugin-api": "^0.1.0", - "minisearch": "^5.0.0" + "minisearch": "^5.0.0", + "pdfjs-dist": "^2.16.105" }, "pnpm": { "overrides": { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 252dcb8..d34b019 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -19,6 +19,7 @@ specifiers: jest: ^27.5.1 minisearch: ^5.0.0 obsidian: latest + pdfjs-dist: ^2.16.105 prettier: ^2.7.1 prettier-plugin-svelte: ^2.7.0 svelte: ^3.50.1 @@ -30,6 +31,7 @@ specifiers: dependencies: '@vanakat/plugin-api': 0.1.0 minisearch: 5.0.0 + pdfjs-dist: 2.16.105 devDependencies: '@babel/preset-env': 7.19.0 @@ -2436,6 +2438,10 @@ packages: webidl-conversions: 5.0.0 dev: true + /dommatrix/1.0.3: + resolution: {integrity: sha512-l32Xp/TLgWb8ReqbVJAFIvXmY7go4nTxxlWiAFyhoQw9RKEOHBZNnyGvJWqDVSPmq3Y9HlM4npqF/T6VMOXhww==} + dev: false + /electron-to-chromium/1.4.247: resolution: {integrity: sha512-FLs6R4FQE+1JHM0hh3sfdxnYjKvJpHZyhQDjc2qFq/xFvmmRt/TATNToZhrcGUFzpF2XjeiuozrA8lI0PZmYYw==} dev: true @@ -3920,6 +3926,18 @@ packages: resolution: {integrity: sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==} engines: {node: '>=8'} + /pdfjs-dist/2.16.105: + resolution: {integrity: sha512-J4dn41spsAwUxCpEoVf6GVoz908IAA3mYiLmNxg8J9kfRXc2jxpbUepcP0ocp0alVNLFthTAM8DZ1RaHh8sU0A==} + peerDependencies: + worker-loader: ^3.0.8 + peerDependenciesMeta: + worker-loader: + optional: true + dependencies: + dommatrix: 1.0.3 + web-streams-polyfill: 3.2.1 + dev: false + /picocolors/1.0.0: resolution: {integrity: sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==} dev: true @@ -4559,6 +4577,11 @@ packages: makeerror: 1.0.12 dev: true + /web-streams-polyfill/3.2.1: + resolution: {integrity: sha512-e0MO3wdXWKrLbL0DgGnUV7WHVuw9OUvL4hjgnPkIeEvESk74gAITi5G606JtZPp39cd8HA9VQzCIvA49LpPN5Q==} + engines: {node: '>= 8'} + dev: false + /webidl-conversions/5.0.0: resolution: {integrity: sha512-VlZwKPCkYKxQgeSbH5EyngOmRp7Ww7I9rQLERETtf5ofd9pGeswWiOtogpEO850jziPRarreGxn5QIiTqpb2wA==} engines: {node: '>=8'} diff --git a/src/notes-index.ts b/src/notes-index.ts index c9592e9..a7a3c9c 100644 --- a/src/notes-index.ts +++ b/src/notes-index.ts @@ -1,8 +1,10 @@ -import { Notice, TAbstractFile, TFile } from 'obsidian' +import {Notice, TAbstractFile, TFile} from 'obsidian' import { + canIndexPDFs, extractHeadingsFromCache, getAliasesFromMetadata, - getTagsFromMetadata, isFileIndexable, + getTagsFromMetadata, + isFileIndexable, isFilePlaintext, removeDiacritics, wait, @@ -16,11 +18,11 @@ import { removeNoteFromCache, saveNotesCacheToFile, } from './notes' -import { getPdfText } from './pdf-parser' -import type { IndexedNote } from './globals' -import { searchIndexFilePath } from './globals' -import { settings } from './settings' -import { minisearchInstance } from './search' +import {getPdfText} from './pdf-parser' +import type {IndexedNote} from './globals' +import {searchIndexFilePath} from './globals' +import {settings} from './settings' +import {minisearchInstance} from './search' let isIndexChanged: boolean @@ -178,3 +180,31 @@ export async function saveIndexToFile(): Promise { isIndexChanged = false } } + +export async function indexPDFs() { + if (canIndexPDFs()) { + const start = new Date().getTime() + const files = app.vault.getFiles().filter(f => f.path.endsWith('.pdf')) + if (files.length > 50) { + new Notice(`⚠️ Omnisearch is indexing ${files.length} PDFs. You can experience slowdowns while this work is in progress.`) + } + + const promises: Promise[] = [] + for (const file of files) { + if (getNoteFromCache(file.path)) { + removeFromIndex(file.path) + } + promises.push(addToIndex(file)) + } + await Promise.all(promises) + + // Notice & log + const message = `Omnisearch - Indexed ${files.length} PDFs in ${ + new Date().getTime() - start + }ms` + if (settings.showIndexingNotices) { + new Notice(message) + } + console.log(message) + } +} \ No newline at end of file diff --git a/src/pdf-parser.ts b/src/pdf-parser.ts index 886f25c..d76c530 100644 --- a/src/pdf-parser.ts +++ b/src/pdf-parser.ts @@ -1,11 +1,11 @@ import type { TFile } from 'obsidian' -import { loadPdfJs } from 'obsidian' +import PDFJs from 'pdfjs-dist' +import pdfjsWorker from 'pdfjs-dist/build/pdf.worker.entry' -let PDFJs: any = null +PDFJs.GlobalWorkerOptions.workerSrc = pdfjsWorker // https://stackoverflow.com/a/59929946 export async function getPdfText(file: TFile): Promise { - PDFJs = PDFJs ?? (await loadPdfJs()) const data = await app.vault.readBinary(file) const doc = await PDFJs.getDocument(data).promise const pageTexts = Array.from({ length: doc.numPages }, async (v, i) => { diff --git a/src/search.ts b/src/search.ts index ce3251d..97c9fd7 100644 --- a/src/search.ts +++ b/src/search.ts @@ -24,7 +24,7 @@ import { loadNotesCache, resetNotesCache, } from './notes' -import { addToIndex, removeFromIndex, saveIndexToFile } from './notes-index' +import {addToIndex, indexPDFs, removeFromIndex, saveIndexToFile} from './notes-index' export let minisearchInstance: MiniSearch @@ -134,38 +134,6 @@ export async function initGlobalSearchIndex(): Promise { } } -async function indexPDFs() { - if (canIndexPDFs()) { - const start = new Date().getTime() - console.warn( - "Omnisearch - Warnings on 'pdf.worker.min' are due to some issues while reading PDFs file and can usually be ignored." - ) - const files = app.vault.getFiles().filter(f => f.path.endsWith('.pdf')) - let promises: Promise[] = [] - for (const [i, file] of files.entries()) { - if (getNoteFromCache(file.path)) { - removeFromIndex(file.path) - } - promises.push(addToIndex(file)) - if (i % 10 === 0) { - await wait(1) - await Promise.all(promises) - promises = [] - } - } - await Promise.all(promises) - - // Notice & log - const message = `Omnisearch - Indexed ${files.length} PDFs in ${ - new Date().getTime() - start - }ms` - if (settings.showIndexingNotices) { - new Notice(message) - } - console.log(message) - } -} - /** * Searches the index for the given query, * and returns an array of raw results diff --git a/src/settings.ts b/src/settings.ts index f8ce735..5681849 100644 --- a/src/settings.ts +++ b/src/settings.ts @@ -106,23 +106,23 @@ export class SettingsTab extends PluginSettingTab { }) }) - // // Index PDFs - // const indexPDFsDesc = new DocumentFragment() - // indexPDFsDesc.createSpan({}, span => { - // span.innerHTML = `Omnisearch will index your PDFs, and return them in search results. - // This feature is currently a work-in-progress, please report slowdowns or issues that you might experience.
- // PDFs being quite slow to index, it is strongly recommended to also enable "Store index in file".
- // Needs a restart to fully take effect.` - // }) - // new Setting(containerEl) - // .setName('BETA - Index PDFs') - // .setDesc(indexPDFsDesc) - // .addToggle(toggle => - // toggle.setValue(settings.indexPDFs).onChange(async v => { - // settings.indexPDFs = v - // await saveSettings(this.plugin) - // }) - // ) + // Index PDFs + const indexPDFsDesc = new DocumentFragment() + indexPDFsDesc.createSpan({}, span => { + span.innerHTML = `Omnisearch will index your PDFs, and return them in search results. + This feature is currently a work-in-progress, please report slowdowns or issues that you might experience.
+ PDFs being quite slow to index, it is strongly recommended to also enable "Store index in file".
+ Needs a restart to fully take effect.` + }) + new Setting(containerEl) + .setName('BETA - Index PDFs') + .setDesc(indexPDFsDesc) + .addToggle(toggle => + toggle.setValue(settings.indexPDFs).onChange(async v => { + settings.indexPDFs = v + await saveSettings(this.plugin) + }) + ) // Store index const serializedIndexDesc = new DocumentFragment() diff --git a/src/utils.ts b/src/utils.ts index 4a8fcf4..4396f24 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -173,7 +173,7 @@ export function getCtrlKeyLabel(): 'ctrl' | '⌘' { } export function canIndexPDFs(): boolean { - return false + return settings.indexPDFs } export function isFileIndexable(path: string): boolean {