Embedding PDF.js in Omnisearch to avoid crashes

This commit is contained in:
Simon Cambier
2022-10-03 13:32:16 +02:00
parent a6659d78a5
commit c497b91651
7 changed files with 84 additions and 62 deletions

View File

@@ -37,7 +37,8 @@
}, },
"dependencies": { "dependencies": {
"@vanakat/plugin-api": "^0.1.0", "@vanakat/plugin-api": "^0.1.0",
"minisearch": "^5.0.0" "minisearch": "^5.0.0",
"pdfjs-dist": "^2.16.105"
}, },
"pnpm": { "pnpm": {
"overrides": { "overrides": {

23
pnpm-lock.yaml generated
View File

@@ -19,6 +19,7 @@ specifiers:
jest: ^27.5.1 jest: ^27.5.1
minisearch: ^5.0.0 minisearch: ^5.0.0
obsidian: latest obsidian: latest
pdfjs-dist: ^2.16.105
prettier: ^2.7.1 prettier: ^2.7.1
prettier-plugin-svelte: ^2.7.0 prettier-plugin-svelte: ^2.7.0
svelte: ^3.50.1 svelte: ^3.50.1
@@ -30,6 +31,7 @@ specifiers:
dependencies: dependencies:
'@vanakat/plugin-api': 0.1.0 '@vanakat/plugin-api': 0.1.0
minisearch: 5.0.0 minisearch: 5.0.0
pdfjs-dist: 2.16.105
devDependencies: devDependencies:
'@babel/preset-env': 7.19.0 '@babel/preset-env': 7.19.0
@@ -2436,6 +2438,10 @@ packages:
webidl-conversions: 5.0.0 webidl-conversions: 5.0.0
dev: true dev: true
/dommatrix/1.0.3:
resolution: {integrity: sha512-l32Xp/TLgWb8ReqbVJAFIvXmY7go4nTxxlWiAFyhoQw9RKEOHBZNnyGvJWqDVSPmq3Y9HlM4npqF/T6VMOXhww==}
dev: false
/electron-to-chromium/1.4.247: /electron-to-chromium/1.4.247:
resolution: {integrity: sha512-FLs6R4FQE+1JHM0hh3sfdxnYjKvJpHZyhQDjc2qFq/xFvmmRt/TATNToZhrcGUFzpF2XjeiuozrA8lI0PZmYYw==} resolution: {integrity: sha512-FLs6R4FQE+1JHM0hh3sfdxnYjKvJpHZyhQDjc2qFq/xFvmmRt/TATNToZhrcGUFzpF2XjeiuozrA8lI0PZmYYw==}
dev: true dev: true
@@ -3920,6 +3926,18 @@ packages:
resolution: {integrity: sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==} resolution: {integrity: sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==}
engines: {node: '>=8'} engines: {node: '>=8'}
/pdfjs-dist/2.16.105:
resolution: {integrity: sha512-J4dn41spsAwUxCpEoVf6GVoz908IAA3mYiLmNxg8J9kfRXc2jxpbUepcP0ocp0alVNLFthTAM8DZ1RaHh8sU0A==}
peerDependencies:
worker-loader: ^3.0.8
peerDependenciesMeta:
worker-loader:
optional: true
dependencies:
dommatrix: 1.0.3
web-streams-polyfill: 3.2.1
dev: false
/picocolors/1.0.0: /picocolors/1.0.0:
resolution: {integrity: sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==} resolution: {integrity: sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==}
dev: true dev: true
@@ -4559,6 +4577,11 @@ packages:
makeerror: 1.0.12 makeerror: 1.0.12
dev: true dev: true
/web-streams-polyfill/3.2.1:
resolution: {integrity: sha512-e0MO3wdXWKrLbL0DgGnUV7WHVuw9OUvL4hjgnPkIeEvESk74gAITi5G606JtZPp39cd8HA9VQzCIvA49LpPN5Q==}
engines: {node: '>= 8'}
dev: false
/webidl-conversions/5.0.0: /webidl-conversions/5.0.0:
resolution: {integrity: sha512-VlZwKPCkYKxQgeSbH5EyngOmRp7Ww7I9rQLERETtf5ofd9pGeswWiOtogpEO850jziPRarreGxn5QIiTqpb2wA==} resolution: {integrity: sha512-VlZwKPCkYKxQgeSbH5EyngOmRp7Ww7I9rQLERETtf5ofd9pGeswWiOtogpEO850jziPRarreGxn5QIiTqpb2wA==}
engines: {node: '>=8'} engines: {node: '>=8'}

View File

@@ -1,8 +1,10 @@
import {Notice, TAbstractFile, TFile} from 'obsidian' import {Notice, TAbstractFile, TFile} from 'obsidian'
import { import {
canIndexPDFs,
extractHeadingsFromCache, extractHeadingsFromCache,
getAliasesFromMetadata, getAliasesFromMetadata,
getTagsFromMetadata, isFileIndexable, getTagsFromMetadata,
isFileIndexable,
isFilePlaintext, isFilePlaintext,
removeDiacritics, removeDiacritics,
wait, wait,
@@ -178,3 +180,31 @@ export async function saveIndexToFile(): Promise<void> {
isIndexChanged = false isIndexChanged = false
} }
} }
export async function indexPDFs() {
if (canIndexPDFs()) {
const start = new Date().getTime()
const files = app.vault.getFiles().filter(f => f.path.endsWith('.pdf'))
if (files.length > 50) {
new Notice(`⚠️ Omnisearch is indexing ${files.length} PDFs. You can experience slowdowns while this work is in progress.`)
}
const promises: Promise<void>[] = []
for (const file of files) {
if (getNoteFromCache(file.path)) {
removeFromIndex(file.path)
}
promises.push(addToIndex(file))
}
await Promise.all(promises)
// Notice & log
const message = `Omnisearch - Indexed ${files.length} PDFs in ${
new Date().getTime() - start
}ms`
if (settings.showIndexingNotices) {
new Notice(message)
}
console.log(message)
}
}

View File

@@ -1,11 +1,11 @@
import type { TFile } from 'obsidian' import type { TFile } from 'obsidian'
import { loadPdfJs } from 'obsidian' import PDFJs from 'pdfjs-dist'
import pdfjsWorker from 'pdfjs-dist/build/pdf.worker.entry'
let PDFJs: any = null PDFJs.GlobalWorkerOptions.workerSrc = pdfjsWorker
// https://stackoverflow.com/a/59929946 // https://stackoverflow.com/a/59929946
export async function getPdfText(file: TFile): Promise<string> { export async function getPdfText(file: TFile): Promise<string> {
PDFJs = PDFJs ?? (await loadPdfJs())
const data = await app.vault.readBinary(file) const data = await app.vault.readBinary(file)
const doc = await PDFJs.getDocument(data).promise const doc = await PDFJs.getDocument(data).promise
const pageTexts = Array.from({ length: doc.numPages }, async (v, i) => { const pageTexts = Array.from({ length: doc.numPages }, async (v, i) => {

View File

@@ -24,7 +24,7 @@ import {
loadNotesCache, loadNotesCache,
resetNotesCache, resetNotesCache,
} from './notes' } from './notes'
import { addToIndex, removeFromIndex, saveIndexToFile } from './notes-index' import {addToIndex, indexPDFs, removeFromIndex, saveIndexToFile} from './notes-index'
export let minisearchInstance: MiniSearch<IndexedNote> export let minisearchInstance: MiniSearch<IndexedNote>
@@ -134,38 +134,6 @@ export async function initGlobalSearchIndex(): Promise<void> {
} }
} }
async function indexPDFs() {
if (canIndexPDFs()) {
const start = new Date().getTime()
console.warn(
"Omnisearch - Warnings on 'pdf.worker.min' are due to some issues while reading PDFs file and can usually be ignored."
)
const files = app.vault.getFiles().filter(f => f.path.endsWith('.pdf'))
let promises: Promise<void>[] = []
for (const [i, file] of files.entries()) {
if (getNoteFromCache(file.path)) {
removeFromIndex(file.path)
}
promises.push(addToIndex(file))
if (i % 10 === 0) {
await wait(1)
await Promise.all(promises)
promises = []
}
}
await Promise.all(promises)
// Notice & log
const message = `Omnisearch - Indexed ${files.length} PDFs in ${
new Date().getTime() - start
}ms`
if (settings.showIndexingNotices) {
new Notice(message)
}
console.log(message)
}
}
/** /**
* Searches the index for the given query, * Searches the index for the given query,
* and returns an array of raw results * and returns an array of raw results

View File

@@ -106,23 +106,23 @@ export class SettingsTab extends PluginSettingTab {
}) })
}) })
// // Index PDFs // Index PDFs
// const indexPDFsDesc = new DocumentFragment() const indexPDFsDesc = new DocumentFragment()
// indexPDFsDesc.createSpan({}, span => { indexPDFsDesc.createSpan({}, span => {
// span.innerHTML = `Omnisearch will index your PDFs, and return them in search results. span.innerHTML = `Omnisearch will index your PDFs, and return them in search results.
// This feature is currently a work-in-progress, please report slowdowns or issues that you might experience.<br> This feature is currently a work-in-progress, please report slowdowns or issues that you might experience.<br>
// PDFs being quite slow to index, <strong style="color: var(--text-accent)">it is strongly recommended to also enable "Store index in file"</strong>.<br> PDFs being quite slow to index, <strong style="color: var(--text-accent)">it is strongly recommended to also enable "Store index in file"</strong>.<br>
// <strong>Needs a restart to fully take effect.</strong>` <strong>Needs a restart to fully take effect.</strong>`
// }) })
// new Setting(containerEl) new Setting(containerEl)
// .setName('BETA - Index PDFs') .setName('BETA - Index PDFs')
// .setDesc(indexPDFsDesc) .setDesc(indexPDFsDesc)
// .addToggle(toggle => .addToggle(toggle =>
// toggle.setValue(settings.indexPDFs).onChange(async v => { toggle.setValue(settings.indexPDFs).onChange(async v => {
// settings.indexPDFs = v settings.indexPDFs = v
// await saveSettings(this.plugin) await saveSettings(this.plugin)
// }) })
// ) )
// Store index // Store index
const serializedIndexDesc = new DocumentFragment() const serializedIndexDesc = new DocumentFragment()

View File

@@ -173,7 +173,7 @@ export function getCtrlKeyLabel(): 'ctrl' | '⌘' {
} }
export function canIndexPDFs(): boolean { export function canIndexPDFs(): boolean {
return false return settings.indexPDFs
} }
export function isFileIndexable(path: string): boolean { export function isFileIndexable(path: string): boolean {