Embedding PDF.js in Omnisearch to avoid crashes

This commit is contained in:
Simon Cambier
2022-10-03 13:32:16 +02:00
parent a6659d78a5
commit c497b91651
7 changed files with 84 additions and 62 deletions

View File

@@ -37,7 +37,8 @@
},
"dependencies": {
"@vanakat/plugin-api": "^0.1.0",
"minisearch": "^5.0.0"
"minisearch": "^5.0.0",
"pdfjs-dist": "^2.16.105"
},
"pnpm": {
"overrides": {

23
pnpm-lock.yaml generated
View File

@@ -19,6 +19,7 @@ specifiers:
jest: ^27.5.1
minisearch: ^5.0.0
obsidian: latest
pdfjs-dist: ^2.16.105
prettier: ^2.7.1
prettier-plugin-svelte: ^2.7.0
svelte: ^3.50.1
@@ -30,6 +31,7 @@ specifiers:
dependencies:
'@vanakat/plugin-api': 0.1.0
minisearch: 5.0.0
pdfjs-dist: 2.16.105
devDependencies:
'@babel/preset-env': 7.19.0
@@ -2436,6 +2438,10 @@ packages:
webidl-conversions: 5.0.0
dev: true
/dommatrix/1.0.3:
resolution: {integrity: sha512-l32Xp/TLgWb8ReqbVJAFIvXmY7go4nTxxlWiAFyhoQw9RKEOHBZNnyGvJWqDVSPmq3Y9HlM4npqF/T6VMOXhww==}
dev: false
/electron-to-chromium/1.4.247:
resolution: {integrity: sha512-FLs6R4FQE+1JHM0hh3sfdxnYjKvJpHZyhQDjc2qFq/xFvmmRt/TATNToZhrcGUFzpF2XjeiuozrA8lI0PZmYYw==}
dev: true
@@ -3920,6 +3926,18 @@ packages:
resolution: {integrity: sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==}
engines: {node: '>=8'}
/pdfjs-dist/2.16.105:
resolution: {integrity: sha512-J4dn41spsAwUxCpEoVf6GVoz908IAA3mYiLmNxg8J9kfRXc2jxpbUepcP0ocp0alVNLFthTAM8DZ1RaHh8sU0A==}
peerDependencies:
worker-loader: ^3.0.8
peerDependenciesMeta:
worker-loader:
optional: true
dependencies:
dommatrix: 1.0.3
web-streams-polyfill: 3.2.1
dev: false
/picocolors/1.0.0:
resolution: {integrity: sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==}
dev: true
@@ -4559,6 +4577,11 @@ packages:
makeerror: 1.0.12
dev: true
/web-streams-polyfill/3.2.1:
resolution: {integrity: sha512-e0MO3wdXWKrLbL0DgGnUV7WHVuw9OUvL4hjgnPkIeEvESk74gAITi5G606JtZPp39cd8HA9VQzCIvA49LpPN5Q==}
engines: {node: '>= 8'}
dev: false
/webidl-conversions/5.0.0:
resolution: {integrity: sha512-VlZwKPCkYKxQgeSbH5EyngOmRp7Ww7I9rQLERETtf5ofd9pGeswWiOtogpEO850jziPRarreGxn5QIiTqpb2wA==}
engines: {node: '>=8'}

View File

@@ -1,8 +1,10 @@
import {Notice, TAbstractFile, TFile} from 'obsidian'
import {
canIndexPDFs,
extractHeadingsFromCache,
getAliasesFromMetadata,
getTagsFromMetadata, isFileIndexable,
getTagsFromMetadata,
isFileIndexable,
isFilePlaintext,
removeDiacritics,
wait,
@@ -178,3 +180,31 @@ export async function saveIndexToFile(): Promise<void> {
isIndexChanged = false
}
}
export async function indexPDFs() {
if (canIndexPDFs()) {
const start = new Date().getTime()
const files = app.vault.getFiles().filter(f => f.path.endsWith('.pdf'))
if (files.length > 50) {
new Notice(`⚠️ Omnisearch is indexing ${files.length} PDFs. You can experience slowdowns while this work is in progress.`)
}
const promises: Promise<void>[] = []
for (const file of files) {
if (getNoteFromCache(file.path)) {
removeFromIndex(file.path)
}
promises.push(addToIndex(file))
}
await Promise.all(promises)
// Notice & log
const message = `Omnisearch - Indexed ${files.length} PDFs in ${
new Date().getTime() - start
}ms`
if (settings.showIndexingNotices) {
new Notice(message)
}
console.log(message)
}
}

View File

@@ -1,11 +1,11 @@
import type { TFile } from 'obsidian'
import { loadPdfJs } from 'obsidian'
import PDFJs from 'pdfjs-dist'
import pdfjsWorker from 'pdfjs-dist/build/pdf.worker.entry'
let PDFJs: any = null
PDFJs.GlobalWorkerOptions.workerSrc = pdfjsWorker
// https://stackoverflow.com/a/59929946
export async function getPdfText(file: TFile): Promise<string> {
PDFJs = PDFJs ?? (await loadPdfJs())
const data = await app.vault.readBinary(file)
const doc = await PDFJs.getDocument(data).promise
const pageTexts = Array.from({ length: doc.numPages }, async (v, i) => {

View File

@@ -24,7 +24,7 @@ import {
loadNotesCache,
resetNotesCache,
} from './notes'
import { addToIndex, removeFromIndex, saveIndexToFile } from './notes-index'
import {addToIndex, indexPDFs, removeFromIndex, saveIndexToFile} from './notes-index'
export let minisearchInstance: MiniSearch<IndexedNote>
@@ -134,38 +134,6 @@ export async function initGlobalSearchIndex(): Promise<void> {
}
}
async function indexPDFs() {
if (canIndexPDFs()) {
const start = new Date().getTime()
console.warn(
"Omnisearch - Warnings on 'pdf.worker.min' are due to some issues while reading PDFs file and can usually be ignored."
)
const files = app.vault.getFiles().filter(f => f.path.endsWith('.pdf'))
let promises: Promise<void>[] = []
for (const [i, file] of files.entries()) {
if (getNoteFromCache(file.path)) {
removeFromIndex(file.path)
}
promises.push(addToIndex(file))
if (i % 10 === 0) {
await wait(1)
await Promise.all(promises)
promises = []
}
}
await Promise.all(promises)
// Notice & log
const message = `Omnisearch - Indexed ${files.length} PDFs in ${
new Date().getTime() - start
}ms`
if (settings.showIndexingNotices) {
new Notice(message)
}
console.log(message)
}
}
/**
* Searches the index for the given query,
* and returns an array of raw results

View File

@@ -106,23 +106,23 @@ export class SettingsTab extends PluginSettingTab {
})
})
// // Index PDFs
// const indexPDFsDesc = new DocumentFragment()
// indexPDFsDesc.createSpan({}, span => {
// span.innerHTML = `Omnisearch will index your PDFs, and return them in search results.
// This feature is currently a work-in-progress, please report slowdowns or issues that you might experience.<br>
// PDFs being quite slow to index, <strong style="color: var(--text-accent)">it is strongly recommended to also enable "Store index in file"</strong>.<br>
// <strong>Needs a restart to fully take effect.</strong>`
// })
// new Setting(containerEl)
// .setName('BETA - Index PDFs')
// .setDesc(indexPDFsDesc)
// .addToggle(toggle =>
// toggle.setValue(settings.indexPDFs).onChange(async v => {
// settings.indexPDFs = v
// await saveSettings(this.plugin)
// })
// )
// Index PDFs
const indexPDFsDesc = new DocumentFragment()
indexPDFsDesc.createSpan({}, span => {
span.innerHTML = `Omnisearch will index your PDFs, and return them in search results.
This feature is currently a work-in-progress, please report slowdowns or issues that you might experience.<br>
PDFs being quite slow to index, <strong style="color: var(--text-accent)">it is strongly recommended to also enable "Store index in file"</strong>.<br>
<strong>Needs a restart to fully take effect.</strong>`
})
new Setting(containerEl)
.setName('BETA - Index PDFs')
.setDesc(indexPDFsDesc)
.addToggle(toggle =>
toggle.setValue(settings.indexPDFs).onChange(async v => {
settings.indexPDFs = v
await saveSettings(this.plugin)
})
)
// Store index
const serializedIndexDesc = new DocumentFragment()

View File

@@ -173,7 +173,7 @@ export function getCtrlKeyLabel(): 'ctrl' | '⌘' {
}
export function canIndexPDFs(): boolean {
return false
return settings.indexPDFs
}
export function isFileIndexable(path: string): boolean {