Embedding PDF.js in Omnisearch to avoid crashes

This commit is contained in:
Simon Cambier
2022-10-03 13:32:16 +02:00
parent a6659d78a5
commit c497b91651
7 changed files with 84 additions and 62 deletions

View File

@@ -1,8 +1,10 @@
import { Notice, TAbstractFile, TFile } from 'obsidian'
import {Notice, TAbstractFile, TFile} from 'obsidian'
import {
canIndexPDFs,
extractHeadingsFromCache,
getAliasesFromMetadata,
getTagsFromMetadata, isFileIndexable,
getTagsFromMetadata,
isFileIndexable,
isFilePlaintext,
removeDiacritics,
wait,
@@ -16,11 +18,11 @@ import {
removeNoteFromCache,
saveNotesCacheToFile,
} from './notes'
import { getPdfText } from './pdf-parser'
import type { IndexedNote } from './globals'
import { searchIndexFilePath } from './globals'
import { settings } from './settings'
import { minisearchInstance } from './search'
import {getPdfText} from './pdf-parser'
import type {IndexedNote} from './globals'
import {searchIndexFilePath} from './globals'
import {settings} from './settings'
import {minisearchInstance} from './search'
let isIndexChanged: boolean
@@ -178,3 +180,31 @@ export async function saveIndexToFile(): Promise<void> {
isIndexChanged = false
}
}
export async function indexPDFs() {
if (canIndexPDFs()) {
const start = new Date().getTime()
const files = app.vault.getFiles().filter(f => f.path.endsWith('.pdf'))
if (files.length > 50) {
new Notice(`⚠️ Omnisearch is indexing ${files.length} PDFs. You can experience slowdowns while this work is in progress.`)
}
const promises: Promise<void>[] = []
for (const file of files) {
if (getNoteFromCache(file.path)) {
removeFromIndex(file.path)
}
promises.push(addToIndex(file))
}
await Promise.all(promises)
// Notice & log
const message = `Omnisearch - Indexed ${files.length} PDFs in ${
new Date().getTime() - start
}ms`
if (settings.showIndexingNotices) {
new Notice(message)
}
console.log(message)
}
}

View File

@@ -1,11 +1,11 @@
import type { TFile } from 'obsidian'
import { loadPdfJs } from 'obsidian'
import PDFJs from 'pdfjs-dist'
import pdfjsWorker from 'pdfjs-dist/build/pdf.worker.entry'
let PDFJs: any = null
PDFJs.GlobalWorkerOptions.workerSrc = pdfjsWorker
// https://stackoverflow.com/a/59929946
export async function getPdfText(file: TFile): Promise<string> {
PDFJs = PDFJs ?? (await loadPdfJs())
const data = await app.vault.readBinary(file)
const doc = await PDFJs.getDocument(data).promise
const pageTexts = Array.from({ length: doc.numPages }, async (v, i) => {

View File

@@ -24,7 +24,7 @@ import {
loadNotesCache,
resetNotesCache,
} from './notes'
import { addToIndex, removeFromIndex, saveIndexToFile } from './notes-index'
import {addToIndex, indexPDFs, removeFromIndex, saveIndexToFile} from './notes-index'
export let minisearchInstance: MiniSearch<IndexedNote>
@@ -134,38 +134,6 @@ export async function initGlobalSearchIndex(): Promise<void> {
}
}
async function indexPDFs() {
if (canIndexPDFs()) {
const start = new Date().getTime()
console.warn(
"Omnisearch - Warnings on 'pdf.worker.min' are due to some issues while reading PDFs file and can usually be ignored."
)
const files = app.vault.getFiles().filter(f => f.path.endsWith('.pdf'))
let promises: Promise<void>[] = []
for (const [i, file] of files.entries()) {
if (getNoteFromCache(file.path)) {
removeFromIndex(file.path)
}
promises.push(addToIndex(file))
if (i % 10 === 0) {
await wait(1)
await Promise.all(promises)
promises = []
}
}
await Promise.all(promises)
// Notice & log
const message = `Omnisearch - Indexed ${files.length} PDFs in ${
new Date().getTime() - start
}ms`
if (settings.showIndexingNotices) {
new Notice(message)
}
console.log(message)
}
}
/**
* Searches the index for the given query,
* and returns an array of raw results

View File

@@ -106,23 +106,23 @@ export class SettingsTab extends PluginSettingTab {
})
})
// // Index PDFs
// const indexPDFsDesc = new DocumentFragment()
// indexPDFsDesc.createSpan({}, span => {
// span.innerHTML = `Omnisearch will index your PDFs, and return them in search results.
// This feature is currently a work-in-progress, please report slowdowns or issues that you might experience.<br>
// PDFs being quite slow to index, <strong style="color: var(--text-accent)">it is strongly recommended to also enable "Store index in file"</strong>.<br>
// <strong>Needs a restart to fully take effect.</strong>`
// })
// new Setting(containerEl)
// .setName('BETA - Index PDFs')
// .setDesc(indexPDFsDesc)
// .addToggle(toggle =>
// toggle.setValue(settings.indexPDFs).onChange(async v => {
// settings.indexPDFs = v
// await saveSettings(this.plugin)
// })
// )
// Index PDFs
const indexPDFsDesc = new DocumentFragment()
indexPDFsDesc.createSpan({}, span => {
span.innerHTML = `Omnisearch will index your PDFs, and return them in search results.
This feature is currently a work-in-progress, please report slowdowns or issues that you might experience.<br>
PDFs being quite slow to index, <strong style="color: var(--text-accent)">it is strongly recommended to also enable "Store index in file"</strong>.<br>
<strong>Needs a restart to fully take effect.</strong>`
})
new Setting(containerEl)
.setName('BETA - Index PDFs')
.setDesc(indexPDFsDesc)
.addToggle(toggle =>
toggle.setValue(settings.indexPDFs).onChange(async v => {
settings.indexPDFs = v
await saveSettings(this.plugin)
})
)
// Store index
const serializedIndexDesc = new DocumentFragment()

View File

@@ -173,7 +173,7 @@ export function getCtrlKeyLabel(): 'ctrl' | '⌘' {
}
export function canIndexPDFs(): boolean {
return false
return settings.indexPDFs
}
export function isFileIndexable(path: string): boolean {