#58 - Using a pool of web workers

This commit is contained in:
Simon Cambier
2022-10-18 22:20:26 +02:00
parent 31bf65283f
commit 4c212a798e
3 changed files with 55 additions and 25 deletions

View File

@@ -180,10 +180,6 @@ export async function indexPDFs() {
await cacheManager.writeMinisearchIndex(Search.minisearchInstance) await cacheManager.writeMinisearchIndex(Search.minisearchInstance)
}) })
) )
// pdfQueue.add(async () => {
// await addToIndexAndCache(file)
// await cacheManager.writeMinisearchIndex(Search.minisearchInstance)
// })
} }
await Promise.all(input) await Promise.all(input)
// await pdfQueue.onEmpty() // await pdfQueue.onEmpty()

View File

@@ -1,11 +1,47 @@
import type { TFile } from 'obsidian' import type { TFile } from 'obsidian'
import PDFWorker from 'web-worker:./pdf-worker.ts' import WebWorker from 'web-worker:./pdf-worker.ts'
import { makeMD5 } from './utils' import { makeMD5 } from './utils'
import { database } from './database' import { database } from './database'
import { settings } from './settings'
const workerTimeout = 120_000 const workerTimeout = 120_000
class PDFWorker {
private static pool: PDFWorker[] = []
static getWorker(): PDFWorker {
const free = PDFWorker.pool.find(w => !w.running)
if (free) {
return free
}
const worker = new PDFWorker(new WebWorker({ name: 'PDF Text Extractor' }))
PDFWorker.pool.push(worker)
return worker
}
private running = false
private constructor(private worker: Worker) {}
public async run(msg: any): Promise<any> {
return new Promise((resolve, reject) => {
this.running = true
const timeout = setTimeout(() => {
this.worker.terminate()
console.warn('Omnisearch - Worker timeout')
reject('timeout')
this.running = false
}, workerTimeout)
this.worker.postMessage(msg)
this.worker.onmessage = evt => {
clearTimeout(timeout)
resolve(evt)
this.running = false
}
})
}
}
class PDFManager { class PDFManager {
public async getPdfText(file: TFile): Promise<string> { public async getPdfText(file: TFile): Promise<string> {
// 1) Check if we can find by path & size // 1) Check if we can find by path & size
@@ -27,34 +63,31 @@ class PDFManager {
} }
// 3) The PDF is not cached, extract it // 3) The PDF is not cached, extract it
const worker = new PDFWorker({ name: 'PDF Text Extractor' }) const worker = PDFWorker.getWorker() // new PDFWorker({ name: 'PDF Text Extractor' })
return new Promise(async (resolve, reject) => { return new Promise(async (resolve, reject) => {
// @ts-ignore try {
file.stat.size const res = await worker.run({ data, name: file.basename })
const text = (res.data.text as string)
// In case of a timeout, we just return an empty line.
// If we don't, it will try to reindex at each restart.
const timeout = setTimeout(() => {
worker.terminate()
console.warn('Omnisearch - Worker timeout to extract text from ' + file.basename)
resolve('')
}, workerTimeout)
worker.postMessage({ data, name: file.basename })
worker.onmessage = (evt: any) => {
const text = (evt.data.text as string)
// Replace \n with spaces // Replace \n with spaces
.replace(/\n/g, ' ') .replace(/\n/g, ' ')
// Trim multiple spaces // Trim multiple spaces
.replace(/ +/g, ' ') .replace(/ +/g, ' ')
.trim() .trim()
// Add it to the cache
database.pdf database.pdf
.add({ hash, text, path: file.path, size: file.stat.size }) .add({ hash, text, path: file.path, size: file.stat.size })
.then(() => { .then(() => {
clearTimeout(timeout)
resolve(text) resolve(text)
}) })
worker.terminate() } catch (e) {
// In case of error (unreadable PDF or timeout) just add
// an empty string to the cache
database.pdf
.add({ hash, text: '', path: file.path, size: file.stat.size })
.then(() => {
resolve('')
})
} }
}) })
} }

View File

@@ -178,9 +178,10 @@ export class SettingsTab extends PluginSettingTab {
indexPDFsDesc.createSpan({}, span => { indexPDFsDesc.createSpan({}, span => {
span.innerHTML = `Omnisearch will include PDFs in search results. span.innerHTML = `Omnisearch will include PDFs in search results.
<ul> <ul>
<li>⚠️ Depending on their size, PDFs can take anywhere from a few seconds to 2 minutes to be processed.</li>
<li>⚠️ Texts extracted from PDFs may contain errors such as missing spaces, or spaces in the middle of words.</li> <li>⚠️ Texts extracted from PDFs may contain errors such as missing spaces, or spaces in the middle of words.</li>
<li>⚠️ Some PDFs can't be processed correctly and will return an empty text.</li>
<li>This feature is currently a work-in-progress, please report issues that you might experience.</li> <li>This feature is currently a work-in-progress, please report issues that you might experience.</li>
<li>Each PDF can take a few seconds to be indexed, so it may not appear immediately in search results.</li>
</ul> </ul>
<strong style="color: var(--text-accent)">Needs a restart to fully take effect.</strong>` <strong style="color: var(--text-accent)">Needs a restart to fully take effect.</strong>`
}) })
@@ -340,7 +341,7 @@ export const DEFAULT_SETTINGS: OmnisearchSettings = {
PDFIndexing: false, PDFIndexing: false,
backgroundProcesses: Platform.isMobileApp backgroundProcesses: Platform.isMobileApp
? 1 ? 1
: Math.max(1, Math.floor(require('os').cpus().length / 2)), : Math.max(1, Math.floor(require('os').cpus().length - 2)),
showIndexingNotices: false, showIndexingNotices: false,
showShortName: false, showShortName: false,