#58 - Using a pool of web workers
This commit is contained in:
@@ -180,10 +180,6 @@ export async function indexPDFs() {
|
|||||||
await cacheManager.writeMinisearchIndex(Search.minisearchInstance)
|
await cacheManager.writeMinisearchIndex(Search.minisearchInstance)
|
||||||
})
|
})
|
||||||
)
|
)
|
||||||
// pdfQueue.add(async () => {
|
|
||||||
// await addToIndexAndCache(file)
|
|
||||||
// await cacheManager.writeMinisearchIndex(Search.minisearchInstance)
|
|
||||||
// })
|
|
||||||
}
|
}
|
||||||
await Promise.all(input)
|
await Promise.all(input)
|
||||||
// await pdfQueue.onEmpty()
|
// await pdfQueue.onEmpty()
|
||||||
|
|||||||
@@ -1,11 +1,47 @@
|
|||||||
import type { TFile } from 'obsidian'
|
import type { TFile } from 'obsidian'
|
||||||
import PDFWorker from 'web-worker:./pdf-worker.ts'
|
import WebWorker from 'web-worker:./pdf-worker.ts'
|
||||||
import { makeMD5 } from './utils'
|
import { makeMD5 } from './utils'
|
||||||
import { database } from './database'
|
import { database } from './database'
|
||||||
import { settings } from './settings'
|
|
||||||
|
|
||||||
const workerTimeout = 120_000
|
const workerTimeout = 120_000
|
||||||
|
|
||||||
|
class PDFWorker {
|
||||||
|
private static pool: PDFWorker[] = []
|
||||||
|
static getWorker(): PDFWorker {
|
||||||
|
const free = PDFWorker.pool.find(w => !w.running)
|
||||||
|
if (free) {
|
||||||
|
return free
|
||||||
|
}
|
||||||
|
const worker = new PDFWorker(new WebWorker({ name: 'PDF Text Extractor' }))
|
||||||
|
PDFWorker.pool.push(worker)
|
||||||
|
return worker
|
||||||
|
}
|
||||||
|
|
||||||
|
private running = false
|
||||||
|
|
||||||
|
private constructor(private worker: Worker) {}
|
||||||
|
|
||||||
|
public async run(msg: any): Promise<any> {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
this.running = true
|
||||||
|
|
||||||
|
const timeout = setTimeout(() => {
|
||||||
|
this.worker.terminate()
|
||||||
|
console.warn('Omnisearch - Worker timeout')
|
||||||
|
reject('timeout')
|
||||||
|
this.running = false
|
||||||
|
}, workerTimeout)
|
||||||
|
|
||||||
|
this.worker.postMessage(msg)
|
||||||
|
this.worker.onmessage = evt => {
|
||||||
|
clearTimeout(timeout)
|
||||||
|
resolve(evt)
|
||||||
|
this.running = false
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
class PDFManager {
|
class PDFManager {
|
||||||
public async getPdfText(file: TFile): Promise<string> {
|
public async getPdfText(file: TFile): Promise<string> {
|
||||||
// 1) Check if we can find by path & size
|
// 1) Check if we can find by path & size
|
||||||
@@ -27,34 +63,31 @@ class PDFManager {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// 3) The PDF is not cached, extract it
|
// 3) The PDF is not cached, extract it
|
||||||
const worker = new PDFWorker({ name: 'PDF Text Extractor' })
|
const worker = PDFWorker.getWorker() // new PDFWorker({ name: 'PDF Text Extractor' })
|
||||||
return new Promise(async (resolve, reject) => {
|
return new Promise(async (resolve, reject) => {
|
||||||
// @ts-ignore
|
try {
|
||||||
file.stat.size
|
const res = await worker.run({ data, name: file.basename })
|
||||||
|
const text = (res.data.text as string)
|
||||||
// In case of a timeout, we just return an empty line.
|
|
||||||
// If we don't, it will try to reindex at each restart.
|
|
||||||
const timeout = setTimeout(() => {
|
|
||||||
worker.terminate()
|
|
||||||
console.warn('Omnisearch - Worker timeout to extract text from ' + file.basename)
|
|
||||||
resolve('')
|
|
||||||
}, workerTimeout)
|
|
||||||
|
|
||||||
worker.postMessage({ data, name: file.basename })
|
|
||||||
worker.onmessage = (evt: any) => {
|
|
||||||
const text = (evt.data.text as string)
|
|
||||||
// Replace \n with spaces
|
// Replace \n with spaces
|
||||||
.replace(/\n/g, ' ')
|
.replace(/\n/g, ' ')
|
||||||
// Trim multiple spaces
|
// Trim multiple spaces
|
||||||
.replace(/ +/g, ' ')
|
.replace(/ +/g, ' ')
|
||||||
.trim()
|
.trim()
|
||||||
|
|
||||||
|
// Add it to the cache
|
||||||
database.pdf
|
database.pdf
|
||||||
.add({ hash, text, path: file.path, size: file.stat.size })
|
.add({ hash, text, path: file.path, size: file.stat.size })
|
||||||
.then(() => {
|
.then(() => {
|
||||||
clearTimeout(timeout)
|
|
||||||
resolve(text)
|
resolve(text)
|
||||||
})
|
})
|
||||||
worker.terminate()
|
} catch (e) {
|
||||||
|
// In case of error (unreadable PDF or timeout) just add
|
||||||
|
// an empty string to the cache
|
||||||
|
database.pdf
|
||||||
|
.add({ hash, text: '', path: file.path, size: file.stat.size })
|
||||||
|
.then(() => {
|
||||||
|
resolve('')
|
||||||
|
})
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -178,9 +178,10 @@ export class SettingsTab extends PluginSettingTab {
|
|||||||
indexPDFsDesc.createSpan({}, span => {
|
indexPDFsDesc.createSpan({}, span => {
|
||||||
span.innerHTML = `Omnisearch will include PDFs in search results.
|
span.innerHTML = `Omnisearch will include PDFs in search results.
|
||||||
<ul>
|
<ul>
|
||||||
|
<li>⚠️ Depending on their size, PDFs can take anywhere from a few seconds to 2 minutes to be processed.</li>
|
||||||
<li>⚠️ Texts extracted from PDFs may contain errors such as missing spaces, or spaces in the middle of words.</li>
|
<li>⚠️ Texts extracted from PDFs may contain errors such as missing spaces, or spaces in the middle of words.</li>
|
||||||
|
<li>⚠️ Some PDFs can't be processed correctly and will return an empty text.</li>
|
||||||
<li>This feature is currently a work-in-progress, please report issues that you might experience.</li>
|
<li>This feature is currently a work-in-progress, please report issues that you might experience.</li>
|
||||||
<li>Each PDF can take a few seconds to be indexed, so it may not appear immediately in search results.</li>
|
|
||||||
</ul>
|
</ul>
|
||||||
<strong style="color: var(--text-accent)">Needs a restart to fully take effect.</strong>`
|
<strong style="color: var(--text-accent)">Needs a restart to fully take effect.</strong>`
|
||||||
})
|
})
|
||||||
@@ -340,7 +341,7 @@ export const DEFAULT_SETTINGS: OmnisearchSettings = {
|
|||||||
PDFIndexing: false,
|
PDFIndexing: false,
|
||||||
backgroundProcesses: Platform.isMobileApp
|
backgroundProcesses: Platform.isMobileApp
|
||||||
? 1
|
? 1
|
||||||
: Math.max(1, Math.floor(require('os').cpus().length / 2)),
|
: Math.max(1, Math.floor(require('os').cpus().length - 2)),
|
||||||
|
|
||||||
showIndexingNotices: false,
|
showIndexingNotices: false,
|
||||||
showShortName: false,
|
showShortName: false,
|
||||||
|
|||||||
Reference in New Issue
Block a user