Squashed commit of the following:

commit ac82511ddd17d5472ae3cfea9bbad9754f5a4d62
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Sat Oct 22 08:23:42 2022 +0200

    Screw that cache, seriously.

commit 8ba40d1be73daaaffea09e07bc56c339266db9b6
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Fri Oct 21 22:36:48 2022 +0200

    Stuff

commit 27b8fd7dc809be9714a109d3a458eb1276a47e2e
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Fri Oct 21 22:22:20 2022 +0200

    Moved files

commit fb1349c914907e586e103ca54fb04b9ddd45ef5d
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Thu Oct 20 22:25:29 2022 +0200

    Removed duplicate code

commit e7371138e60cbe4155cfd4fb44e3ee1d2e3ee088
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Thu Oct 20 21:50:09 2022 +0200

    Moved a bunch of files

commit 2ee1b2a0e799d4b41ab3a444d8cc44dfff5b5623
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Thu Oct 20 21:32:21 2022 +0200

    Removed useless code

commit 76c530dfb9adbad1bbe9079de2330fe43a044249
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Thu Oct 20 20:44:11 2022 +0200

    Split file reading and indexing
This commit is contained in:
Simon Cambier
2022-10-22 08:25:34 +02:00
parent 1376cea282
commit c2ecdd79ad
25 changed files with 338 additions and 403 deletions

107
src/pdf/pdf-manager.ts Normal file
View File

@@ -0,0 +1,107 @@
import type { TFile } from 'obsidian'
import WebWorker from 'web-worker:./pdf-worker.ts'
import { makeMD5 } from '../tools/utils'
import { database } from '../database'
const workerTimeout = 120_000
class PDFWorker {
private static pool: PDFWorker[] = []
static getWorker(): PDFWorker {
const free = PDFWorker.pool.find(w => !w.running)
if (free) {
return free
}
const worker = new PDFWorker(new WebWorker({ name: 'PDF Text Extractor' }))
PDFWorker.pool.push(worker)
return worker
}
private running = false
private constructor(private worker: Worker) {}
public async run(msg: any): Promise<any> {
return new Promise((resolve, reject) => {
this.running = true
const timeout = setTimeout(() => {
this.worker.terminate()
console.warn('Omnisearch - Worker timeout')
reject('timeout')
this.running = false
}, workerTimeout)
this.worker.postMessage(msg)
this.worker.onmessage = evt => {
clearTimeout(timeout)
resolve(evt)
this.running = false
}
})
}
}
class PDFManager {
public async getPdfText(file: TFile): Promise<string> {
// 1) Check if we can find by path & size
const docByPath = await database.pdf.get({
path: file.path,
size: file.stat.size,
})
if (docByPath) {
return docByPath.text
}
// 2) Check by hash
const data = new Uint8Array(await app.vault.readBinary(file))
const hash = makeMD5(data)
const docByHash = await database.pdf.get(hash)
if (docByHash) {
return docByHash.text
}
// 3) The PDF is not cached, extract it
const worker = PDFWorker.getWorker() // new PDFWorker({ name: 'PDF Text Extractor' })
return new Promise(async (resolve, reject) => {
try {
const res = await worker.run({ data, name: file.basename })
const text = (res.data.text as string)
// Replace \n with spaces
.replace(/\n/g, ' ')
// Trim multiple spaces
.replace(/ +/g, ' ')
.trim()
// Add it to the cache
database.pdf
.add({ hash, text, path: file.path, size: file.stat.size })
.then(() => {
resolve(text)
})
} catch (e) {
// In case of error (unreadable PDF or timeout) just add
// an empty string to the cache
database.pdf
.add({ hash, text: '', path: file.path, size: file.stat.size })
.then(() => {
resolve('')
})
}
})
}
/**
* Removes the outdated cache entries
*/
public async cleanCache(): Promise<void> {
database.pdf.each(async item => {
if (!(await app.vault.adapter.exists(item.path))) {
console.log(item.path + ' does not exist')
}
})
}
}
export const pdfManager = new PDFManager()

21
src/pdf/pdf-worker.ts Normal file
View File

@@ -0,0 +1,21 @@
import rustPlugin from '../../pkg/obsidian_search_bg.wasm'
import * as plugin from '../../pkg'
const decodedPlugin = decodeBase64(rustPlugin as any)
onmessage = async evt => {
const buffer = Uint8Array.from(decodedPlugin, c => c.charCodeAt(0))
await plugin.default(Promise.resolve(buffer))
try {
const text = plugin.extract_pdf_text(evt.data.data as Uint8Array)
self.postMessage({ text })
} catch (e) {
console.warn('Omnisearch - Could not extract text from ' + evt.data.name)
self.postMessage({ text: '' })
}
}
function decodeBase64(data: string) {
return atob(data)
// return Buffer.from(data, 'base64').toString()
}