From 9455032d0ac9f17050766e021ada614eb2cc852b Mon Sep 17 00:00:00 2001 From: Simon Cambier Date: Sat, 29 Oct 2022 12:32:57 +0200 Subject: [PATCH] Exported PDF extractor --- package.json | 5 +- pnpm-lock.yaml | 2 + rollup.config.js | 4 -- src/file-loader.ts | 7 +-- src/lib.rs | 55 -------------------- src/obsidian.rs | 14 ----- src/pdf/pdf-manager.ts | 107 --------------------------------------- src/pdf/pdf-worker.ts | 21 -------- src/typings/types.d.ts | 1 - src/typings/workers.d.ts | 4 -- 10 files changed, 9 insertions(+), 211 deletions(-) delete mode 100644 src/lib.rs delete mode 100644 src/obsidian.rs delete mode 100644 src/pdf/pdf-manager.ts delete mode 100644 src/pdf/pdf-worker.ts delete mode 100644 src/typings/types.d.ts delete mode 100644 src/typings/workers.d.ts diff --git a/package.json b/package.json index 61db892..72a87af 100644 --- a/package.json +++ b/package.json @@ -4,8 +4,8 @@ "description": "A search engine for Obsidian", "main": "dist/main.js", "scripts": { - "dev": "wasm-pack build --target web && rollup -c -w", - "build": "wasm-pack build --target web && rollup -c", + "dev": "rollup -c -w", + "build": "rollup -c", "check": "svelte-check --tsconfig ./tsconfig.json", "version": "node version-bump.mjs && git add manifest.json versions.json package.json", "test": "jest" @@ -29,6 +29,7 @@ "builtin-modules": "^3.3.0", "jest": "^27.5.1", "obsidian": "latest", + "obsidian-text-extract": "link:C:/Dev/Obsidian/obsidian-text-extract/dist", "prettier": "^2.7.1", "prettier-plugin-svelte": "^2.8.0", "rollup": "^2.79.1", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 77da2fc..cf12982 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -23,6 +23,7 @@ specifiers: lodash-es: 4.17.21 minisearch: 5.0.0 obsidian: latest + obsidian-text-extract: link:C:/Dev/Obsidian/obsidian-text-extract/dist p-limit: ^4.0.0 pako: ^2.0.4 prettier: ^2.7.1 @@ -66,6 +67,7 @@ devDependencies: builtin-modules: 3.3.0 jest: 27.5.1 obsidian: 0.16.3 + obsidian-text-extract: link:../obsidian-text-extract/dist prettier: 2.7.1 prettier-plugin-svelte: 2.8.0_ibge6ami6vq2q2j5g4rcvk62hq rollup: 2.79.1 diff --git a/rollup.config.js b/rollup.config.js index e21b160..b18319f 100644 --- a/rollup.config.js +++ b/rollup.config.js @@ -1,12 +1,10 @@ import { nodeResolve } from '@rollup/plugin-node-resolve' import commonjs from '@rollup/plugin-commonjs' -import { base64 } from 'rollup-plugin-base64' import typescript from '@rollup/plugin-typescript' import svelte from 'rollup-plugin-svelte' import autoPreprocess from 'svelte-preprocess' import copy from 'rollup-plugin-copy' import { terser } from 'rollup-plugin-terser' -import webWorkerLoader from 'rollup-plugin-web-worker-loader' const banner = `/* THIS IS A GENERATED/BUNDLED FILE BY ROLLUP @@ -33,7 +31,6 @@ export default { }), typescript(), commonjs(), - base64({ include: '**/*.wasm' }), copy({ targets: [ { src: 'manifest.json', dest: 'dist' }, @@ -41,7 +38,6 @@ export default { { src: 'assets/.gitignore', dest: 'dist' }, ], }), - webWorkerLoader({ inline: true, forceInline: true, targetPlatform: "browser" }), production && terser(), ], } \ No newline at end of file diff --git a/src/file-loader.ts b/src/file-loader.ts index 3b46ace..dcf8085 100644 --- a/src/file-loader.ts +++ b/src/file-loader.ts @@ -9,8 +9,8 @@ import { import * as NotesIndex from './notes-index' import type { TFile } from 'obsidian' import type { IndexedDocument } from './globals' -import { pdfManager } from './pdf/pdf-manager' import { getNonExistingNotes } from './tools/notes' +import { getPdfText } from 'obsidian-text-extract' /** * Return all plaintext files as IndexedDocuments @@ -60,7 +60,7 @@ export async function fileToIndexedDocument( if (isFilePlaintext(file.path)) { content = removeDiacritics(await app.vault.cachedRead(file)) } else if (file.path.endsWith('.pdf')) { - content = removeDiacritics(await pdfManager.getPdfText(file)) + content = removeDiacritics(await getPdfText(file)) } else { throw new Error('Invalid file: ' + file.path) } @@ -73,7 +73,8 @@ export async function fileToIndexedDocument( if (metadata?.frontmatter?.['excalidraw-plugin']) { const comments = metadata.sections?.filter(s => s.type === 'comment') ?? [] for (const { start, end } of comments.map(c => c.position)) { - content = content.substring(0, start.offset-1) + content.substring(end.offset) + content = + content.substring(0, start.offset - 1) + content.substring(end.offset) } } diff --git a/src/lib.rs b/src/lib.rs deleted file mode 100644 index e1d6abf..0000000 --- a/src/lib.rs +++ /dev/null @@ -1,55 +0,0 @@ -use js_sys::Uint8Array; -use pdf_extract::extract_text_from_mem; -use wasm_bindgen::prelude::*; - -// mod obsidian; - -#[wasm_bindgen] -pub fn extract_pdf_text(arr: Uint8Array) -> Result { - match extract_text_from_mem(&arr.to_vec()) { - Ok(txt) => return Ok(txt), - Err(e) => return Err(JsError::new(&e.to_string())), - }; -} - -// #[wasm_bindgen] -// pub struct ExampleCommand { -// id: JsString, -// name: JsString, -// } - -// #[wasm_bindgen] -// impl ExampleCommand { -// #[wasm_bindgen(getter)] -// pub fn id(&self) -> JsString { -// self.id.clone() -// } - -// #[wasm_bindgen(setter)] -// pub fn set_id(&mut self, id: &str) { -// self.id = JsString::from(id) -// } - -// #[wasm_bindgen(getter)] -// pub fn name(&self) -> JsString { -// self.name.clone() -// } - -// #[wasm_bindgen(setter)] -// pub fn set_name(&mut self, name: &str) { -// self.name = JsString::from(name) -// } - -// pub fn callback(&self) { -// obsidian::Notice::new("hello from rust"); -// } -// } - -// #[wasm_bindgen] -// pub fn onload(plugin: &obsidian::Plugin) { -// let cmd = ExampleCommand { -// id: JsString::from("example"), -// name: JsString::from("Example"), -// }; -// plugin.addCommand(JsValue::from(cmd)) -// } diff --git a/src/obsidian.rs b/src/obsidian.rs deleted file mode 100644 index 40268df..0000000 --- a/src/obsidian.rs +++ /dev/null @@ -1,14 +0,0 @@ -use wasm_bindgen::prelude::*; - -#[wasm_bindgen(module = "obsidian")] -extern "C" { - pub type Plugin; - - #[wasm_bindgen(structural, method)] - pub fn addCommand(this: &Plugin, command: JsValue); - - pub type Notice; - - #[wasm_bindgen(constructor)] - pub fn new(message: &str) -> Notice; -} diff --git a/src/pdf/pdf-manager.ts b/src/pdf/pdf-manager.ts deleted file mode 100644 index aebd726..0000000 --- a/src/pdf/pdf-manager.ts +++ /dev/null @@ -1,107 +0,0 @@ -import type { TFile } from 'obsidian' -import WebWorker from 'web-worker:./pdf-worker.ts' -import { makeMD5 } from '../tools/utils' -import { database } from '../database' - -const workerTimeout = 120_000 - -class PDFWorker { - private static pool: PDFWorker[] = [] - static getWorker(): PDFWorker { - const free = PDFWorker.pool.find(w => !w.running) - if (free) { - return free - } - const worker = new PDFWorker(new WebWorker({ name: 'PDF Text Extractor' })) - PDFWorker.pool.push(worker) - return worker - } - - private running = false - - private constructor(private worker: Worker) {} - - public async run(msg: any): Promise { - return new Promise((resolve, reject) => { - this.running = true - - const timeout = setTimeout(() => { - this.worker.terminate() - console.warn('Omnisearch - Worker timeout') - reject('timeout') - this.running = false - }, workerTimeout) - - this.worker.postMessage(msg) - this.worker.onmessage = evt => { - clearTimeout(timeout) - resolve(evt) - this.running = false - } - }) - } -} - -class PDFManager { - public async getPdfText(file: TFile): Promise { - // 1) Check if we can find by path & size - const docByPath = await database.pdf.get({ - path: file.path, - size: file.stat.size, - }) - - if (docByPath) { - return docByPath.text - } - - // 2) Check by hash - const data = new Uint8Array(await app.vault.readBinary(file)) - const hash = makeMD5(data) - const docByHash = await database.pdf.get(hash) - if (docByHash) { - return docByHash.text - } - - // 3) The PDF is not cached, extract it - const worker = PDFWorker.getWorker() // new PDFWorker({ name: 'PDF Text Extractor' }) - return new Promise(async (resolve, reject) => { - try { - const res = await worker.run({ data, name: file.basename }) - const text = (res.data.text as string) - // Replace \n with spaces - .replace(/\n/g, ' ') - // Trim multiple spaces - .replace(/ +/g, ' ') - .trim() - - // Add it to the cache - database.pdf - .add({ hash, text, path: file.path, size: file.stat.size }) - .then(() => { - resolve(text) - }) - } catch (e) { - // In case of error (unreadable PDF or timeout) just add - // an empty string to the cache - database.pdf - .add({ hash, text: '', path: file.path, size: file.stat.size }) - .then(() => { - resolve('') - }) - } - }) - } - - /** - * Removes the outdated cache entries - */ - public async cleanCache(): Promise { - database.pdf.each(async item => { - if (!(await app.vault.adapter.exists(item.path))) { - console.log(item.path + ' does not exist') - } - }) - } -} - -export const pdfManager = new PDFManager() diff --git a/src/pdf/pdf-worker.ts b/src/pdf/pdf-worker.ts deleted file mode 100644 index 828d521..0000000 --- a/src/pdf/pdf-worker.ts +++ /dev/null @@ -1,21 +0,0 @@ -import rustPlugin from '../../pkg/obsidian_search_bg.wasm' -import * as plugin from '../../pkg' - -const decodedPlugin = decodeBase64(rustPlugin as any) - -onmessage = async evt => { - const buffer = Uint8Array.from(decodedPlugin, c => c.charCodeAt(0)) - await plugin.default(Promise.resolve(buffer)) - try { - const text = plugin.extract_pdf_text(evt.data.data as Uint8Array) - self.postMessage({ text }) - } catch (e) { - console.warn('Omnisearch - Could not extract text from ' + evt.data.name) - self.postMessage({ text: '' }) - } -} - -function decodeBase64(data: string) { - return atob(data) - // return Buffer.from(data, 'base64').toString() -} diff --git a/src/typings/types.d.ts b/src/typings/types.d.ts deleted file mode 100644 index e1c9a55..0000000 --- a/src/typings/types.d.ts +++ /dev/null @@ -1 +0,0 @@ -declare module 'pdfjs-dist/build/pdf.worker.entry'; \ No newline at end of file diff --git a/src/typings/workers.d.ts b/src/typings/workers.d.ts deleted file mode 100644 index 3edba2b..0000000 --- a/src/typings/workers.d.ts +++ /dev/null @@ -1,4 +0,0 @@ -declare module "web-worker:*" { - const WorkerFactory: new (options: any) => Worker; - export default WorkerFactory; -} \ No newline at end of file