Exported PDF extractor

2022-10-29 12:32:57 +02:00
parent 0350e62a47
commit 9455032d0a
10 changed files with 9 additions and 211 deletions
--- a/src/file-loader.ts
+++ b/src/file-loader.ts
@@ -9,8 +9,8 @@ import {
 import * as NotesIndex from './notes-index'
 import type { TFile } from 'obsidian'
 import type { IndexedDocument } from './globals'
-import { pdfManager } from './pdf/pdf-manager'
 import { getNonExistingNotes } from './tools/notes'
+import { getPdfText } from 'obsidian-text-extract'

 /**
 * Return all plaintext files as IndexedDocuments
@@ -60,7 +60,7 @@ export async function fileToIndexedDocument(
  if (isFilePlaintext(file.path)) {
    content = removeDiacritics(await app.vault.cachedRead(file))
  } else if (file.path.endsWith('.pdf')) {
-    content = removeDiacritics(await pdfManager.getPdfText(file))
+    content = removeDiacritics(await getPdfText(file))
  } else {
    throw new Error('Invalid file: ' + file.path)
  }
@@ -73,7 +73,8 @@ export async function fileToIndexedDocument(
  if (metadata?.frontmatter?.['excalidraw-plugin']) {
    const comments = metadata.sections?.filter(s => s.type === 'comment') ?? []
    for (const { start, end } of comments.map(c => c.position)) {
-      content = content.substring(0, start.offset-1) + content.substring(end.offset)
+      content =
+        content.substring(0, start.offset - 1) + content.substring(end.offset)
    }
  }

--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,55 +0,0 @@
-use js_sys::Uint8Array;
-use pdf_extract::extract_text_from_mem;
-use wasm_bindgen::prelude::*;
-
-// mod obsidian;
-
-#[wasm_bindgen]
-pub fn extract_pdf_text(arr: Uint8Array) -> Result<String, JsError> {
-    match extract_text_from_mem(&arr.to_vec()) {
-        Ok(txt) => return Ok(txt),
-        Err(e) => return Err(JsError::new(&e.to_string())),
-    };
-}
-
-// #[wasm_bindgen]
-// pub struct ExampleCommand {
-//     id: JsString,
-//     name: JsString,
-// }
-
-// #[wasm_bindgen]
-// impl ExampleCommand {
-//     #[wasm_bindgen(getter)]
-//     pub fn id(&self) -> JsString {
-//         self.id.clone()
-//     }
-
-//     #[wasm_bindgen(setter)]
-//     pub fn set_id(&mut self, id: &str) {
-//         self.id = JsString::from(id)
-//     }
-
-//     #[wasm_bindgen(getter)]
-//     pub fn name(&self) -> JsString {
-//         self.name.clone()
-//     }
-
-//     #[wasm_bindgen(setter)]
-//     pub fn set_name(&mut self, name: &str) {
-//         self.name = JsString::from(name)
-//     }
-
-//     pub fn callback(&self) {
-//         obsidian::Notice::new("hello from rust");
-//     }
-// }
-
-// #[wasm_bindgen]
-// pub fn onload(plugin: &obsidian::Plugin) {
-//     let cmd = ExampleCommand {
-//         id: JsString::from("example"),
-//         name: JsString::from("Example"),
-//     };
-//     plugin.addCommand(JsValue::from(cmd))
-// }
--- a/src/obsidian.rs
+++ b/src/obsidian.rs
@@ -1,14 +0,0 @@
-use wasm_bindgen::prelude::*;
-
-#[wasm_bindgen(module = "obsidian")]
-extern "C" {
-    pub type Plugin;
-
-    #[wasm_bindgen(structural, method)]
-    pub fn addCommand(this: &Plugin, command: JsValue);
-
-    pub type Notice;
-
-    #[wasm_bindgen(constructor)]
-    pub fn new(message: &str) -> Notice;
-}
--- a/src/pdf/pdf-manager.ts
+++ b/src/pdf/pdf-manager.ts
@@ -1,107 +0,0 @@
-import type { TFile } from 'obsidian'
-import WebWorker from 'web-worker:./pdf-worker.ts'
-import { makeMD5 } from '../tools/utils'
-import { database } from '../database'
-
-const workerTimeout = 120_000
-
-class PDFWorker {
-  private static pool: PDFWorker[] = []
-  static getWorker(): PDFWorker {
-    const free = PDFWorker.pool.find(w => !w.running)
-    if (free) {
-      return free
-    }
-    const worker = new PDFWorker(new WebWorker({ name: 'PDF Text Extractor' }))
-    PDFWorker.pool.push(worker)
-    return worker
-  }
-
-  private running = false
-
-  private constructor(private worker: Worker) {}
-
-  public async run(msg: any): Promise<any> {
-    return new Promise((resolve, reject) => {
-      this.running = true
-
-      const timeout = setTimeout(() => {
-        this.worker.terminate()
-        console.warn('Omnisearch - Worker timeout')
-        reject('timeout')
-        this.running = false
-      }, workerTimeout)
-
-      this.worker.postMessage(msg)
-      this.worker.onmessage = evt => {
-        clearTimeout(timeout)
-        resolve(evt)
-        this.running = false
-      }
-    })
-  }
-}
-
-class PDFManager {
-  public async getPdfText(file: TFile): Promise<string> {
-    // 1) Check if we can find by path & size
-    const docByPath = await database.pdf.get({
-      path: file.path,
-      size: file.stat.size,
-    })
-
-    if (docByPath) {
-      return docByPath.text
-    }
-
-    // 2) Check by hash
-    const data = new Uint8Array(await app.vault.readBinary(file))
-    const hash = makeMD5(data)
-    const docByHash = await database.pdf.get(hash)
-    if (docByHash) {
-      return docByHash.text
-    }
-
-    // 3) The PDF is not cached, extract it
-    const worker = PDFWorker.getWorker() // new PDFWorker({ name: 'PDF Text Extractor' })
-    return new Promise(async (resolve, reject) => {
-      try {
-        const res = await worker.run({ data, name: file.basename })
-        const text = (res.data.text as string)
-          // Replace \n with spaces
-          .replace(/\n/g, ' ')
-          // Trim multiple spaces
-          .replace(/ +/g, ' ')
-          .trim()
-
-        // Add it to the cache
-        database.pdf
-          .add({ hash, text, path: file.path, size: file.stat.size })
-          .then(() => {
-            resolve(text)
-          })
-      } catch (e) {
-        // In case of error (unreadable PDF or timeout) just add
-        // an empty string to the cache
-        database.pdf
-          .add({ hash, text: '', path: file.path, size: file.stat.size })
-          .then(() => {
-            resolve('')
-          })
-      }
-    })
-  }
-
-  /**
-   * Removes the outdated cache entries
-   */
-  public async cleanCache(): Promise<void> {
-    database.pdf.each(async item => {
-      if (!(await app.vault.adapter.exists(item.path))) {
-        console.log(item.path + ' does not exist')
-      }
-    })
-  }
-}
-
-export const pdfManager = new PDFManager()
--- a/src/pdf/pdf-worker.ts
+++ b/src/pdf/pdf-worker.ts
@@ -1,21 +0,0 @@
-import rustPlugin from '../../pkg/obsidian_search_bg.wasm'
-import * as plugin from '../../pkg'
-
-const decodedPlugin = decodeBase64(rustPlugin as any)
-
-onmessage = async evt => {
-  const buffer = Uint8Array.from(decodedPlugin, c => c.charCodeAt(0))
-  await plugin.default(Promise.resolve(buffer))
-  try {
-    const text = plugin.extract_pdf_text(evt.data.data as Uint8Array)
-    self.postMessage({ text })
-  } catch (e) {
-    console.warn('Omnisearch - Could not extract text from ' + evt.data.name)
-    self.postMessage({ text: '' })
-  }
-}
-
-function decodeBase64(data: string) {
-  return atob(data)
-  // return Buffer.from(data, 'base64').toString()
-}
--- a/src/typings/types.d.ts
+++ b/src/typings/types.d.ts
@@ -1 +0,0 @@
-declare module 'pdfjs-dist/build/pdf.worker.entry';
--- a/src/typings/workers.d.ts
+++ b/src/typings/workers.d.ts
@@ -1,4 +0,0 @@
-declare module "web-worker:*" {
-  const WorkerFactory: new (options: any) => Worker;
-  export default WorkerFactory;
-}
				`@@ -1 +0,0 @@`
				`declare module 'pdfjs-dist/build/pdf.worker.entry';`