Exported PDF extractor
This commit is contained in:
@@ -4,8 +4,8 @@
|
|||||||
"description": "A search engine for Obsidian",
|
"description": "A search engine for Obsidian",
|
||||||
"main": "dist/main.js",
|
"main": "dist/main.js",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"dev": "wasm-pack build --target web && rollup -c -w",
|
"dev": "rollup -c -w",
|
||||||
"build": "wasm-pack build --target web && rollup -c",
|
"build": "rollup -c",
|
||||||
"check": "svelte-check --tsconfig ./tsconfig.json",
|
"check": "svelte-check --tsconfig ./tsconfig.json",
|
||||||
"version": "node version-bump.mjs && git add manifest.json versions.json package.json",
|
"version": "node version-bump.mjs && git add manifest.json versions.json package.json",
|
||||||
"test": "jest"
|
"test": "jest"
|
||||||
@@ -29,6 +29,7 @@
|
|||||||
"builtin-modules": "^3.3.0",
|
"builtin-modules": "^3.3.0",
|
||||||
"jest": "^27.5.1",
|
"jest": "^27.5.1",
|
||||||
"obsidian": "latest",
|
"obsidian": "latest",
|
||||||
|
"obsidian-text-extract": "link:C:/Dev/Obsidian/obsidian-text-extract/dist",
|
||||||
"prettier": "^2.7.1",
|
"prettier": "^2.7.1",
|
||||||
"prettier-plugin-svelte": "^2.8.0",
|
"prettier-plugin-svelte": "^2.8.0",
|
||||||
"rollup": "^2.79.1",
|
"rollup": "^2.79.1",
|
||||||
|
|||||||
2
pnpm-lock.yaml
generated
2
pnpm-lock.yaml
generated
@@ -23,6 +23,7 @@ specifiers:
|
|||||||
lodash-es: 4.17.21
|
lodash-es: 4.17.21
|
||||||
minisearch: 5.0.0
|
minisearch: 5.0.0
|
||||||
obsidian: latest
|
obsidian: latest
|
||||||
|
obsidian-text-extract: link:C:/Dev/Obsidian/obsidian-text-extract/dist
|
||||||
p-limit: ^4.0.0
|
p-limit: ^4.0.0
|
||||||
pako: ^2.0.4
|
pako: ^2.0.4
|
||||||
prettier: ^2.7.1
|
prettier: ^2.7.1
|
||||||
@@ -66,6 +67,7 @@ devDependencies:
|
|||||||
builtin-modules: 3.3.0
|
builtin-modules: 3.3.0
|
||||||
jest: 27.5.1
|
jest: 27.5.1
|
||||||
obsidian: 0.16.3
|
obsidian: 0.16.3
|
||||||
|
obsidian-text-extract: link:../obsidian-text-extract/dist
|
||||||
prettier: 2.7.1
|
prettier: 2.7.1
|
||||||
prettier-plugin-svelte: 2.8.0_ibge6ami6vq2q2j5g4rcvk62hq
|
prettier-plugin-svelte: 2.8.0_ibge6ami6vq2q2j5g4rcvk62hq
|
||||||
rollup: 2.79.1
|
rollup: 2.79.1
|
||||||
|
|||||||
@@ -1,12 +1,10 @@
|
|||||||
import { nodeResolve } from '@rollup/plugin-node-resolve'
|
import { nodeResolve } from '@rollup/plugin-node-resolve'
|
||||||
import commonjs from '@rollup/plugin-commonjs'
|
import commonjs from '@rollup/plugin-commonjs'
|
||||||
import { base64 } from 'rollup-plugin-base64'
|
|
||||||
import typescript from '@rollup/plugin-typescript'
|
import typescript from '@rollup/plugin-typescript'
|
||||||
import svelte from 'rollup-plugin-svelte'
|
import svelte from 'rollup-plugin-svelte'
|
||||||
import autoPreprocess from 'svelte-preprocess'
|
import autoPreprocess from 'svelte-preprocess'
|
||||||
import copy from 'rollup-plugin-copy'
|
import copy from 'rollup-plugin-copy'
|
||||||
import { terser } from 'rollup-plugin-terser'
|
import { terser } from 'rollup-plugin-terser'
|
||||||
import webWorkerLoader from 'rollup-plugin-web-worker-loader'
|
|
||||||
|
|
||||||
const banner = `/*
|
const banner = `/*
|
||||||
THIS IS A GENERATED/BUNDLED FILE BY ROLLUP
|
THIS IS A GENERATED/BUNDLED FILE BY ROLLUP
|
||||||
@@ -33,7 +31,6 @@ export default {
|
|||||||
}),
|
}),
|
||||||
typescript(),
|
typescript(),
|
||||||
commonjs(),
|
commonjs(),
|
||||||
base64({ include: '**/*.wasm' }),
|
|
||||||
copy({
|
copy({
|
||||||
targets: [
|
targets: [
|
||||||
{ src: 'manifest.json', dest: 'dist' },
|
{ src: 'manifest.json', dest: 'dist' },
|
||||||
@@ -41,7 +38,6 @@ export default {
|
|||||||
{ src: 'assets/.gitignore', dest: 'dist' },
|
{ src: 'assets/.gitignore', dest: 'dist' },
|
||||||
],
|
],
|
||||||
}),
|
}),
|
||||||
webWorkerLoader({ inline: true, forceInline: true, targetPlatform: "browser" }),
|
|
||||||
production && terser(),
|
production && terser(),
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
@@ -9,8 +9,8 @@ import {
|
|||||||
import * as NotesIndex from './notes-index'
|
import * as NotesIndex from './notes-index'
|
||||||
import type { TFile } from 'obsidian'
|
import type { TFile } from 'obsidian'
|
||||||
import type { IndexedDocument } from './globals'
|
import type { IndexedDocument } from './globals'
|
||||||
import { pdfManager } from './pdf/pdf-manager'
|
|
||||||
import { getNonExistingNotes } from './tools/notes'
|
import { getNonExistingNotes } from './tools/notes'
|
||||||
|
import { getPdfText } from 'obsidian-text-extract'
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return all plaintext files as IndexedDocuments
|
* Return all plaintext files as IndexedDocuments
|
||||||
@@ -60,7 +60,7 @@ export async function fileToIndexedDocument(
|
|||||||
if (isFilePlaintext(file.path)) {
|
if (isFilePlaintext(file.path)) {
|
||||||
content = removeDiacritics(await app.vault.cachedRead(file))
|
content = removeDiacritics(await app.vault.cachedRead(file))
|
||||||
} else if (file.path.endsWith('.pdf')) {
|
} else if (file.path.endsWith('.pdf')) {
|
||||||
content = removeDiacritics(await pdfManager.getPdfText(file))
|
content = removeDiacritics(await getPdfText(file))
|
||||||
} else {
|
} else {
|
||||||
throw new Error('Invalid file: ' + file.path)
|
throw new Error('Invalid file: ' + file.path)
|
||||||
}
|
}
|
||||||
@@ -73,7 +73,8 @@ export async function fileToIndexedDocument(
|
|||||||
if (metadata?.frontmatter?.['excalidraw-plugin']) {
|
if (metadata?.frontmatter?.['excalidraw-plugin']) {
|
||||||
const comments = metadata.sections?.filter(s => s.type === 'comment') ?? []
|
const comments = metadata.sections?.filter(s => s.type === 'comment') ?? []
|
||||||
for (const { start, end } of comments.map(c => c.position)) {
|
for (const { start, end } of comments.map(c => c.position)) {
|
||||||
content = content.substring(0, start.offset-1) + content.substring(end.offset)
|
content =
|
||||||
|
content.substring(0, start.offset - 1) + content.substring(end.offset)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
55
src/lib.rs
55
src/lib.rs
@@ -1,55 +0,0 @@
|
|||||||
use js_sys::Uint8Array;
|
|
||||||
use pdf_extract::extract_text_from_mem;
|
|
||||||
use wasm_bindgen::prelude::*;
|
|
||||||
|
|
||||||
// mod obsidian;
|
|
||||||
|
|
||||||
#[wasm_bindgen]
|
|
||||||
pub fn extract_pdf_text(arr: Uint8Array) -> Result<String, JsError> {
|
|
||||||
match extract_text_from_mem(&arr.to_vec()) {
|
|
||||||
Ok(txt) => return Ok(txt),
|
|
||||||
Err(e) => return Err(JsError::new(&e.to_string())),
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
// #[wasm_bindgen]
|
|
||||||
// pub struct ExampleCommand {
|
|
||||||
// id: JsString,
|
|
||||||
// name: JsString,
|
|
||||||
// }
|
|
||||||
|
|
||||||
// #[wasm_bindgen]
|
|
||||||
// impl ExampleCommand {
|
|
||||||
// #[wasm_bindgen(getter)]
|
|
||||||
// pub fn id(&self) -> JsString {
|
|
||||||
// self.id.clone()
|
|
||||||
// }
|
|
||||||
|
|
||||||
// #[wasm_bindgen(setter)]
|
|
||||||
// pub fn set_id(&mut self, id: &str) {
|
|
||||||
// self.id = JsString::from(id)
|
|
||||||
// }
|
|
||||||
|
|
||||||
// #[wasm_bindgen(getter)]
|
|
||||||
// pub fn name(&self) -> JsString {
|
|
||||||
// self.name.clone()
|
|
||||||
// }
|
|
||||||
|
|
||||||
// #[wasm_bindgen(setter)]
|
|
||||||
// pub fn set_name(&mut self, name: &str) {
|
|
||||||
// self.name = JsString::from(name)
|
|
||||||
// }
|
|
||||||
|
|
||||||
// pub fn callback(&self) {
|
|
||||||
// obsidian::Notice::new("hello from rust");
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// #[wasm_bindgen]
|
|
||||||
// pub fn onload(plugin: &obsidian::Plugin) {
|
|
||||||
// let cmd = ExampleCommand {
|
|
||||||
// id: JsString::from("example"),
|
|
||||||
// name: JsString::from("Example"),
|
|
||||||
// };
|
|
||||||
// plugin.addCommand(JsValue::from(cmd))
|
|
||||||
// }
|
|
||||||
@@ -1,14 +0,0 @@
|
|||||||
use wasm_bindgen::prelude::*;
|
|
||||||
|
|
||||||
#[wasm_bindgen(module = "obsidian")]
|
|
||||||
extern "C" {
|
|
||||||
pub type Plugin;
|
|
||||||
|
|
||||||
#[wasm_bindgen(structural, method)]
|
|
||||||
pub fn addCommand(this: &Plugin, command: JsValue);
|
|
||||||
|
|
||||||
pub type Notice;
|
|
||||||
|
|
||||||
#[wasm_bindgen(constructor)]
|
|
||||||
pub fn new(message: &str) -> Notice;
|
|
||||||
}
|
|
||||||
@@ -1,107 +0,0 @@
|
|||||||
import type { TFile } from 'obsidian'
|
|
||||||
import WebWorker from 'web-worker:./pdf-worker.ts'
|
|
||||||
import { makeMD5 } from '../tools/utils'
|
|
||||||
import { database } from '../database'
|
|
||||||
|
|
||||||
const workerTimeout = 120_000
|
|
||||||
|
|
||||||
class PDFWorker {
|
|
||||||
private static pool: PDFWorker[] = []
|
|
||||||
static getWorker(): PDFWorker {
|
|
||||||
const free = PDFWorker.pool.find(w => !w.running)
|
|
||||||
if (free) {
|
|
||||||
return free
|
|
||||||
}
|
|
||||||
const worker = new PDFWorker(new WebWorker({ name: 'PDF Text Extractor' }))
|
|
||||||
PDFWorker.pool.push(worker)
|
|
||||||
return worker
|
|
||||||
}
|
|
||||||
|
|
||||||
private running = false
|
|
||||||
|
|
||||||
private constructor(private worker: Worker) {}
|
|
||||||
|
|
||||||
public async run(msg: any): Promise<any> {
|
|
||||||
return new Promise((resolve, reject) => {
|
|
||||||
this.running = true
|
|
||||||
|
|
||||||
const timeout = setTimeout(() => {
|
|
||||||
this.worker.terminate()
|
|
||||||
console.warn('Omnisearch - Worker timeout')
|
|
||||||
reject('timeout')
|
|
||||||
this.running = false
|
|
||||||
}, workerTimeout)
|
|
||||||
|
|
||||||
this.worker.postMessage(msg)
|
|
||||||
this.worker.onmessage = evt => {
|
|
||||||
clearTimeout(timeout)
|
|
||||||
resolve(evt)
|
|
||||||
this.running = false
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
class PDFManager {
|
|
||||||
public async getPdfText(file: TFile): Promise<string> {
|
|
||||||
// 1) Check if we can find by path & size
|
|
||||||
const docByPath = await database.pdf.get({
|
|
||||||
path: file.path,
|
|
||||||
size: file.stat.size,
|
|
||||||
})
|
|
||||||
|
|
||||||
if (docByPath) {
|
|
||||||
return docByPath.text
|
|
||||||
}
|
|
||||||
|
|
||||||
// 2) Check by hash
|
|
||||||
const data = new Uint8Array(await app.vault.readBinary(file))
|
|
||||||
const hash = makeMD5(data)
|
|
||||||
const docByHash = await database.pdf.get(hash)
|
|
||||||
if (docByHash) {
|
|
||||||
return docByHash.text
|
|
||||||
}
|
|
||||||
|
|
||||||
// 3) The PDF is not cached, extract it
|
|
||||||
const worker = PDFWorker.getWorker() // new PDFWorker({ name: 'PDF Text Extractor' })
|
|
||||||
return new Promise(async (resolve, reject) => {
|
|
||||||
try {
|
|
||||||
const res = await worker.run({ data, name: file.basename })
|
|
||||||
const text = (res.data.text as string)
|
|
||||||
// Replace \n with spaces
|
|
||||||
.replace(/\n/g, ' ')
|
|
||||||
// Trim multiple spaces
|
|
||||||
.replace(/ +/g, ' ')
|
|
||||||
.trim()
|
|
||||||
|
|
||||||
// Add it to the cache
|
|
||||||
database.pdf
|
|
||||||
.add({ hash, text, path: file.path, size: file.stat.size })
|
|
||||||
.then(() => {
|
|
||||||
resolve(text)
|
|
||||||
})
|
|
||||||
} catch (e) {
|
|
||||||
// In case of error (unreadable PDF or timeout) just add
|
|
||||||
// an empty string to the cache
|
|
||||||
database.pdf
|
|
||||||
.add({ hash, text: '', path: file.path, size: file.stat.size })
|
|
||||||
.then(() => {
|
|
||||||
resolve('')
|
|
||||||
})
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Removes the outdated cache entries
|
|
||||||
*/
|
|
||||||
public async cleanCache(): Promise<void> {
|
|
||||||
database.pdf.each(async item => {
|
|
||||||
if (!(await app.vault.adapter.exists(item.path))) {
|
|
||||||
console.log(item.path + ' does not exist')
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
export const pdfManager = new PDFManager()
|
|
||||||
@@ -1,21 +0,0 @@
|
|||||||
import rustPlugin from '../../pkg/obsidian_search_bg.wasm'
|
|
||||||
import * as plugin from '../../pkg'
|
|
||||||
|
|
||||||
const decodedPlugin = decodeBase64(rustPlugin as any)
|
|
||||||
|
|
||||||
onmessage = async evt => {
|
|
||||||
const buffer = Uint8Array.from(decodedPlugin, c => c.charCodeAt(0))
|
|
||||||
await plugin.default(Promise.resolve(buffer))
|
|
||||||
try {
|
|
||||||
const text = plugin.extract_pdf_text(evt.data.data as Uint8Array)
|
|
||||||
self.postMessage({ text })
|
|
||||||
} catch (e) {
|
|
||||||
console.warn('Omnisearch - Could not extract text from ' + evt.data.name)
|
|
||||||
self.postMessage({ text: '' })
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function decodeBase64(data: string) {
|
|
||||||
return atob(data)
|
|
||||||
// return Buffer.from(data, 'base64').toString()
|
|
||||||
}
|
|
||||||
1
src/typings/types.d.ts
vendored
1
src/typings/types.d.ts
vendored
@@ -1 +0,0 @@
|
|||||||
declare module 'pdfjs-dist/build/pdf.worker.entry';
|
|
||||||
4
src/typings/workers.d.ts
vendored
4
src/typings/workers.d.ts
vendored
@@ -1,4 +0,0 @@
|
|||||||
declare module "web-worker:*" {
|
|
||||||
const WorkerFactory: new (options: any) => Worker;
|
|
||||||
export default WorkerFactory;
|
|
||||||
}
|
|
||||||
Reference in New Issue
Block a user