From 5266ee76b32e84ede1acea95d68ebfc4df6f39f2 Mon Sep 17 00:00:00 2001 From: Simon Cambier Date: Thu, 29 Sep 2022 22:10:19 +0200 Subject: [PATCH] #58 - Experimental PDF support --- package.json | 8 +++++- pnpm-lock.yaml | 42 ++++++++++++++++++++++++++----- src/components/ModalInFile.svelte | 2 +- src/pdf-parser.ts | 18 +++++++++++++ src/search.ts | 12 ++++++--- src/types-obsidian.d.ts | 24 ++++++++++++++++++ src/types.d.ts | 23 +---------------- src/utils.ts | 2 +- 8 files changed, 97 insertions(+), 34 deletions(-) create mode 100644 src/pdf-parser.ts create mode 100644 src/types-obsidian.d.ts diff --git a/package.json b/package.json index f5c3851..ceab56b 100644 --- a/package.json +++ b/package.json @@ -37,6 +37,12 @@ }, "dependencies": { "@vanakat/plugin-api": "^0.1.0", - "minisearch": "^5.0.0" + "minisearch": "^5.0.0", + "pdfjs-dist": "^2.16.105" + }, + "pnpm": { + "overrides": { + "moment@>=2.18.0 <2.29.4": ">=2.29.4" + } } } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 0263238..b364f17 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1,5 +1,8 @@ lockfileVersion: 5.3 +overrides: + moment@>=2.18.0 <2.29.4: '>=2.29.4' + specifiers: '@babel/preset-env': ^7.19.0 '@babel/preset-typescript': ^7.18.6 @@ -12,10 +15,13 @@ specifiers: builtin-modules: ^3.3.0 esbuild: 0.13.12 esbuild-plugin-copy: ^1.3.0 + esbuild-plugin-wasm: ^1.0.0 esbuild-svelte: ^0.7.1 + extract-pdf: C:\Dev\rust\extract-pdf\pkg jest: ^27.5.1 minisearch: ^5.0.0 obsidian: latest + pdfjs-dist: ^2.16.105 prettier: ^2.7.1 prettier-plugin-svelte: ^2.7.0 svelte: ^3.50.1 @@ -26,7 +32,9 @@ specifiers: dependencies: '@vanakat/plugin-api': 0.1.0 + extract-pdf: link:../../rust/extract-pdf/pkg minisearch: 5.0.0 + pdfjs-dist: 2.16.105 devDependencies: '@babel/preset-env': 7.19.0 @@ -39,6 +47,7 @@ devDependencies: builtin-modules: 3.3.0 esbuild: 0.13.12 esbuild-plugin-copy: 1.3.0_esbuild@0.13.12 + esbuild-plugin-wasm: 1.0.0 esbuild-svelte: 0.7.1_esbuild@0.13.12+svelte@3.50.1 jest: 27.5.1 obsidian: 0.16.3 @@ -2433,6 +2442,10 @@ packages: webidl-conversions: 5.0.0 dev: true + /dommatrix/1.0.3: + resolution: {integrity: sha512-l32Xp/TLgWb8ReqbVJAFIvXmY7go4nTxxlWiAFyhoQw9RKEOHBZNnyGvJWqDVSPmq3Y9HlM4npqF/T6VMOXhww==} + dev: false + /electron-to-chromium/1.4.247: resolution: {integrity: sha512-FLs6R4FQE+1JHM0hh3sfdxnYjKvJpHZyhQDjc2qFq/xFvmmRt/TATNToZhrcGUFzpF2XjeiuozrA8lI0PZmYYw==} dev: true @@ -2571,6 +2584,11 @@ packages: globby: 11.1.0 dev: true + /esbuild-plugin-wasm/1.0.0: + resolution: {integrity: sha512-iXIf3hwfqorExG66/eNr3U8JakIZuge70nMNQtinvxbzdljQ/RjvwaBiGPqF/DvuIumUApbe3zj2kqHLVyc7uQ==} + engines: {node: '>=0.10.0'} + dev: true + /esbuild-sunos-64/0.13.12: resolution: {integrity: sha512-jBsF+e0woK3miKI8ufGWKG3o3rY9DpHvCVRn5eburMIIE+2c+y3IZ1srsthKyKI6kkXLvV4Cf/E7w56kLipMXw==} cpu: [x64] @@ -3768,13 +3786,8 @@ packages: minimist: 1.2.6 dev: true - /moment/2.29.2: - resolution: {integrity: sha512-UgzG4rvxYpN15jgCmVJwac49h9ly9NurikMWGPdVxm8GZD6XjkKPxDTjQQ43gtGgnV3X0cAyWDdP2Wexoquifg==} - dev: false - /moment/2.29.4: resolution: {integrity: sha512-5LC9SOxjSc2HF6vO2CyuTDNivEdoz2IvyJJGj6X8DJ0eFyfszE0QiEd+iXmBvUP3WHxSjFH/vIsA0EN00cgr8w==} - dev: true /ms/2.1.2: resolution: {integrity: sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==} @@ -3828,7 +3841,7 @@ packages: '@codemirror/state': 0.19.9 '@codemirror/view': 0.19.48 '@types/codemirror': 0.0.108 - moment: 2.29.2 + moment: 2.29.4 dev: false /obsidian/0.16.3: @@ -3922,6 +3935,18 @@ packages: resolution: {integrity: sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==} engines: {node: '>=8'} + /pdfjs-dist/2.16.105: + resolution: {integrity: sha512-J4dn41spsAwUxCpEoVf6GVoz908IAA3mYiLmNxg8J9kfRXc2jxpbUepcP0ocp0alVNLFthTAM8DZ1RaHh8sU0A==} + peerDependencies: + worker-loader: ^3.0.8 + peerDependenciesMeta: + worker-loader: + optional: true + dependencies: + dommatrix: 1.0.3 + web-streams-polyfill: 3.2.1 + dev: false + /picocolors/1.0.0: resolution: {integrity: sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==} dev: true @@ -4561,6 +4586,11 @@ packages: makeerror: 1.0.12 dev: true + /web-streams-polyfill/3.2.1: + resolution: {integrity: sha512-e0MO3wdXWKrLbL0DgGnUV7WHVuw9OUvL4hjgnPkIeEvESk74gAITi5G606JtZPp39cd8HA9VQzCIvA49LpPN5Q==} + engines: {node: '>= 8'} + dev: false + /webidl-conversions/5.0.0: resolution: {integrity: sha512-VlZwKPCkYKxQgeSbH5EyngOmRp7Ww7I9rQLERETtf5ofd9pGeswWiOtogpEO850jziPRarreGxn5QIiTqpb2wA==} engines: {node: '>=8'} diff --git a/src/components/ModalInFile.svelte b/src/components/ModalInFile.svelte index 2355c81..5341c37 100644 --- a/src/components/ModalInFile.svelte +++ b/src/components/ModalInFile.svelte @@ -27,7 +27,7 @@ let groupedOffsets: number[] = [] let selectedIndex = 0 - let note: ResultNote | null = null + let note: ResultNote | undefined let query: Query onMount(() => { diff --git a/src/pdf-parser.ts b/src/pdf-parser.ts new file mode 100644 index 0000000..59e6748 --- /dev/null +++ b/src/pdf-parser.ts @@ -0,0 +1,18 @@ +import PDFJs from 'pdfjs-dist' +import pdfjsWorker from 'pdfjs-dist/build/pdf.worker.entry' +import type { TextItem } from 'pdfjs-dist/types/src/display/api' +import type { TFile } from 'obsidian' + +PDFJs.GlobalWorkerOptions.workerSrc = pdfjsWorker + +// https://stackoverflow.com/a/59929946 +export async function getPdfText(file: TFile): Promise { + const data = await app.vault.readBinary(file) + const doc = await PDFJs.getDocument(data).promise + const pageTexts = Array.from({ length: doc.numPages }, async (v, i) => { + const page = await doc.getPage(i + 1) + const content = await page.getTextContent() + return (content.items as TextItem[]).map(token => token.str).join('') + }) + return (await Promise.all(pageTexts)).join('') +} diff --git a/src/search.ts b/src/search.ts index c73c4d8..ff1e690 100644 --- a/src/search.ts +++ b/src/search.ts @@ -19,7 +19,7 @@ import { wait, } from './utils' import type { Query } from './query' -import { settings } from './settings' +import { settings } from './settings' import { removeNoteFromCache, getNoteFromCache, @@ -32,6 +32,7 @@ import { saveNotesCacheToFile, isCacheOutdated, } from './notes' +import { getPdfText } from './pdf-parser' let minisearchInstance: MiniSearch let isIndexChanged: boolean @@ -326,8 +327,13 @@ export async function addToIndex(file: TAbstractFile): Promise { throw new Error(`${file.basename} is already indexed`) } - // Fetch content from the cache to index it as-is - const content = removeDiacritics(await app.vault.cachedRead(file)) + let content + if (file.path.endsWith('.pdf')) { + content = removeDiacritics(await getPdfText(file as TFile)) + } else { + // Fetch content from the cache to index it as-is + content = removeDiacritics(await app.vault.cachedRead(file)) + } // Make the document and index it const note: IndexedNote = { diff --git a/src/types-obsidian.d.ts b/src/types-obsidian.d.ts new file mode 100644 index 0000000..5cc7d8f --- /dev/null +++ b/src/types-obsidian.d.ts @@ -0,0 +1,24 @@ +import type { MetadataCache, ViewState, Vault } from 'obsidian' + +declare module 'obsidian' { + interface MetadataCache { + isUserIgnored?(path: string): boolean + } + + interface FrontMatterCache { + aliases?: string[] | string + tags?: string[] | string + } + + interface ViewState { + state?: { + file?: string + } + } + + interface Vault { + getConfig(string): unknown + } +} + + diff --git a/src/types.d.ts b/src/types.d.ts index c29e77a..e1c9a55 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -1,22 +1 @@ -import type { MetadataCache, ViewState, Vault } from 'obsidian' - -declare module 'obsidian' { - interface MetadataCache { - isUserIgnored?(path: string): boolean - } - - interface FrontMatterCache { - aliases?: string[] | string - tags?: string[] | string - } - - interface ViewState { - state?: { - file?: string - } - } - - interface Vault { - getConfig(string): unknown - } -} +declare module 'pdfjs-dist/build/pdf.worker.entry'; \ No newline at end of file diff --git a/src/utils.ts b/src/utils.ts index b09cfb6..430d445 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -174,7 +174,7 @@ export function getCtrlKeyLabel(): 'ctrl' | '⌘' { export function isFileIndexable(path: string): boolean { return ( - path.endsWith('.md') || + path.endsWith('.md') || path.endsWith('.pdf') || settings.indexedFileTypes.some(t => path.endsWith(`.${t}`)) ) }