#58 - Experimental PDF support

This commit is contained in:
Simon Cambier
2022-09-29 22:10:19 +02:00
parent 1314fc6269
commit 5266ee76b3
8 changed files with 97 additions and 34 deletions

View File

@@ -37,6 +37,12 @@
}, },
"dependencies": { "dependencies": {
"@vanakat/plugin-api": "^0.1.0", "@vanakat/plugin-api": "^0.1.0",
"minisearch": "^5.0.0" "minisearch": "^5.0.0",
"pdfjs-dist": "^2.16.105"
},
"pnpm": {
"overrides": {
"moment@>=2.18.0 <2.29.4": ">=2.29.4"
}
} }
} }

42
pnpm-lock.yaml generated
View File

@@ -1,5 +1,8 @@
lockfileVersion: 5.3 lockfileVersion: 5.3
overrides:
moment@>=2.18.0 <2.29.4: '>=2.29.4'
specifiers: specifiers:
'@babel/preset-env': ^7.19.0 '@babel/preset-env': ^7.19.0
'@babel/preset-typescript': ^7.18.6 '@babel/preset-typescript': ^7.18.6
@@ -12,10 +15,13 @@ specifiers:
builtin-modules: ^3.3.0 builtin-modules: ^3.3.0
esbuild: 0.13.12 esbuild: 0.13.12
esbuild-plugin-copy: ^1.3.0 esbuild-plugin-copy: ^1.3.0
esbuild-plugin-wasm: ^1.0.0
esbuild-svelte: ^0.7.1 esbuild-svelte: ^0.7.1
extract-pdf: C:\Dev\rust\extract-pdf\pkg
jest: ^27.5.1 jest: ^27.5.1
minisearch: ^5.0.0 minisearch: ^5.0.0
obsidian: latest obsidian: latest
pdfjs-dist: ^2.16.105
prettier: ^2.7.1 prettier: ^2.7.1
prettier-plugin-svelte: ^2.7.0 prettier-plugin-svelte: ^2.7.0
svelte: ^3.50.1 svelte: ^3.50.1
@@ -26,7 +32,9 @@ specifiers:
dependencies: dependencies:
'@vanakat/plugin-api': 0.1.0 '@vanakat/plugin-api': 0.1.0
extract-pdf: link:../../rust/extract-pdf/pkg
minisearch: 5.0.0 minisearch: 5.0.0
pdfjs-dist: 2.16.105
devDependencies: devDependencies:
'@babel/preset-env': 7.19.0 '@babel/preset-env': 7.19.0
@@ -39,6 +47,7 @@ devDependencies:
builtin-modules: 3.3.0 builtin-modules: 3.3.0
esbuild: 0.13.12 esbuild: 0.13.12
esbuild-plugin-copy: 1.3.0_esbuild@0.13.12 esbuild-plugin-copy: 1.3.0_esbuild@0.13.12
esbuild-plugin-wasm: 1.0.0
esbuild-svelte: 0.7.1_esbuild@0.13.12+svelte@3.50.1 esbuild-svelte: 0.7.1_esbuild@0.13.12+svelte@3.50.1
jest: 27.5.1 jest: 27.5.1
obsidian: 0.16.3 obsidian: 0.16.3
@@ -2433,6 +2442,10 @@ packages:
webidl-conversions: 5.0.0 webidl-conversions: 5.0.0
dev: true dev: true
/dommatrix/1.0.3:
resolution: {integrity: sha512-l32Xp/TLgWb8ReqbVJAFIvXmY7go4nTxxlWiAFyhoQw9RKEOHBZNnyGvJWqDVSPmq3Y9HlM4npqF/T6VMOXhww==}
dev: false
/electron-to-chromium/1.4.247: /electron-to-chromium/1.4.247:
resolution: {integrity: sha512-FLs6R4FQE+1JHM0hh3sfdxnYjKvJpHZyhQDjc2qFq/xFvmmRt/TATNToZhrcGUFzpF2XjeiuozrA8lI0PZmYYw==} resolution: {integrity: sha512-FLs6R4FQE+1JHM0hh3sfdxnYjKvJpHZyhQDjc2qFq/xFvmmRt/TATNToZhrcGUFzpF2XjeiuozrA8lI0PZmYYw==}
dev: true dev: true
@@ -2571,6 +2584,11 @@ packages:
globby: 11.1.0 globby: 11.1.0
dev: true dev: true
/esbuild-plugin-wasm/1.0.0:
resolution: {integrity: sha512-iXIf3hwfqorExG66/eNr3U8JakIZuge70nMNQtinvxbzdljQ/RjvwaBiGPqF/DvuIumUApbe3zj2kqHLVyc7uQ==}
engines: {node: '>=0.10.0'}
dev: true
/esbuild-sunos-64/0.13.12: /esbuild-sunos-64/0.13.12:
resolution: {integrity: sha512-jBsF+e0woK3miKI8ufGWKG3o3rY9DpHvCVRn5eburMIIE+2c+y3IZ1srsthKyKI6kkXLvV4Cf/E7w56kLipMXw==} resolution: {integrity: sha512-jBsF+e0woK3miKI8ufGWKG3o3rY9DpHvCVRn5eburMIIE+2c+y3IZ1srsthKyKI6kkXLvV4Cf/E7w56kLipMXw==}
cpu: [x64] cpu: [x64]
@@ -3768,13 +3786,8 @@ packages:
minimist: 1.2.6 minimist: 1.2.6
dev: true dev: true
/moment/2.29.2:
resolution: {integrity: sha512-UgzG4rvxYpN15jgCmVJwac49h9ly9NurikMWGPdVxm8GZD6XjkKPxDTjQQ43gtGgnV3X0cAyWDdP2Wexoquifg==}
dev: false
/moment/2.29.4: /moment/2.29.4:
resolution: {integrity: sha512-5LC9SOxjSc2HF6vO2CyuTDNivEdoz2IvyJJGj6X8DJ0eFyfszE0QiEd+iXmBvUP3WHxSjFH/vIsA0EN00cgr8w==} resolution: {integrity: sha512-5LC9SOxjSc2HF6vO2CyuTDNivEdoz2IvyJJGj6X8DJ0eFyfszE0QiEd+iXmBvUP3WHxSjFH/vIsA0EN00cgr8w==}
dev: true
/ms/2.1.2: /ms/2.1.2:
resolution: {integrity: sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==} resolution: {integrity: sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==}
@@ -3828,7 +3841,7 @@ packages:
'@codemirror/state': 0.19.9 '@codemirror/state': 0.19.9
'@codemirror/view': 0.19.48 '@codemirror/view': 0.19.48
'@types/codemirror': 0.0.108 '@types/codemirror': 0.0.108
moment: 2.29.2 moment: 2.29.4
dev: false dev: false
/obsidian/0.16.3: /obsidian/0.16.3:
@@ -3922,6 +3935,18 @@ packages:
resolution: {integrity: sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==} resolution: {integrity: sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==}
engines: {node: '>=8'} engines: {node: '>=8'}
/pdfjs-dist/2.16.105:
resolution: {integrity: sha512-J4dn41spsAwUxCpEoVf6GVoz908IAA3mYiLmNxg8J9kfRXc2jxpbUepcP0ocp0alVNLFthTAM8DZ1RaHh8sU0A==}
peerDependencies:
worker-loader: ^3.0.8
peerDependenciesMeta:
worker-loader:
optional: true
dependencies:
dommatrix: 1.0.3
web-streams-polyfill: 3.2.1
dev: false
/picocolors/1.0.0: /picocolors/1.0.0:
resolution: {integrity: sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==} resolution: {integrity: sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==}
dev: true dev: true
@@ -4561,6 +4586,11 @@ packages:
makeerror: 1.0.12 makeerror: 1.0.12
dev: true dev: true
/web-streams-polyfill/3.2.1:
resolution: {integrity: sha512-e0MO3wdXWKrLbL0DgGnUV7WHVuw9OUvL4hjgnPkIeEvESk74gAITi5G606JtZPp39cd8HA9VQzCIvA49LpPN5Q==}
engines: {node: '>= 8'}
dev: false
/webidl-conversions/5.0.0: /webidl-conversions/5.0.0:
resolution: {integrity: sha512-VlZwKPCkYKxQgeSbH5EyngOmRp7Ww7I9rQLERETtf5ofd9pGeswWiOtogpEO850jziPRarreGxn5QIiTqpb2wA==} resolution: {integrity: sha512-VlZwKPCkYKxQgeSbH5EyngOmRp7Ww7I9rQLERETtf5ofd9pGeswWiOtogpEO850jziPRarreGxn5QIiTqpb2wA==}
engines: {node: '>=8'} engines: {node: '>=8'}

View File

@@ -27,7 +27,7 @@
let groupedOffsets: number[] = [] let groupedOffsets: number[] = []
let selectedIndex = 0 let selectedIndex = 0
let note: ResultNote | null = null let note: ResultNote | undefined
let query: Query let query: Query
onMount(() => { onMount(() => {

18
src/pdf-parser.ts Normal file
View File

@@ -0,0 +1,18 @@
import PDFJs from 'pdfjs-dist'
import pdfjsWorker from 'pdfjs-dist/build/pdf.worker.entry'
import type { TextItem } from 'pdfjs-dist/types/src/display/api'
import type { TFile } from 'obsidian'
PDFJs.GlobalWorkerOptions.workerSrc = pdfjsWorker
// https://stackoverflow.com/a/59929946
export async function getPdfText(file: TFile): Promise<string> {
const data = await app.vault.readBinary(file)
const doc = await PDFJs.getDocument(data).promise
const pageTexts = Array.from({ length: doc.numPages }, async (v, i) => {
const page = await doc.getPage(i + 1)
const content = await page.getTextContent()
return (content.items as TextItem[]).map(token => token.str).join('')
})
return (await Promise.all(pageTexts)).join('')
}

View File

@@ -19,7 +19,7 @@ import {
wait, wait,
} from './utils' } from './utils'
import type { Query } from './query' import type { Query } from './query'
import { settings } from './settings' import { settings } from './settings'
import { import {
removeNoteFromCache, removeNoteFromCache,
getNoteFromCache, getNoteFromCache,
@@ -32,6 +32,7 @@ import {
saveNotesCacheToFile, saveNotesCacheToFile,
isCacheOutdated, isCacheOutdated,
} from './notes' } from './notes'
import { getPdfText } from './pdf-parser'
let minisearchInstance: MiniSearch<IndexedNote> let minisearchInstance: MiniSearch<IndexedNote>
let isIndexChanged: boolean let isIndexChanged: boolean
@@ -326,8 +327,13 @@ export async function addToIndex(file: TAbstractFile): Promise<void> {
throw new Error(`${file.basename} is already indexed`) throw new Error(`${file.basename} is already indexed`)
} }
// Fetch content from the cache to index it as-is let content
const content = removeDiacritics(await app.vault.cachedRead(file)) if (file.path.endsWith('.pdf')) {
content = removeDiacritics(await getPdfText(file as TFile))
} else {
// Fetch content from the cache to index it as-is
content = removeDiacritics(await app.vault.cachedRead(file))
}
// Make the document and index it // Make the document and index it
const note: IndexedNote = { const note: IndexedNote = {

24
src/types-obsidian.d.ts vendored Normal file
View File

@@ -0,0 +1,24 @@
import type { MetadataCache, ViewState, Vault } from 'obsidian'
declare module 'obsidian' {
interface MetadataCache {
isUserIgnored?(path: string): boolean
}
interface FrontMatterCache {
aliases?: string[] | string
tags?: string[] | string
}
interface ViewState {
state?: {
file?: string
}
}
interface Vault {
getConfig(string): unknown
}
}

23
src/types.d.ts vendored
View File

@@ -1,22 +1 @@
import type { MetadataCache, ViewState, Vault } from 'obsidian' declare module 'pdfjs-dist/build/pdf.worker.entry';
declare module 'obsidian' {
interface MetadataCache {
isUserIgnored?(path: string): boolean
}
interface FrontMatterCache {
aliases?: string[] | string
tags?: string[] | string
}
interface ViewState {
state?: {
file?: string
}
}
interface Vault {
getConfig(string): unknown
}
}

View File

@@ -174,7 +174,7 @@ export function getCtrlKeyLabel(): 'ctrl' | '⌘' {
export function isFileIndexable(path: string): boolean { export function isFileIndexable(path: string): boolean {
return ( return (
path.endsWith('.md') || path.endsWith('.md') || path.endsWith('.pdf') ||
settings.indexedFileTypes.some(t => path.endsWith(`.${t}`)) settings.indexedFileTypes.some(t => path.endsWith(`.${t}`))
) )
} }