#58 - Experimental PDF support

This commit is contained in:
Simon Cambier
2022-09-29 22:10:19 +02:00
parent 1314fc6269
commit 5266ee76b3
8 changed files with 97 additions and 34 deletions

View File

@@ -27,7 +27,7 @@
let groupedOffsets: number[] = []
let selectedIndex = 0
let note: ResultNote | null = null
let note: ResultNote | undefined
let query: Query
onMount(() => {

18
src/pdf-parser.ts Normal file
View File

@@ -0,0 +1,18 @@
import PDFJs from 'pdfjs-dist'
import pdfjsWorker from 'pdfjs-dist/build/pdf.worker.entry'
import type { TextItem } from 'pdfjs-dist/types/src/display/api'
import type { TFile } from 'obsidian'
PDFJs.GlobalWorkerOptions.workerSrc = pdfjsWorker
// https://stackoverflow.com/a/59929946
export async function getPdfText(file: TFile): Promise<string> {
const data = await app.vault.readBinary(file)
const doc = await PDFJs.getDocument(data).promise
const pageTexts = Array.from({ length: doc.numPages }, async (v, i) => {
const page = await doc.getPage(i + 1)
const content = await page.getTextContent()
return (content.items as TextItem[]).map(token => token.str).join('')
})
return (await Promise.all(pageTexts)).join('')
}

View File

@@ -19,7 +19,7 @@ import {
wait,
} from './utils'
import type { Query } from './query'
import { settings } from './settings'
import { settings } from './settings'
import {
removeNoteFromCache,
getNoteFromCache,
@@ -32,6 +32,7 @@ import {
saveNotesCacheToFile,
isCacheOutdated,
} from './notes'
import { getPdfText } from './pdf-parser'
let minisearchInstance: MiniSearch<IndexedNote>
let isIndexChanged: boolean
@@ -326,8 +327,13 @@ export async function addToIndex(file: TAbstractFile): Promise<void> {
throw new Error(`${file.basename} is already indexed`)
}
// Fetch content from the cache to index it as-is
const content = removeDiacritics(await app.vault.cachedRead(file))
let content
if (file.path.endsWith('.pdf')) {
content = removeDiacritics(await getPdfText(file as TFile))
} else {
// Fetch content from the cache to index it as-is
content = removeDiacritics(await app.vault.cachedRead(file))
}
// Make the document and index it
const note: IndexedNote = {

24
src/types-obsidian.d.ts vendored Normal file
View File

@@ -0,0 +1,24 @@
import type { MetadataCache, ViewState, Vault } from 'obsidian'
declare module 'obsidian' {
interface MetadataCache {
isUserIgnored?(path: string): boolean
}
interface FrontMatterCache {
aliases?: string[] | string
tags?: string[] | string
}
interface ViewState {
state?: {
file?: string
}
}
interface Vault {
getConfig(string): unknown
}
}

23
src/types.d.ts vendored
View File

@@ -1,22 +1 @@
import type { MetadataCache, ViewState, Vault } from 'obsidian'
declare module 'obsidian' {
interface MetadataCache {
isUserIgnored?(path: string): boolean
}
interface FrontMatterCache {
aliases?: string[] | string
tags?: string[] | string
}
interface ViewState {
state?: {
file?: string
}
}
interface Vault {
getConfig(string): unknown
}
}
declare module 'pdfjs-dist/build/pdf.worker.entry';

View File

@@ -174,7 +174,7 @@ export function getCtrlKeyLabel(): 'ctrl' | '⌘' {
export function isFileIndexable(path: string): boolean {
return (
path.endsWith('.md') ||
path.endsWith('.md') || path.endsWith('.pdf') ||
settings.indexedFileTypes.some(t => path.endsWith(`.${t}`))
)
}