From 1cd151b1fdc7968846b17bcceff746367d3e969e Mon Sep 17 00:00:00 2001 From: Simon Cambier Date: Sun, 30 Oct 2022 20:45:55 +0100 Subject: [PATCH] OCR bases --- src/file-loader.ts | 34 +++++++++++++++++++++++++++++++++- src/main.ts | 12 +++++++++++- src/tools/utils.ts | 6 ++++++ 3 files changed, 50 insertions(+), 2 deletions(-) diff --git a/src/file-loader.ts b/src/file-loader.ts index dcf8085..f6409d8 100644 --- a/src/file-loader.ts +++ b/src/file-loader.ts @@ -3,6 +3,7 @@ import { extractHeadingsFromCache, getAliasesFromMetadata, getTagsFromMetadata, + isFileImage, isFilePlaintext, removeDiacritics, } from './tools/utils' @@ -10,7 +11,7 @@ import * as NotesIndex from './notes-index' import type { TFile } from 'obsidian' import type { IndexedDocument } from './globals' import { getNonExistingNotes } from './tools/notes' -import { getPdfText } from 'obsidian-text-extract' +import { getImageText, getPdfText } from 'obsidian-text-extract' /** * Return all plaintext files as IndexedDocuments @@ -48,6 +49,35 @@ export async function getPDFFiles(): Promise { return data } +/** + * Return all Image files as IndexedDocuments. + * If a PDF isn't cached, it will be read from the disk and added to the IndexedDB + */ +export async function getImageFiles(): Promise { + const allFiles = app.vault + .getFiles() + .filter( + f => + f.path.endsWith('.png') || + f.path.endsWith('.jpg') || + f.path.endsWith('.jpeg') + ) + const data: IndexedDocument[] = [] + + const input = [] + for (const file of allFiles) { + input.push( + NotesIndex.processQueue(async () => { + const doc = await fileToIndexedDocument(file) + await cacheManager.updateDocument(file.path, doc) + data.push(doc) + }) + ) + } + await Promise.all(input) + return data +} + /** * Convert a file into an IndexedDocument. * Will use the cache if possible. @@ -61,6 +91,8 @@ export async function fileToIndexedDocument( content = removeDiacritics(await app.vault.cachedRead(file)) } else if (file.path.endsWith('.pdf')) { content = removeDiacritics(await getPdfText(file)) + } else if (isFileImage(file.path)) { + content = removeDiacritics(await getImageText(file)) } else { throw new Error('Invalid file: ' + file.path) } diff --git a/src/main.ts b/src/main.ts index d14427a..1d07815 100644 --- a/src/main.ts +++ b/src/main.ts @@ -1,5 +1,5 @@ import { Notice, Plugin, TFile } from 'obsidian' -import {SearchEngine} from './search/search-engine' +import { SearchEngine } from './search/search-engine' import { OmnisearchInFileModal, OmnisearchVaultModal, @@ -128,6 +128,16 @@ async function populateIndex(): Promise { console.timeEnd('Omnisearch - Timing') } + // Load Images + // if (settings.PDFIndexing) { + console.time('Omnisearch - Timing') + const images = await FileLoader.getImageFiles() + // Index them + await tmpEngine.addAllToMinisearch(images) + console.log(`Omnisearch - Indexed ${images.length} Images`) + console.timeEnd('Omnisearch - Timing') + // } + // Load PDFs into the main search engine, and write cache SearchEngine.loadTmpDataIntoMain() SearchEngine.isIndexing.set(false) diff --git a/src/tools/utils.ts b/src/tools/utils.ts index 8e13bf5..a5f2c90 100644 --- a/src/tools/utils.ts +++ b/src/tools/utils.ts @@ -188,6 +188,12 @@ export function isFileIndexable(path: string): boolean { ) } +export function isFileImage(path: string): boolean { + return ( + path.endsWith('.png') || path.endsWith('.jpg') || path.endsWith('.jpeg') + ) +} + export function isFilePlaintext(path: string): boolean { return getPlaintextExtensions().some(t => path.endsWith(`.${t}`)) }