From f0b2de4316e96cdaa9ecc2601827cee343e7e4a8 Mon Sep 17 00:00:00 2001 From: Simon Cambier Date: Sun, 6 Nov 2022 17:04:46 +0100 Subject: [PATCH] #106 - Basic, English-only OCR --- README.md | 15 ++++---- package.json | 2 +- pnpm-lock.yaml | 8 ++--- src/components/ResultItemVault.svelte | 1 - src/file-loader.ts | 38 ++++++++++++++------ src/main.ts | 50 ++++++++++++++++----------- src/settings.ts | 25 +++++++++++++- src/tools/utils.ts | 4 ++- 8 files changed, 95 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index 826a593..4e45232 100644 --- a/README.md +++ b/README.md @@ -124,15 +124,12 @@ See [styles.css](./assets/styles.css) for more information. **Omnisearch makes Obsidian sluggish at startup.** -- You may have _big_ documents. Huge notes (like novels) can freeze the interface for a short time when being indexed. While Omnisearch uses a cache between sessions, it's still rebuilt at startup to keep it up-to-date. +- While Omnisearch does its best to work smoothly in the background, bigger vaults can cause some hiccups at startup because of the search index size. -**I have thousands of notes, and at startup I have to wait a few seconds before Omnisearch gives me the context of a result.** +**Omnisearch gives inconsistent/invalid results, there are errors in the developer console** -- Omnisearch refreshes its index at startup. During this time, you can still find notes, but Omnisearch is not able to show you the excerpts. - -**Omnisearch gives inconsistent/invalid results, or there are errors in the developer console.** - -- Restart Obsidian to force a reindex of Omnisearch +- Restart Obsidian to force a reindex of Omnisearch. +- The cache can be corrupted; you can clear it at the bottom of the settings page, then restart Obsidian. **A query should return a result that does not appear.** @@ -153,6 +150,8 @@ You can write your issue [here](https://github.com/scambier/obsidian-omnisearch/ Omnisearch is licensed under [GPL-3](https://tldrlegal.com/license/gnu-general-public-license-v3-(gpl-3)). -## Sponsors +## Thanks + +To all people who donate through [Ko-Fi](https://ko-fi.com/scambier) or [Github Sponsors](https://github.com/sponsors/scambier) ❤ ![JetBrains Logo (Main) logo](https://resources.jetbrains.com/storage/products/company/brand/logos/jb_beam.svg) diff --git a/package.json b/package.json index feabbf3..65b1723 100644 --- a/package.json +++ b/package.json @@ -44,7 +44,7 @@ "dexie": "^3.2.2", "lodash-es": "4.17.21", "minisearch": "github:scambier/minisearch#callback_desync", - "obsidian-text-extract": "1.0.1", + "obsidian-text-extract": "1.0.2", "pure-md5": "^0.1.14" }, "pnpm": { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 3775cc6..e0485b7 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -23,7 +23,7 @@ specifiers: lodash-es: 4.17.21 minisearch: github:scambier/minisearch#callback_desync obsidian: latest - obsidian-text-extract: 1.0.1 + obsidian-text-extract: 1.0.2 prettier: ^2.7.1 prettier-plugin-svelte: ^2.8.0 pure-md5: ^0.1.14 @@ -40,7 +40,7 @@ dependencies: dexie: 3.2.2 lodash-es: 4.17.21 minisearch: github.com/scambier/minisearch/adf11cab46d851220a41c9ad95ed986b630f0f3c - obsidian-text-extract: 1.0.1 + obsidian-text-extract: 1.0.2 pure-md5: 0.1.14 devDependencies: @@ -4326,8 +4326,8 @@ packages: object-keys: 1.1.1 dev: true - /obsidian-text-extract/1.0.1: - resolution: {integrity: sha512-IJlxbZi/WxzWKwnyruFP/7KUTed6MZW+74OGI5ovZVfikkWx7lfBe1rn23La2kq/BrwETIAugO5Ke4ppvp53gA==} + /obsidian-text-extract/1.0.2: + resolution: {integrity: sha512-OOnV1B0kTED46vxPRLOHdHeev6CqcXs6A39DE3IJaEV9PNZKPF3/f6d3t7/zFLgOVMNiVD9Uj+YziAhXeKt4lw==} dependencies: dexie: 3.2.2 p-limit: 4.0.0 diff --git a/src/components/ResultItemVault.svelte b/src/components/ResultItemVault.svelte index 43c91dc..75bf665 100644 --- a/src/components/ResultItemVault.svelte +++ b/src/components/ResultItemVault.svelte @@ -18,7 +18,6 @@ $: { imagePath = null if (isFileImage(note.path)) { - console.log(note.path) // @ts-ignore const file = app.vault.getFiles().find(f => f.path === note.path) if (file) { diff --git a/src/file-loader.ts b/src/file-loader.ts index 2e44fd4..1bec451 100644 --- a/src/file-loader.ts +++ b/src/file-loader.ts @@ -3,6 +3,7 @@ import { extractHeadingsFromCache, getAliasesFromMetadata, getTagsFromMetadata, + isFileImage, isFilePlaintext, removeDiacritics, } from './tools/utils' @@ -10,7 +11,7 @@ import * as NotesIndex from './notes-index' import type { TFile } from 'obsidian' import type { IndexedDocument } from './globals' import { getNonExistingNotes } from './tools/notes' -import { getPdfText } from 'obsidian-text-extract' +import { getPdfText, getImageText } from 'obsidian-text-extract' /** * Return all plaintext files as IndexedDocuments @@ -27,20 +28,33 @@ export async function getPlainTextFiles(): Promise { } /** - * Return all PDF files as IndexedDocuments. - * If a PDF isn't cached, it will be read from the disk and added to the IndexedDB + * Return all PDFs as IndexedDocuments. */ -export async function getPDFFiles(): Promise { +export async function getPDFAsDocuments(): Promise { const files = app.vault.getFiles().filter(f => f.path.endsWith('.pdf')) + return await getBinaryFiles(files) +} + +/** + * Return all imageas as IndexedDocuments. + */ +export async function getImagesAsDocuments(): Promise { + const files = app.vault.getFiles().filter(f => isFileImage(f.path)) + return await getBinaryFiles(files) +} + +async function getBinaryFiles(files: TFile[]): Promise { const data: IndexedDocument[] = [] const input = [] for (const file of files) { - input.push(new Promise(async (resolve, reject) => { - const doc = await fileToIndexedDocument(file) - await cacheManager.updateLiveDocument(file.path, doc) - data.push(doc) - return resolve(null) - })) + input.push( + new Promise(async (resolve, reject) => { + const doc = await fileToIndexedDocument(file) + await cacheManager.updateLiveDocument(file.path, doc) + data.push(doc) + return resolve(null) + }) + ) } await Promise.all(input) return data @@ -51,13 +65,15 @@ export async function getPDFFiles(): Promise { * Will use the cache if possible. */ export async function fileToIndexedDocument( - file: TFile, + file: TFile ): Promise { let content: string if (isFilePlaintext(file.path)) { content = await app.vault.cachedRead(file) } else if (file.path.endsWith('.pdf')) { content = await getPdfText(file) + } else if (isFileImage(file.path)) { + content = await getImageText(file) } else { throw new Error('Invalid file: ' + file.path) } diff --git a/src/main.ts b/src/main.ts index 9891503..bd741a2 100644 --- a/src/main.ts +++ b/src/main.ts @@ -84,9 +84,7 @@ export default class OmnisearchPlugin extends Plugin { showWelcomeNotice(this) } - onunload(): void { - - } + onunload(): void {} addRibbonButton(): void { this.ribbonButton = this.addRibbonIcon('search', 'Omnisearch', _evt => { @@ -110,13 +108,13 @@ async function populateIndex(): Promise { // Initialize minisearch let engine = SearchEngine.getEngine() - // No cache for iOS + // if not iOS, load data from cache if (!Platform.isIosApp) { engine = await SearchEngine.initFromCache() } // Load plaintext files - console.log('Omnisearch - Fetching notes') + console.log('Omnisearch - Reading notes') const plainTextFiles = await FileLoader.getPlainTextFiles() let allFiles = [...plainTextFiles] // iOS: since there's no cache, directly index the documents @@ -127,15 +125,28 @@ async function populateIndex(): Promise { // Load PDFs if (settings.PDFIndexing) { - console.log('Omnisearch - Fetching PDFs') - const pdfs = await FileLoader.getPDFFiles() + console.log('Omnisearch - Reading PDFs') + const pdfDocuments = await FileLoader.getPDFAsDocuments() // iOS: since there's no cache, just index the documents if (Platform.isIosApp) { await wait(1000) - await engine.addAllToMinisearch(pdfs) + await engine.addAllToMinisearch(pdfDocuments) } // Add PDFs to the files list - allFiles = [...allFiles, ...pdfs] + allFiles = [...allFiles, ...pdfDocuments] + } + + // Load Images + if (settings.imagesIndexing) { + console.log('Omnisearch - Reading Images') + const imagesDocuments = await FileLoader.getImagesAsDocuments() + // iOS: since there's no cache, just index the documents + if (Platform.isIosApp) { + await wait(1000) + await engine.addAllToMinisearch(imagesDocuments) + } + // Add Images to the files list + allFiles = [...allFiles, ...imagesDocuments] } console.log('Omnisearch - Total number of files: ' + allFiles.length) @@ -146,6 +157,7 @@ async function populateIndex(): Promise { console.log('Omnisearch - Checking index cache diff...') // Check which documents need to be removed/added/updated const diffDocs = await cacheManager.getDiffDocuments(allFiles) + console.log(`Omnisearch - Files to add/remove/update: ${diffDocs.toAdd.length}/${diffDocs.toDelete.length}/${diffDocs.toUpdate.length}`) needToUpdateCache = !!( diffDocs.toAdd.length || diffDocs.toDelete.length || @@ -154,23 +166,19 @@ async function populateIndex(): Promise { // Add await engine.addAllToMinisearch(diffDocs.toAdd) - console.log(`Omnisearch - ${diffDocs.toAdd.length} files to add`) diffDocs.toAdd.forEach(doc => cacheManager.updateLiveDocument(doc.path, doc) ) // Delete - console.log(`Omnisearch - ${diffDocs.toDelete.length} files to remove`) diffDocs.toDelete.forEach(d => engine.removeFromMinisearch(d)) diffDocs.toDelete.forEach(doc => cacheManager.deleteLiveDocument(doc.path)) // Update (delete + add) - console.log(`Omnisearch - ${diffDocs.toUpdate.length} files to update`) - diffDocs.toUpdate - .forEach(({ oldDoc, newDoc }) => { - engine.removeFromMinisearch(oldDoc) - cacheManager.updateLiveDocument(oldDoc.path, newDoc) - }) + diffDocs.toUpdate.forEach(({ oldDoc, newDoc }) => { + engine.removeFromMinisearch(oldDoc) + cacheManager.updateLiveDocument(oldDoc.path, newDoc) + }) await engine.addAllToMinisearch(diffDocs.toUpdate.map(d => d.newDoc)) } @@ -205,13 +213,13 @@ async function cleanOldCacheFiles() { } function showWelcomeNotice(plugin: Plugin) { - const code = '1.7.6' + const code = '1.8.0-beta.3' if (settings.welcomeMessage !== code) { const welcome = new DocumentFragment() welcome.createSpan({}, span => { - span.innerHTML = `Omnisearch has been updated -New beta feature: PDF search 🔎📄 -Toggle "BETA - Index PDFs" in Omnisearch settings page.` + span.innerHTML = `Omnisearch BETA has been updated +You can now enable "Images Indexing" to use Optical Character Recognition on your scanned documents +🔎🖼` }) new Notice(welcome, 30000) } diff --git a/src/settings.ts b/src/settings.ts index 968bcf9..44d2a83 100644 --- a/src/settings.ts +++ b/src/settings.ts @@ -25,6 +25,8 @@ export interface OmnisearchSettings extends WeightingSettings { indexedFileTypes: string[] /** Enable PDF indexing */ PDFIndexing: boolean + /** Enable PDF indexing */ + imagesIndexing: boolean /** Display Omnisearch popup notices over Obsidian */ showIndexingNotices: boolean /** Activate the small 🔍 button on Obsidian's ribbon */ @@ -147,7 +149,7 @@ export class SettingsTab extends PluginSettingTab { indexPDFsDesc.createSpan({}, span => { span.innerHTML = `Omnisearch will include PDFs in search results.
    -
  • ⚠️ Depending on their size, PDFs can take anywhere from a few seconds to 2 minutes to be processed.
  • +
  • ⚠️ Each PDF can take anywhere from a few seconds to 2 minutes to be processed.
  • ⚠️ Texts extracted from PDFs may contain errors such as missing spaces, or spaces in the middle of words.
  • ⚠️ Some PDFs can't be processed correctly and will return an empty text.
  • This feature is currently a work-in-progress, please report issues that you might experience.
  • @@ -164,6 +166,26 @@ export class SettingsTab extends PluginSettingTab { }) ) + // PDF Indexing + const indexImagesDesc = new DocumentFragment() + indexImagesDesc.createSpan({}, span => { + span.innerHTML = `Omnisearch will use Tesseract to index images from their text. +
      +
    • Only English is supported at the moment.
    • +
    • Not all images can be correctly read by the OCR, this feature works best with scanned documents.
    • +
    + Needs a restart to fully take effect.` + }) + new Setting(containerEl) + .setName('BETA - Images Indexing') + .setDesc(indexImagesDesc) + .addToggle(toggle => + toggle.setValue(settings.imagesIndexing).onChange(async v => { + settings.imagesIndexing = v + await saveSettings(this.plugin) + }) + ) + // #endregion Behavior // #region User Interface @@ -315,6 +337,7 @@ export const DEFAULT_SETTINGS: OmnisearchSettings = { ignoreDiacritics: true, indexedFileTypes: [] as string[], PDFIndexing: false, + imagesIndexing: false, showIndexingNotices: false, showShortName: false, diff --git a/src/tools/utils.ts b/src/tools/utils.ts index 2d5cbf0..602b640 100644 --- a/src/tools/utils.ts +++ b/src/tools/utils.ts @@ -184,7 +184,9 @@ export function getCtrlKeyLabel(): 'ctrl' | '⌘' { export function isFileIndexable(path: string): boolean { return ( - (settings.PDFIndexing && path.endsWith('.pdf')) || isFilePlaintext(path) + (settings.PDFIndexing && path.endsWith('.pdf')) || + isFilePlaintext(path) || + (settings.imagesIndexing && isFileImage(path)) ) }