Squashed commit of the following:

commit 739f9c349031510e8ef010ba2445a2a1fdbec247 Author: Simon Cambier <simon.cambier@protonmail.com> Date: Sun Oct 16 16:57:03 2022 +0200 Code cleaning + README commit 85762bae592f3eafd34ba22b0cf1841bfbd91ca6 Author: Simon Cambier <simon.cambier@protonmail.com> Date: Sun Oct 16 14:59:01 2022 +0200 Cleaning deleted PDFs from cache commit 1a37bf38d3f64870d4b40df1b67d8106c893ab64 Author: Simon Cambier <simon.cambier@protonmail.com> Date: Sun Oct 16 13:18:06 2022 +0200 PDF cache saved to IndexedDB
2022-10-16 16:58:10 +02:00
parent ad820cb2c9
commit 1c3cc728f6
13 changed files with 184 additions and 124 deletions
--- a/src/cache-manager.ts
+++ b/src/cache-manager.ts
@@ -5,12 +5,12 @@ import { deflate, inflate } from 'pako'
 import {
  notesCacheFilePath,
  minisearchCacheFilePath,
-  type IndexedNote,
+  type IndexedDocument,
 } from './globals'
 import { settings } from './settings'

 class CacheManager {
-  notesCache: Record<string, IndexedNote> = {}
+  notesCache: Record<string, IndexedDocument> = {}
  compress = true
  writeInterval = 5_000 // In milliseconds

@@ -94,7 +94,7 @@ class CacheManager {
    console.log('Omnisearch - Notes cache saved on disk')
  }

-  public addNoteToCache(path: string, note: IndexedNote) {
+  public addNoteToCache(path: string, note: IndexedDocument) {
    this.notesCache[path] = note
    this.saveNotesCache()
  }
@@ -103,11 +103,11 @@ class CacheManager {
    delete this.notesCache[key]
  }

-  public getNoteFromCache(key: string): IndexedNote | undefined {
+  public getNoteFromCache(key: string): IndexedDocument | undefined {
    return this.notesCache[key]
  }

-  public getNonExistingNotesFromCache(): IndexedNote[] {
+  public getNonExistingNotesFromCache(): IndexedDocument[] {
    return Object.values(this.notesCache).filter(note => note.doesNotExist)
  }

--- a/src/components/InputSearch.svelte
+++ b/src/components/InputSearch.svelte
@@ -20,7 +20,7 @@

  const debouncedOnInput = debounce(() => {
    dispatch('input', value)
-  }, 100)
+  }, 250)
 </script>

 <div class="omnisearch-input-container">
--- a/src/database.ts
+++ b/src/database.ts
@@ -0,0 +1,17 @@
+import Dexie from 'dexie'
+
+class OmnisearchCache extends Dexie {
+  pdf!: Dexie.Table<
+    { path: string; hash: string; size: number; text: string },
+    string
+  >
+
+  constructor() {
+    super(app.appId + '_omnisearch')
+    this.version(1).stores({
+      pdf: 'path, hash, size, text',
+    })
+  }
+}
+
+export const database = new OmnisearchCache()
--- a/src/globals.ts
+++ b/src/globals.ts
@@ -14,14 +14,13 @@ export const eventBus = new EventBus()

 export const minisearchCacheFilePath = `${app.vault.configDir}/plugins/omnisearch/searchIndex.data`
 export const notesCacheFilePath = `${app.vault.configDir}/plugins/omnisearch/notesCache.data`
-export const pdfCacheFilePath = `${app.vault.configDir}/plugins/omnisearch/pdfCache.data`
 export const historyFilePath = `${app.vault.configDir}/plugins/omnisearch/historyCache.json`

 export const EventNames = {
  ToggleExcerpts: 'toggle-excerpts',
 } as const

-export type IndexedNote = {
+export type IndexedDocument = {
  path: string
  basename: string
  mtime: number
--- a/src/main.ts
+++ b/src/main.ts
@@ -9,7 +9,6 @@ import { loadSearchHistory } from './search-history'
 import { isFilePlaintext } from './utils'
 import * as NotesIndex from './notes-index'
 import { cacheManager } from './cache-manager'
-import { pdfManager } from './pdf-manager'

 function _registerAPI(plugin: OmnisearchPlugin): void {
  registerAPI('omnisearch', api, plugin as any)
@@ -25,7 +24,6 @@ export default class OmnisearchPlugin extends Plugin {
    await loadSettings(this)
    await loadSearchHistory()
    await cacheManager.loadNotesCache()
-    await pdfManager.loadPDFCache()

    _registerAPI(this)

@@ -91,7 +89,7 @@ export default class OmnisearchPlugin extends Plugin {

  onunload(): void {
    console.log('Omnisearch - Interrupting PDF indexing')
-    NotesIndex.pdfQueue.pause()
+    NotesIndex.pdfQueue.clearQueue()
  }

  addRibbonButton(): void {
@@ -102,17 +100,17 @@ export default class OmnisearchPlugin extends Plugin {
 }

 async function cleanOldCacheFiles() {
-  const oldSearchIndexFilePath = `${app.vault.configDir}/plugins/omnisearch/searchIndex.json`
-  if (await app.vault.adapter.exists(oldSearchIndexFilePath)) {
-    try {
-      await app.vault.adapter.remove(oldSearchIndexFilePath)
-    } catch (e) {}
-  }
-  const oldNnotesCacheFilePath = `${app.vault.configDir}/plugins/omnisearch/notesCache.json`
-  if (await app.vault.adapter.exists(oldNnotesCacheFilePath)) {
-    try {
-      await app.vault.adapter.remove(oldNnotesCacheFilePath)
-    } catch (e) {}
+  const toDelete = [
+    `${app.vault.configDir}/plugins/omnisearch/searchIndex.json`,
+    `${app.vault.configDir}/plugins/omnisearch/notesCache.json`,
+    `${app.vault.configDir}/plugins/omnisearch/pdfCache.data`
+  ]
+  for (const item of toDelete) {
+    if (await app.vault.adapter.exists(item)) {
+      try {
+        await app.vault.adapter.remove(item)
+      } catch (e) {}
+    }
  }
 }

--- a/src/notes-index.ts
+++ b/src/notes-index.ts
@@ -4,23 +4,19 @@ import {
  getAliasesFromMetadata,
  getTagsFromMetadata,
  isFileIndexable,
-  isFilePlaintext,
  removeDiacritics,
  wait,
 } from './utils'
 import { getNonExistingNotes, removeAnchors } from './notes'
-import * as PDF from './pdf-manager'
-import type { IndexedNote } from './globals'
+import { pdfManager } from './pdf-manager'
+import type { IndexedDocument } from './globals'
 import { settings } from './settings'
 import * as Search from './search'
-import PQueue from 'p-queue-compat'
+// import PQueue from 'p-queue-compat'
+import pLimit from 'p-limit'
 import { cacheManager } from './cache-manager'

-let isIndexChanged: boolean
-
-export const pdfQueue = new PQueue({
-  concurrency: settings.backgroundProcesses,
-})
+export const pdfQueue = pLimit(settings.backgroundProcesses)

 /**
 * Adds a file to the index
@@ -59,14 +55,14 @@ export async function addToIndexAndCache(file: TAbstractFile): Promise<void> {

    let content
    if (file.path.endsWith('.pdf')) {
-      content = removeDiacritics(await PDF.pdfManager.getPdfText(file as TFile))
+      content = removeDiacritics(await pdfManager.getPdfText(file as TFile))
    } else {
      // Fetch content from the cache to index it as-is
      content = removeDiacritics(await app.vault.cachedRead(file))
    }

    // Make the document and index it
-    const note: IndexedNote = {
+    const note: IndexedDocument = {
      basename: removeDiacritics(file.basename),
      content,
      path: file.path,
@@ -86,7 +82,6 @@ export async function addToIndexAndCache(file: TAbstractFile): Promise<void> {
    }

    Search.minisearchInstance.add(note)
-    isIndexChanged = true
    cacheManager.addNoteToCache(note.path, note)
  } catch (e) {
    console.trace('Error while indexing ' + file.basename)
@@ -105,12 +100,13 @@ export function addNonExistingToIndex(name: string, parent: string): void {
  const filename = name + (name.endsWith('.md') ? '' : '.md')
  if (cacheManager.getNoteFromCache(filename)) return

-  const note = {
+  const note: IndexedDocument = {
    path: filename,
    basename: name,
    mtime: 0,

    content: '',
+    tags: [],
    aliases: '',
    headings1: '',
    headings2: '',
@@ -118,9 +114,8 @@ export function addNonExistingToIndex(name: string, parent: string): void {

    doesNotExist: true,
    parent,
-  } as IndexedNote
+  }
  Search.minisearchInstance.add(note)
-  isIndexChanged = true
  cacheManager.addNoteToCache(filename, note)
 }

@@ -129,14 +124,13 @@ export function addNonExistingToIndex(name: string, parent: string): void {
 * @param path
 */
 export function removeFromIndex(path: string): void {
-  if (!isFilePlaintext(path)) {
+  if (!isFileIndexable(path)) {
    console.info(`"${path}" is not an indexable file`)
    return
  }
  const note = cacheManager.getNoteFromCache(path)
  if (note) {
    Search.minisearchInstance.remove(note)
-    isIndexChanged = true
    cacheManager.removeNoteFromCache(path)
    cacheManager
      .getNonExistingNotesFromCache()
@@ -175,21 +169,30 @@ export async function indexPDFs() {
    const files = app.vault.getFiles().filter(f => f.path.endsWith('.pdf'))
    console.time('PDF Indexing')
    console.log(`Omnisearch - Indexing ${files.length} PDFs`)
+    const input = []
    for (const file of files) {
      if (cacheManager.getNoteFromCache(file.path)) {
        removeFromIndex(file.path)
      }
-      pdfQueue.add(async () => {
-        await addToIndexAndCache(file)
-        await cacheManager.writeMinisearchIndex(Search.minisearchInstance)
-      })
+      input.push(
+        pdfQueue(async () => {
+          await addToIndexAndCache(file)
+          await cacheManager.writeMinisearchIndex(Search.minisearchInstance)
+        })
+      )
+      // pdfQueue.add(async () => {
+      //   await addToIndexAndCache(file)
+      //   await cacheManager.writeMinisearchIndex(Search.minisearchInstance)
+      // })
    }
-
-    await pdfQueue.onEmpty()
+    await Promise.all(input)
+    // await pdfQueue.onEmpty()
    console.timeEnd('PDF Indexing')

    if (settings.showIndexingNotices) {
      new Notice(`Omnisearch - Indexed ${files.length} PDFs`)
    }
+
+    await pdfManager.cleanCache()
  }
 }
--- a/src/pdf-manager.ts
+++ b/src/pdf-manager.ts
@@ -1,51 +1,58 @@
 import type { TFile } from 'obsidian'
-import PQueue from 'p-queue-compat'
 import PDFWorker from 'web-worker:./pdf-worker.ts'
-import { pdfCacheFilePath } from './globals'
-import { deflate, inflate } from 'pako'
 import { makeMD5 } from './utils'
+import { database } from './database'

 class PDFManager {
-  private cache: Map<string, { content: string }> = new Map()
-  private serializeQueue = new PQueue({ concurrency: 1 })
-
-  public async loadPDFCache(): Promise<void> {
-    if (await app.vault.adapter.exists(pdfCacheFilePath)) {
-      try {
-        const data = await app.vault.adapter.readBinary(pdfCacheFilePath)
-        const json = new TextDecoder('utf8').decode(inflate(data))
-        this.cache = new Map(JSON.parse(json))
-      } catch (e) {
-        console.error(e)
-        this.cache = new Map()
-      }
-    }
-  }
-
  public async getPdfText(file: TFile): Promise<string> {
+    // 1) Check if we can find by path & size
+    const docByPath = await database.pdf.get({
+      path: file.path,
+      size: file.stat.size,
+    })
+
+    if (docByPath) {
+      return docByPath.text
+    }
+
+    // 2) Check by hash
    const data = new Uint8Array(await app.vault.readBinary(file))
    const hash = makeMD5(data)
-    if (this.cache.has(hash)) {
-      return this.cache.get(hash)!.content
+    const docByHash = await database.pdf.get(hash)
+    if (docByHash) {
+      return docByHash.text
    }

+    // 3) The PDF is not cached, extract it
    const worker = new PDFWorker({ name: 'PDF Text Extractor' })
    return new Promise(async (resolve, reject) => {
      // @ts-ignore
+      file.stat.size
      worker.postMessage({ data, name: file.basename })
      worker.onmessage = (evt: any) => {
-        const txt = evt.data.text
-        this.updatePDFCache(hash, txt)
-        resolve(txt)
+        const text = (evt.data.text as string)
+          // Replace \n with spaces
+          .replace(/\n/g, ' ')
+          // Trim multiple spaces
+          .replace(/ +/g, ' ')
+          .trim()
+        database.pdf
+          .add({ hash, text, path: file.path, size: file.stat.size })
+          .then(() => {
+            resolve(text)
+          })
      }
    })
  }

-  private async updatePDFCache(hash: string, content: string): Promise<void> {
-    this.serializeQueue.add(() => {
-      this.cache.set(hash, { content })
-      const data = deflate(JSON.stringify(Array.from(this.cache), null, 1))
-      app.vault.adapter.writeBinary(pdfCacheFilePath, data as any)
+  /**
+   * Removes the outdated cache entries
+   */
+  public async cleanCache(): Promise<void> {
+    database.pdf.each(async item => {
+      if (!(await app.vault.adapter.exists(item.path))) {
+        console.log(item.path + ' does not exist')
+      }
    })
  }
 }
--- a/src/search.ts
+++ b/src/search.ts
@@ -2,33 +2,25 @@ import { Notice } from 'obsidian'
 import MiniSearch, { type Options, type SearchResult } from 'minisearch'
 import {
  chsRegex,
-  type IndexedNote,
+  type IndexedDocument,
  type ResultNote,
  minisearchCacheFilePath,
  type SearchMatch,
  SPACE_OR_PUNCTUATION,
 } from './globals'
 import {
-  isFileIndexable,
  isFilePlaintext,
  removeDiacritics,
  stringsToRegex,
  stripMarkdownCharacters,
-  wait,
 } from './utils'
 import type { Query } from './query'
 import { settings } from './settings'
-// import {
-//   getNoteFromCache,
-//   isCacheOutdated,
-//   loadNotesCache,
-//   resetNotesCache,
-// } from './notes'
 import * as NotesIndex from './notes-index'
-import PQueue from 'p-queue-compat'
+import pLimit from 'p-limit'
 import { cacheManager } from './cache-manager'

-export let minisearchInstance: MiniSearch<IndexedNote>
+export let minisearchInstance: MiniSearch<IndexedDocument>

 const tokenize = (text: string): string[] => {
  const tokens = text.split(SPACE_OR_PUNCTUATION)
@@ -46,7 +38,7 @@ const tokenize = (text: string): string[] => {
 * and adds all the notes to the index
 */
 export async function initGlobalSearchIndex(): Promise<void> {
-  const options: Options<IndexedNote> = {
+  const options: Options<IndexedDocument> = {
    tokenize,
    processTerm: (term: string) =>
      (settings.ignoreDiacritics ? removeDiacritics(term) : term).toLowerCase(),
@@ -106,15 +98,16 @@ export async function initGlobalSearchIndex(): Promise<void> {
  }

  // Read and index all the files into the search engine
-  const queue = new PQueue({ concurrency: 10 })
+  const queue = pLimit(10)
+  const input = []
  for (const file of files) {
    if (cacheManager.getNoteFromCache(file.path)) {
      NotesIndex.removeFromIndex(file.path)
    }
-    queue.add(() => NotesIndex.addToIndexAndCache(file))
+    input.push(queue(() => NotesIndex.addToIndexAndCache(file)))
  }

-  await queue.onEmpty()
+  await Promise.all(input)

  if (files.length > 0) {
    const message = `Omnisearch - Indexed ${files.length} ${notesSuffix} in ${
--- a/src/settings.ts
+++ b/src/settings.ts
@@ -173,12 +173,15 @@ export class SettingsTab extends PluginSettingTab {
        })
      )

-    // PDF Indexing - not available on mobile
+    // PDF Indexing
    const indexPDFsDesc = new DocumentFragment()
    indexPDFsDesc.createSpan({}, span => {
      span.innerHTML = `Omnisearch will include PDFs in search results.
-       This feature is currently a work-in-progress, please report slowdowns or issues that you might experience.<br>
-       Each PDF can take a few seconds to be indexed, so it may not appear immediately in search results.<br>
+      <ul>
+        <li>⚠️ Texts extracted from PDFs may contain errors such as missing spaces, or spaces in the middle of words.</li>
+        <li>This feature is currently a work-in-progress, please report issues that you might experience.</li>
+        <li>Each PDF can take a few seconds to be indexed, so it may not appear immediately in search results.</li>
+      </ul>
       <strong style="color: var(--text-accent)">Needs a restart to fully take effect.</strong>`
    })
    new Setting(containerEl)
--- a/src/typings/types-obsidian.d.ts
+++ b/src/typings/types-obsidian.d.ts
@@ -19,6 +19,10 @@ declare module 'obsidian' {
  interface Vault {
    getConfig(string): unknown
  }
+
+  interface App {
+    appId: string
+  }
 }