From dcef2d3719cf416cfad4777fc179b42e8d0ba401 Mon Sep 17 00:00:00 2001 From: Simon Cambier Date: Fri, 25 Nov 2022 08:28:50 +0100 Subject: [PATCH] WIP refactor to take advantage of minisearch 6.0 --- src/cache-manager.ts | 32 ++++---- src/components/ModalVault.svelte | 4 +- src/components/ResultItemVault.svelte | 1 + src/database.ts | 18 ++++- src/file-loader.ts | 104 +++++++++++++++++++------- src/globals.ts | 1 + src/main.ts | 15 ++-- src/notes-index.ts | 26 +++---- src/search/search-engine.ts | 58 ++++++++------ src/tools/utils.ts | 10 ++- 10 files changed, 172 insertions(+), 97 deletions(-) diff --git a/src/cache-manager.ts b/src/cache-manager.ts index a71eac1..8f6fdb4 100644 --- a/src/cache-manager.ts +++ b/src/cache-manager.ts @@ -6,6 +6,10 @@ import { minisearchOptions } from './search/search-engine' import { makeMD5 } from './tools/utils' class CacheManager { + /** + * @deprecated + * @private + */ private liveDocuments: Map = new Map() /** * Show an empty input field next time the user opens Omnisearch modal @@ -39,6 +43,7 @@ class CacheManager { /** * Important: keep this method async for the day it _really_ becomes async. * This will avoid a refactor. + * @deprecated * @param path * @param note */ @@ -49,19 +54,22 @@ class CacheManager { this.liveDocuments.set(path, note) } + /** + * @deprecated + * @param key + */ public deleteLiveDocument(key: string): void { this.liveDocuments.delete(key) } + /** + * @deprecated + * @param key + */ public getLiveDocument(key: string): IndexedDocument | undefined { return this.liveDocuments.get(key) } - public isDocumentOutdated(file: TFile): boolean { - const indexedNote = this.getLiveDocument(file.path) - return !indexedNote || indexedNote.mtime !== file.stat.mtime - } - //#region Minisearch public getDocumentsChecksum(documents: IndexedDocument[]): string { @@ -82,7 +90,6 @@ class CacheManager { public async getMinisearchCache(): Promise { // Retrieve documents and make their checksum const cachedDocs = await database.documents.toArray() - const checksum = this.getDocumentsChecksum(cachedDocs.map(d => d.document)) // Add those documents in the live cache cachedDocs.forEach(doc => @@ -91,13 +98,6 @@ class CacheManager { // Retrieve the search cache, and verify the checksum const cachedIndex = (await database.minisearch.toArray())[0] - if (cachedIndex?.checksum !== checksum) { - console.warn("Omnisearch - Cache - Checksums don't match, clearing cache") - // Invalid (or null) cache, clear everything - await database.minisearch.clear() - await database.documents.clear() - return null - } try { return MiniSearch.loadJS(cachedIndex.data, minisearchOptions) @@ -116,7 +116,7 @@ class CacheManager { * @param documents */ public async getDiffDocuments(documents: IndexedDocument[]): Promise<{ - toDelete: IndexedDocument[] + toDelete: string[] toAdd: IndexedDocument[] toUpdate: { oldDoc: IndexedDocument; newDoc: IndexedDocument }[] }> { @@ -128,7 +128,7 @@ class CacheManager { // present in `cachedDocs` but not in `documents` const toDelete = cachedDocs .filter(c => !documents.find(d => d.path === c.path)) - .map(d => d.document) + .map(d => d.path) // toUpdate: same path, but different mtime const toUpdate = cachedDocs @@ -158,7 +158,7 @@ class CacheManager { // Delete // console.log(`Omnisearch - Cache - Will delete ${toDelete.length} documents`) - await database.documents.bulkDelete(toDelete.map(o => o.path)) + await database.documents.bulkDelete(toDelete) // Add // console.log(`Omnisearch - Cache - Will add ${toAdd.length} documents`) diff --git a/src/components/ModalVault.svelte b/src/components/ModalVault.svelte index 1805204..dfe2ce1 100644 --- a/src/components/ModalVault.svelte +++ b/src/components/ModalVault.svelte @@ -6,7 +6,7 @@ import { eventBus, IndexingStep, type ResultNote } from 'src/globals' import { createNote, openNote } from 'src/tools/notes' import { SearchEngine } from 'src/search/search-engine' - import { getCtrlKeyLabel, getExtension, loopIndex } from 'src/tools/utils' + import { getCtrlKeyLabel, getExtension, isFilePDF, loopIndex } from 'src/tools/utils' import { OmnisearchInFileModal, type OmnisearchVaultModal, @@ -189,7 +189,7 @@ function switchToInFileModal(): void { // Do nothing if the selectedNote is a PDF, // or if there is 0 match (e.g indexing in progress) - if (selectedNote?.path.endsWith('.pdf') || !selectedNote?.matches.length) { + if (isFilePDF(selectedNote?.path) || !selectedNote?.matches.length) { return } diff --git a/src/components/ResultItemVault.svelte b/src/components/ResultItemVault.svelte index 75bf665..d6b3708 100644 --- a/src/components/ResultItemVault.svelte +++ b/src/components/ResultItemVault.svelte @@ -21,6 +21,7 @@ // @ts-ignore const file = app.vault.getFiles().find(f => f.path === note.path) if (file) { + // @ts-ignore imagePath = app.vault.getResourcePath(file) } } diff --git a/src/database.ts b/src/database.ts index eb94ea4..635a322 100644 --- a/src/database.ts +++ b/src/database.ts @@ -32,12 +32,26 @@ export class OmnisearchCache extends Dexie { //#region Table declarations documents!: Dexie.Table< - { path: string; mtime: number; document: IndexedDocument }, + { + path: string + mtime: number + /** + * @deprecated + */ + document: IndexedDocument + }, string > searchHistory!: Dexie.Table<{ id?: number; query: string }, number> minisearch!: Dexie.Table< - { date: string; checksum: string; data: AsPlainObject }, + { + date: string + /** + * @deprecated + */ + checksum: string + data: AsPlainObject + }, string > diff --git a/src/file-loader.ts b/src/file-loader.ts index f8cb0f3..fa03097 100644 --- a/src/file-loader.ts +++ b/src/file-loader.ts @@ -4,14 +4,13 @@ import { getAliasesFromMetadata, getTagsFromMetadata, isFileImage, + isFilePDF, isFilePlaintext, removeDiacritics, } from './tools/utils' -import * as NotesIndex from './notes-index' import type { TFile } from 'obsidian' import type { IndexedDocument } from './globals' -import { getNonExistingNotes } from './tools/notes' -import { getPdfText, getImageText } from 'obsidian-text-extract' +import { getImageText, getPdfText } from 'obsidian-text-extract' /** * Return all plaintext files as IndexedDocuments @@ -20,9 +19,9 @@ export async function getPlainTextFiles(): Promise { const allFiles = app.vault.getFiles().filter(f => isFilePlaintext(f.path)) const data: IndexedDocument[] = [] for (const file of allFiles) { - const doc = await fileToIndexedDocument(file) + const doc = await getIndexedDocument(file.path) data.push(doc) - await cacheManager.updateLiveDocument(file.path, doc) + // await cacheManager.updateLiveDocument(file.path, doc) } return data } @@ -31,7 +30,7 @@ export async function getPlainTextFiles(): Promise { * Return all PDFs as IndexedDocuments. */ export async function getPDFAsDocuments(): Promise { - const files = app.vault.getFiles().filter(f => f.path.endsWith('.pdf')) + const files = app.vault.getFiles().filter(f => isFilePDF(f.path)) return await getBinaryFiles(files) } @@ -49,8 +48,8 @@ async function getBinaryFiles(files: TFile[]): Promise { for (const file of files) { input.push( new Promise(async (resolve, reject) => { - const doc = await fileToIndexedDocument(file) - await cacheManager.updateLiveDocument(file.path, doc) + const doc = await getIndexedDocument(file.path) + // await cacheManager.updateLiveDocument(file.path, doc) data.push(doc) return resolve(null) }) @@ -60,37 +59,34 @@ async function getBinaryFiles(files: TFile[]): Promise { return data } -/** - * Convert a file into an IndexedDocument. - * Will use the cache if possible. - */ -export async function fileToIndexedDocument( - file: TFile +export async function getIndexedDocument( + path: string ): Promise { + const file = app.vault.getFiles().find(f => f.path === path) + if (!file) throw new Error(`Invalid file path: "${path}"`) let content: string - if (isFilePlaintext(file.path)) { + if (isFilePlaintext(path)) { content = await app.vault.cachedRead(file) - } else if (file.path.endsWith('.pdf')) { + } else if (isFilePDF(path)) { content = await getPdfText(file) } else if (isFileImage(file.path)) { content = await getImageText(file) } else { - throw new Error('Invalid file: ' + file.path) + throw new Error('Invalid file format: ' + file.path) } - content = removeDiacritics(content) const metadata = app.metadataCache.getFileCache(file) // Look for links that lead to non-existing files, // and add them to the index. if (metadata) { - // FIXME: https://github.com/scambier/obsidian-omnisearch/issues/129 - const nonExisting = getNonExistingNotes(file, metadata) - for (const name of nonExisting.filter( - o => !cacheManager.getLiveDocument(o) - )) { - NotesIndex.addNonExistingToIndex(name, file.path) - } + // // FIXME: https://github.com/scambier/obsidian-omnisearch/issues/129 + // const nonExisting = getNonExistingNotes(file, metadata) + // for (const name of nonExisting.filter( + // o => !cacheManager.getLiveDocument(o) + // )) { + // NotesIndex.addNonExistingToIndex(name, file.path) + // } // EXCALIDRAW // Remove the json code @@ -117,3 +113,61 @@ export async function fileToIndexedDocument( headings3: metadata ? extractHeadingsFromCache(metadata, 3).join(' ') : '', } } + +/** + * Convert a file into an IndexedDocument. + * Will use the cache if possible. + */ +// async function fileToIndexedDocument( +// file: TFile +// ): Promise { +// let content: string +// if (isFilePlaintext(file.path)) { +// content = await app.vault.cachedRead(file) +// } else if (isFilePDF(file.path)) { +// content = await getPdfText(file) +// } else if (isFileImage(file.path)) { +// content = await getImageText(file) +// } else { +// throw new Error('Invalid file: ' + file.path) +// } +// +// content = removeDiacritics(content) +// const metadata = app.metadataCache.getFileCache(file) +// +// // Look for links that lead to non-existing files, +// // and add them to the index. +// if (metadata) { +// // FIXME: https://github.com/scambier/obsidian-omnisearch/issues/129 +// const nonExisting = getNonExistingNotes(file, metadata) +// for (const name of nonExisting.filter( +// o => !cacheManager.getLiveDocument(o) +// )) { +// NotesIndex.addNonExistingToIndex(name, file.path) +// } +// +// // EXCALIDRAW +// // Remove the json code +// if (metadata.frontmatter?.['excalidraw-plugin']) { +// const comments = +// metadata.sections?.filter(s => s.type === 'comment') ?? [] +// for (const { start, end } of comments.map(c => c.position)) { +// content = +// content.substring(0, start.offset - 1) + content.substring(end.offset) +// } +// } +// } +// +// return { +// basename: removeDiacritics(file.basename), +// content, +// path: file.path, +// mtime: file.stat.mtime, +// +// tags: getTagsFromMetadata(metadata), +// aliases: getAliasesFromMetadata(metadata).join(''), +// headings1: metadata ? extractHeadingsFromCache(metadata, 1).join(' ') : '', +// headings2: metadata ? extractHeadingsFromCache(metadata, 2).join(' ') : '', +// headings3: metadata ? extractHeadingsFromCache(metadata, 3).join(' ') : '', +// } +// } diff --git a/src/globals.ts b/src/globals.ts index 8bee954..64562d7 100644 --- a/src/globals.ts +++ b/src/globals.ts @@ -37,6 +37,7 @@ export type IndexedDocument = { headings2: string headings3: string + // TODO: reimplement this doesNotExist?: boolean parent?: string } diff --git a/src/main.ts b/src/main.ts index 7e1e405..113f6f2 100644 --- a/src/main.ts +++ b/src/main.ts @@ -8,10 +8,11 @@ import { loadSettings, settings, SettingsTab, showExcerpt } from './settings' import { eventBus, EventNames, IndexingStep } from './globals' import api from './tools/api' import { isFilePlaintext, wait } from './tools/utils' -import * as NotesIndex from './notes-index' import * as FileLoader from './file-loader' import { OmnisearchCache } from './database' import { cacheManager } from './cache-manager' +import * as NotesIndex from './notes-index' +import { addToIndexAndMemCache } from "./notes-index"; export default class OmnisearchPlugin extends Plugin { private ribbonButton?: HTMLElement @@ -186,21 +187,15 @@ async function populateIndex(): Promise { // Add await engine.addAllToMinisearch(diffDocs.toAdd) - diffDocs.toAdd.forEach(doc => - cacheManager.updateLiveDocument(doc.path, doc) - ) // Delete - for (const [i, doc] of diffDocs.toDelete.entries()) { - await wait(0) - engine.removeFromMinisearch(doc) - cacheManager.deleteLiveDocument(doc.path) + for (const pathToDel of diffDocs.toDelete) { + NotesIndex.removeFromIndex(pathToDel) } // Update (delete + add) diffDocs.toUpdate.forEach(({ oldDoc, newDoc }) => { - engine.removeFromMinisearch(oldDoc) - cacheManager.updateLiveDocument(oldDoc.path, newDoc) + NotesIndex.removeFromIndex(oldDoc.path) }) await engine.addAllToMinisearch(diffDocs.toUpdate.map(d => d.newDoc)) } diff --git a/src/notes-index.ts b/src/notes-index.ts index 4b022f8..4c2ecd7 100644 --- a/src/notes-index.ts +++ b/src/notes-index.ts @@ -4,7 +4,9 @@ import { removeAnchors } from './tools/notes' import { SearchEngine } from './search/search-engine' import { cacheManager } from './cache-manager' import type { IndexedDocument } from './globals' -import { fileToIndexedDocument } from './file-loader' +import { getIndexedDocument } from "./file-loader"; + +const indexedList: Set = new Set() /** * Adds a file to the search index @@ -18,21 +20,14 @@ export async function addToIndexAndMemCache( return } - // Check if the file was already indexed as non-existent. - // If so, remove it from the index, and add it again as a real note. - if (cacheManager.getLiveDocument(file.path)?.doesNotExist) { - removeFromIndex(file.path) - } - try { - if (cacheManager.getLiveDocument(file.path)) { + if (indexedList.has(file.path)) { throw new Error(`${file.basename} is already indexed`) } // Make the document and index it - const note = await fileToIndexedDocument(file) - SearchEngine.getEngine().addSingleToMinisearch(note) - await cacheManager.updateLiveDocument(note.path, note) + SearchEngine.getEngine().addSingleToMinisearch(file.path) + indexedList.add(file.path) } catch (e) { // console.trace('Error while indexing ' + file.basename) console.error(e) @@ -65,8 +60,7 @@ export function addNonExistingToIndex(name: string, parent: string): void { doesNotExist: true, parent, } - SearchEngine.getEngine().addSingleToMinisearch(note) - cacheManager.updateLiveDocument(filename, note) + SearchEngine.getEngine().addSingleToMinisearch(note.path) } /** @@ -77,10 +71,8 @@ export function removeFromIndex(path: string): void { console.info(`"${path}" is not an indexable file`) return } - const note = cacheManager.getLiveDocument(path) - if (note) { - SearchEngine.getEngine().removeFromMinisearch(note) - cacheManager.deleteLiveDocument(path) + if (indexedList.has(path)) { + SearchEngine.getEngine().removeFromMinisearch(path) // FIXME: only remove non-existing notes if they don't have another parent // cacheManager diff --git a/src/search/search-engine.ts b/src/search/search-engine.ts index 324197e..c35688b 100644 --- a/src/search/search-engine.ts +++ b/src/search/search-engine.ts @@ -2,10 +2,10 @@ import MiniSearch, { type Options, type SearchResult } from 'minisearch' import { chsRegex, type IndexedDocument, + IndexingStep, type ResultNote, type SearchMatch, SPACE_OR_PUNCTUATION, - IndexingStep, } from '../globals' import { removeDiacritics, @@ -17,8 +17,9 @@ import { settings } from '../settings' import { cacheManager } from '../cache-manager' import { writable } from 'svelte/store' import { Notice } from 'obsidian' +import { getIndexedDocument } from '../file-loader' -let previousResults: ResultNote[] = [] +let previousResults: SearchResult[] = [] const tokenize = (text: string): string[] => { const tokens = text.split(SPACE_OR_PUNCTUATION) @@ -102,7 +103,10 @@ export class SearchEngine { query: Query, options: { prefixLength: number } ): Promise { - if (!query.segmentsToStr()) return [] + if (query.isEmpty()) { + previousResults = [] + return [] + } let results = this.minisearch.search(query.segmentsToStr(), { prefix: term => term.length >= options.prefixLength, @@ -116,6 +120,7 @@ export class SearchEngine { headings3: settings.weightH3, }, }) + if (!results.length) return previousResults // Downrank files that are in Obsidian's excluded list if (settings.respectExcluded) { @@ -124,19 +129,23 @@ export class SearchEngine { app.metadataCache.isUserIgnored && app.metadataCache.isUserIgnored(result.id) ) { - result.score /= 10 // TODO: make this value configurable or toggleable? + result.score /= 10 } }) } + const documents = await Promise.all( + results.map(async result => await getIndexedDocument(result.id)) + ) + // If the search query contains quotes, filter out results that don't have the exact match const exactTerms = query.getExactTerms() if (exactTerms.length) { results = results.filter(r => { - const title = - cacheManager.getLiveDocument(r.id)?.path.toLowerCase() ?? '' + const document = documents.find(d => d.path === r.id) + const title = document?.path.toLowerCase() ?? '' const content = stripMarkdownCharacters( - cacheManager.getLiveDocument(r.id)?.content ?? '' + document?.content ?? '' ).toLowerCase() return exactTerms.every(q => content.includes(q) || title.includes(q)) }) @@ -147,16 +156,22 @@ export class SearchEngine { if (exclusions.length) { results = results.filter(r => { const content = stripMarkdownCharacters( - cacheManager.getLiveDocument(r.id)?.content ?? '' + documents.find(d => d.path === r.id)?.content ?? '' ).toLowerCase() return exclusions.every(q => !content.includes(q.value)) }) } // FIXME: // Dedupe results - clutch for https://github.com/scambier/obsidian-omnisearch/issues/129 - return results.filter( - (result, index, arr) => arr.findIndex(t => t.id === result.id) === index - ) + results = results + .filter( + (result, index, arr) => arr.findIndex(t => t.id === result.id) === index + ) + .slice(0, 50) + + previousResults = results + + return results } /** @@ -196,10 +211,6 @@ export class SearchEngine { query: Query, options?: Partial<{ singleFilePath: string | null }> ): Promise { - if (query.isEmpty()) { - previousResults = [] - return [] - } // Get the raw results let results: SearchResult[] if (settings.simpleSearch) { @@ -207,7 +218,6 @@ export class SearchEngine { } else { results = await this.search(query, { prefixLength: 3 }) } - if (!results.length) return previousResults // Extract tags from the query const tags = query.segments @@ -233,9 +243,14 @@ export class SearchEngine { } } + // TODO: this already called in search(), pass each document in its SearchResult instead? + const documents = await Promise.all( + results.map(async result => await getIndexedDocument(result.id)) + ) + // Map the raw results to get usable suggestions const resultNotes = results.map(result => { - let note = cacheManager.getLiveDocument(result.id) + let note = documents.find(d => d.path === result.id) if (!note) { // throw new Error(`Omnisearch - Note "${result.id}" not indexed`) console.warn(`Omnisearch - Note "${result.id}" not in the live cache`) @@ -278,7 +293,6 @@ export class SearchEngine { } return resultNote }) - previousResults = resultNotes return resultNotes } @@ -291,12 +305,12 @@ export class SearchEngine { await this.minisearch.addAllAsync(documents, { chunkSize }) } - public addSingleToMinisearch(document: IndexedDocument): void { - this.minisearch.add(document) + public addSingleToMinisearch(path: string): void { + getIndexedDocument(path).then(doc => this.minisearch.add(doc)) } - public removeFromMinisearch(document: IndexedDocument): void { - this.minisearch.remove(document) + public removeFromMinisearch(path: string): void { + this.minisearch.discard(path) } // #endregion diff --git a/src/tools/utils.ts b/src/tools/utils.ts index 1339d3e..81dec10 100644 --- a/src/tools/utils.ts +++ b/src/tools/utils.ts @@ -1,9 +1,9 @@ import { type CachedMetadata, - Notice, - Platform, getAllTags, + Notice, parseFrontMatterAliases, + Platform, } from 'obsidian' import type { SearchMatch } from '../globals' import { @@ -207,8 +207,8 @@ export function getCtrlKeyLabel(): 'ctrl' | '⌘' { export function isFileIndexable(path: string): boolean { return ( - (settings.PDFIndexing && path.endsWith('.pdf')) || isFilePlaintext(path) || + (settings.PDFIndexing && isFilePDF(path)) || (settings.imagesIndexing && isFileImage(path)) ) } @@ -219,6 +219,10 @@ export function isFileImage(path: string): boolean { ) } +export function isFilePDF(path: string): boolean { + return path.endsWith('.pdf') +} + export function isFilePlaintext(path: string): boolean { return getPlaintextExtensions().some(t => path.endsWith(`.${t}`)) }