diff --git a/src/cache-manager.ts b/src/cache-manager.ts index 7eaad86..378679a 100644 --- a/src/cache-manager.ts +++ b/src/cache-manager.ts @@ -1,6 +1,9 @@ import type { TFile } from 'obsidian' import type { IndexedDocument } from './globals' import { database } from './database' +import MiniSearch from 'minisearch' +import { minisearchOptions } from './search/search-engine' +import { fileToIndexedDocument } from './file-loader' class CacheManager { private documentsCache: Map = new Map() @@ -54,6 +57,32 @@ class CacheManager { return !indexedNote || indexedNote.mtime !== file.stat.mtime } + //#region Minisearch + + public async getMinisearchCache(): Promise { + const cache = (await database.minisearch.toArray())[0] + if (!cache) { + return null + } + try { + return MiniSearch.loadJSON(cache.data, minisearchOptions) + } catch (e) { + console.error('Omnisearch - Error while loading Minisearch cache') + console.error(e) + return null + } + } + + public async writeMinisearchCache(minisearch: MiniSearch): Promise { + await database.minisearch.clear() + await database.minisearch.add({ + date: new Date().toISOString(), + data: JSON.stringify(minisearch.toJSON()), + }) + console.log('Omnisearch - Search cache written') + } + + //#endregion Minisearch } export const cacheManager = new CacheManager() diff --git a/src/components/ModalInFile.svelte b/src/components/ModalInFile.svelte index 09ad24e..0b0e4e8 100644 --- a/src/components/ModalInFile.svelte +++ b/src/components/ModalInFile.svelte @@ -13,9 +13,12 @@ import { loopIndex } from 'src/tools/utils' import { onDestroy, onMount, tick } from 'svelte' import { MarkdownView } from 'obsidian' - import * as Search from 'src/search/search' + import { SearchEngine } from 'src/search/search-engine' import ModalContainer from './ModalContainer.svelte' - import { OmnisearchInFileModal, OmnisearchVaultModal } from 'src/components/modals' + import { + OmnisearchInFileModal, + OmnisearchVaultModal, + } from 'src/components/modals' import ResultItemInFile from './ResultItemInFile.svelte' import { Query } from 'src/search/query' import { openNote } from 'src/tools/notes' @@ -49,7 +52,7 @@ $: (async () => { if (searchQuery) { query = new Query(searchQuery) - note = (await Search.getSuggestions(query, {singleFilePath}))[0] ?? null + note = (await SearchEngine.getEngine().getSuggestions(query, { singleFilePath }))[0] ?? null lastSearch = searchQuery } selectedIndex = 0 @@ -102,7 +105,7 @@ async function scrollIntoView(): Promise { await tick() const elem = document.querySelector(`[data-result-id="${selectedIndex}"]`) - elem?.scrollIntoView({behavior: 'auto', block: 'nearest'}) + elem?.scrollIntoView({ behavior: 'auto', block: 'nearest' }) } async function openSelection( @@ -128,8 +131,8 @@ pos.ch = 0 view.editor.setCursor(pos) view.editor.scrollIntoView({ - from: {line: pos.line - 10, ch: 0}, - to: {line: pos.line + 10, ch: 0}, + from: { line: pos.line - 10, ch: 0 }, + to: { line: pos.line + 10, ch: 0 }, }) } } @@ -143,7 +146,7 @@ + initialValue="{searchQuery}" /> {#if groupedOffsets.length && note} @@ -154,7 +157,7 @@ index="{i}" selected="{i === selectedIndex}" on:mousemove="{_e => (selectedIndex = i)}" - on:click="{openSelection}"/> + on:click="{openSelection}" /> {/each} {:else}
diff --git a/src/components/ModalVault.svelte b/src/components/ModalVault.svelte index 4bf8ed7..c8af625 100644 --- a/src/components/ModalVault.svelte +++ b/src/components/ModalVault.svelte @@ -5,9 +5,12 @@ import ModalContainer from './ModalContainer.svelte' import { eventBus, type ResultNote } from 'src/globals' import { createNote, openNote } from 'src/tools/notes' - import * as Search from 'src/search/search' + import { SearchEngine } from 'src/search/search-engine' import { getCtrlKeyLabel, getExtension, loopIndex } from 'src/tools/utils' - import { OmnisearchInFileModal, type OmnisearchVaultModal } from 'src/components/modals' + import { + OmnisearchInFileModal, + type OmnisearchVaultModal, + } from 'src/components/modals' import ResultItemVault from './ResultItemVault.svelte' import { Query } from 'src/search/query' import { settings } from '../settings' @@ -70,7 +73,7 @@ async function updateResults() { query = new Query(searchQuery) - resultNotes = (await Search.getSuggestions(query)).sort( + resultNotes = (await SearchEngine.getEngine().getSuggestions(query)).sort( (a, b) => b.score - a.score ) selectedIndex = 0 @@ -158,8 +161,9 @@ } function switchToInFileModal(): void { - // Do nothing if the selectedNote is a PDF - if (selectedNote?.path.endsWith('.pdf')) { + // Do nothing if the selectedNote is a PDF, + // or if there is 0 match (e.g indexing in progress) + if (selectedNote?.path.endsWith('.pdf') || !selectedNote.matches.length) { return } @@ -192,7 +196,7 @@ const elem = document.querySelector( `[data-result-id="${selectedNote.path}"]` ) - elem?.scrollIntoView({behavior: 'auto', block: 'nearest'}) + elem?.scrollIntoView({ behavior: 'auto', block: 'nearest' }) } } @@ -206,13 +210,19 @@ {/if} +{#if SearchEngine.isIndexing} +
+ ⏳ Omnisearch indexing is currently in progress +
+{/if} + {#each resultNotes as result, i} + on:click="{onClick}" /> {/each} {#if !resultNotes.length && searchQuery}
@@ -237,7 +247,7 @@ to switch to In-File Search
-
+
{getCtrlKeyLabel()} ↵ @@ -252,7 +262,7 @@ to create in a new pane
-
+
alt ↵ diff --git a/src/database.ts b/src/database.ts index 29e1963..590cb38 100644 --- a/src/database.ts +++ b/src/database.ts @@ -6,12 +6,14 @@ class OmnisearchCache extends Dexie { string > searchHistory!: Dexie.Table<{ id?: number; query: string }, number> + minisearch!: Dexie.Table<{date: string; data: string}, string> constructor() { super(app.appId + '_omnisearch') - this.version(3).stores({ + this.version(4).stores({ pdf: 'path, hash, size, text', searchHistory: '++id, query', + minisearch: 'date, data' }) } } diff --git a/src/main.ts b/src/main.ts index 07adda6..e2754f4 100644 --- a/src/main.ts +++ b/src/main.ts @@ -1,5 +1,5 @@ import { Notice, Plugin, TFile } from 'obsidian' -import * as Search from './search/search' +import {SearchEngine} from './search/search-engine' import { OmnisearchInFileModal, OmnisearchVaultModal, @@ -8,7 +8,7 @@ import { loadSettings, settings, SettingsTab, showExcerpt } from './settings' import { eventBus, EventNames } from './globals' import { registerAPI } from '@vanakat/plugin-api' import api from './tools/api' -import { isFilePlaintext } from './tools/utils' +import { isFilePlaintext, wait } from './tools/utils' import * as NotesIndex from './notes-index' import * as FileLoader from './file-loader' @@ -20,7 +20,7 @@ export default class OmnisearchPlugin extends Plugin { await loadSettings(this) // Initialize minisearch - await Search.initSearchEngine() + await SearchEngine.initFromCache() _registerAPI(this) @@ -106,11 +106,13 @@ export default class OmnisearchPlugin extends Plugin { * Read the files and feed them to Minisearch */ async function populateIndex(): Promise { + const tmpEngine = SearchEngine.getTmpEngine() + // Load plain text files console.time('Omnisearch - Timing') const files = await FileLoader.getPlainTextFiles() // Index them - await Search.addAllToMinisearch(files) + await tmpEngine.addAllToMinisearch(files) console.log(`Omnisearch - Indexed ${files.length} notes`) console.timeEnd('Omnisearch - Timing') @@ -119,10 +121,15 @@ async function populateIndex(): Promise { console.time('Omnisearch - Timing') const pdfs = await FileLoader.getPDFFiles() // Index them - await Search.addAllToMinisearch(pdfs) + await tmpEngine.addAllToMinisearch(pdfs) console.log(`Omnisearch - Indexed ${pdfs.length} PDFs`) console.timeEnd('Omnisearch - Timing') } + + await tmpEngine.writeToCache() + SearchEngine.swapEngines() + + // Save minisearch } async function cleanOldCacheFiles() { diff --git a/src/notes-index.ts b/src/notes-index.ts index 31ba5c1..cdfe781 100644 --- a/src/notes-index.ts +++ b/src/notes-index.ts @@ -2,7 +2,7 @@ import { Notice, TAbstractFile, TFile } from 'obsidian' import { isFileIndexable, wait } from './tools/utils' import { removeAnchors } from './tools/notes' import { settings } from './settings' -import * as Search from './search/search' +import { SearchEngine } from './search/search-engine' import { cacheManager } from './cache-manager' import pLimit from 'p-limit' import type { IndexedDocument } from './globals' @@ -38,7 +38,7 @@ export async function addToIndexAndMemCache( // Make the document and index it const note = await fileToIndexedDocument(file) - Search.addSingleToMinisearch(note) + SearchEngine.getEngine().addSingleToMinisearch(note) await cacheManager.updateDocument(note.path, note) } catch (e) { // console.trace('Error while indexing ' + file.basename) @@ -72,7 +72,7 @@ export function addNonExistingToIndex(name: string, parent: string): void { doesNotExist: true, parent, } - Search.addSingleToMinisearch(note) + SearchEngine.getEngine().addSingleToMinisearch(note) cacheManager.updateDocument(filename, note) } @@ -86,7 +86,7 @@ export function removeFromIndex(path: string): void { } const note = cacheManager.getDocument(path) if (note) { - Search.removeFromMinisearch(note) + SearchEngine.getEngine().removeFromMinisearch(note) cacheManager.deleteDocument(path) // FIXME: only remove non-existing notes if they don't have another parent diff --git a/src/search/search-engine.ts b/src/search/search-engine.ts new file mode 100644 index 0000000..8d57981 --- /dev/null +++ b/src/search/search-engine.ts @@ -0,0 +1,299 @@ +import MiniSearch, { + type AsPlainObject, + type Options, + type SearchResult, +} from 'minisearch' +import { + chsRegex, + type IndexedDocument, + type ResultNote, + type SearchMatch, + SPACE_OR_PUNCTUATION, +} from '../globals' +import { + removeDiacritics, + stringsToRegex, + stripMarkdownCharacters, +} from '../tools/utils' +import type { Query } from './query' +import { settings } from '../settings' +import { cacheManager } from '../cache-manager' + +const tokenize = (text: string): string[] => { + const tokens = text.split(SPACE_OR_PUNCTUATION) + const chsSegmenter = (app as any).plugins.plugins['cm-chs-patch'] + + if (chsSegmenter) { + return tokens.flatMap(word => + chsRegex.test(word) ? chsSegmenter.cut(word) : [word] + ) + } else return tokens +} + +export const minisearchOptions: Options = { + tokenize, + processTerm: (term: string) => + (settings.ignoreDiacritics ? removeDiacritics(term) : term).toLowerCase(), + idField: 'path', + fields: [ + 'basename', + 'aliases', + 'content', + 'headings1', + 'headings2', + 'headings3', + ], + storeFields: ['tags'], +} + +export class SearchEngine { + private static engine: SearchEngine + private static tmpEngine: SearchEngine + public static isIndexing = true + + /** + * The main singleton SearchEngine instance. + * Should be used for all queries + */ + public static getEngine(): SearchEngine { + if (!this.engine) { + this.engine = new SearchEngine() + } + return this.engine + } + + /** + * The secondary instance. This one is indexed in the background, + * while the main instance is quickly filled with cache data + */ + public static getTmpEngine(): SearchEngine { + if (!this.tmpEngine) { + this.tmpEngine = new SearchEngine() + } + return this.tmpEngine + } + + /** + * Instantiates the main instance with cache data (if it exists) + */ + public static async initFromCache(): Promise { + try { + const cache = await cacheManager.getMinisearchCache() + if (cache) { + this.getEngine().minisearch = cache + } + } catch (e) { + console.error(e) + } + } + + /** + * Must be called when the background indexing is done, + * to load the freshest data into the main instance + */ + public static swapEngines(): void { + ;[this.engine, this.tmpEngine] = [this.tmpEngine, this.engine] + this.isIndexing = false + } + + private minisearch: MiniSearch + + private constructor() { + this.minisearch = new MiniSearch(minisearchOptions) + } + + /** + * Searches the index for the given query, + * and returns an array of raw results + */ + public async search( + query: Query, + options = { fuzzy: 0.1 } + ): Promise { + if (!query.segmentsToStr()) return [] + + let results = this.minisearch.search(query.segmentsToStr(), { + prefix: true, + // fuzzy: term => (term.length > 4 ? 0.2 : false), + fuzzy: options.fuzzy, + combineWith: 'AND', + boost: { + basename: settings.weightBasename, + aliases: settings.weightBasename, + headings1: settings.weightH1, + headings2: settings.weightH2, + headings3: settings.weightH3, + }, + }) + + // Downrank files that are in Obsidian's excluded list + if (settings.respectExcluded) { + results.forEach(result => { + if ( + app.metadataCache.isUserIgnored && + app.metadataCache.isUserIgnored(result.id) + ) { + result.score /= 10 // TODO: make this value configurable or toggleable? + } + }) + } + + // If the search query contains quotes, filter out results that don't have the exact match + const exactTerms = query.getExactTerms() + if (exactTerms.length) { + results = results.filter(r => { + const title = cacheManager.getDocument(r.id)?.path.toLowerCase() ?? '' + const content = stripMarkdownCharacters( + cacheManager.getDocument(r.id)?.content ?? '' + ).toLowerCase() + return exactTerms.every(q => content.includes(q) || title.includes(q)) + }) + } + + // If the search query contains exclude terms, filter out results that have them + const exclusions = query.exclusions + if (exclusions.length) { + results = results.filter(r => { + const content = stripMarkdownCharacters( + cacheManager.getDocument(r.id)?.content ?? '' + ).toLowerCase() + return exclusions.every(q => !content.includes(q.value)) + }) + } + return results + } + + /** + * Parses a text against a regex, and returns the { string, offset } matches + */ + public getMatches(text: string, reg: RegExp, query: Query): SearchMatch[] { + let match: RegExpExecArray | null = null + const matches: SearchMatch[] = [] + let count = 0 + while ((match = reg.exec(text)) !== null) { + if (++count >= 100) break // Avoid infinite loops, stop looking after 100 matches + const m = match[0] + if (m) matches.push({ match: m, offset: match.index }) + } + + // If the query can be found "as is" in the text, put this match first + const best = text.toLowerCase().indexOf(query.segmentsToStr()) + if (best > -1) { + matches.unshift({ + offset: best, + match: query.segmentsToStr(), + }) + } + + return matches + } + + /** + * Searches the index, and returns an array of ResultNote objects. + * If we have the singleFile option set, + * the array contains a single result from that file + * @param query + * @param options + * @returns + */ + public async getSuggestions( + query: Query, + options?: Partial<{ singleFilePath: string | null }> + ): Promise { + // Get the raw results + let results = await this.search(query) + if (results.length == 0) { + results = await this.search(query, { fuzzy: 0.2 }) + } + if (!results.length) return [] + + // Extract tags from the query + const tags = query.segments + .filter(s => s.value.startsWith('#')) + .map(s => s.value) + + // Either keep the 50 first results, + // or the one corresponding to `singleFile` + if (options?.singleFilePath) { + const result = results.find(r => r.id === options.singleFilePath) + if (result) results = [result] + else results = [] + } else { + results = results.slice(0, 50) + + // Put the results with tags on top + for (const tag of tags) { + for (const result of results) { + if ((result.tags ?? []).includes(tag)) { + result.score *= 100 + } + } + } + } + + // Map the raw results to get usable suggestions + return results.map(result => { + let note = cacheManager.getDocument(result.id) + if (!note) { + // throw new Error(`Omnisearch - Note "${result.id}" not indexed`) + note = { + content: '', + basename: result.id, + path: result.id, + } as IndexedDocument + } + + // Remove '#' from tags, for highlighting + query.segments.forEach(s => { + s.value = s.value.replace(/^#/, '') + }) + // Clean search matches that match quoted expressions, + // and inject those expressions instead + const foundWords = [ + // Matching terms from the result, + // do not necessarily match the query + ...Object.keys(result.match), + + // Quoted expressions + ...query.segments.filter(s => s.exact).map(s => s.value), + + // Tags, starting with # + ...tags, + ].filter(w => w.length > 1) + + // console.log(foundWords) + const matches = this.getMatches( + note.content, + stringsToRegex(foundWords), + query + ) + const resultNote: ResultNote = { + score: result.score, + foundWords, + matches, + ...note, + } + return resultNote + }) + } + + // #region Read/write minisearch index + + public async addAllToMinisearch(documents: IndexedDocument[]): Promise { + await this.minisearch.addAllAsync(documents) + } + + public addSingleToMinisearch(document: IndexedDocument): void { + this.minisearch.add(document) + } + + public removeFromMinisearch(document: IndexedDocument): void { + this.minisearch.remove(document) + } + + // #endregion + + public async writeToCache(): Promise { + await cacheManager.writeMinisearchCache(this.minisearch) + } +} diff --git a/src/search/search.ts b/src/search/search.ts deleted file mode 100644 index 3731c6d..0000000 --- a/src/search/search.ts +++ /dev/null @@ -1,260 +0,0 @@ -import MiniSearch, { - type AsPlainObject, - type Options, - type SearchResult, -} from 'minisearch' -import { - chsRegex, - type IndexedDocument, - type ResultNote, - type SearchMatch, - SPACE_OR_PUNCTUATION, -} from '../globals' -import { - removeDiacritics, - stringsToRegex, - stripMarkdownCharacters, -} from '../tools/utils' -import type { Query } from './query' -import { settings } from '../settings' -import { cacheManager } from '../cache-manager' - -let minisearchInstance: MiniSearch - -const tokenize = (text: string): string[] => { - const tokens = text.split(SPACE_OR_PUNCTUATION) - const chsSegmenter = (app as any).plugins.plugins['cm-chs-patch'] - - if (chsSegmenter) { - return tokens.flatMap(word => - chsRegex.test(word) ? chsSegmenter.cut(word) : [word] - ) - } else return tokens -} - -const minisearchOptions: Options = { - tokenize, - processTerm: (term: string) => - (settings.ignoreDiacritics ? removeDiacritics(term) : term).toLowerCase(), - idField: 'path', - fields: [ - 'basename', - 'aliases', - 'content', - 'headings1', - 'headings2', - 'headings3', - ], - storeFields: ['tags'], -} - -/** - * Initializes the MiniSearch instance, - * and adds all the notes to the index - */ -export async function initSearchEngine(): Promise { - // Default instance - minisearchInstance = new MiniSearch(minisearchOptions) -} - -export async function initSearchEngineFromData(json: string): Promise { - try { - minisearchInstance = MiniSearch.loadJSON(json, minisearchOptions) - console.log('Omnisearch - MiniSearch index loaded from the file') - } catch (e) { - console.error('Omnisearch - Could not load MiniSearch index from json') - console.error(e) - } -} - -/** - * Searches the index for the given query, - * and returns an array of raw results - */ -async function search( - query: Query, - options = { fuzzy: 0.1 } -): Promise { - if (!query.segmentsToStr()) return [] - - let results = minisearchInstance.search(query.segmentsToStr(), { - prefix: true, - // fuzzy: term => (term.length > 4 ? 0.2 : false), - fuzzy: options.fuzzy, - combineWith: 'AND', - boost: { - basename: settings.weightBasename, - aliases: settings.weightBasename, - headings1: settings.weightH1, - headings2: settings.weightH2, - headings3: settings.weightH3, - }, - }) - - // Downrank files that are in Obsidian's excluded list - if (settings.respectExcluded) { - results.forEach(result => { - if ( - app.metadataCache.isUserIgnored && - app.metadataCache.isUserIgnored(result.id) - ) { - result.score /= 10 // TODO: make this value configurable or toggleable? - } - }) - } - - // If the search query contains quotes, filter out results that don't have the exact match - const exactTerms = query.getExactTerms() - if (exactTerms.length) { - results = results.filter(r => { - const title = cacheManager.getDocument(r.id)?.path.toLowerCase() ?? '' - const content = stripMarkdownCharacters( - cacheManager.getDocument(r.id)?.content ?? '' - ).toLowerCase() - return exactTerms.every(q => content.includes(q) || title.includes(q)) - }) - } - - // If the search query contains exclude terms, filter out results that have them - const exclusions = query.exclusions - if (exclusions.length) { - results = results.filter(r => { - const content = stripMarkdownCharacters( - cacheManager.getDocument(r.id)?.content ?? '' - ).toLowerCase() - return exclusions.every(q => !content.includes(q.value)) - }) - } - return results -} - -/** - * Parses a text against a regex, and returns the { string, offset } matches - */ -export function getMatches( - text: string, - reg: RegExp, - query: Query -): SearchMatch[] { - let match: RegExpExecArray | null = null - const matches: SearchMatch[] = [] - let count = 0 - while ((match = reg.exec(text)) !== null) { - if (++count >= 100) break // Avoid infinite loops, stop looking after 100 matches - const m = match[0] - if (m) matches.push({ match: m, offset: match.index }) - } - - // If the query can be found "as is" in the text, put this match first - const best = text.toLowerCase().indexOf(query.segmentsToStr()) - if (best > -1) { - matches.unshift({ - offset: best, - match: query.segmentsToStr(), - }) - } - - return matches -} - -/** - * Searches the index, and returns an array of ResultNote objects. - * If we have the singleFile option set, - * the array contains a single result from that file - * @param query - * @param options - * @returns - */ -export async function getSuggestions( - query: Query, - options?: Partial<{ singleFilePath: string | null }> -): Promise { - // Get the raw results - let results = await search(query) - if (results.length == 0) { - results = await search(query, { fuzzy: 0.2 }) - } - if (!results.length) return [] - - // Extract tags from the query - const tags = query.segments - .filter(s => s.value.startsWith('#')) - .map(s => s.value) - - // Either keep the 50 first results, - // or the one corresponding to `singleFile` - if (options?.singleFilePath) { - const result = results.find(r => r.id === options.singleFilePath) - if (result) results = [result] - else results = [] - } else { - results = results.slice(0, 50) - - // Put the results with tags on top - for (const tag of tags) { - for (const result of results) { - if ((result.tags ?? []).includes(tag)) { - result.score *= 100 - } - } - } - } - - // Map the raw results to get usable suggestions - return results.map(result => { - const note = cacheManager.getDocument(result.id) - if (!note) { - throw new Error(`Omnisearch - Note "${result.id}" not indexed`) - } - - // Remove '#' from tags, for highlighting - query.segments.forEach(s => { - s.value = s.value.replace(/^#/, '') - }) - // Clean search matches that match quoted expressions, - // and inject those expressions instead - const foundWords = [ - // Matching terms from the result, - // do not necessarily match the query - ...Object.keys(result.match), - - // Quoted expressions - ...query.segments.filter(s => s.exact).map(s => s.value), - - // Tags, starting with # - ...tags, - ].filter(w => w.length > 1) - - // console.log(foundWords) - const matches = getMatches(note.content, stringsToRegex(foundWords), query) - const resultNote: ResultNote = { - score: result.score, - foundWords, - matches, - ...note, - } - return resultNote - }) -} - -// #region Read/write minisearch index - -export function getMinisearchIndexJSON(): AsPlainObject { - return minisearchInstance.toJSON() -} - -export async function addAllToMinisearch( - documents: IndexedDocument[] -): Promise { - await minisearchInstance.addAllAsync(documents) -} - -export function addSingleToMinisearch(document: IndexedDocument): void { - minisearchInstance.add(document) -} - -export function removeFromMinisearch(document: IndexedDocument): void { - minisearchInstance.remove(document) -} - -// #endregion diff --git a/src/tools/api.ts b/src/tools/api.ts index 2e5523d..ac4429e 100644 --- a/src/tools/api.ts +++ b/src/tools/api.ts @@ -1,6 +1,6 @@ import type { ResultNote, SearchMatch } from '../globals' import { Query } from '../search/query' -import * as Search from '../search/search' +import { SearchEngine } from '../search/search-engine' type ResultNoteApi = { score: number @@ -30,7 +30,7 @@ function mapResults(results: ResultNote[]): ResultNoteApi[] { async function search(q: string): Promise { const query = new Query(q) - const raw = await Search.getSuggestions(query) + const raw = await SearchEngine.getEngine().getSuggestions(query) return mapResults(raw) }