Files
obsidian-tannersearch/src/search/search-engine.ts
Simon Cambier 087ec5cc99 Squashed commit of the following:
commit 3b229cad538ad88ef2d366964c4261bc0e02fb7c
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Sat Nov 5 14:30:08 2022 +0100

    1.8.0-beta.1

commit f43c369b2dd0a1083b171724e3f7466429505629
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Sat Nov 5 13:39:45 2022 +0100

    Squashed commit of the following:

    commit 93508ee95046385baf62475e5bd835ed9fafe6d3
    Author: Simon Cambier <simon.cambier@protonmail.com>
    Date:   Sat Nov 5 13:35:56 2022 +0100

        Cleaning

    commit 205e6a7cce4c1939338820f366f7ae8a067ec7fb
    Author: Simon Cambier <simon.cambier@protonmail.com>
    Date:   Fri Nov 4 08:53:46 2022 +0100

        Added logs

    commit ea19b94e164581829908ac71d09a60e230925a7f
    Author: Simon Cambier <simon.cambier@protonmail.com>
    Date:   Thu Nov 3 22:27:24 2022 +0100

        Notices

    commit 53ff4e822b3c292a56da150b94a1cfe43e199d44
    Author: Simon Cambier <simon.cambier@protonmail.com>
    Date:   Thu Nov 3 22:27:09 2022 +0100

        Custom minisearch build + Notice when the cache could be corrupted

    commit 498408afd1c350dd68969318c3533fff8aa6c172
    Author: Simon Cambier <simon.cambier@protonmail.com>
    Date:   Thu Nov 3 22:26:22 2022 +0100

        Added a button to manually clear the cache

    commit 90afe5d3868989626ba4613b064e24ac7efa88be
    Author: Simon Cambier <simon.cambier@protonmail.com>
    Date:   Thu Nov 3 22:03:41 2022 +0100

        Optimized loading minisearch from cache

    commit 719dcb9c82f09f56dabb828ac13c9c1db7f795bb
    Author: Simon Cambier <simon.cambier@protonmail.com>
    Date:   Thu Nov 3 21:43:49 2022 +0100

        #92 - Refactored cache to make it behave like pre-indexedDb

    commit 2164ccfa39d83eef23231d01e8aa35ac30e0d31c
    Author: Simon Cambier <simon.cambier@protonmail.com>
    Date:   Wed Nov 2 23:13:59 2022 +0100

        Removed cache & tmp engine

    commit 50eb33bbd4d074be9a9952eaf871cd8f58b327e6
    Author: Simon Cambier <simon.cambier@protonmail.com>
    Date:   Wed Nov 2 22:56:04 2022 +0100

        More efficient loading of PDFs

commit a6342a675f
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Wed Nov 2 10:34:02 2022 +0100

    #120 - Cleaning of old cache databases

commit b6890567f3
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Mon Oct 31 17:28:17 2022 +0100

    Updated Readme
2022-11-05 14:58:25 +01:00

295 lines
8.2 KiB
TypeScript

import MiniSearch, { type Options, type SearchResult } from 'minisearch'
import {
chsRegex,
type IndexedDocument,
type ResultNote,
type SearchMatch,
SPACE_OR_PUNCTUATION,
} from '../globals'
import {
removeDiacritics,
stringsToRegex,
stripMarkdownCharacters,
} from '../tools/utils'
import type { Query } from './query'
import { settings } from '../settings'
import { cacheManager } from '../cache-manager'
import { writable } from 'svelte/store'
import { Notice } from 'obsidian'
const tokenize = (text: string): string[] => {
const tokens = text.split(SPACE_OR_PUNCTUATION)
const chsSegmenter = (app as any).plugins.plugins['cm-chs-patch']
if (chsSegmenter) {
return tokens.flatMap(word =>
chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
)
} else return tokens
}
export const minisearchOptions: Options<IndexedDocument> = {
tokenize,
processTerm: (term: string) =>
(settings.ignoreDiacritics ? removeDiacritics(term) : term).toLowerCase(),
idField: 'path',
fields: [
'basename',
'aliases',
'content',
'headings1',
'headings2',
'headings3',
],
storeFields: ['tags'],
callbackWhenDesync() {
new Notice(
'Omnisearch - Your index cache may be incorrect or corrupted. If this message keeps appearing, go to Settings to clear the cache.'
)
},
}
export class SearchEngine {
private static engine?: SearchEngine
public static isIndexing = writable(true)
/**
* The main singleton SearchEngine instance.
* Should be used for all queries
*/
public static getEngine(): SearchEngine {
if (!this.engine) {
this.engine = new SearchEngine()
}
return this.engine
}
/**
* Instantiates the main instance with cache data (if it exists)
*/
public static async initFromCache(): Promise<SearchEngine> {
try {
const cache = await cacheManager.getMinisearchCache()
if (cache) {
this.getEngine().minisearch = cache
}
} catch (e) {
new Notice(
'Omnisearch - Cache missing or invalid. Some freezes may occur while Omnisearch indexes your vault.'
)
console.error('Omnisearch - Could not init engine from cache')
console.error(e)
}
return this.getEngine()
}
private minisearch: MiniSearch
private constructor() {
this.minisearch = new MiniSearch(minisearchOptions)
}
/**
* Searches the index for the given query,
* and returns an array of raw results
*/
public async search(
query: Query,
options = { fuzzy: 0.1, prefix: false }
): Promise<SearchResult[]> {
if (!query.segmentsToStr()) return []
let results = this.minisearch.search(query.segmentsToStr(), {
prefix: term => {
return options.prefix || term.length > 4
},
fuzzy: options.fuzzy,
combineWith: 'AND',
boost: {
basename: settings.weightBasename,
aliases: settings.weightBasename,
headings1: settings.weightH1,
headings2: settings.weightH2,
headings3: settings.weightH3,
},
})
// Downrank files that are in Obsidian's excluded list
if (settings.respectExcluded) {
results.forEach(result => {
if (
app.metadataCache.isUserIgnored &&
app.metadataCache.isUserIgnored(result.id)
) {
result.score /= 10 // TODO: make this value configurable or toggleable?
}
})
}
// If the search query contains quotes, filter out results that don't have the exact match
const exactTerms = query.getExactTerms()
if (exactTerms.length) {
results = results.filter(r => {
const title =
cacheManager.getLiveDocument(r.id)?.path.toLowerCase() ?? ''
const content = stripMarkdownCharacters(
cacheManager.getLiveDocument(r.id)?.content ?? ''
).toLowerCase()
return exactTerms.every(q => content.includes(q) || title.includes(q))
})
}
// If the search query contains exclude terms, filter out results that have them
const exclusions = query.exclusions
if (exclusions.length) {
results = results.filter(r => {
const content = stripMarkdownCharacters(
cacheManager.getLiveDocument(r.id)?.content ?? ''
).toLowerCase()
return exclusions.every(q => !content.includes(q.value))
})
}
return results
}
/**
* Parses a text against a regex, and returns the { string, offset } matches
*/
public getMatches(text: string, reg: RegExp, query: Query): SearchMatch[] {
let match: RegExpExecArray | null = null
const matches: SearchMatch[] = []
let count = 0
while ((match = reg.exec(text)) !== null) {
if (++count >= 100) break // Avoid infinite loops, stop looking after 100 matches
const m = match[0]
if (m) matches.push({ match: m, offset: match.index })
}
// If the query can be found "as is" in the text, put this match first
const best = text.toLowerCase().indexOf(query.segmentsToStr())
if (best > -1) {
matches.unshift({
offset: best,
match: query.segmentsToStr(),
})
}
return matches
}
/**
* Searches the index, and returns an array of ResultNote objects.
* If we have the singleFile option set,
* the array contains a single result from that file
* @param query
* @param options
* @returns
*/
public async getSuggestions(
query: Query,
options?: Partial<{ singleFilePath: string | null }>
): Promise<ResultNote[]> {
// Get the raw results
let results = await this.search(query)
if (results.length == 0) {
if (settings.retryWhenZeroResult) {
results = await this.search(query, { fuzzy: 0.2, prefix: true })
}
}
if (!results.length) return []
// Extract tags from the query
const tags = query.segments
.filter(s => s.value.startsWith('#'))
.map(s => s.value)
// Either keep the 50 first results,
// or the one corresponding to `singleFile`
if (options?.singleFilePath) {
const result = results.find(r => r.id === options.singleFilePath)
if (result) results = [result]
else results = []
} else {
results = results.slice(0, 50)
// Put the results with tags on top
for (const tag of tags) {
for (const result of results) {
if ((result.tags ?? []).includes(tag)) {
result.score *= 100
}
}
}
}
// Map the raw results to get usable suggestions
return results.map(result => {
let note = cacheManager.getLiveDocument(result.id)
if (!note) {
// throw new Error(`Omnisearch - Note "${result.id}" not indexed`)
console.warn(`Omnisearch - Note "${result.id}" not in the live cache`)
note = {
content: '',
basename: result.id,
path: result.id,
} as IndexedDocument
}
// Remove '#' from tags, for highlighting
query.segments.forEach(s => {
s.value = s.value.replace(/^#/, '')
})
// Clean search matches that match quoted expressions,
// and inject those expressions instead
const foundWords = [
// Matching terms from the result,
// do not necessarily match the query
...Object.keys(result.match),
// Quoted expressions
...query.segments.filter(s => s.exact).map(s => s.value),
// Tags, starting with #
...tags,
].filter(w => w.length > 1)
// console.log(foundWords)
const matches = this.getMatches(
note.content,
stringsToRegex(foundWords),
query
)
const resultNote: ResultNote = {
score: result.score,
foundWords,
matches,
...note,
}
return resultNote
})
}
// #region Read/write minisearch index
public async addAllToMinisearch(
documents: IndexedDocument[],
chunkSize = 10
): Promise<void> {
await this.minisearch.addAllAsync(documents, { chunkSize })
}
public addSingleToMinisearch(document: IndexedDocument): void {
this.minisearch.add(document)
}
public removeFromMinisearch(document: IndexedDocument): void {
this.minisearch.remove(document)
}
// #endregion
public async writeToCache(documents: IndexedDocument[]): Promise<void> {
await cacheManager.writeMinisearchCache(this.minisearch, documents)
}
}