Squashed commit of the following:

commit ac82511ddd17d5472ae3cfea9bbad9754f5a4d62
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Sat Oct 22 08:23:42 2022 +0200

    Screw that cache, seriously.

commit 8ba40d1be73daaaffea09e07bc56c339266db9b6
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Fri Oct 21 22:36:48 2022 +0200

    Stuff

commit 27b8fd7dc809be9714a109d3a458eb1276a47e2e
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Fri Oct 21 22:22:20 2022 +0200

    Moved files

commit fb1349c914907e586e103ca54fb04b9ddd45ef5d
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Thu Oct 20 22:25:29 2022 +0200

    Removed duplicate code

commit e7371138e60cbe4155cfd4fb44e3ee1d2e3ee088
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Thu Oct 20 21:50:09 2022 +0200

    Moved a bunch of files

commit 2ee1b2a0e799d4b41ab3a444d8cc44dfff5b5623
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Thu Oct 20 21:32:21 2022 +0200

    Removed useless code

commit 76c530dfb9adbad1bbe9079de2330fe43a044249
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Thu Oct 20 20:44:11 2022 +0200

    Split file reading and indexing
This commit is contained in:
Simon Cambier
2022-10-22 08:25:34 +02:00
parent 1376cea282
commit c2ecdd79ad
25 changed files with 338 additions and 403 deletions

249
src/search/search.ts Normal file
View File

@@ -0,0 +1,249 @@
import MiniSearch, {
type AsPlainObject,
type Options,
type SearchResult,
} from 'minisearch'
import {
chsRegex,
type IndexedDocument,
type ResultNote,
type SearchMatch,
SPACE_OR_PUNCTUATION,
} from '../globals'
import {
removeDiacritics,
stringsToRegex,
stripMarkdownCharacters,
} from '../tools/utils'
import type { Query } from './query'
import { settings } from '../settings'
import { cacheManager } from '../cache-manager'
let minisearchInstance: MiniSearch<IndexedDocument>
const tokenize = (text: string): string[] => {
const tokens = text.split(SPACE_OR_PUNCTUATION)
const chsSegmenter = (app as any).plugins.plugins['cm-chs-patch']
if (chsSegmenter) {
return tokens.flatMap(word =>
chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
)
} else return tokens
}
const minisearchOptions: Options<IndexedDocument> = {
tokenize,
processTerm: (term: string) =>
(settings.ignoreDiacritics ? removeDiacritics(term) : term).toLowerCase(),
idField: 'path',
fields: [
'basename',
'aliases',
'content',
'headings1',
'headings2',
'headings3',
],
storeFields: ['tags'],
}
/**
* Initializes the MiniSearch instance,
* and adds all the notes to the index
*/
export async function initSearchEngine(): Promise<void> {
// Default instance
minisearchInstance = new MiniSearch(minisearchOptions)
}
export async function initSearchEngineFromData(json: string): Promise<void> {
try {
minisearchInstance = MiniSearch.loadJSON(json, minisearchOptions)
console.log('Omnisearch - MiniSearch index loaded from the file')
} catch (e) {
console.error('Omnisearch - Could not load MiniSearch index from json')
console.error(e)
}
}
/**
* Searches the index for the given query,
* and returns an array of raw results
* @param query
* @returns
*/
async function search(query: Query): Promise<SearchResult[]> {
if (!query.segmentsToStr()) return []
let results = minisearchInstance.search(query.segmentsToStr(), {
prefix: true,
fuzzy: term => (term.length > 4 ? 0.2 : false),
combineWith: 'AND',
boost: {
basename: settings.weightBasename,
aliases: settings.weightBasename,
headings1: settings.weightH1,
headings2: settings.weightH2,
headings3: settings.weightH3,
},
})
// Downrank files that are in Obsidian's excluded list
if (settings.respectExcluded) {
results.forEach(result => {
if (
app.metadataCache.isUserIgnored &&
app.metadataCache.isUserIgnored(result.id)
) {
result.score /= 3 // TODO: make this value configurable or toggleable?
}
})
}
// If the search query contains quotes, filter out results that don't have the exact match
const exactTerms = query.getExactTerms()
if (exactTerms.length) {
results = results.filter(r => {
const title = cacheManager.getDocument(r.id)?.path.toLowerCase() ?? ''
const content = stripMarkdownCharacters(
cacheManager.getDocument(r.id)?.content ?? ''
).toLowerCase()
return exactTerms.every(q => content.includes(q) || title.includes(q))
})
}
// If the search query contains exclude terms, filter out results that have them
const exclusions = query.exclusions
if (exclusions.length) {
results = results.filter(r => {
const content = stripMarkdownCharacters(
cacheManager.getDocument(r.id)?.content ?? ''
).toLowerCase()
return exclusions.every(q => !content.includes(q.value))
})
}
return results
}
/**
* Parses a text against a regex, and returns the { string, offset } matches
* @param text
* @param reg
* @returns
*/
export function getMatches(text: string, reg: RegExp): SearchMatch[] {
let match: RegExpExecArray | null = null
const matches: SearchMatch[] = []
let count = 0 // TODO: FIXME: this is a hack to avoid infinite loops
while ((match = reg.exec(text)) !== null) {
if (++count > 100) break
const m = match[0]
if (m) matches.push({ match: m, offset: match.index })
}
return matches
}
/**
* Searches the index, and returns an array of ResultNote objects.
* If we have the singleFile option set,
* the array contains a single result from that file
* @param query
* @param options
* @returns
*/
export async function getSuggestions(
query: Query,
options?: Partial<{ singleFilePath: string | null }>
): Promise<ResultNote[]> {
// Get the raw results
let results = await search(query)
if (!results.length) return []
// Extract tags from the query
const tags = query.segments
.filter(s => s.value.startsWith('#'))
.map(s => s.value)
// Either keep the 50 first results,
// or the one corresponding to `singleFile`
if (options?.singleFilePath) {
const result = results.find(r => r.id === options.singleFilePath)
if (result) results = [result]
else results = []
} else {
results = results.slice(0, 50)
// Put the results with tags on top
for (const tag of tags) {
for (const result of results) {
if ((result.tags ?? []).includes(tag)) {
result.score *= 100
}
}
}
}
// Map the raw results to get usable suggestions
return results.map(result => {
const note = cacheManager.getDocument(result.id)
if (!note) {
throw new Error(`Note "${result.id}" not indexed`)
}
// Remove '#' from tags, for highlighting
query.segments.forEach(s => {
s.value = s.value.replace(/^#/, '')
})
// Clean search matches that match quoted expressions,
// and inject those expressions instead
const foundWords = [
// Matching terms from the result,
// do not necessarily match the query
...Object.keys(result.match),
// // Matching terms from the query,
// // but only if they stem from the result's matches
// ...Object.keys(result.match).filter(w =>
// query.segments.some(s => w.startsWith(s.value)),
// ),
// Quoted expressions
...query.segments.filter(s => s.exact).map(s => s.value),
// Tags, starting with #
...tags,
]
const matches = getMatches(note.content, stringsToRegex(foundWords))
const resultNote: ResultNote = {
score: result.score,
foundWords,
matches,
...note,
}
return resultNote
})
}
// #region Read/write minisearch index
export function getMinisearchIndexJSON(): AsPlainObject {
return minisearchInstance.toJSON()
}
export async function addAllToMinisearch(
documents: IndexedDocument[]
): Promise<void> {
await minisearchInstance.addAllAsync(documents)
}
export function addSingleToMinisearch(document: IndexedDocument): void {
minisearchInstance.add(document)
}
export function removeFromMinisearch(document: IndexedDocument): void {
minisearchInstance.remove(document)
}
// #endregion