Renamed "Omnisearch" into "SearchEngine"
This commit is contained in:
462
src/search/search-engine.ts
Normal file
462
src/search/search-engine.ts
Normal file
@@ -0,0 +1,462 @@
|
||||
import MiniSearch, { type Options, type SearchResult } from 'minisearch'
|
||||
import type { DocumentRef, IndexedDocument, ResultNote } from '../globals'
|
||||
|
||||
import { chunkArray, logDebug, removeDiacritics } from '../tools/utils'
|
||||
import { Notice } from 'obsidian'
|
||||
import type { Query } from './query'
|
||||
import { sortBy } from 'lodash-es'
|
||||
import type OmnisearchPlugin from '../main'
|
||||
import { Tokenizer } from './tokenizer'
|
||||
|
||||
export class SearchEngine {
|
||||
private tokenizer: Tokenizer
|
||||
private minisearch: MiniSearch
|
||||
/** Map<path, mtime> */
|
||||
private indexedDocuments: Map<string, number> = new Map()
|
||||
// private previousResults: SearchResult[] = []
|
||||
// private previousQuery: Query | null = null
|
||||
|
||||
constructor(protected plugin: OmnisearchPlugin) {
|
||||
this.tokenizer = new Tokenizer(plugin)
|
||||
this.minisearch = new MiniSearch(this.getOptions())
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if the cache is valid
|
||||
*/
|
||||
async loadCache(): Promise<boolean> {
|
||||
const cache = await this.plugin.cacheManager.getMinisearchCache()
|
||||
if (cache) {
|
||||
this.minisearch = await MiniSearch.loadJSAsync(
|
||||
cache.data,
|
||||
this.getOptions()
|
||||
)
|
||||
this.indexedDocuments = new Map(cache.paths.map(o => [o.path, o.mtime]))
|
||||
return true
|
||||
}
|
||||
console.log('Omnisearch - No cache found')
|
||||
return false
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the list of documents that need to be reindexed
|
||||
* @param docs
|
||||
*/
|
||||
getDiff(docs: DocumentRef[]): {
|
||||
toAdd: DocumentRef[]
|
||||
toRemove: DocumentRef[]
|
||||
} {
|
||||
const docsMap = new Map(docs.map(d => [d.path, d.mtime]))
|
||||
|
||||
// console.log(this.indexedDocuments)
|
||||
const toAdd = docs.filter(
|
||||
d =>
|
||||
!this.indexedDocuments.has(d.path) ||
|
||||
this.indexedDocuments.get(d.path) !== d.mtime
|
||||
)
|
||||
// console.log(toAdd)
|
||||
const toRemove = [...this.indexedDocuments]
|
||||
.filter(
|
||||
([path, mtime]) => !docsMap.has(path) || docsMap.get(path) !== mtime
|
||||
)
|
||||
.map(o => ({ path: o[0], mtime: o[1] }))
|
||||
return { toAdd, toRemove }
|
||||
}
|
||||
|
||||
/**
|
||||
* Add notes/PDFs/images to the search index
|
||||
* @param paths
|
||||
*/
|
||||
public async addFromPaths(paths: string[]): Promise<void> {
|
||||
logDebug('Adding files', paths)
|
||||
let documents = (
|
||||
await Promise.all(
|
||||
paths.map(
|
||||
async path => await this.plugin.cacheManager.getDocument(path)
|
||||
)
|
||||
)
|
||||
).filter(d => !!d?.path)
|
||||
logDebug('Sorting documents to first index markdown')
|
||||
// Index markdown files first
|
||||
documents = sortBy(documents, d => (d.path.endsWith('.md') ? 0 : 1))
|
||||
|
||||
// If a document is already added, discard it
|
||||
this.removeFromPaths(
|
||||
documents.filter(d => this.indexedDocuments.has(d.path)).map(d => d.path)
|
||||
)
|
||||
|
||||
// Split the documents in smaller chunks to add them to minisearch
|
||||
const chunkedDocs = chunkArray(documents, 500)
|
||||
for (const docs of chunkedDocs) {
|
||||
logDebug('Indexing into search engine', docs)
|
||||
// Update the list of indexed docs
|
||||
docs.forEach(doc => this.indexedDocuments.set(doc.path, doc.mtime))
|
||||
|
||||
// Discard files that may have been already added (though it shouldn't happen)
|
||||
const alreadyAdded = docs.filter(doc => this.minisearch.has(doc.path))
|
||||
this.removeFromPaths(alreadyAdded.map(o => o.path))
|
||||
|
||||
// Add docs to minisearch
|
||||
await this.minisearch.addAllAsync(docs)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Discard a document from minisearch
|
||||
* @param paths
|
||||
*/
|
||||
public removeFromPaths(paths: string[]): void {
|
||||
paths.forEach(p => this.indexedDocuments.delete(p))
|
||||
// Make sure to not discard a file that we don't have
|
||||
const existing = paths.filter(p => this.minisearch.has(p))
|
||||
this.minisearch.discardAll(existing)
|
||||
}
|
||||
|
||||
/**
|
||||
* Searches the index for the given query,
|
||||
* and returns an array of raw results
|
||||
*/
|
||||
public async search(
|
||||
query: Query,
|
||||
options: { prefixLength: number; singleFilePath?: string }
|
||||
): Promise<SearchResult[]> {
|
||||
const settings = this.plugin.settings
|
||||
if (query.isEmpty()) {
|
||||
// this.previousResults = []
|
||||
// this.previousQuery = null
|
||||
return []
|
||||
}
|
||||
|
||||
logDebug('=== New search ===')
|
||||
logDebug('Starting search for', query)
|
||||
|
||||
let fuzziness: number
|
||||
switch (settings.fuzziness) {
|
||||
case '0':
|
||||
fuzziness = 0
|
||||
break
|
||||
case '1':
|
||||
fuzziness = 0.1
|
||||
break
|
||||
default:
|
||||
fuzziness = 0.2
|
||||
break
|
||||
}
|
||||
|
||||
const searchTokens = this.tokenizer.tokenizeForSearch(query.segmentsToStr())
|
||||
logDebug(JSON.stringify(searchTokens, null, 1))
|
||||
let results = this.minisearch.search(searchTokens, {
|
||||
prefix: term => term.length >= options.prefixLength,
|
||||
// length <= 3: no fuzziness
|
||||
// length <= 5: fuzziness of 10%
|
||||
// length > 5: fuzziness of 20%
|
||||
fuzzy: term =>
|
||||
term.length <= 3 ? 0 : term.length <= 5 ? fuzziness / 2 : fuzziness,
|
||||
boost: {
|
||||
basename: settings.weightBasename,
|
||||
directory: settings.weightDirectory,
|
||||
aliases: settings.weightBasename,
|
||||
headings1: settings.weightH1,
|
||||
headings2: settings.weightH2,
|
||||
headings3: settings.weightH3,
|
||||
tags: settings.weightUnmarkedTags,
|
||||
unmarkedTags: settings.weightUnmarkedTags,
|
||||
},
|
||||
// The query is already tokenized, don't tokenize again
|
||||
tokenize: text => [text],
|
||||
})
|
||||
|
||||
logDebug('Found', results.length, 'results')
|
||||
|
||||
// Filter query results to only keep files that match query.query.ext (if any)
|
||||
if (query.query.ext?.length) {
|
||||
results = results.filter(r => {
|
||||
// ".can" should match ".canvas"
|
||||
const ext = '.' + r.id.split('.').pop()
|
||||
return query.query.ext?.some(e =>
|
||||
ext.startsWith(e.startsWith('.') ? e : '.' + e)
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
// Filter query results that match the path
|
||||
if (query.query.path) {
|
||||
results = results.filter(r =>
|
||||
query.query.path?.some(p =>
|
||||
(r.id as string).toLowerCase().includes(p.toLowerCase())
|
||||
)
|
||||
)
|
||||
}
|
||||
if (query.query.exclude.path) {
|
||||
results = results.filter(
|
||||
r =>
|
||||
!query.query.exclude.path?.some(p =>
|
||||
(r.id as string).toLowerCase().includes(p.toLowerCase())
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
if (!results.length) {
|
||||
return []
|
||||
}
|
||||
|
||||
if (options.singleFilePath) {
|
||||
return results.filter(r => r.id === options.singleFilePath)
|
||||
}
|
||||
|
||||
logDebug(
|
||||
'searching with downranked folders',
|
||||
settings.downrankedFoldersFilters
|
||||
)
|
||||
|
||||
// Hide or downrank files that are in Obsidian's excluded list
|
||||
if (settings.hideExcluded) {
|
||||
// Filter the files out
|
||||
results = results.filter(
|
||||
result =>
|
||||
!(
|
||||
this.plugin.app.metadataCache.isUserIgnored &&
|
||||
this.plugin.app.metadataCache.isUserIgnored(result.id)
|
||||
)
|
||||
)
|
||||
} else {
|
||||
// Just downrank them
|
||||
results.forEach(result => {
|
||||
if (
|
||||
this.plugin.app.metadataCache.isUserIgnored &&
|
||||
this.plugin.app.metadataCache.isUserIgnored(result.id)
|
||||
) {
|
||||
result.score /= 10
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Extract tags from the query
|
||||
const tags = query.getTags()
|
||||
|
||||
for (const result of results) {
|
||||
const path = result.id
|
||||
if (settings.downrankedFoldersFilters.length > 0) {
|
||||
// downrank files that are in folders listed in the downrankedFoldersFilters
|
||||
let downrankingFolder = false
|
||||
settings.downrankedFoldersFilters.forEach(filter => {
|
||||
if (path.startsWith(filter)) {
|
||||
// we don't want the filter to match the folder sources, e.g.
|
||||
// it needs to match a whole folder name
|
||||
if (path === filter || path.startsWith(filter + '/')) {
|
||||
logDebug('searching with downranked folders in path: ', path)
|
||||
downrankingFolder = true
|
||||
}
|
||||
}
|
||||
})
|
||||
if (downrankingFolder) {
|
||||
result.score /= 10
|
||||
}
|
||||
const pathParts = path.split('/')
|
||||
const pathPartsLength = pathParts.length
|
||||
for (let i = 0; i < pathPartsLength; i++) {
|
||||
const pathPart = pathParts[i]
|
||||
if (settings.downrankedFoldersFilters.includes(pathPart)) {
|
||||
result.score /= 10
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Boost custom properties
|
||||
const metadata = this.plugin.app.metadataCache.getCache(path)
|
||||
if (metadata) {
|
||||
for (const { name, weight } of settings.weightCustomProperties) {
|
||||
const values = metadata?.frontmatter?.[name]
|
||||
if (values && result.terms.some(t => values.includes(t))) {
|
||||
logDebug(`Boosting field "${name}" x${weight} for ${path}`)
|
||||
result.score *= weight
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Put the results with tags on top
|
||||
for (const tag of tags) {
|
||||
if ((result.tags ?? []).includes(tag)) {
|
||||
result.score *= 100
|
||||
}
|
||||
}
|
||||
}
|
||||
logDebug('Sorting and limiting results')
|
||||
|
||||
// Sort results and keep the 50 best
|
||||
results = results.sort((a, b) => b.score - a.score).slice(0, 50)
|
||||
|
||||
if (results.length) logDebug('First result:', results[0])
|
||||
|
||||
const documents = await Promise.all(
|
||||
results.map(
|
||||
async result => await this.plugin.cacheManager.getDocument(result.id)
|
||||
)
|
||||
)
|
||||
|
||||
// If the search query contains quotes, filter out results that don't have the exact match
|
||||
const exactTerms = query.getExactTerms()
|
||||
if (exactTerms.length) {
|
||||
logDebug('Filtering with quoted terms: ', exactTerms)
|
||||
results = results.filter(r => {
|
||||
const document = documents.find(d => d.path === r.id)
|
||||
const title = document?.path.toLowerCase() ?? ''
|
||||
const content = (document?.cleanedContent ?? '').toLowerCase()
|
||||
return exactTerms.every(
|
||||
q => content.includes(q) || removeDiacritics(title).includes(q)
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
// If the search query contains exclude terms, filter out results that have them
|
||||
const exclusions = query.query.exclude.text
|
||||
if (exclusions.length) {
|
||||
logDebug('Filtering with exclusions')
|
||||
results = results.filter(r => {
|
||||
const content = (
|
||||
documents.find(d => d.path === r.id)?.content ?? ''
|
||||
).toLowerCase()
|
||||
return exclusions.every(q => !content.includes(q))
|
||||
})
|
||||
}
|
||||
|
||||
logDebug('Deduping')
|
||||
// FIXME:
|
||||
// Dedupe results - clutch for https://github.com/scambier/obsidian-omnisearch/issues/129
|
||||
results = results.filter(
|
||||
(result, index, arr) => arr.findIndex(t => t.id === result.id) === index
|
||||
)
|
||||
|
||||
// this.previousQuery = query
|
||||
// this.previousResults = results
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
/**
|
||||
* Searches the index, and returns an array of ResultNote objects.
|
||||
* If we have the singleFile option set,
|
||||
* the array contains a single result from that file
|
||||
* @param query
|
||||
* @param options
|
||||
* @returns
|
||||
*/
|
||||
public async getSuggestions(
|
||||
query: Query,
|
||||
options?: Partial<{ singleFilePath?: string }>
|
||||
): Promise<ResultNote[]> {
|
||||
// Get the raw results
|
||||
let results: SearchResult[]
|
||||
if (this.plugin.settings.simpleSearch) {
|
||||
results = await this.search(query, {
|
||||
prefixLength: 3,
|
||||
singleFilePath: options?.singleFilePath,
|
||||
})
|
||||
} else {
|
||||
results = await this.search(query, {
|
||||
prefixLength: 1,
|
||||
singleFilePath: options?.singleFilePath,
|
||||
})
|
||||
}
|
||||
|
||||
const documents = await Promise.all(
|
||||
results.map(
|
||||
async result => await this.plugin.cacheManager.getDocument(result.id)
|
||||
)
|
||||
)
|
||||
|
||||
// Map the raw results to get usable suggestions
|
||||
const resultNotes = results.map(result => {
|
||||
logDebug('Locating matches for', result.id)
|
||||
let note = documents.find(d => d.path === result.id)
|
||||
if (!note) {
|
||||
// throw new Error(`Omnisearch - Note "${result.id}" not indexed`)
|
||||
console.warn(`Omnisearch - Note "${result.id}" not in the live cache`)
|
||||
note = {
|
||||
content: '',
|
||||
basename: result.id,
|
||||
path: result.id,
|
||||
} as IndexedDocument
|
||||
}
|
||||
|
||||
// Clean search matches that match quoted expressions,
|
||||
// and inject those expressions instead
|
||||
const foundWords = [
|
||||
// Matching terms from the result,
|
||||
// do not necessarily match the query
|
||||
...result.terms,
|
||||
|
||||
// Quoted expressions
|
||||
...query.getExactTerms(),
|
||||
|
||||
// Tags, starting with #
|
||||
...query.getTags(),
|
||||
]
|
||||
logDebug('Matching tokens:', foundWords)
|
||||
|
||||
logDebug('Getting matches locations...')
|
||||
const matches = this.plugin.textProcessor.getMatches(
|
||||
note.content,
|
||||
foundWords,
|
||||
query
|
||||
)
|
||||
logDebug(`Matches for ${note.basename}`, matches)
|
||||
const resultNote: ResultNote = {
|
||||
score: result.score,
|
||||
foundWords,
|
||||
matches,
|
||||
...note,
|
||||
}
|
||||
return resultNote
|
||||
})
|
||||
return resultNotes
|
||||
}
|
||||
|
||||
public async writeToCache(): Promise<void> {
|
||||
await this.plugin.cacheManager.writeMinisearchCache(
|
||||
this.minisearch,
|
||||
this.indexedDocuments
|
||||
)
|
||||
}
|
||||
|
||||
private getOptions(): Options<IndexedDocument> {
|
||||
return {
|
||||
tokenize: this.tokenizer.tokenizeForIndexing,
|
||||
extractField: (doc, fieldName) => {
|
||||
if (fieldName === 'directory') {
|
||||
// return path without the filename
|
||||
const parts = doc.path.split('/')
|
||||
parts.pop()
|
||||
return parts.join('/')
|
||||
}
|
||||
return (doc as any)[fieldName]
|
||||
},
|
||||
processTerm: (term: string) =>
|
||||
(this.plugin.settings.ignoreDiacritics
|
||||
? removeDiacritics(term)
|
||||
: term
|
||||
).toLowerCase(),
|
||||
idField: 'path',
|
||||
fields: [
|
||||
'basename',
|
||||
// Different from `path`, since `path` is the unique index and needs to include the filename
|
||||
'directory',
|
||||
'aliases',
|
||||
'content',
|
||||
'headings1',
|
||||
'headings2',
|
||||
'headings3',
|
||||
],
|
||||
storeFields: ['tags'],
|
||||
logger(_level, _message, code) {
|
||||
if (code === 'version_conflict') {
|
||||
new Notice(
|
||||
'Omnisearch - Your index cache may be incorrect or corrupted. If this message keeps appearing, go to Settings to clear the cache.',
|
||||
5000
|
||||
)
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user