A Chinese user may also have english notes, but previous implementation can not handle hyphens and camel case This commit should fix by changing the order of how tokens are generated
459 lines
13 KiB
TypeScript
459 lines
13 KiB
TypeScript
import MiniSearch, { type Options, type SearchResult } from 'minisearch'
|
|
import type {
|
|
DocumentRef,
|
|
IndexedDocument,
|
|
ResultNote,
|
|
SearchMatch,
|
|
} from '../globals'
|
|
import { chsRegex, getChsSegmenter, SPACE_OR_PUNCTUATION } from '../globals'
|
|
import { settings } from '../settings'
|
|
import {
|
|
chunkArray,
|
|
logDebug,
|
|
removeDiacritics,
|
|
splitCamelCase,
|
|
splitHyphens,
|
|
stringsToRegex,
|
|
stripMarkdownCharacters,
|
|
warnDebug,
|
|
} from '../tools/utils'
|
|
import { Notice } from 'obsidian'
|
|
import type { Query } from './query'
|
|
import { cacheManager } from '../cache-manager'
|
|
import { sortBy } from 'lodash-es'
|
|
|
|
const tokenize = (text: string): string[] => {
|
|
let tokens = text.split(SPACE_OR_PUNCTUATION)
|
|
|
|
// Split hyphenated tokens
|
|
tokens = [...tokens, ...tokens.flatMap(splitHyphens)]
|
|
|
|
// Split camelCase tokens into "camel" and "case
|
|
tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]
|
|
|
|
// When enabled, we only use the chsSegmenter,
|
|
// and not the other custom tokenizers
|
|
const chsSegmenter = getChsSegmenter()
|
|
if (chsSegmenter) {
|
|
tokens = tokens.flatMap(word =>
|
|
chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
|
|
)
|
|
}
|
|
|
|
return tokens
|
|
}
|
|
|
|
export class Omnisearch {
|
|
public static readonly options: Options<IndexedDocument> = {
|
|
tokenize,
|
|
extractField: (doc, fieldName) => {
|
|
if (fieldName === 'directory') {
|
|
// return path without the filename
|
|
const parts = doc.path.split('/')
|
|
parts.pop()
|
|
return parts.join('/')
|
|
}
|
|
return (doc as any)[fieldName]
|
|
},
|
|
processTerm: (term: string) =>
|
|
(settings.ignoreDiacritics ? removeDiacritics(term) : term).toLowerCase(),
|
|
idField: 'path',
|
|
fields: [
|
|
'basename',
|
|
// Different from `path`, since `path` is the unique index and needs to include the filename
|
|
'directory',
|
|
'aliases',
|
|
'content',
|
|
'headings1',
|
|
'headings2',
|
|
'headings3',
|
|
],
|
|
storeFields: ['tags'],
|
|
logger(_level, _message, code) {
|
|
if (code === 'version_conflict') {
|
|
new Notice(
|
|
'Omnisearch - Your index cache may be incorrect or corrupted. If this message keeps appearing, go to Settings to clear the cache.',
|
|
5000
|
|
)
|
|
}
|
|
},
|
|
}
|
|
private minisearch: MiniSearch
|
|
private indexedDocuments: Map<string, number> = new Map()
|
|
// private previousResults: SearchResult[] = []
|
|
// private previousQuery: Query | null = null
|
|
|
|
constructor() {
|
|
this.minisearch = new MiniSearch(Omnisearch.options)
|
|
}
|
|
|
|
/**
|
|
* Return true if the cache is valid
|
|
*/
|
|
async loadCache(): Promise<boolean> {
|
|
const cache = await cacheManager.getMinisearchCache()
|
|
if (cache) {
|
|
// console.log('Omnisearch - Cache', cache)
|
|
this.minisearch = MiniSearch.loadJS(cache.data, Omnisearch.options)
|
|
this.indexedDocuments = new Map(cache.paths.map(o => [o.path, o.mtime]))
|
|
return true
|
|
}
|
|
console.log('Omnisearch - No cache found')
|
|
return false
|
|
}
|
|
|
|
/**
|
|
* Returns the list of documents that need to be reindexed
|
|
* @param docs
|
|
*/
|
|
getDiff(docs: DocumentRef[]): {
|
|
toAdd: DocumentRef[]
|
|
toRemove: DocumentRef[]
|
|
} {
|
|
const docsMap = new Map(docs.map(d => [d.path, d.mtime]))
|
|
|
|
// console.log(this.indexedDocuments)
|
|
const toAdd = docs.filter(
|
|
d =>
|
|
!this.indexedDocuments.has(d.path) ||
|
|
this.indexedDocuments.get(d.path) !== d.mtime
|
|
)
|
|
// console.log(toAdd)
|
|
const toRemove = [...this.indexedDocuments]
|
|
.filter(
|
|
([path, mtime]) => !docsMap.has(path) || docsMap.get(path) !== mtime
|
|
)
|
|
.map(o => ({ path: o[0], mtime: o[1] }))
|
|
return { toAdd, toRemove }
|
|
}
|
|
|
|
/**
|
|
* Add notes/PDFs/images to the search index
|
|
* @param paths
|
|
*/
|
|
public async addFromPaths(paths: string[]): Promise<void> {
|
|
logDebug('Adding files', paths)
|
|
let documents = (
|
|
await Promise.all(
|
|
paths.map(async path => await cacheManager.getDocument(path))
|
|
)
|
|
).filter(d => !!d?.path)
|
|
logDebug('Sorting documents to first index markdown')
|
|
// Index markdown files first
|
|
documents = sortBy(documents, d => (d.path.endsWith('.md') ? 0 : 1))
|
|
|
|
// If a document is already added, discard it
|
|
this.removeFromPaths(
|
|
documents.filter(d => this.indexedDocuments.has(d.path)).map(d => d.path)
|
|
)
|
|
|
|
// Split the documents in smaller chunks to add them to minisearch
|
|
const chunkedDocs = chunkArray(documents, 500)
|
|
for (const docs of chunkedDocs) {
|
|
logDebug('Indexing into search engine', docs)
|
|
// Update the list of indexed docs
|
|
docs.forEach(doc => this.indexedDocuments.set(doc.path, doc.mtime))
|
|
|
|
// Discard files that may have been already added (though it shouldn't happen)
|
|
const alreadyAdded = docs.filter(doc => this.minisearch.has(doc.path))
|
|
this.removeFromPaths(alreadyAdded.map(o => o.path))
|
|
|
|
// Add docs to minisearch
|
|
await this.minisearch.addAllAsync(docs)
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Discard a document from minisearch
|
|
* @param paths
|
|
*/
|
|
public removeFromPaths(paths: string[]): void {
|
|
paths.forEach(p => this.indexedDocuments.delete(p))
|
|
// Make sure to not discard a file that we don't have
|
|
const existing = paths.filter(p => this.minisearch.has(p))
|
|
this.minisearch.discardAll(existing)
|
|
}
|
|
|
|
/**
|
|
* Searches the index for the given query,
|
|
* and returns an array of raw results
|
|
*/
|
|
public async search(
|
|
query: Query,
|
|
options: { prefixLength: number; singleFilePath?: string }
|
|
): Promise<SearchResult[]> {
|
|
if (query.isEmpty()) {
|
|
// this.previousResults = []
|
|
// this.previousQuery = null
|
|
return []
|
|
}
|
|
|
|
logDebug('Starting search for', query)
|
|
|
|
let fuzziness: number
|
|
switch (settings.fuzziness) {
|
|
case '0':
|
|
fuzziness = 0
|
|
break
|
|
case '1':
|
|
fuzziness = 0.1
|
|
break
|
|
default:
|
|
fuzziness = 0.2
|
|
break
|
|
}
|
|
|
|
let results = this.minisearch.search(query.segmentsToStr(), {
|
|
prefix: term => term.length >= options.prefixLength,
|
|
// length <= 3: no fuzziness
|
|
// length <= 5: fuzziness of 10%
|
|
// length > 5: fuzziness of 20%
|
|
fuzzy: term =>
|
|
term.length <= 3 ? 0 : term.length <= 5 ? fuzziness / 2 : fuzziness,
|
|
combineWith: 'AND',
|
|
boost: {
|
|
basename: settings.weightBasename,
|
|
directory: settings.weightDirectory,
|
|
aliases: settings.weightBasename,
|
|
headings1: settings.weightH1,
|
|
headings2: settings.weightH2,
|
|
headings3: settings.weightH3,
|
|
unmarkedTags: settings.weightUnmarkedTags,
|
|
},
|
|
})
|
|
|
|
logDebug('Found', results.length, 'results')
|
|
|
|
// Filter query results to only keep files that match query.query.ext (if any)
|
|
if (query.query.ext?.length) {
|
|
results = results.filter(r => {
|
|
// ".can" should match ".canvas"
|
|
const ext = '.' + r.id.split('.').pop()
|
|
return query.query.ext?.some(e =>
|
|
ext.startsWith(e.startsWith('.') ? e : '.' + e)
|
|
)
|
|
})
|
|
}
|
|
|
|
// Filter query results that match the path
|
|
if (query.query.path) {
|
|
results = results.filter(r =>
|
|
query.query.path?.some(p =>
|
|
(r.id as string).toLowerCase().includes(p.toLowerCase())
|
|
)
|
|
)
|
|
}
|
|
if (query.query.exclude.path) {
|
|
results = results.filter(
|
|
r =>
|
|
!query.query.exclude.path?.some(p =>
|
|
(r.id as string).toLowerCase().includes(p.toLowerCase())
|
|
)
|
|
)
|
|
}
|
|
|
|
if (!results.length) {
|
|
return []
|
|
}
|
|
|
|
if (options.singleFilePath) {
|
|
return results.filter(r => r.id === options.singleFilePath)
|
|
}
|
|
|
|
// Hide or downrank files that are in Obsidian's excluded list
|
|
if (settings.hideExcluded) {
|
|
// Filter the files out
|
|
results = results.filter(
|
|
result =>
|
|
!(
|
|
app.metadataCache.isUserIgnored &&
|
|
app.metadataCache.isUserIgnored(result.id)
|
|
)
|
|
)
|
|
} else {
|
|
// Just downrank them
|
|
results.forEach(result => {
|
|
if (
|
|
app.metadataCache.isUserIgnored &&
|
|
app.metadataCache.isUserIgnored(result.id)
|
|
) {
|
|
result.score /= 10
|
|
}
|
|
})
|
|
}
|
|
|
|
// Extract tags from the query
|
|
const tags = query.getTags()
|
|
|
|
// Put the results with tags on top
|
|
for (const tag of tags) {
|
|
for (const result of results) {
|
|
if ((result.tags ?? []).includes(tag)) {
|
|
result.score *= 100
|
|
}
|
|
}
|
|
}
|
|
|
|
logDebug('Sorting and limiting results')
|
|
|
|
// Sort results and keep the 50 best
|
|
results = results.sort((a, b) => b.score - a.score).slice(0, 50)
|
|
|
|
const documents = await Promise.all(
|
|
results.map(async result => await cacheManager.getDocument(result.id))
|
|
)
|
|
|
|
// If the search query contains quotes, filter out results that don't have the exact match
|
|
const exactTerms = query.getExactTerms()
|
|
if (exactTerms.length) {
|
|
logDebug('Filtering with quoted terms')
|
|
results = results.filter(r => {
|
|
const document = documents.find(d => d.path === r.id)
|
|
const title = document?.path.toLowerCase() ?? ''
|
|
const content = stripMarkdownCharacters(
|
|
document?.content ?? ''
|
|
).toLowerCase()
|
|
return exactTerms.every(q => content.includes(q) || title.includes(q))
|
|
})
|
|
}
|
|
|
|
// If the search query contains exclude terms, filter out results that have them
|
|
const exclusions = query.query.exclude.text
|
|
if (exclusions.length) {
|
|
logDebug('Filtering with exclusions')
|
|
results = results.filter(r => {
|
|
const content = stripMarkdownCharacters(
|
|
documents.find(d => d.path === r.id)?.content ?? ''
|
|
).toLowerCase()
|
|
return exclusions.every(q => !content.includes(q))
|
|
})
|
|
}
|
|
|
|
logDebug('Deduping')
|
|
// FIXME:
|
|
// Dedupe results - clutch for https://github.com/scambier/obsidian-omnisearch/issues/129
|
|
results = results.filter(
|
|
(result, index, arr) => arr.findIndex(t => t.id === result.id) === index
|
|
)
|
|
|
|
// this.previousQuery = query
|
|
// this.previousResults = results
|
|
|
|
return results
|
|
}
|
|
|
|
public getMatches(text: string, reg: RegExp, query: Query): SearchMatch[] {
|
|
const startTime = new Date().getTime()
|
|
let match: RegExpExecArray | null = null
|
|
let matches: SearchMatch[] = []
|
|
let count = 0
|
|
while ((match = reg.exec(text)) !== null) {
|
|
// Avoid infinite loops, stop looking after 100 matches or if we're taking too much time
|
|
if (++count >= 100 || new Date().getTime() - startTime > 50) {
|
|
warnDebug('Stopped getMatches at', count, 'results')
|
|
break
|
|
}
|
|
const m = match[2]
|
|
if (m) matches.push({ match: m, offset: match.index + 1 })
|
|
}
|
|
|
|
// If the query can be found "as is" in the text, put this match first
|
|
const best = text.toLowerCase().indexOf(query.segmentsToStr())
|
|
if (best > -1) {
|
|
matches = matches.filter(m => m.offset !== best)
|
|
matches.unshift({
|
|
offset: best,
|
|
match: query.segmentsToStr(),
|
|
})
|
|
}
|
|
|
|
return matches
|
|
}
|
|
|
|
/**
|
|
* Searches the index, and returns an array of ResultNote objects.
|
|
* If we have the singleFile option set,
|
|
* the array contains a single result from that file
|
|
* @param query
|
|
* @param options
|
|
* @returns
|
|
*/
|
|
public async getSuggestions(
|
|
query: Query,
|
|
options?: Partial<{ singleFilePath?: string }>
|
|
): Promise<ResultNote[]> {
|
|
// Get the raw results
|
|
let results: SearchResult[]
|
|
if (settings.simpleSearch) {
|
|
results = await this.search(query, {
|
|
prefixLength: 3,
|
|
singleFilePath: options?.singleFilePath,
|
|
})
|
|
} else {
|
|
results = await this.search(query, {
|
|
prefixLength: 1,
|
|
singleFilePath: options?.singleFilePath,
|
|
})
|
|
}
|
|
|
|
const documents = await Promise.all(
|
|
results.map(async result => await cacheManager.getDocument(result.id))
|
|
)
|
|
|
|
// Map the raw results to get usable suggestions
|
|
const resultNotes = results.map(result => {
|
|
logDebug('Locating matches for', result.id)
|
|
let note = documents.find(d => d.path === result.id)
|
|
if (!note) {
|
|
// throw new Error(`Omnisearch - Note "${result.id}" not indexed`)
|
|
console.warn(`Omnisearch - Note "${result.id}" not in the live cache`)
|
|
note = {
|
|
content: '',
|
|
basename: result.id,
|
|
path: result.id,
|
|
} as IndexedDocument
|
|
}
|
|
|
|
// Clean search matches that match quoted expressions,
|
|
// and inject those expressions instead
|
|
const foundWords = [
|
|
// Matching terms from the result,
|
|
// do not necessarily match the query
|
|
...Object.keys(result.match),
|
|
|
|
// Quoted expressions
|
|
...query.getExactTerms(),
|
|
|
|
// Tags, starting with #
|
|
...query.getTags(),
|
|
].filter(w => w.length > 1 || /\p{Emoji}/u.test(w))
|
|
logDebug('Matching tokens:', foundWords)
|
|
|
|
logDebug('Getting matches locations...')
|
|
const matches = this.getMatches(
|
|
note.content,
|
|
stringsToRegex(foundWords),
|
|
query
|
|
)
|
|
logDebug('Matches:', matches)
|
|
const resultNote: ResultNote = {
|
|
score: result.score,
|
|
foundWords,
|
|
matches,
|
|
...note,
|
|
}
|
|
return resultNote
|
|
})
|
|
return resultNotes
|
|
}
|
|
|
|
public async writeToCache(): Promise<void> {
|
|
await cacheManager.writeMinisearchCache(
|
|
this.minisearch,
|
|
this.indexedDocuments
|
|
)
|
|
}
|
|
}
|
|
|
|
export const searchEngine = new Omnisearch()
|