Files
obsidian-tannersearch/src/search/omnisearch.ts
YuNing Chen 095e5f841d Change the order of tokenizing text. (#267)
A Chinese user may also have english notes, but previous implementation can not handle hyphens and camel case

This commit should fix by changing the order of how tokens are generated
2023-08-02 09:59:30 +02:00

459 lines
13 KiB
TypeScript

import MiniSearch, { type Options, type SearchResult } from 'minisearch'
import type {
DocumentRef,
IndexedDocument,
ResultNote,
SearchMatch,
} from '../globals'
import { chsRegex, getChsSegmenter, SPACE_OR_PUNCTUATION } from '../globals'
import { settings } from '../settings'
import {
chunkArray,
logDebug,
removeDiacritics,
splitCamelCase,
splitHyphens,
stringsToRegex,
stripMarkdownCharacters,
warnDebug,
} from '../tools/utils'
import { Notice } from 'obsidian'
import type { Query } from './query'
import { cacheManager } from '../cache-manager'
import { sortBy } from 'lodash-es'
const tokenize = (text: string): string[] => {
let tokens = text.split(SPACE_OR_PUNCTUATION)
// Split hyphenated tokens
tokens = [...tokens, ...tokens.flatMap(splitHyphens)]
// Split camelCase tokens into "camel" and "case
tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]
// When enabled, we only use the chsSegmenter,
// and not the other custom tokenizers
const chsSegmenter = getChsSegmenter()
if (chsSegmenter) {
tokens = tokens.flatMap(word =>
chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
)
}
return tokens
}
export class Omnisearch {
public static readonly options: Options<IndexedDocument> = {
tokenize,
extractField: (doc, fieldName) => {
if (fieldName === 'directory') {
// return path without the filename
const parts = doc.path.split('/')
parts.pop()
return parts.join('/')
}
return (doc as any)[fieldName]
},
processTerm: (term: string) =>
(settings.ignoreDiacritics ? removeDiacritics(term) : term).toLowerCase(),
idField: 'path',
fields: [
'basename',
// Different from `path`, since `path` is the unique index and needs to include the filename
'directory',
'aliases',
'content',
'headings1',
'headings2',
'headings3',
],
storeFields: ['tags'],
logger(_level, _message, code) {
if (code === 'version_conflict') {
new Notice(
'Omnisearch - Your index cache may be incorrect or corrupted. If this message keeps appearing, go to Settings to clear the cache.',
5000
)
}
},
}
private minisearch: MiniSearch
private indexedDocuments: Map<string, number> = new Map()
// private previousResults: SearchResult[] = []
// private previousQuery: Query | null = null
constructor() {
this.minisearch = new MiniSearch(Omnisearch.options)
}
/**
* Return true if the cache is valid
*/
async loadCache(): Promise<boolean> {
const cache = await cacheManager.getMinisearchCache()
if (cache) {
// console.log('Omnisearch - Cache', cache)
this.minisearch = MiniSearch.loadJS(cache.data, Omnisearch.options)
this.indexedDocuments = new Map(cache.paths.map(o => [o.path, o.mtime]))
return true
}
console.log('Omnisearch - No cache found')
return false
}
/**
* Returns the list of documents that need to be reindexed
* @param docs
*/
getDiff(docs: DocumentRef[]): {
toAdd: DocumentRef[]
toRemove: DocumentRef[]
} {
const docsMap = new Map(docs.map(d => [d.path, d.mtime]))
// console.log(this.indexedDocuments)
const toAdd = docs.filter(
d =>
!this.indexedDocuments.has(d.path) ||
this.indexedDocuments.get(d.path) !== d.mtime
)
// console.log(toAdd)
const toRemove = [...this.indexedDocuments]
.filter(
([path, mtime]) => !docsMap.has(path) || docsMap.get(path) !== mtime
)
.map(o => ({ path: o[0], mtime: o[1] }))
return { toAdd, toRemove }
}
/**
* Add notes/PDFs/images to the search index
* @param paths
*/
public async addFromPaths(paths: string[]): Promise<void> {
logDebug('Adding files', paths)
let documents = (
await Promise.all(
paths.map(async path => await cacheManager.getDocument(path))
)
).filter(d => !!d?.path)
logDebug('Sorting documents to first index markdown')
// Index markdown files first
documents = sortBy(documents, d => (d.path.endsWith('.md') ? 0 : 1))
// If a document is already added, discard it
this.removeFromPaths(
documents.filter(d => this.indexedDocuments.has(d.path)).map(d => d.path)
)
// Split the documents in smaller chunks to add them to minisearch
const chunkedDocs = chunkArray(documents, 500)
for (const docs of chunkedDocs) {
logDebug('Indexing into search engine', docs)
// Update the list of indexed docs
docs.forEach(doc => this.indexedDocuments.set(doc.path, doc.mtime))
// Discard files that may have been already added (though it shouldn't happen)
const alreadyAdded = docs.filter(doc => this.minisearch.has(doc.path))
this.removeFromPaths(alreadyAdded.map(o => o.path))
// Add docs to minisearch
await this.minisearch.addAllAsync(docs)
}
}
/**
* Discard a document from minisearch
* @param paths
*/
public removeFromPaths(paths: string[]): void {
paths.forEach(p => this.indexedDocuments.delete(p))
// Make sure to not discard a file that we don't have
const existing = paths.filter(p => this.minisearch.has(p))
this.minisearch.discardAll(existing)
}
/**
* Searches the index for the given query,
* and returns an array of raw results
*/
public async search(
query: Query,
options: { prefixLength: number; singleFilePath?: string }
): Promise<SearchResult[]> {
if (query.isEmpty()) {
// this.previousResults = []
// this.previousQuery = null
return []
}
logDebug('Starting search for', query)
let fuzziness: number
switch (settings.fuzziness) {
case '0':
fuzziness = 0
break
case '1':
fuzziness = 0.1
break
default:
fuzziness = 0.2
break
}
let results = this.minisearch.search(query.segmentsToStr(), {
prefix: term => term.length >= options.prefixLength,
// length <= 3: no fuzziness
// length <= 5: fuzziness of 10%
// length > 5: fuzziness of 20%
fuzzy: term =>
term.length <= 3 ? 0 : term.length <= 5 ? fuzziness / 2 : fuzziness,
combineWith: 'AND',
boost: {
basename: settings.weightBasename,
directory: settings.weightDirectory,
aliases: settings.weightBasename,
headings1: settings.weightH1,
headings2: settings.weightH2,
headings3: settings.weightH3,
unmarkedTags: settings.weightUnmarkedTags,
},
})
logDebug('Found', results.length, 'results')
// Filter query results to only keep files that match query.query.ext (if any)
if (query.query.ext?.length) {
results = results.filter(r => {
// ".can" should match ".canvas"
const ext = '.' + r.id.split('.').pop()
return query.query.ext?.some(e =>
ext.startsWith(e.startsWith('.') ? e : '.' + e)
)
})
}
// Filter query results that match the path
if (query.query.path) {
results = results.filter(r =>
query.query.path?.some(p =>
(r.id as string).toLowerCase().includes(p.toLowerCase())
)
)
}
if (query.query.exclude.path) {
results = results.filter(
r =>
!query.query.exclude.path?.some(p =>
(r.id as string).toLowerCase().includes(p.toLowerCase())
)
)
}
if (!results.length) {
return []
}
if (options.singleFilePath) {
return results.filter(r => r.id === options.singleFilePath)
}
// Hide or downrank files that are in Obsidian's excluded list
if (settings.hideExcluded) {
// Filter the files out
results = results.filter(
result =>
!(
app.metadataCache.isUserIgnored &&
app.metadataCache.isUserIgnored(result.id)
)
)
} else {
// Just downrank them
results.forEach(result => {
if (
app.metadataCache.isUserIgnored &&
app.metadataCache.isUserIgnored(result.id)
) {
result.score /= 10
}
})
}
// Extract tags from the query
const tags = query.getTags()
// Put the results with tags on top
for (const tag of tags) {
for (const result of results) {
if ((result.tags ?? []).includes(tag)) {
result.score *= 100
}
}
}
logDebug('Sorting and limiting results')
// Sort results and keep the 50 best
results = results.sort((a, b) => b.score - a.score).slice(0, 50)
const documents = await Promise.all(
results.map(async result => await cacheManager.getDocument(result.id))
)
// If the search query contains quotes, filter out results that don't have the exact match
const exactTerms = query.getExactTerms()
if (exactTerms.length) {
logDebug('Filtering with quoted terms')
results = results.filter(r => {
const document = documents.find(d => d.path === r.id)
const title = document?.path.toLowerCase() ?? ''
const content = stripMarkdownCharacters(
document?.content ?? ''
).toLowerCase()
return exactTerms.every(q => content.includes(q) || title.includes(q))
})
}
// If the search query contains exclude terms, filter out results that have them
const exclusions = query.query.exclude.text
if (exclusions.length) {
logDebug('Filtering with exclusions')
results = results.filter(r => {
const content = stripMarkdownCharacters(
documents.find(d => d.path === r.id)?.content ?? ''
).toLowerCase()
return exclusions.every(q => !content.includes(q))
})
}
logDebug('Deduping')
// FIXME:
// Dedupe results - clutch for https://github.com/scambier/obsidian-omnisearch/issues/129
results = results.filter(
(result, index, arr) => arr.findIndex(t => t.id === result.id) === index
)
// this.previousQuery = query
// this.previousResults = results
return results
}
public getMatches(text: string, reg: RegExp, query: Query): SearchMatch[] {
const startTime = new Date().getTime()
let match: RegExpExecArray | null = null
let matches: SearchMatch[] = []
let count = 0
while ((match = reg.exec(text)) !== null) {
// Avoid infinite loops, stop looking after 100 matches or if we're taking too much time
if (++count >= 100 || new Date().getTime() - startTime > 50) {
warnDebug('Stopped getMatches at', count, 'results')
break
}
const m = match[2]
if (m) matches.push({ match: m, offset: match.index + 1 })
}
// If the query can be found "as is" in the text, put this match first
const best = text.toLowerCase().indexOf(query.segmentsToStr())
if (best > -1) {
matches = matches.filter(m => m.offset !== best)
matches.unshift({
offset: best,
match: query.segmentsToStr(),
})
}
return matches
}
/**
* Searches the index, and returns an array of ResultNote objects.
* If we have the singleFile option set,
* the array contains a single result from that file
* @param query
* @param options
* @returns
*/
public async getSuggestions(
query: Query,
options?: Partial<{ singleFilePath?: string }>
): Promise<ResultNote[]> {
// Get the raw results
let results: SearchResult[]
if (settings.simpleSearch) {
results = await this.search(query, {
prefixLength: 3,
singleFilePath: options?.singleFilePath,
})
} else {
results = await this.search(query, {
prefixLength: 1,
singleFilePath: options?.singleFilePath,
})
}
const documents = await Promise.all(
results.map(async result => await cacheManager.getDocument(result.id))
)
// Map the raw results to get usable suggestions
const resultNotes = results.map(result => {
logDebug('Locating matches for', result.id)
let note = documents.find(d => d.path === result.id)
if (!note) {
// throw new Error(`Omnisearch - Note "${result.id}" not indexed`)
console.warn(`Omnisearch - Note "${result.id}" not in the live cache`)
note = {
content: '',
basename: result.id,
path: result.id,
} as IndexedDocument
}
// Clean search matches that match quoted expressions,
// and inject those expressions instead
const foundWords = [
// Matching terms from the result,
// do not necessarily match the query
...Object.keys(result.match),
// Quoted expressions
...query.getExactTerms(),
// Tags, starting with #
...query.getTags(),
].filter(w => w.length > 1 || /\p{Emoji}/u.test(w))
logDebug('Matching tokens:', foundWords)
logDebug('Getting matches locations...')
const matches = this.getMatches(
note.content,
stringsToRegex(foundWords),
query
)
logDebug('Matches:', matches)
const resultNote: ResultNote = {
score: result.score,
foundWords,
matches,
...note,
}
return resultNote
})
return resultNotes
}
public async writeToCache(): Promise<void> {
await cacheManager.writeMinisearchCache(
this.minisearch,
this.indexedDocuments
)
}
}
export const searchEngine = new Omnisearch()