WIP refactor to take advantage of minisearch 6.0

This commit is contained in:
Simon Cambier
2022-11-25 08:28:50 +01:00
parent 2daa718bc9
commit dcef2d3719
10 changed files with 172 additions and 97 deletions

View File

@@ -6,6 +6,10 @@ import { minisearchOptions } from './search/search-engine'
import { makeMD5 } from './tools/utils'
class CacheManager {
/**
* @deprecated
* @private
*/
private liveDocuments: Map<string, IndexedDocument> = new Map()
/**
* Show an empty input field next time the user opens Omnisearch modal
@@ -39,6 +43,7 @@ class CacheManager {
/**
* Important: keep this method async for the day it _really_ becomes async.
* This will avoid a refactor.
* @deprecated
* @param path
* @param note
*/
@@ -49,19 +54,22 @@ class CacheManager {
this.liveDocuments.set(path, note)
}
/**
* @deprecated
* @param key
*/
public deleteLiveDocument(key: string): void {
this.liveDocuments.delete(key)
}
/**
* @deprecated
* @param key
*/
public getLiveDocument(key: string): IndexedDocument | undefined {
return this.liveDocuments.get(key)
}
public isDocumentOutdated(file: TFile): boolean {
const indexedNote = this.getLiveDocument(file.path)
return !indexedNote || indexedNote.mtime !== file.stat.mtime
}
//#region Minisearch
public getDocumentsChecksum(documents: IndexedDocument[]): string {
@@ -82,7 +90,6 @@ class CacheManager {
public async getMinisearchCache(): Promise<MiniSearch | null> {
// Retrieve documents and make their checksum
const cachedDocs = await database.documents.toArray()
const checksum = this.getDocumentsChecksum(cachedDocs.map(d => d.document))
// Add those documents in the live cache
cachedDocs.forEach(doc =>
@@ -91,13 +98,6 @@ class CacheManager {
// Retrieve the search cache, and verify the checksum
const cachedIndex = (await database.minisearch.toArray())[0]
if (cachedIndex?.checksum !== checksum) {
console.warn("Omnisearch - Cache - Checksums don't match, clearing cache")
// Invalid (or null) cache, clear everything
await database.minisearch.clear()
await database.documents.clear()
return null
}
try {
return MiniSearch.loadJS(cachedIndex.data, minisearchOptions)
@@ -116,7 +116,7 @@ class CacheManager {
* @param documents
*/
public async getDiffDocuments(documents: IndexedDocument[]): Promise<{
toDelete: IndexedDocument[]
toDelete: string[]
toAdd: IndexedDocument[]
toUpdate: { oldDoc: IndexedDocument; newDoc: IndexedDocument }[]
}> {
@@ -128,7 +128,7 @@ class CacheManager {
// present in `cachedDocs` but not in `documents`
const toDelete = cachedDocs
.filter(c => !documents.find(d => d.path === c.path))
.map(d => d.document)
.map(d => d.path)
// toUpdate: same path, but different mtime
const toUpdate = cachedDocs
@@ -158,7 +158,7 @@ class CacheManager {
// Delete
// console.log(`Omnisearch - Cache - Will delete ${toDelete.length} documents`)
await database.documents.bulkDelete(toDelete.map(o => o.path))
await database.documents.bulkDelete(toDelete)
// Add
// console.log(`Omnisearch - Cache - Will add ${toAdd.length} documents`)

View File

@@ -6,7 +6,7 @@
import { eventBus, IndexingStep, type ResultNote } from 'src/globals'
import { createNote, openNote } from 'src/tools/notes'
import { SearchEngine } from 'src/search/search-engine'
import { getCtrlKeyLabel, getExtension, loopIndex } from 'src/tools/utils'
import { getCtrlKeyLabel, getExtension, isFilePDF, loopIndex } from 'src/tools/utils'
import {
OmnisearchInFileModal,
type OmnisearchVaultModal,
@@ -189,7 +189,7 @@
function switchToInFileModal(): void {
// Do nothing if the selectedNote is a PDF,
// or if there is 0 match (e.g indexing in progress)
if (selectedNote?.path.endsWith('.pdf') || !selectedNote?.matches.length) {
if (isFilePDF(selectedNote?.path) || !selectedNote?.matches.length) {
return
}

View File

@@ -21,6 +21,7 @@
// @ts-ignore
const file = app.vault.getFiles().find(f => f.path === note.path)
if (file) {
// @ts-ignore
imagePath = app.vault.getResourcePath(file)
}
}

View File

@@ -32,12 +32,26 @@ export class OmnisearchCache extends Dexie {
//#region Table declarations
documents!: Dexie.Table<
{ path: string; mtime: number; document: IndexedDocument },
{
path: string
mtime: number
/**
* @deprecated
*/
document: IndexedDocument
},
string
>
searchHistory!: Dexie.Table<{ id?: number; query: string }, number>
minisearch!: Dexie.Table<
{ date: string; checksum: string; data: AsPlainObject },
{
date: string
/**
* @deprecated
*/
checksum: string
data: AsPlainObject
},
string
>

View File

@@ -4,14 +4,13 @@ import {
getAliasesFromMetadata,
getTagsFromMetadata,
isFileImage,
isFilePDF,
isFilePlaintext,
removeDiacritics,
} from './tools/utils'
import * as NotesIndex from './notes-index'
import type { TFile } from 'obsidian'
import type { IndexedDocument } from './globals'
import { getNonExistingNotes } from './tools/notes'
import { getPdfText, getImageText } from 'obsidian-text-extract'
import { getImageText, getPdfText } from 'obsidian-text-extract'
/**
* Return all plaintext files as IndexedDocuments
@@ -20,9 +19,9 @@ export async function getPlainTextFiles(): Promise<IndexedDocument[]> {
const allFiles = app.vault.getFiles().filter(f => isFilePlaintext(f.path))
const data: IndexedDocument[] = []
for (const file of allFiles) {
const doc = await fileToIndexedDocument(file)
const doc = await getIndexedDocument(file.path)
data.push(doc)
await cacheManager.updateLiveDocument(file.path, doc)
// await cacheManager.updateLiveDocument(file.path, doc)
}
return data
}
@@ -31,7 +30,7 @@ export async function getPlainTextFiles(): Promise<IndexedDocument[]> {
* Return all PDFs as IndexedDocuments.
*/
export async function getPDFAsDocuments(): Promise<IndexedDocument[]> {
const files = app.vault.getFiles().filter(f => f.path.endsWith('.pdf'))
const files = app.vault.getFiles().filter(f => isFilePDF(f.path))
return await getBinaryFiles(files)
}
@@ -49,8 +48,8 @@ async function getBinaryFiles(files: TFile[]): Promise<IndexedDocument[]> {
for (const file of files) {
input.push(
new Promise(async (resolve, reject) => {
const doc = await fileToIndexedDocument(file)
await cacheManager.updateLiveDocument(file.path, doc)
const doc = await getIndexedDocument(file.path)
// await cacheManager.updateLiveDocument(file.path, doc)
data.push(doc)
return resolve(null)
})
@@ -60,37 +59,34 @@ async function getBinaryFiles(files: TFile[]): Promise<IndexedDocument[]> {
return data
}
/**
* Convert a file into an IndexedDocument.
* Will use the cache if possible.
*/
export async function fileToIndexedDocument(
file: TFile
export async function getIndexedDocument(
path: string
): Promise<IndexedDocument> {
const file = app.vault.getFiles().find(f => f.path === path)
if (!file) throw new Error(`Invalid file path: "${path}"`)
let content: string
if (isFilePlaintext(file.path)) {
if (isFilePlaintext(path)) {
content = await app.vault.cachedRead(file)
} else if (file.path.endsWith('.pdf')) {
} else if (isFilePDF(path)) {
content = await getPdfText(file)
} else if (isFileImage(file.path)) {
content = await getImageText(file)
} else {
throw new Error('Invalid file: ' + file.path)
throw new Error('Invalid file format: ' + file.path)
}
content = removeDiacritics(content)
const metadata = app.metadataCache.getFileCache(file)
// Look for links that lead to non-existing files,
// and add them to the index.
if (metadata) {
// FIXME: https://github.com/scambier/obsidian-omnisearch/issues/129
const nonExisting = getNonExistingNotes(file, metadata)
for (const name of nonExisting.filter(
o => !cacheManager.getLiveDocument(o)
)) {
NotesIndex.addNonExistingToIndex(name, file.path)
}
// // FIXME: https://github.com/scambier/obsidian-omnisearch/issues/129
// const nonExisting = getNonExistingNotes(file, metadata)
// for (const name of nonExisting.filter(
// o => !cacheManager.getLiveDocument(o)
// )) {
// NotesIndex.addNonExistingToIndex(name, file.path)
// }
// EXCALIDRAW
// Remove the json code
@@ -117,3 +113,61 @@ export async function fileToIndexedDocument(
headings3: metadata ? extractHeadingsFromCache(metadata, 3).join(' ') : '',
}
}
/**
* Convert a file into an IndexedDocument.
* Will use the cache if possible.
*/
// async function fileToIndexedDocument(
// file: TFile
// ): Promise<IndexedDocument> {
// let content: string
// if (isFilePlaintext(file.path)) {
// content = await app.vault.cachedRead(file)
// } else if (isFilePDF(file.path)) {
// content = await getPdfText(file)
// } else if (isFileImage(file.path)) {
// content = await getImageText(file)
// } else {
// throw new Error('Invalid file: ' + file.path)
// }
//
// content = removeDiacritics(content)
// const metadata = app.metadataCache.getFileCache(file)
//
// // Look for links that lead to non-existing files,
// // and add them to the index.
// if (metadata) {
// // FIXME: https://github.com/scambier/obsidian-omnisearch/issues/129
// const nonExisting = getNonExistingNotes(file, metadata)
// for (const name of nonExisting.filter(
// o => !cacheManager.getLiveDocument(o)
// )) {
// NotesIndex.addNonExistingToIndex(name, file.path)
// }
//
// // EXCALIDRAW
// // Remove the json code
// if (metadata.frontmatter?.['excalidraw-plugin']) {
// const comments =
// metadata.sections?.filter(s => s.type === 'comment') ?? []
// for (const { start, end } of comments.map(c => c.position)) {
// content =
// content.substring(0, start.offset - 1) + content.substring(end.offset)
// }
// }
// }
//
// return {
// basename: removeDiacritics(file.basename),
// content,
// path: file.path,
// mtime: file.stat.mtime,
//
// tags: getTagsFromMetadata(metadata),
// aliases: getAliasesFromMetadata(metadata).join(''),
// headings1: metadata ? extractHeadingsFromCache(metadata, 1).join(' ') : '',
// headings2: metadata ? extractHeadingsFromCache(metadata, 2).join(' ') : '',
// headings3: metadata ? extractHeadingsFromCache(metadata, 3).join(' ') : '',
// }
// }

View File

@@ -37,6 +37,7 @@ export type IndexedDocument = {
headings2: string
headings3: string
// TODO: reimplement this
doesNotExist?: boolean
parent?: string
}

View File

@@ -8,10 +8,11 @@ import { loadSettings, settings, SettingsTab, showExcerpt } from './settings'
import { eventBus, EventNames, IndexingStep } from './globals'
import api from './tools/api'
import { isFilePlaintext, wait } from './tools/utils'
import * as NotesIndex from './notes-index'
import * as FileLoader from './file-loader'
import { OmnisearchCache } from './database'
import { cacheManager } from './cache-manager'
import * as NotesIndex from './notes-index'
import { addToIndexAndMemCache } from "./notes-index";
export default class OmnisearchPlugin extends Plugin {
private ribbonButton?: HTMLElement
@@ -186,21 +187,15 @@ async function populateIndex(): Promise<void> {
// Add
await engine.addAllToMinisearch(diffDocs.toAdd)
diffDocs.toAdd.forEach(doc =>
cacheManager.updateLiveDocument(doc.path, doc)
)
// Delete
for (const [i, doc] of diffDocs.toDelete.entries()) {
await wait(0)
engine.removeFromMinisearch(doc)
cacheManager.deleteLiveDocument(doc.path)
for (const pathToDel of diffDocs.toDelete) {
NotesIndex.removeFromIndex(pathToDel)
}
// Update (delete + add)
diffDocs.toUpdate.forEach(({ oldDoc, newDoc }) => {
engine.removeFromMinisearch(oldDoc)
cacheManager.updateLiveDocument(oldDoc.path, newDoc)
NotesIndex.removeFromIndex(oldDoc.path)
})
await engine.addAllToMinisearch(diffDocs.toUpdate.map(d => d.newDoc))
}

View File

@@ -4,7 +4,9 @@ import { removeAnchors } from './tools/notes'
import { SearchEngine } from './search/search-engine'
import { cacheManager } from './cache-manager'
import type { IndexedDocument } from './globals'
import { fileToIndexedDocument } from './file-loader'
import { getIndexedDocument } from "./file-loader";
const indexedList: Set<string> = new Set()
/**
* Adds a file to the search index
@@ -18,21 +20,14 @@ export async function addToIndexAndMemCache(
return
}
// Check if the file was already indexed as non-existent.
// If so, remove it from the index, and add it again as a real note.
if (cacheManager.getLiveDocument(file.path)?.doesNotExist) {
removeFromIndex(file.path)
}
try {
if (cacheManager.getLiveDocument(file.path)) {
if (indexedList.has(file.path)) {
throw new Error(`${file.basename} is already indexed`)
}
// Make the document and index it
const note = await fileToIndexedDocument(file)
SearchEngine.getEngine().addSingleToMinisearch(note)
await cacheManager.updateLiveDocument(note.path, note)
SearchEngine.getEngine().addSingleToMinisearch(file.path)
indexedList.add(file.path)
} catch (e) {
// console.trace('Error while indexing ' + file.basename)
console.error(e)
@@ -65,8 +60,7 @@ export function addNonExistingToIndex(name: string, parent: string): void {
doesNotExist: true,
parent,
}
SearchEngine.getEngine().addSingleToMinisearch(note)
cacheManager.updateLiveDocument(filename, note)
SearchEngine.getEngine().addSingleToMinisearch(note.path)
}
/**
@@ -77,10 +71,8 @@ export function removeFromIndex(path: string): void {
console.info(`"${path}" is not an indexable file`)
return
}
const note = cacheManager.getLiveDocument(path)
if (note) {
SearchEngine.getEngine().removeFromMinisearch(note)
cacheManager.deleteLiveDocument(path)
if (indexedList.has(path)) {
SearchEngine.getEngine().removeFromMinisearch(path)
// FIXME: only remove non-existing notes if they don't have another parent
// cacheManager

View File

@@ -2,10 +2,10 @@ import MiniSearch, { type Options, type SearchResult } from 'minisearch'
import {
chsRegex,
type IndexedDocument,
IndexingStep,
type ResultNote,
type SearchMatch,
SPACE_OR_PUNCTUATION,
IndexingStep,
} from '../globals'
import {
removeDiacritics,
@@ -17,8 +17,9 @@ import { settings } from '../settings'
import { cacheManager } from '../cache-manager'
import { writable } from 'svelte/store'
import { Notice } from 'obsidian'
import { getIndexedDocument } from '../file-loader'
let previousResults: ResultNote[] = []
let previousResults: SearchResult[] = []
const tokenize = (text: string): string[] => {
const tokens = text.split(SPACE_OR_PUNCTUATION)
@@ -102,7 +103,10 @@ export class SearchEngine {
query: Query,
options: { prefixLength: number }
): Promise<SearchResult[]> {
if (!query.segmentsToStr()) return []
if (query.isEmpty()) {
previousResults = []
return []
}
let results = this.minisearch.search(query.segmentsToStr(), {
prefix: term => term.length >= options.prefixLength,
@@ -116,6 +120,7 @@ export class SearchEngine {
headings3: settings.weightH3,
},
})
if (!results.length) return previousResults
// Downrank files that are in Obsidian's excluded list
if (settings.respectExcluded) {
@@ -124,19 +129,23 @@ export class SearchEngine {
app.metadataCache.isUserIgnored &&
app.metadataCache.isUserIgnored(result.id)
) {
result.score /= 10 // TODO: make this value configurable or toggleable?
result.score /= 10
}
})
}
const documents = await Promise.all(
results.map(async result => await getIndexedDocument(result.id))
)
// If the search query contains quotes, filter out results that don't have the exact match
const exactTerms = query.getExactTerms()
if (exactTerms.length) {
results = results.filter(r => {
const title =
cacheManager.getLiveDocument(r.id)?.path.toLowerCase() ?? ''
const document = documents.find(d => d.path === r.id)
const title = document?.path.toLowerCase() ?? ''
const content = stripMarkdownCharacters(
cacheManager.getLiveDocument(r.id)?.content ?? ''
document?.content ?? ''
).toLowerCase()
return exactTerms.every(q => content.includes(q) || title.includes(q))
})
@@ -147,16 +156,22 @@ export class SearchEngine {
if (exclusions.length) {
results = results.filter(r => {
const content = stripMarkdownCharacters(
cacheManager.getLiveDocument(r.id)?.content ?? ''
documents.find(d => d.path === r.id)?.content ?? ''
).toLowerCase()
return exclusions.every(q => !content.includes(q.value))
})
}
// FIXME:
// Dedupe results - clutch for https://github.com/scambier/obsidian-omnisearch/issues/129
return results.filter(
results = results
.filter(
(result, index, arr) => arr.findIndex(t => t.id === result.id) === index
)
.slice(0, 50)
previousResults = results
return results
}
/**
@@ -196,10 +211,6 @@ export class SearchEngine {
query: Query,
options?: Partial<{ singleFilePath: string | null }>
): Promise<ResultNote[]> {
if (query.isEmpty()) {
previousResults = []
return []
}
// Get the raw results
let results: SearchResult[]
if (settings.simpleSearch) {
@@ -207,7 +218,6 @@ export class SearchEngine {
} else {
results = await this.search(query, { prefixLength: 3 })
}
if (!results.length) return previousResults
// Extract tags from the query
const tags = query.segments
@@ -233,9 +243,14 @@ export class SearchEngine {
}
}
// TODO: this already called in search(), pass each document in its SearchResult instead?
const documents = await Promise.all(
results.map(async result => await getIndexedDocument(result.id))
)
// Map the raw results to get usable suggestions
const resultNotes = results.map(result => {
let note = cacheManager.getLiveDocument(result.id)
let note = documents.find(d => d.path === result.id)
if (!note) {
// throw new Error(`Omnisearch - Note "${result.id}" not indexed`)
console.warn(`Omnisearch - Note "${result.id}" not in the live cache`)
@@ -278,7 +293,6 @@ export class SearchEngine {
}
return resultNote
})
previousResults = resultNotes
return resultNotes
}
@@ -291,12 +305,12 @@ export class SearchEngine {
await this.minisearch.addAllAsync(documents, { chunkSize })
}
public addSingleToMinisearch(document: IndexedDocument): void {
this.minisearch.add(document)
public addSingleToMinisearch(path: string): void {
getIndexedDocument(path).then(doc => this.minisearch.add(doc))
}
public removeFromMinisearch(document: IndexedDocument): void {
this.minisearch.remove(document)
public removeFromMinisearch(path: string): void {
this.minisearch.discard(path)
}
// #endregion

View File

@@ -1,9 +1,9 @@
import {
type CachedMetadata,
Notice,
Platform,
getAllTags,
Notice,
parseFrontMatterAliases,
Platform,
} from 'obsidian'
import type { SearchMatch } from '../globals'
import {
@@ -207,8 +207,8 @@ export function getCtrlKeyLabel(): 'ctrl' | '⌘' {
export function isFileIndexable(path: string): boolean {
return (
(settings.PDFIndexing && path.endsWith('.pdf')) ||
isFilePlaintext(path) ||
(settings.PDFIndexing && isFilePDF(path)) ||
(settings.imagesIndexing && isFileImage(path))
)
}
@@ -219,6 +219,10 @@ export function isFileImage(path: string): boolean {
)
}
export function isFilePDF(path: string): boolean {
return path.endsWith('.pdf')
}
export function isFilePlaintext(path: string): boolean {
return getPlaintextExtensions().some(t => path.endsWith(`.${t}`))
}