Refactor to split the text extractor is done.

This commit is contained in:
Simon Cambier
2022-11-05 23:18:28 +01:00
parent 74db2844a9
commit e6c113d83b
8 changed files with 818 additions and 617 deletions

View File

@@ -3,7 +3,7 @@ import type { IndexedDocument } from './globals'
import { database } from './database'
import MiniSearch from 'minisearch'
import { minisearchOptions } from './search/search-engine'
import { makeMD5, wait } from './tools/utils'
import { makeMD5 } from './tools/utils'
import { settings } from './settings'
class CacheManager {
@@ -121,28 +121,34 @@ class CacheManager {
public async getDiffDocuments(documents: IndexedDocument[]): Promise<{
toDelete: IndexedDocument[]
toAdd: IndexedDocument[]
toUpdate: { old: IndexedDocument; new: IndexedDocument }[]
toUpdate: { oldDoc: IndexedDocument; newDoc: IndexedDocument }[]
}> {
let cachedDocs = await database.documents.toArray()
// present in `documents` but not in `cachedDocs`
const toAdd = documents.filter(
d => !cachedDocs.find(c => c.path === d.path)
)
// present in `cachedDocs` but not in `documents`
const toDelete = cachedDocs
.filter(c => !documents.find(d => d.path === c.path))
.map(d => d.document)
// toUpdate: same path, but different mtime
const toUpdate = cachedDocs
.filter(c =>
documents.find(d => d.path === c.path && d.mtime !== c.mtime)
.filter(({ mtime: cMtime, path: cPath }) =>
documents.some(
({ mtime: dMtime, path: dPath }) =>
cPath === dPath && dMtime !== cMtime
)
)
.map(c => ({
old: c.document,
new: documents.find(d => d.path === c.path)!,
oldDoc: c.document,
newDoc: documents.find(d => d.path === c.path)!,
}))
return {
toDelete,
toAdd,
toDelete,
toUpdate,
}
}
@@ -167,9 +173,9 @@ class CacheManager {
// console.log(`Omnisearch - Cache - Will update ${toUpdate.length} documents`)
await database.documents.bulkPut(
toUpdate.map(o => ({
document: o.new,
mtime: o.new.mtime,
path: o.new.path,
document: o.newDoc,
mtime: o.newDoc.mtime,
path: o.newDoc.path,
}))
)

View File

@@ -2,7 +2,6 @@ import { App, Modal, TFile } from 'obsidian'
import ModalVault from './ModalVault.svelte'
import ModalInFile from './ModalInFile.svelte'
import { eventBus, EventNames, isInputComposition } from '../globals'
import { settings } from '../settings'
abstract class OmnisearchModal extends Modal {
protected constructor(app: App) {

View File

@@ -32,7 +32,6 @@ export class OmnisearchCache extends Dexie {
//#region Table declarations
pdf!: Dexie.Table<{ path: string; hash: string; text: string }, string>
documents!: Dexie.Table<
{ path: string; mtime: number; document: IndexedDocument },
string
@@ -56,7 +55,6 @@ export class OmnisearchCache extends Dexie {
super(OmnisearchCache.dbName)
// Database structure
this.version(OmnisearchCache.dbVersion).stores({
pdf: 'path, hash, size',
searchHistory: '++id',
documents: 'path',
minisearch: 'date',

View File

@@ -32,22 +32,16 @@ export async function getPlainTextFiles(): Promise<IndexedDocument[]> {
* If a PDF isn't cached, it will be read from the disk and added to the IndexedDB
*/
export async function getPDFFiles(): Promise<IndexedDocument[]> {
const fromDisk = app.vault.getFiles().filter(f => f.path.endsWith('.pdf'))
const fromDb = await database.pdf.toArray()
const files = app.vault.getFiles().filter(f => f.path.endsWith('.pdf'))
const data: IndexedDocument[] = []
const input = []
for (const file of fromDisk) {
input.push(
NotesIndex.processQueue(async () => {
const doc = await fileToIndexedDocument(
file,
fromDb.find(o => o.path === file.path)?.text
)
await cacheManager.updateLiveDocument(file.path, doc)
data.push(doc)
})
)
for (const file of files) {
input.push(new Promise(async (resolve, reject) => {
const doc = await fileToIndexedDocument(file)
await cacheManager.updateLiveDocument(file.path, doc)
data.push(doc)
return resolve(null)
}))
}
await Promise.all(input)
return data
@@ -56,21 +50,17 @@ export async function getPDFFiles(): Promise<IndexedDocument[]> {
/**
* Convert a file into an IndexedDocument.
* Will use the cache if possible.
* @param file
* @param content If we give a text content, will skip the fetching part
*/
export async function fileToIndexedDocument(
file: TFile,
content?: string
): Promise<IndexedDocument> {
if (!content) {
if (isFilePlaintext(file.path)) {
content = await app.vault.cachedRead(file)
} else if (file.path.endsWith('.pdf')) {
content = await getPdfText(file)
} else {
throw new Error('Invalid file: ' + file.path)
}
let content: string
if (isFilePlaintext(file.path)) {
content = await app.vault.cachedRead(file)
} else if (file.path.endsWith('.pdf')) {
content = await getPdfText(file)
} else {
throw new Error('Invalid file: ' + file.path)
}
content = removeDiacritics(content)

View File

@@ -105,7 +105,7 @@ export default class OmnisearchPlugin extends Plugin {
* Read the files and feed them to Minisearch
*/
async function populateIndex(): Promise<void> {
console.time('Omnisearch - Indexing duration')
console.time('Omnisearch - Indexing total time')
// Initialize minisearch
let engine = SearchEngine.getEngine()
@@ -116,6 +116,7 @@ async function populateIndex(): Promise<void> {
}
// Load plaintext files
console.log('Omnisearch - Fetching notes')
const plainTextFiles = await FileLoader.getPlainTextFiles()
let allFiles = [...plainTextFiles]
// iOS: since there's no cache, directly index the documents
@@ -126,6 +127,7 @@ async function populateIndex(): Promise<void> {
// Load PDFs
if (settings.PDFIndexing) {
console.log('Omnisearch - Fetching PDFs')
const pdfs = await FileLoader.getPDFFiles()
// iOS: since there's no cache, just index the documents
if (Platform.isIosApp) {
@@ -136,37 +138,52 @@ async function populateIndex(): Promise<void> {
allFiles = [...allFiles, ...pdfs]
}
console.log('Omnisearch - Total number of files: ' + allFiles.length)
let needToUpdateCache = false
// Other platforms: make a diff of what's to add/update/delete
if (!Platform.isIosApp) {
console.log('Omnisearch - Checking index cache diff...')
// Check which documents need to be removed/added/updated
const diffDocs = await cacheManager.getDiffDocuments(allFiles)
needToUpdateCache = !!(
diffDocs.toAdd.length ||
diffDocs.toDelete.length ||
diffDocs.toUpdate.length
)
// Add
await engine.addAllToMinisearch(diffDocs.toAdd)
console.log(`Omnisearch - ${diffDocs.toAdd.length} files to add`)
diffDocs.toAdd.forEach(doc =>
cacheManager.updateLiveDocument(doc.path, doc)
)
// Delete
console.log(`Omnisearch - ${diffDocs.toDelete.length} files to remove`)
diffDocs.toDelete.forEach(d => engine.removeFromMinisearch(d))
diffDocs.toDelete.forEach(doc => cacheManager.deleteLiveDocument(doc.path))
// Update (delete + add)
console.log(`Omnisearch - ${diffDocs.toUpdate.length} files to update`)
diffDocs.toUpdate
.map(d => d.old)
.forEach(d => {
engine.removeFromMinisearch(d)
cacheManager.updateLiveDocument(d.path, d)
.forEach(({ oldDoc, newDoc }) => {
engine.removeFromMinisearch(oldDoc)
cacheManager.updateLiveDocument(oldDoc.path, newDoc)
})
await engine.addAllToMinisearch(diffDocs.toUpdate.map(d => d.new))
await engine.addAllToMinisearch(diffDocs.toUpdate.map(d => d.newDoc))
}
// Load PDFs into the main search engine, and write cache
// SearchEngine.loadTmpDataIntoMain()
SearchEngine.isIndexing.set(false)
if (!Platform.isIosApp) {
if (!Platform.isIosApp && needToUpdateCache) {
console.log('Omnisearch - Writing cache...')
await SearchEngine.getEngine().writeToCache(allFiles)
}
console.timeEnd('Omnisearch - Indexing duration')
console.timeEnd('Omnisearch - Indexing total time')
}
async function cleanOldCacheFiles() {

View File

@@ -156,7 +156,7 @@ export async function filterAsync<T>(
* @returns
*/
export function stripMarkdownCharacters(text: string): string {
return text.replace(/(\*|_)+(.+?)(\*|_)+/g, (match, p1, p2) => p2)
return text.replace(/(\*|_)+(.+?)(\*|_)+/g, (_match, _p1, p2) => p2)
}
export function getAliasesFromMetadata(