Refactor to split the text extractor is done.

This commit is contained in:
Simon Cambier
2022-11-05 23:18:28 +01:00
parent 74db2844a9
commit e6c113d83b
8 changed files with 818 additions and 617 deletions

View File

@@ -25,11 +25,10 @@
"babel-jest": "^27.5.1", "babel-jest": "^27.5.1",
"builtin-modules": "^3.3.0", "builtin-modules": "^3.3.0",
"esbuild": "0.13.12", "esbuild": "0.13.12",
"esbuild-plugin-copy": "^1.3.0", "esbuild-plugin-copy": "1.3.0",
"esbuild-svelte": "^0.7.1", "esbuild-svelte": "0.7.1",
"jest": "^27.5.1", "jest": "^27.5.1",
"obsidian": "latest", "obsidian": "latest",
"obsidian-text-extract": "link:C:/Dev/Obsidian/obsidian-text-extract/dist",
"prettier": "^2.7.1", "prettier": "^2.7.1",
"prettier-plugin-svelte": "^2.8.0", "prettier-plugin-svelte": "^2.8.0",
"svelte": "^3.51.0", "svelte": "^3.51.0",
@@ -45,6 +44,7 @@
"dexie": "^3.2.2", "dexie": "^3.2.2",
"lodash-es": "4.17.21", "lodash-es": "4.17.21",
"minisearch": "github:scambier/minisearch#callback_desync", "minisearch": "github:scambier/minisearch#callback_desync",
"obsidian-text-extract": "1.0.1",
"p-limit": "^4.0.0", "p-limit": "^4.0.0",
"pure-md5": "^0.1.14" "pure-md5": "^0.1.14"
}, },

1325
pnpm-lock.yaml generated

File diff suppressed because it is too large Load Diff

View File

@@ -3,7 +3,7 @@ import type { IndexedDocument } from './globals'
import { database } from './database' import { database } from './database'
import MiniSearch from 'minisearch' import MiniSearch from 'minisearch'
import { minisearchOptions } from './search/search-engine' import { minisearchOptions } from './search/search-engine'
import { makeMD5, wait } from './tools/utils' import { makeMD5 } from './tools/utils'
import { settings } from './settings' import { settings } from './settings'
class CacheManager { class CacheManager {
@@ -121,28 +121,34 @@ class CacheManager {
public async getDiffDocuments(documents: IndexedDocument[]): Promise<{ public async getDiffDocuments(documents: IndexedDocument[]): Promise<{
toDelete: IndexedDocument[] toDelete: IndexedDocument[]
toAdd: IndexedDocument[] toAdd: IndexedDocument[]
toUpdate: { old: IndexedDocument; new: IndexedDocument }[] toUpdate: { oldDoc: IndexedDocument; newDoc: IndexedDocument }[]
}> { }> {
let cachedDocs = await database.documents.toArray() let cachedDocs = await database.documents.toArray()
// present in `documents` but not in `cachedDocs`
const toAdd = documents.filter( const toAdd = documents.filter(
d => !cachedDocs.find(c => c.path === d.path) d => !cachedDocs.find(c => c.path === d.path)
) )
// present in `cachedDocs` but not in `documents`
const toDelete = cachedDocs const toDelete = cachedDocs
.filter(c => !documents.find(d => d.path === c.path)) .filter(c => !documents.find(d => d.path === c.path))
.map(d => d.document) .map(d => d.document)
// toUpdate: same path, but different mtime
const toUpdate = cachedDocs const toUpdate = cachedDocs
.filter(c => .filter(({ mtime: cMtime, path: cPath }) =>
documents.find(d => d.path === c.path && d.mtime !== c.mtime) documents.some(
({ mtime: dMtime, path: dPath }) =>
cPath === dPath && dMtime !== cMtime
)
) )
.map(c => ({ .map(c => ({
old: c.document, oldDoc: c.document,
new: documents.find(d => d.path === c.path)!, newDoc: documents.find(d => d.path === c.path)!,
})) }))
return { return {
toDelete,
toAdd, toAdd,
toDelete,
toUpdate, toUpdate,
} }
} }
@@ -167,9 +173,9 @@ class CacheManager {
// console.log(`Omnisearch - Cache - Will update ${toUpdate.length} documents`) // console.log(`Omnisearch - Cache - Will update ${toUpdate.length} documents`)
await database.documents.bulkPut( await database.documents.bulkPut(
toUpdate.map(o => ({ toUpdate.map(o => ({
document: o.new, document: o.newDoc,
mtime: o.new.mtime, mtime: o.newDoc.mtime,
path: o.new.path, path: o.newDoc.path,
})) }))
) )

View File

@@ -2,7 +2,6 @@ import { App, Modal, TFile } from 'obsidian'
import ModalVault from './ModalVault.svelte' import ModalVault from './ModalVault.svelte'
import ModalInFile from './ModalInFile.svelte' import ModalInFile from './ModalInFile.svelte'
import { eventBus, EventNames, isInputComposition } from '../globals' import { eventBus, EventNames, isInputComposition } from '../globals'
import { settings } from '../settings'
abstract class OmnisearchModal extends Modal { abstract class OmnisearchModal extends Modal {
protected constructor(app: App) { protected constructor(app: App) {

View File

@@ -32,7 +32,6 @@ export class OmnisearchCache extends Dexie {
//#region Table declarations //#region Table declarations
pdf!: Dexie.Table<{ path: string; hash: string; text: string }, string>
documents!: Dexie.Table< documents!: Dexie.Table<
{ path: string; mtime: number; document: IndexedDocument }, { path: string; mtime: number; document: IndexedDocument },
string string
@@ -56,7 +55,6 @@ export class OmnisearchCache extends Dexie {
super(OmnisearchCache.dbName) super(OmnisearchCache.dbName)
// Database structure // Database structure
this.version(OmnisearchCache.dbVersion).stores({ this.version(OmnisearchCache.dbVersion).stores({
pdf: 'path, hash, size',
searchHistory: '++id', searchHistory: '++id',
documents: 'path', documents: 'path',
minisearch: 'date', minisearch: 'date',

View File

@@ -32,22 +32,16 @@ export async function getPlainTextFiles(): Promise<IndexedDocument[]> {
* If a PDF isn't cached, it will be read from the disk and added to the IndexedDB * If a PDF isn't cached, it will be read from the disk and added to the IndexedDB
*/ */
export async function getPDFFiles(): Promise<IndexedDocument[]> { export async function getPDFFiles(): Promise<IndexedDocument[]> {
const fromDisk = app.vault.getFiles().filter(f => f.path.endsWith('.pdf')) const files = app.vault.getFiles().filter(f => f.path.endsWith('.pdf'))
const fromDb = await database.pdf.toArray()
const data: IndexedDocument[] = [] const data: IndexedDocument[] = []
const input = [] const input = []
for (const file of fromDisk) { for (const file of files) {
input.push( input.push(new Promise(async (resolve, reject) => {
NotesIndex.processQueue(async () => { const doc = await fileToIndexedDocument(file)
const doc = await fileToIndexedDocument( await cacheManager.updateLiveDocument(file.path, doc)
file, data.push(doc)
fromDb.find(o => o.path === file.path)?.text return resolve(null)
) }))
await cacheManager.updateLiveDocument(file.path, doc)
data.push(doc)
})
)
} }
await Promise.all(input) await Promise.all(input)
return data return data
@@ -56,21 +50,17 @@ export async function getPDFFiles(): Promise<IndexedDocument[]> {
/** /**
* Convert a file into an IndexedDocument. * Convert a file into an IndexedDocument.
* Will use the cache if possible. * Will use the cache if possible.
* @param file
* @param content If we give a text content, will skip the fetching part
*/ */
export async function fileToIndexedDocument( export async function fileToIndexedDocument(
file: TFile, file: TFile,
content?: string
): Promise<IndexedDocument> { ): Promise<IndexedDocument> {
if (!content) { let content: string
if (isFilePlaintext(file.path)) { if (isFilePlaintext(file.path)) {
content = await app.vault.cachedRead(file) content = await app.vault.cachedRead(file)
} else if (file.path.endsWith('.pdf')) { } else if (file.path.endsWith('.pdf')) {
content = await getPdfText(file) content = await getPdfText(file)
} else { } else {
throw new Error('Invalid file: ' + file.path) throw new Error('Invalid file: ' + file.path)
}
} }
content = removeDiacritics(content) content = removeDiacritics(content)

View File

@@ -105,7 +105,7 @@ export default class OmnisearchPlugin extends Plugin {
* Read the files and feed them to Minisearch * Read the files and feed them to Minisearch
*/ */
async function populateIndex(): Promise<void> { async function populateIndex(): Promise<void> {
console.time('Omnisearch - Indexing duration') console.time('Omnisearch - Indexing total time')
// Initialize minisearch // Initialize minisearch
let engine = SearchEngine.getEngine() let engine = SearchEngine.getEngine()
@@ -116,6 +116,7 @@ async function populateIndex(): Promise<void> {
} }
// Load plaintext files // Load plaintext files
console.log('Omnisearch - Fetching notes')
const plainTextFiles = await FileLoader.getPlainTextFiles() const plainTextFiles = await FileLoader.getPlainTextFiles()
let allFiles = [...plainTextFiles] let allFiles = [...plainTextFiles]
// iOS: since there's no cache, directly index the documents // iOS: since there's no cache, directly index the documents
@@ -126,6 +127,7 @@ async function populateIndex(): Promise<void> {
// Load PDFs // Load PDFs
if (settings.PDFIndexing) { if (settings.PDFIndexing) {
console.log('Omnisearch - Fetching PDFs')
const pdfs = await FileLoader.getPDFFiles() const pdfs = await FileLoader.getPDFFiles()
// iOS: since there's no cache, just index the documents // iOS: since there's no cache, just index the documents
if (Platform.isIosApp) { if (Platform.isIosApp) {
@@ -136,37 +138,52 @@ async function populateIndex(): Promise<void> {
allFiles = [...allFiles, ...pdfs] allFiles = [...allFiles, ...pdfs]
} }
console.log('Omnisearch - Total number of files: ' + allFiles.length)
let needToUpdateCache = false
// Other platforms: make a diff of what's to add/update/delete // Other platforms: make a diff of what's to add/update/delete
if (!Platform.isIosApp) { if (!Platform.isIosApp) {
console.log('Omnisearch - Checking index cache diff...')
// Check which documents need to be removed/added/updated // Check which documents need to be removed/added/updated
const diffDocs = await cacheManager.getDiffDocuments(allFiles) const diffDocs = await cacheManager.getDiffDocuments(allFiles)
needToUpdateCache = !!(
diffDocs.toAdd.length ||
diffDocs.toDelete.length ||
diffDocs.toUpdate.length
)
// Add // Add
await engine.addAllToMinisearch(diffDocs.toAdd) await engine.addAllToMinisearch(diffDocs.toAdd)
console.log(`Omnisearch - ${diffDocs.toAdd.length} files to add`)
diffDocs.toAdd.forEach(doc => diffDocs.toAdd.forEach(doc =>
cacheManager.updateLiveDocument(doc.path, doc) cacheManager.updateLiveDocument(doc.path, doc)
) )
// Delete // Delete
console.log(`Omnisearch - ${diffDocs.toDelete.length} files to remove`)
diffDocs.toDelete.forEach(d => engine.removeFromMinisearch(d)) diffDocs.toDelete.forEach(d => engine.removeFromMinisearch(d))
diffDocs.toDelete.forEach(doc => cacheManager.deleteLiveDocument(doc.path)) diffDocs.toDelete.forEach(doc => cacheManager.deleteLiveDocument(doc.path))
// Update (delete + add) // Update (delete + add)
console.log(`Omnisearch - ${diffDocs.toUpdate.length} files to update`)
diffDocs.toUpdate diffDocs.toUpdate
.map(d => d.old) .forEach(({ oldDoc, newDoc }) => {
.forEach(d => { engine.removeFromMinisearch(oldDoc)
engine.removeFromMinisearch(d) cacheManager.updateLiveDocument(oldDoc.path, newDoc)
cacheManager.updateLiveDocument(d.path, d)
}) })
await engine.addAllToMinisearch(diffDocs.toUpdate.map(d => d.new)) await engine.addAllToMinisearch(diffDocs.toUpdate.map(d => d.newDoc))
} }
// Load PDFs into the main search engine, and write cache // Load PDFs into the main search engine, and write cache
// SearchEngine.loadTmpDataIntoMain() // SearchEngine.loadTmpDataIntoMain()
SearchEngine.isIndexing.set(false) SearchEngine.isIndexing.set(false)
if (!Platform.isIosApp) {
if (!Platform.isIosApp && needToUpdateCache) {
console.log('Omnisearch - Writing cache...')
await SearchEngine.getEngine().writeToCache(allFiles) await SearchEngine.getEngine().writeToCache(allFiles)
} }
console.timeEnd('Omnisearch - Indexing duration') console.timeEnd('Omnisearch - Indexing total time')
} }
async function cleanOldCacheFiles() { async function cleanOldCacheFiles() {

View File

@@ -156,7 +156,7 @@ export async function filterAsync<T>(
* @returns * @returns
*/ */
export function stripMarkdownCharacters(text: string): string { export function stripMarkdownCharacters(text: string): string {
return text.replace(/(\*|_)+(.+?)(\*|_)+/g, (match, p1, p2) => p2) return text.replace(/(\*|_)+(.+?)(\*|_)+/g, (_match, _p1, p2) => p2)
} }
export function getAliasesFromMetadata( export function getAliasesFromMetadata(