Squashed commit of the following:

commit 739f9c349031510e8ef010ba2445a2a1fdbec247
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Sun Oct 16 16:57:03 2022 +0200

    Code cleaning + README

commit 85762bae592f3eafd34ba22b0cf1841bfbd91ca6
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Sun Oct 16 14:59:01 2022 +0200

    Cleaning deleted PDFs from cache

commit 1a37bf38d3f64870d4b40df1b67d8106c893ab64
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Sun Oct 16 13:18:06 2022 +0200

    PDF cache saved to IndexedDB
This commit is contained in:
Simon Cambier
2022-10-16 16:58:10 +02:00
parent ad820cb2c9
commit 1c3cc728f6
13 changed files with 184 additions and 124 deletions

View File

@@ -5,12 +5,12 @@ import { deflate, inflate } from 'pako'
import {
notesCacheFilePath,
minisearchCacheFilePath,
type IndexedNote,
type IndexedDocument,
} from './globals'
import { settings } from './settings'
class CacheManager {
notesCache: Record<string, IndexedNote> = {}
notesCache: Record<string, IndexedDocument> = {}
compress = true
writeInterval = 5_000 // In milliseconds
@@ -94,7 +94,7 @@ class CacheManager {
console.log('Omnisearch - Notes cache saved on disk')
}
public addNoteToCache(path: string, note: IndexedNote) {
public addNoteToCache(path: string, note: IndexedDocument) {
this.notesCache[path] = note
this.saveNotesCache()
}
@@ -103,11 +103,11 @@ class CacheManager {
delete this.notesCache[key]
}
public getNoteFromCache(key: string): IndexedNote | undefined {
public getNoteFromCache(key: string): IndexedDocument | undefined {
return this.notesCache[key]
}
public getNonExistingNotesFromCache(): IndexedNote[] {
public getNonExistingNotesFromCache(): IndexedDocument[] {
return Object.values(this.notesCache).filter(note => note.doesNotExist)
}

View File

@@ -20,7 +20,7 @@
const debouncedOnInput = debounce(() => {
dispatch('input', value)
}, 100)
}, 250)
</script>
<div class="omnisearch-input-container">

17
src/database.ts Normal file
View File

@@ -0,0 +1,17 @@
import Dexie from 'dexie'
class OmnisearchCache extends Dexie {
pdf!: Dexie.Table<
{ path: string; hash: string; size: number; text: string },
string
>
constructor() {
super(app.appId + '_omnisearch')
this.version(1).stores({
pdf: 'path, hash, size, text',
})
}
}
export const database = new OmnisearchCache()

View File

@@ -14,14 +14,13 @@ export const eventBus = new EventBus()
export const minisearchCacheFilePath = `${app.vault.configDir}/plugins/omnisearch/searchIndex.data`
export const notesCacheFilePath = `${app.vault.configDir}/plugins/omnisearch/notesCache.data`
export const pdfCacheFilePath = `${app.vault.configDir}/plugins/omnisearch/pdfCache.data`
export const historyFilePath = `${app.vault.configDir}/plugins/omnisearch/historyCache.json`
export const EventNames = {
ToggleExcerpts: 'toggle-excerpts',
} as const
export type IndexedNote = {
export type IndexedDocument = {
path: string
basename: string
mtime: number

View File

@@ -9,7 +9,6 @@ import { loadSearchHistory } from './search-history'
import { isFilePlaintext } from './utils'
import * as NotesIndex from './notes-index'
import { cacheManager } from './cache-manager'
import { pdfManager } from './pdf-manager'
function _registerAPI(plugin: OmnisearchPlugin): void {
registerAPI('omnisearch', api, plugin as any)
@@ -25,7 +24,6 @@ export default class OmnisearchPlugin extends Plugin {
await loadSettings(this)
await loadSearchHistory()
await cacheManager.loadNotesCache()
await pdfManager.loadPDFCache()
_registerAPI(this)
@@ -91,7 +89,7 @@ export default class OmnisearchPlugin extends Plugin {
onunload(): void {
console.log('Omnisearch - Interrupting PDF indexing')
NotesIndex.pdfQueue.pause()
NotesIndex.pdfQueue.clearQueue()
}
addRibbonButton(): void {
@@ -102,17 +100,17 @@ export default class OmnisearchPlugin extends Plugin {
}
async function cleanOldCacheFiles() {
const oldSearchIndexFilePath = `${app.vault.configDir}/plugins/omnisearch/searchIndex.json`
if (await app.vault.adapter.exists(oldSearchIndexFilePath)) {
try {
await app.vault.adapter.remove(oldSearchIndexFilePath)
} catch (e) {}
}
const oldNnotesCacheFilePath = `${app.vault.configDir}/plugins/omnisearch/notesCache.json`
if (await app.vault.adapter.exists(oldNnotesCacheFilePath)) {
try {
await app.vault.adapter.remove(oldNnotesCacheFilePath)
} catch (e) {}
const toDelete = [
`${app.vault.configDir}/plugins/omnisearch/searchIndex.json`,
`${app.vault.configDir}/plugins/omnisearch/notesCache.json`,
`${app.vault.configDir}/plugins/omnisearch/pdfCache.data`
]
for (const item of toDelete) {
if (await app.vault.adapter.exists(item)) {
try {
await app.vault.adapter.remove(item)
} catch (e) {}
}
}
}

View File

@@ -4,23 +4,19 @@ import {
getAliasesFromMetadata,
getTagsFromMetadata,
isFileIndexable,
isFilePlaintext,
removeDiacritics,
wait,
} from './utils'
import { getNonExistingNotes, removeAnchors } from './notes'
import * as PDF from './pdf-manager'
import type { IndexedNote } from './globals'
import { pdfManager } from './pdf-manager'
import type { IndexedDocument } from './globals'
import { settings } from './settings'
import * as Search from './search'
import PQueue from 'p-queue-compat'
// import PQueue from 'p-queue-compat'
import pLimit from 'p-limit'
import { cacheManager } from './cache-manager'
let isIndexChanged: boolean
export const pdfQueue = new PQueue({
concurrency: settings.backgroundProcesses,
})
export const pdfQueue = pLimit(settings.backgroundProcesses)
/**
* Adds a file to the index
@@ -59,14 +55,14 @@ export async function addToIndexAndCache(file: TAbstractFile): Promise<void> {
let content
if (file.path.endsWith('.pdf')) {
content = removeDiacritics(await PDF.pdfManager.getPdfText(file as TFile))
content = removeDiacritics(await pdfManager.getPdfText(file as TFile))
} else {
// Fetch content from the cache to index it as-is
content = removeDiacritics(await app.vault.cachedRead(file))
}
// Make the document and index it
const note: IndexedNote = {
const note: IndexedDocument = {
basename: removeDiacritics(file.basename),
content,
path: file.path,
@@ -86,7 +82,6 @@ export async function addToIndexAndCache(file: TAbstractFile): Promise<void> {
}
Search.minisearchInstance.add(note)
isIndexChanged = true
cacheManager.addNoteToCache(note.path, note)
} catch (e) {
console.trace('Error while indexing ' + file.basename)
@@ -105,12 +100,13 @@ export function addNonExistingToIndex(name: string, parent: string): void {
const filename = name + (name.endsWith('.md') ? '' : '.md')
if (cacheManager.getNoteFromCache(filename)) return
const note = {
const note: IndexedDocument = {
path: filename,
basename: name,
mtime: 0,
content: '',
tags: [],
aliases: '',
headings1: '',
headings2: '',
@@ -118,9 +114,8 @@ export function addNonExistingToIndex(name: string, parent: string): void {
doesNotExist: true,
parent,
} as IndexedNote
}
Search.minisearchInstance.add(note)
isIndexChanged = true
cacheManager.addNoteToCache(filename, note)
}
@@ -129,14 +124,13 @@ export function addNonExistingToIndex(name: string, parent: string): void {
* @param path
*/
export function removeFromIndex(path: string): void {
if (!isFilePlaintext(path)) {
if (!isFileIndexable(path)) {
console.info(`"${path}" is not an indexable file`)
return
}
const note = cacheManager.getNoteFromCache(path)
if (note) {
Search.minisearchInstance.remove(note)
isIndexChanged = true
cacheManager.removeNoteFromCache(path)
cacheManager
.getNonExistingNotesFromCache()
@@ -175,21 +169,30 @@ export async function indexPDFs() {
const files = app.vault.getFiles().filter(f => f.path.endsWith('.pdf'))
console.time('PDF Indexing')
console.log(`Omnisearch - Indexing ${files.length} PDFs`)
const input = []
for (const file of files) {
if (cacheManager.getNoteFromCache(file.path)) {
removeFromIndex(file.path)
}
pdfQueue.add(async () => {
await addToIndexAndCache(file)
await cacheManager.writeMinisearchIndex(Search.minisearchInstance)
})
input.push(
pdfQueue(async () => {
await addToIndexAndCache(file)
await cacheManager.writeMinisearchIndex(Search.minisearchInstance)
})
)
// pdfQueue.add(async () => {
// await addToIndexAndCache(file)
// await cacheManager.writeMinisearchIndex(Search.minisearchInstance)
// })
}
await pdfQueue.onEmpty()
await Promise.all(input)
// await pdfQueue.onEmpty()
console.timeEnd('PDF Indexing')
if (settings.showIndexingNotices) {
new Notice(`Omnisearch - Indexed ${files.length} PDFs`)
}
await pdfManager.cleanCache()
}
}

View File

@@ -1,51 +1,58 @@
import type { TFile } from 'obsidian'
import PQueue from 'p-queue-compat'
import PDFWorker from 'web-worker:./pdf-worker.ts'
import { pdfCacheFilePath } from './globals'
import { deflate, inflate } from 'pako'
import { makeMD5 } from './utils'
import { database } from './database'
class PDFManager {
private cache: Map<string, { content: string }> = new Map()
private serializeQueue = new PQueue({ concurrency: 1 })
public async loadPDFCache(): Promise<void> {
if (await app.vault.adapter.exists(pdfCacheFilePath)) {
try {
const data = await app.vault.adapter.readBinary(pdfCacheFilePath)
const json = new TextDecoder('utf8').decode(inflate(data))
this.cache = new Map(JSON.parse(json))
} catch (e) {
console.error(e)
this.cache = new Map()
}
}
}
public async getPdfText(file: TFile): Promise<string> {
// 1) Check if we can find by path & size
const docByPath = await database.pdf.get({
path: file.path,
size: file.stat.size,
})
if (docByPath) {
return docByPath.text
}
// 2) Check by hash
const data = new Uint8Array(await app.vault.readBinary(file))
const hash = makeMD5(data)
if (this.cache.has(hash)) {
return this.cache.get(hash)!.content
const docByHash = await database.pdf.get(hash)
if (docByHash) {
return docByHash.text
}
// 3) The PDF is not cached, extract it
const worker = new PDFWorker({ name: 'PDF Text Extractor' })
return new Promise(async (resolve, reject) => {
// @ts-ignore
file.stat.size
worker.postMessage({ data, name: file.basename })
worker.onmessage = (evt: any) => {
const txt = evt.data.text
this.updatePDFCache(hash, txt)
resolve(txt)
const text = (evt.data.text as string)
// Replace \n with spaces
.replace(/\n/g, ' ')
// Trim multiple spaces
.replace(/ +/g, ' ')
.trim()
database.pdf
.add({ hash, text, path: file.path, size: file.stat.size })
.then(() => {
resolve(text)
})
}
})
}
private async updatePDFCache(hash: string, content: string): Promise<void> {
this.serializeQueue.add(() => {
this.cache.set(hash, { content })
const data = deflate(JSON.stringify(Array.from(this.cache), null, 1))
app.vault.adapter.writeBinary(pdfCacheFilePath, data as any)
/**
* Removes the outdated cache entries
*/
public async cleanCache(): Promise<void> {
database.pdf.each(async item => {
if (!(await app.vault.adapter.exists(item.path))) {
console.log(item.path + ' does not exist')
}
})
}
}

View File

@@ -2,33 +2,25 @@ import { Notice } from 'obsidian'
import MiniSearch, { type Options, type SearchResult } from 'minisearch'
import {
chsRegex,
type IndexedNote,
type IndexedDocument,
type ResultNote,
minisearchCacheFilePath,
type SearchMatch,
SPACE_OR_PUNCTUATION,
} from './globals'
import {
isFileIndexable,
isFilePlaintext,
removeDiacritics,
stringsToRegex,
stripMarkdownCharacters,
wait,
} from './utils'
import type { Query } from './query'
import { settings } from './settings'
// import {
// getNoteFromCache,
// isCacheOutdated,
// loadNotesCache,
// resetNotesCache,
// } from './notes'
import * as NotesIndex from './notes-index'
import PQueue from 'p-queue-compat'
import pLimit from 'p-limit'
import { cacheManager } from './cache-manager'
export let minisearchInstance: MiniSearch<IndexedNote>
export let minisearchInstance: MiniSearch<IndexedDocument>
const tokenize = (text: string): string[] => {
const tokens = text.split(SPACE_OR_PUNCTUATION)
@@ -46,7 +38,7 @@ const tokenize = (text: string): string[] => {
* and adds all the notes to the index
*/
export async function initGlobalSearchIndex(): Promise<void> {
const options: Options<IndexedNote> = {
const options: Options<IndexedDocument> = {
tokenize,
processTerm: (term: string) =>
(settings.ignoreDiacritics ? removeDiacritics(term) : term).toLowerCase(),
@@ -106,15 +98,16 @@ export async function initGlobalSearchIndex(): Promise<void> {
}
// Read and index all the files into the search engine
const queue = new PQueue({ concurrency: 10 })
const queue = pLimit(10)
const input = []
for (const file of files) {
if (cacheManager.getNoteFromCache(file.path)) {
NotesIndex.removeFromIndex(file.path)
}
queue.add(() => NotesIndex.addToIndexAndCache(file))
input.push(queue(() => NotesIndex.addToIndexAndCache(file)))
}
await queue.onEmpty()
await Promise.all(input)
if (files.length > 0) {
const message = `Omnisearch - Indexed ${files.length} ${notesSuffix} in ${

View File

@@ -173,12 +173,15 @@ export class SettingsTab extends PluginSettingTab {
})
)
// PDF Indexing - not available on mobile
// PDF Indexing
const indexPDFsDesc = new DocumentFragment()
indexPDFsDesc.createSpan({}, span => {
span.innerHTML = `Omnisearch will include PDFs in search results.
This feature is currently a work-in-progress, please report slowdowns or issues that you might experience.<br>
Each PDF can take a few seconds to be indexed, so it may not appear immediately in search results.<br>
<ul>
<li>⚠️ Texts extracted from PDFs may contain errors such as missing spaces, or spaces in the middle of words.</li>
<li>This feature is currently a work-in-progress, please report issues that you might experience.</li>
<li>Each PDF can take a few seconds to be indexed, so it may not appear immediately in search results.</li>
</ul>
<strong style="color: var(--text-accent)">Needs a restart to fully take effect.</strong>`
})
new Setting(containerEl)

View File

@@ -19,6 +19,10 @@ declare module 'obsidian' {
interface Vault {
getConfig(string): unknown
}
interface App {
appId: string
}
}