Squashed commit of the following:
commit 739f9c349031510e8ef010ba2445a2a1fdbec247 Author: Simon Cambier <simon.cambier@protonmail.com> Date: Sun Oct 16 16:57:03 2022 +0200 Code cleaning + README commit 85762bae592f3eafd34ba22b0cf1841bfbd91ca6 Author: Simon Cambier <simon.cambier@protonmail.com> Date: Sun Oct 16 14:59:01 2022 +0200 Cleaning deleted PDFs from cache commit 1a37bf38d3f64870d4b40df1b67d8106c893ab64 Author: Simon Cambier <simon.cambier@protonmail.com> Date: Sun Oct 16 13:18:06 2022 +0200 PDF cache saved to IndexedDB
This commit is contained in:
@@ -5,12 +5,12 @@ import { deflate, inflate } from 'pako'
|
||||
import {
|
||||
notesCacheFilePath,
|
||||
minisearchCacheFilePath,
|
||||
type IndexedNote,
|
||||
type IndexedDocument,
|
||||
} from './globals'
|
||||
import { settings } from './settings'
|
||||
|
||||
class CacheManager {
|
||||
notesCache: Record<string, IndexedNote> = {}
|
||||
notesCache: Record<string, IndexedDocument> = {}
|
||||
compress = true
|
||||
writeInterval = 5_000 // In milliseconds
|
||||
|
||||
@@ -94,7 +94,7 @@ class CacheManager {
|
||||
console.log('Omnisearch - Notes cache saved on disk')
|
||||
}
|
||||
|
||||
public addNoteToCache(path: string, note: IndexedNote) {
|
||||
public addNoteToCache(path: string, note: IndexedDocument) {
|
||||
this.notesCache[path] = note
|
||||
this.saveNotesCache()
|
||||
}
|
||||
@@ -103,11 +103,11 @@ class CacheManager {
|
||||
delete this.notesCache[key]
|
||||
}
|
||||
|
||||
public getNoteFromCache(key: string): IndexedNote | undefined {
|
||||
public getNoteFromCache(key: string): IndexedDocument | undefined {
|
||||
return this.notesCache[key]
|
||||
}
|
||||
|
||||
public getNonExistingNotesFromCache(): IndexedNote[] {
|
||||
public getNonExistingNotesFromCache(): IndexedDocument[] {
|
||||
return Object.values(this.notesCache).filter(note => note.doesNotExist)
|
||||
}
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
|
||||
const debouncedOnInput = debounce(() => {
|
||||
dispatch('input', value)
|
||||
}, 100)
|
||||
}, 250)
|
||||
</script>
|
||||
|
||||
<div class="omnisearch-input-container">
|
||||
|
||||
17
src/database.ts
Normal file
17
src/database.ts
Normal file
@@ -0,0 +1,17 @@
|
||||
import Dexie from 'dexie'
|
||||
|
||||
class OmnisearchCache extends Dexie {
|
||||
pdf!: Dexie.Table<
|
||||
{ path: string; hash: string; size: number; text: string },
|
||||
string
|
||||
>
|
||||
|
||||
constructor() {
|
||||
super(app.appId + '_omnisearch')
|
||||
this.version(1).stores({
|
||||
pdf: 'path, hash, size, text',
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
export const database = new OmnisearchCache()
|
||||
@@ -14,14 +14,13 @@ export const eventBus = new EventBus()
|
||||
|
||||
export const minisearchCacheFilePath = `${app.vault.configDir}/plugins/omnisearch/searchIndex.data`
|
||||
export const notesCacheFilePath = `${app.vault.configDir}/plugins/omnisearch/notesCache.data`
|
||||
export const pdfCacheFilePath = `${app.vault.configDir}/plugins/omnisearch/pdfCache.data`
|
||||
export const historyFilePath = `${app.vault.configDir}/plugins/omnisearch/historyCache.json`
|
||||
|
||||
export const EventNames = {
|
||||
ToggleExcerpts: 'toggle-excerpts',
|
||||
} as const
|
||||
|
||||
export type IndexedNote = {
|
||||
export type IndexedDocument = {
|
||||
path: string
|
||||
basename: string
|
||||
mtime: number
|
||||
|
||||
26
src/main.ts
26
src/main.ts
@@ -9,7 +9,6 @@ import { loadSearchHistory } from './search-history'
|
||||
import { isFilePlaintext } from './utils'
|
||||
import * as NotesIndex from './notes-index'
|
||||
import { cacheManager } from './cache-manager'
|
||||
import { pdfManager } from './pdf-manager'
|
||||
|
||||
function _registerAPI(plugin: OmnisearchPlugin): void {
|
||||
registerAPI('omnisearch', api, plugin as any)
|
||||
@@ -25,7 +24,6 @@ export default class OmnisearchPlugin extends Plugin {
|
||||
await loadSettings(this)
|
||||
await loadSearchHistory()
|
||||
await cacheManager.loadNotesCache()
|
||||
await pdfManager.loadPDFCache()
|
||||
|
||||
_registerAPI(this)
|
||||
|
||||
@@ -91,7 +89,7 @@ export default class OmnisearchPlugin extends Plugin {
|
||||
|
||||
onunload(): void {
|
||||
console.log('Omnisearch - Interrupting PDF indexing')
|
||||
NotesIndex.pdfQueue.pause()
|
||||
NotesIndex.pdfQueue.clearQueue()
|
||||
}
|
||||
|
||||
addRibbonButton(): void {
|
||||
@@ -102,17 +100,17 @@ export default class OmnisearchPlugin extends Plugin {
|
||||
}
|
||||
|
||||
async function cleanOldCacheFiles() {
|
||||
const oldSearchIndexFilePath = `${app.vault.configDir}/plugins/omnisearch/searchIndex.json`
|
||||
if (await app.vault.adapter.exists(oldSearchIndexFilePath)) {
|
||||
try {
|
||||
await app.vault.adapter.remove(oldSearchIndexFilePath)
|
||||
} catch (e) {}
|
||||
}
|
||||
const oldNnotesCacheFilePath = `${app.vault.configDir}/plugins/omnisearch/notesCache.json`
|
||||
if (await app.vault.adapter.exists(oldNnotesCacheFilePath)) {
|
||||
try {
|
||||
await app.vault.adapter.remove(oldNnotesCacheFilePath)
|
||||
} catch (e) {}
|
||||
const toDelete = [
|
||||
`${app.vault.configDir}/plugins/omnisearch/searchIndex.json`,
|
||||
`${app.vault.configDir}/plugins/omnisearch/notesCache.json`,
|
||||
`${app.vault.configDir}/plugins/omnisearch/pdfCache.data`
|
||||
]
|
||||
for (const item of toDelete) {
|
||||
if (await app.vault.adapter.exists(item)) {
|
||||
try {
|
||||
await app.vault.adapter.remove(item)
|
||||
} catch (e) {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -4,23 +4,19 @@ import {
|
||||
getAliasesFromMetadata,
|
||||
getTagsFromMetadata,
|
||||
isFileIndexable,
|
||||
isFilePlaintext,
|
||||
removeDiacritics,
|
||||
wait,
|
||||
} from './utils'
|
||||
import { getNonExistingNotes, removeAnchors } from './notes'
|
||||
import * as PDF from './pdf-manager'
|
||||
import type { IndexedNote } from './globals'
|
||||
import { pdfManager } from './pdf-manager'
|
||||
import type { IndexedDocument } from './globals'
|
||||
import { settings } from './settings'
|
||||
import * as Search from './search'
|
||||
import PQueue from 'p-queue-compat'
|
||||
// import PQueue from 'p-queue-compat'
|
||||
import pLimit from 'p-limit'
|
||||
import { cacheManager } from './cache-manager'
|
||||
|
||||
let isIndexChanged: boolean
|
||||
|
||||
export const pdfQueue = new PQueue({
|
||||
concurrency: settings.backgroundProcesses,
|
||||
})
|
||||
export const pdfQueue = pLimit(settings.backgroundProcesses)
|
||||
|
||||
/**
|
||||
* Adds a file to the index
|
||||
@@ -59,14 +55,14 @@ export async function addToIndexAndCache(file: TAbstractFile): Promise<void> {
|
||||
|
||||
let content
|
||||
if (file.path.endsWith('.pdf')) {
|
||||
content = removeDiacritics(await PDF.pdfManager.getPdfText(file as TFile))
|
||||
content = removeDiacritics(await pdfManager.getPdfText(file as TFile))
|
||||
} else {
|
||||
// Fetch content from the cache to index it as-is
|
||||
content = removeDiacritics(await app.vault.cachedRead(file))
|
||||
}
|
||||
|
||||
// Make the document and index it
|
||||
const note: IndexedNote = {
|
||||
const note: IndexedDocument = {
|
||||
basename: removeDiacritics(file.basename),
|
||||
content,
|
||||
path: file.path,
|
||||
@@ -86,7 +82,6 @@ export async function addToIndexAndCache(file: TAbstractFile): Promise<void> {
|
||||
}
|
||||
|
||||
Search.minisearchInstance.add(note)
|
||||
isIndexChanged = true
|
||||
cacheManager.addNoteToCache(note.path, note)
|
||||
} catch (e) {
|
||||
console.trace('Error while indexing ' + file.basename)
|
||||
@@ -105,12 +100,13 @@ export function addNonExistingToIndex(name: string, parent: string): void {
|
||||
const filename = name + (name.endsWith('.md') ? '' : '.md')
|
||||
if (cacheManager.getNoteFromCache(filename)) return
|
||||
|
||||
const note = {
|
||||
const note: IndexedDocument = {
|
||||
path: filename,
|
||||
basename: name,
|
||||
mtime: 0,
|
||||
|
||||
content: '',
|
||||
tags: [],
|
||||
aliases: '',
|
||||
headings1: '',
|
||||
headings2: '',
|
||||
@@ -118,9 +114,8 @@ export function addNonExistingToIndex(name: string, parent: string): void {
|
||||
|
||||
doesNotExist: true,
|
||||
parent,
|
||||
} as IndexedNote
|
||||
}
|
||||
Search.minisearchInstance.add(note)
|
||||
isIndexChanged = true
|
||||
cacheManager.addNoteToCache(filename, note)
|
||||
}
|
||||
|
||||
@@ -129,14 +124,13 @@ export function addNonExistingToIndex(name: string, parent: string): void {
|
||||
* @param path
|
||||
*/
|
||||
export function removeFromIndex(path: string): void {
|
||||
if (!isFilePlaintext(path)) {
|
||||
if (!isFileIndexable(path)) {
|
||||
console.info(`"${path}" is not an indexable file`)
|
||||
return
|
||||
}
|
||||
const note = cacheManager.getNoteFromCache(path)
|
||||
if (note) {
|
||||
Search.minisearchInstance.remove(note)
|
||||
isIndexChanged = true
|
||||
cacheManager.removeNoteFromCache(path)
|
||||
cacheManager
|
||||
.getNonExistingNotesFromCache()
|
||||
@@ -175,21 +169,30 @@ export async function indexPDFs() {
|
||||
const files = app.vault.getFiles().filter(f => f.path.endsWith('.pdf'))
|
||||
console.time('PDF Indexing')
|
||||
console.log(`Omnisearch - Indexing ${files.length} PDFs`)
|
||||
const input = []
|
||||
for (const file of files) {
|
||||
if (cacheManager.getNoteFromCache(file.path)) {
|
||||
removeFromIndex(file.path)
|
||||
}
|
||||
pdfQueue.add(async () => {
|
||||
await addToIndexAndCache(file)
|
||||
await cacheManager.writeMinisearchIndex(Search.minisearchInstance)
|
||||
})
|
||||
input.push(
|
||||
pdfQueue(async () => {
|
||||
await addToIndexAndCache(file)
|
||||
await cacheManager.writeMinisearchIndex(Search.minisearchInstance)
|
||||
})
|
||||
)
|
||||
// pdfQueue.add(async () => {
|
||||
// await addToIndexAndCache(file)
|
||||
// await cacheManager.writeMinisearchIndex(Search.minisearchInstance)
|
||||
// })
|
||||
}
|
||||
|
||||
await pdfQueue.onEmpty()
|
||||
await Promise.all(input)
|
||||
// await pdfQueue.onEmpty()
|
||||
console.timeEnd('PDF Indexing')
|
||||
|
||||
if (settings.showIndexingNotices) {
|
||||
new Notice(`Omnisearch - Indexed ${files.length} PDFs`)
|
||||
}
|
||||
|
||||
await pdfManager.cleanCache()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,51 +1,58 @@
|
||||
import type { TFile } from 'obsidian'
|
||||
import PQueue from 'p-queue-compat'
|
||||
import PDFWorker from 'web-worker:./pdf-worker.ts'
|
||||
import { pdfCacheFilePath } from './globals'
|
||||
import { deflate, inflate } from 'pako'
|
||||
import { makeMD5 } from './utils'
|
||||
import { database } from './database'
|
||||
|
||||
class PDFManager {
|
||||
private cache: Map<string, { content: string }> = new Map()
|
||||
private serializeQueue = new PQueue({ concurrency: 1 })
|
||||
|
||||
public async loadPDFCache(): Promise<void> {
|
||||
if (await app.vault.adapter.exists(pdfCacheFilePath)) {
|
||||
try {
|
||||
const data = await app.vault.adapter.readBinary(pdfCacheFilePath)
|
||||
const json = new TextDecoder('utf8').decode(inflate(data))
|
||||
this.cache = new Map(JSON.parse(json))
|
||||
} catch (e) {
|
||||
console.error(e)
|
||||
this.cache = new Map()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public async getPdfText(file: TFile): Promise<string> {
|
||||
// 1) Check if we can find by path & size
|
||||
const docByPath = await database.pdf.get({
|
||||
path: file.path,
|
||||
size: file.stat.size,
|
||||
})
|
||||
|
||||
if (docByPath) {
|
||||
return docByPath.text
|
||||
}
|
||||
|
||||
// 2) Check by hash
|
||||
const data = new Uint8Array(await app.vault.readBinary(file))
|
||||
const hash = makeMD5(data)
|
||||
if (this.cache.has(hash)) {
|
||||
return this.cache.get(hash)!.content
|
||||
const docByHash = await database.pdf.get(hash)
|
||||
if (docByHash) {
|
||||
return docByHash.text
|
||||
}
|
||||
|
||||
// 3) The PDF is not cached, extract it
|
||||
const worker = new PDFWorker({ name: 'PDF Text Extractor' })
|
||||
return new Promise(async (resolve, reject) => {
|
||||
// @ts-ignore
|
||||
file.stat.size
|
||||
worker.postMessage({ data, name: file.basename })
|
||||
worker.onmessage = (evt: any) => {
|
||||
const txt = evt.data.text
|
||||
this.updatePDFCache(hash, txt)
|
||||
resolve(txt)
|
||||
const text = (evt.data.text as string)
|
||||
// Replace \n with spaces
|
||||
.replace(/\n/g, ' ')
|
||||
// Trim multiple spaces
|
||||
.replace(/ +/g, ' ')
|
||||
.trim()
|
||||
database.pdf
|
||||
.add({ hash, text, path: file.path, size: file.stat.size })
|
||||
.then(() => {
|
||||
resolve(text)
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
private async updatePDFCache(hash: string, content: string): Promise<void> {
|
||||
this.serializeQueue.add(() => {
|
||||
this.cache.set(hash, { content })
|
||||
const data = deflate(JSON.stringify(Array.from(this.cache), null, 1))
|
||||
app.vault.adapter.writeBinary(pdfCacheFilePath, data as any)
|
||||
/**
|
||||
* Removes the outdated cache entries
|
||||
*/
|
||||
public async cleanCache(): Promise<void> {
|
||||
database.pdf.each(async item => {
|
||||
if (!(await app.vault.adapter.exists(item.path))) {
|
||||
console.log(item.path + ' does not exist')
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,33 +2,25 @@ import { Notice } from 'obsidian'
|
||||
import MiniSearch, { type Options, type SearchResult } from 'minisearch'
|
||||
import {
|
||||
chsRegex,
|
||||
type IndexedNote,
|
||||
type IndexedDocument,
|
||||
type ResultNote,
|
||||
minisearchCacheFilePath,
|
||||
type SearchMatch,
|
||||
SPACE_OR_PUNCTUATION,
|
||||
} from './globals'
|
||||
import {
|
||||
isFileIndexable,
|
||||
isFilePlaintext,
|
||||
removeDiacritics,
|
||||
stringsToRegex,
|
||||
stripMarkdownCharacters,
|
||||
wait,
|
||||
} from './utils'
|
||||
import type { Query } from './query'
|
||||
import { settings } from './settings'
|
||||
// import {
|
||||
// getNoteFromCache,
|
||||
// isCacheOutdated,
|
||||
// loadNotesCache,
|
||||
// resetNotesCache,
|
||||
// } from './notes'
|
||||
import * as NotesIndex from './notes-index'
|
||||
import PQueue from 'p-queue-compat'
|
||||
import pLimit from 'p-limit'
|
||||
import { cacheManager } from './cache-manager'
|
||||
|
||||
export let minisearchInstance: MiniSearch<IndexedNote>
|
||||
export let minisearchInstance: MiniSearch<IndexedDocument>
|
||||
|
||||
const tokenize = (text: string): string[] => {
|
||||
const tokens = text.split(SPACE_OR_PUNCTUATION)
|
||||
@@ -46,7 +38,7 @@ const tokenize = (text: string): string[] => {
|
||||
* and adds all the notes to the index
|
||||
*/
|
||||
export async function initGlobalSearchIndex(): Promise<void> {
|
||||
const options: Options<IndexedNote> = {
|
||||
const options: Options<IndexedDocument> = {
|
||||
tokenize,
|
||||
processTerm: (term: string) =>
|
||||
(settings.ignoreDiacritics ? removeDiacritics(term) : term).toLowerCase(),
|
||||
@@ -106,15 +98,16 @@ export async function initGlobalSearchIndex(): Promise<void> {
|
||||
}
|
||||
|
||||
// Read and index all the files into the search engine
|
||||
const queue = new PQueue({ concurrency: 10 })
|
||||
const queue = pLimit(10)
|
||||
const input = []
|
||||
for (const file of files) {
|
||||
if (cacheManager.getNoteFromCache(file.path)) {
|
||||
NotesIndex.removeFromIndex(file.path)
|
||||
}
|
||||
queue.add(() => NotesIndex.addToIndexAndCache(file))
|
||||
input.push(queue(() => NotesIndex.addToIndexAndCache(file)))
|
||||
}
|
||||
|
||||
await queue.onEmpty()
|
||||
await Promise.all(input)
|
||||
|
||||
if (files.length > 0) {
|
||||
const message = `Omnisearch - Indexed ${files.length} ${notesSuffix} in ${
|
||||
|
||||
@@ -173,12 +173,15 @@ export class SettingsTab extends PluginSettingTab {
|
||||
})
|
||||
)
|
||||
|
||||
// PDF Indexing - not available on mobile
|
||||
// PDF Indexing
|
||||
const indexPDFsDesc = new DocumentFragment()
|
||||
indexPDFsDesc.createSpan({}, span => {
|
||||
span.innerHTML = `Omnisearch will include PDFs in search results.
|
||||
This feature is currently a work-in-progress, please report slowdowns or issues that you might experience.<br>
|
||||
Each PDF can take a few seconds to be indexed, so it may not appear immediately in search results.<br>
|
||||
<ul>
|
||||
<li>⚠️ Texts extracted from PDFs may contain errors such as missing spaces, or spaces in the middle of words.</li>
|
||||
<li>This feature is currently a work-in-progress, please report issues that you might experience.</li>
|
||||
<li>Each PDF can take a few seconds to be indexed, so it may not appear immediately in search results.</li>
|
||||
</ul>
|
||||
<strong style="color: var(--text-accent)">Needs a restart to fully take effect.</strong>`
|
||||
})
|
||||
new Setting(containerEl)
|
||||
|
||||
4
src/typings/types-obsidian.d.ts
vendored
4
src/typings/types-obsidian.d.ts
vendored
@@ -19,6 +19,10 @@ declare module 'obsidian' {
|
||||
interface Vault {
|
||||
getConfig(string): unknown
|
||||
}
|
||||
|
||||
interface App {
|
||||
appId: string
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user