Squashed commit of the following:

commit 603b9bbde4c6efc90c81032e4e765c64d3075e75
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Tue Oct 11 21:47:03 2022 +0200

    Basic PDF indexing ok

commit 200331bb5c5111493af1e1f6ef8cd4bbfbdbfd4f
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Tue Oct 11 20:56:44 2022 +0200

    Tweaks and comments

commit 434b9662d40c5fea9d8b28d43828b11916db8c94
Author: Simon Cambier <simon.cambier@ores.be>
Date:   Tue Oct 11 16:22:55 2022 +0200

    Refactoring notes & minisearch cache

commit 7253c676c8ed161782ba8e33f0c4c162880925ad
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Tue Oct 11 09:50:33 2022 +0200

    wip

commit 77736e6ef6f28ccfddb64fb768732927d43bbd77
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Mon Oct 10 20:49:02 2022 +0200

    Small rewrites & deps updates

commit 59845fdb89eb6a3ad3f3f9ad75b39e7a3e604c45
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Mon Oct 10 12:22:11 2022 +0200

    wasm + worker ok

commit 1cf3b506e56147586cd0ebcc003642c5230e04cc
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Sun Oct 2 20:04:49 2022 +0200

    no disk access, of course

commit eb3dd9dd4f616a479a53e10856f6c96c6725e911
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Sun Oct 2 19:08:48 2022 +0200

    Rollup build ok

commit 54f2b7e615456c0e1b1504691689d1ba2c72d9e8
Author: Simon Cambier <simon.cambier@protonmail.com>
Date:   Sun Oct 2 16:03:31 2022 +0200

    Rollup build + wasm PoC
This commit is contained in:
Simon Cambier
2022-10-11 21:54:11 +02:00
parent cf7f6af257
commit 7ddae6dc08
28 changed files with 18437 additions and 923 deletions

View File

@@ -1,6 +1,6 @@
import type { ResultNote, SearchMatch } from './globals'
import { Query } from './query'
import { getSuggestions } from './search'
import * as Search from './search'
type ResultNoteApi = {
score: number
@@ -30,7 +30,7 @@ function mapResults(results: ResultNote[]): ResultNoteApi[] {
async function search(q: string): Promise<ResultNoteApi[]> {
const query = new Query(q)
const raw = await getSuggestions(query)
const raw = await Search.getSuggestions(query)
return mapResults(raw)
}

120
src/cache-manager.ts Normal file
View File

@@ -0,0 +1,120 @@
import { throttle } from 'lodash-es'
import type MiniSearch from 'minisearch'
import type { TFile } from 'obsidian'
import { deflate, inflate } from 'pako'
import {
notesCacheFilePath,
minisearchCacheFilePath,
type IndexedNote,
} from './globals'
import { settings } from './settings'
class CacheManager {
notesCache: Record<string, IndexedNote> = {}
compress = true
writeInterval = 5_000 // In milliseconds
//#region Minisearch
/**
* Serializes and writes the Minisearch index on the disk
*/
public writeMinisearchIndex = throttle(
this._writeMinisearchIndex,
this.writeInterval,
{
leading: true,
trailing: true,
}
)
private async _writeMinisearchIndex(minisearch: MiniSearch): Promise<void> {
if (!settings.persistCache) {
return
}
const json = JSON.stringify(minisearch)
const data = this.compress ? deflate(json) : json
await app.vault.adapter.writeBinary(minisearchCacheFilePath, data as any)
console.log('Omnisearch - Minisearch index saved on disk')
}
public async readMinisearchIndex(): Promise<string | null> {
if (!settings.persistCache) {
return null
}
if (await app.vault.adapter.exists(minisearchCacheFilePath)) {
try {
const data = await app.vault.adapter.readBinary(minisearchCacheFilePath)
return (
this.compress ? new TextDecoder('utf8').decode(inflate(data)) : data
) as any
} catch (e) {
console.trace(
'Omnisearch - Could not load MiniSearch index from the file:'
)
console.warn(e)
app.vault.adapter.remove(minisearchCacheFilePath)
}
}
return null
}
//#endregion Minisearch
public async loadNotesCache() {
if (!settings.persistCache) {
return null
}
if (await app.vault.adapter.exists(notesCacheFilePath)) {
try {
const data = await app.vault.adapter.readBinary(notesCacheFilePath)
const json = (
this.compress ? new TextDecoder('utf8').decode(inflate(data)) : data
) as any
this.notesCache = JSON.parse(json)
} catch (e) {
console.trace('Omnisearch - Could not load notes cache:')
console.warn(e)
app.vault.adapter.remove(notesCacheFilePath)
}
}
return null
}
public saveNotesCache = throttle(this._saveNotesCache, this.writeInterval, {
leading: true,
trailing: true,
})
private async _saveNotesCache() {
if (!settings.persistCache) {
return
}
const json = JSON.stringify(this.notesCache)
const data = this.compress ? deflate(json) : json
await app.vault.adapter.writeBinary(notesCacheFilePath, data as any)
console.log('Omnisearch - Notes cache saved on disk')
}
public addNoteToCache(path: string, note: IndexedNote) {
this.notesCache[path] = note
this.saveNotesCache()
}
public removeNoteFromCache(key: string): void {
delete this.notesCache[key]
}
public getNoteFromCache(key: string): IndexedNote | undefined {
return this.notesCache[key]
}
public getNonExistingNotesFromCache(): IndexedNote[] {
return Object.values(this.notesCache).filter(note => note.doesNotExist)
}
public isCacheOutdated(file: TFile): boolean {
const indexedNote = this.getNoteFromCache(file.path)
return !indexedNote || indexedNote.mtime !== file.stat.mtime
}
}
export const cacheManager = new CacheManager()

View File

@@ -13,13 +13,13 @@
import { loopIndex } from 'src/utils'
import { onDestroy, onMount, tick } from 'svelte'
import { MarkdownView } from 'obsidian'
import { getSuggestions } from 'src/search'
import * as Search from 'src/search'
import ModalContainer from './ModalContainer.svelte'
import { OmnisearchInFileModal, OmnisearchVaultModal } from 'src/modals'
import ResultItemInFile from './ResultItemInFile.svelte'
import { Query } from 'src/query'
import { openNote } from 'src/notes'
import {saveSearchHistory} from "../search-history";
import { saveSearchHistory } from '../search-history'
export let modal: OmnisearchInFileModal
export let parent: OmnisearchVaultModal | null = null
@@ -50,7 +50,7 @@
$: (async () => {
if (searchQuery) {
query = new Query(searchQuery)
note = (await getSuggestions(query, { singleFilePath }))[0] ?? null
note = (await Search.getSuggestions(query, { singleFilePath }))[0] ?? null
lastSearch = searchQuery
}
selectedIndex = 0
@@ -143,20 +143,20 @@
</script>
<InputSearch
value={searchQuery}
on:input={e => (searchQuery = e.detail)}
value="{searchQuery}"
on:input="{e => (searchQuery = e.detail)}"
placeholder="Omnisearch - File" />
<ModalContainer>
{#if groupedOffsets.length && note}
{#each groupedOffsets as offset, i}
<ResultItemInFile
{offset}
{note}
index={i}
selected={i === selectedIndex}
on:mousemove={_e => (selectedIndex = i)}
on:click={openSelection} />
offset="{offset}"
note="{note}"
index="{i}"
selected="{i === selectedIndex}"
on:mousemove="{_e => (selectedIndex = i)}"
on:click="{openSelection}" />
{/each}
{:else}
<div style="text-align: center;">

View File

@@ -5,14 +5,14 @@
import ModalContainer from './ModalContainer.svelte'
import { eventBus, type ResultNote } from 'src/globals'
import { createNote, openNote } from 'src/notes'
import { getSuggestions } from 'src/search'
import * as Search from 'src/search'
import { getCtrlKeyLabel, getExtension, loopIndex } from 'src/utils'
import { OmnisearchInFileModal, type OmnisearchVaultModal } from 'src/modals'
import ResultItemVault from './ResultItemVault.svelte'
import { Query } from 'src/query'
import { saveSearchHistory, searchHistory } from 'src/search-history'
import { settings } from '../settings'
import { refreshIndex } from '../notes-index'
import * as NotesIndex from '../notes-index'
export let modal: OmnisearchVaultModal
let selectedIndex = 0
@@ -29,7 +29,7 @@
}
onMount(async () => {
await refreshIndex()
await NotesIndex.refreshIndex()
searchQuery = searchHistory[historySearchIndex]
eventBus.enable('vault')
eventBus.on('vault', 'enter', openNoteAndCloseModal)
@@ -63,7 +63,7 @@
async function updateResults() {
query = new Query(searchQuery)
resultNotes = (await getSuggestions(query)).sort(
resultNotes = (await Search.getSuggestions(query)).sort(
(a, b) => b.score - a.score
)
selectedIndex = 0

View File

@@ -1,8 +1,8 @@
<script lang="ts">
import { getNoteFromCache } from 'src/notes'
import { cacheManager } from 'src/cache-manager'
import { settings, showExcerpt } from 'src/settings'
import type { ResultNote } from '../globals'
import { getMatches } from '../search'
import * as Search from '../search'
import { highlighter, makeExcerpt, stringsToRegex } from '../utils'
import ResultItemContainer from './ResultItemContainer.svelte'
@@ -10,13 +10,18 @@
export let note: ResultNote
$: reg = stringsToRegex(note.foundWords)
$: matches = getMatches(note.content, reg)
$: matches = Search.getMatches(note.content, reg)
$: cleanedContent = makeExcerpt(note.content, note.matches[0]?.offset ?? -1)
$: glyph = getNoteFromCache(note.path)?.doesNotExist
$: glyph = cacheManager.getNoteFromCache(note.path)?.doesNotExist
$: title = settings.showShortName ? note.basename : note.path
</script>
<ResultItemContainer id={note.path} {selected} on:mousemove on:click {glyph}>
<ResultItemContainer
id="{note.path}"
selected="{selected}"
on:mousemove
on:click
glyph="{glyph}">
<div>
<span class="omnisearch-result__title">
{@html title.replace(reg, highlighter)}

View File

@@ -12,8 +12,9 @@ export const highlightClass = 'suggestion-highlight omnisearch-highlight'
export const eventBus = new EventBus()
export const searchIndexFilePath = `${app.vault.configDir}/plugins/omnisearch/searchIndex.json`
export const notesCacheFilePath = `${app.vault.configDir}/plugins/omnisearch/notesCache.json`
export const minisearchCacheFilePath = `${app.vault.configDir}/plugins/omnisearch/searchIndex.data`
export const notesCacheFilePath = `${app.vault.configDir}/plugins/omnisearch/notesCache.data`
export const pdfCacheFilePath = `${app.vault.configDir}/plugins/omnisearch/pdfCache.data`
export const historyFilePath = `${app.vault.configDir}/plugins/omnisearch/historyCache.json`
export const EventNames = {

57
src/lib.rs Normal file
View File

@@ -0,0 +1,57 @@
use js_sys::Uint8Array;
use pdf_extract::extract_text_from_mem;
use wasm_bindgen::prelude::*;
mod obsidian;
#[wasm_bindgen]
pub fn extract_pdf_text(arr: Uint8Array) -> String {
// FIXME: return a Result<> here, to throw in JS in case of an error
let txt = match extract_text_from_mem(&arr.to_vec()) {
Ok(txt) => txt,
Err(e) => e.to_string(),
};
txt
}
// #[wasm_bindgen]
// pub struct ExampleCommand {
// id: JsString,
// name: JsString,
// }
// #[wasm_bindgen]
// impl ExampleCommand {
// #[wasm_bindgen(getter)]
// pub fn id(&self) -> JsString {
// self.id.clone()
// }
// #[wasm_bindgen(setter)]
// pub fn set_id(&mut self, id: &str) {
// self.id = JsString::from(id)
// }
// #[wasm_bindgen(getter)]
// pub fn name(&self) -> JsString {
// self.name.clone()
// }
// #[wasm_bindgen(setter)]
// pub fn set_name(&mut self, name: &str) {
// self.name = JsString::from(name)
// }
// pub fn callback(&self) {
// obsidian::Notice::new("hello from rust");
// }
// }
// #[wasm_bindgen]
// pub fn onload(plugin: &obsidian::Plugin) {
// let cmd = ExampleCommand {
// id: JsString::from("example"),
// name: JsString::from("Example"),
// };
// plugin.addCommand(JsValue::from(cmd))
// }

View File

@@ -1,13 +1,15 @@
import { Plugin, TFile } from 'obsidian'
import { initGlobalSearchIndex } from './search'
import * as Search from './search'
import { OmnisearchInFileModal, OmnisearchVaultModal } from './modals'
import { loadSettings, settings, SettingsTab, showExcerpt } from './settings'
import {eventBus, EventNames} from './globals'
import { eventBus, EventNames } from './globals'
import { registerAPI } from '@vanakat/plugin-api'
import api from './api'
import { loadSearchHistory } from './search-history'
import {isFilePlaintext, showWelcomeNotice} from './utils'
import { addNoteToReindex, addToIndex, removeFromIndex } from './notes-index'
import { isFilePlaintext, showWelcomeNotice } from './utils'
import * as NotesIndex from './notes-index'
import { cacheManager } from './cache-manager'
import { pdfManager } from './pdf-manager'
function _registerAPI(plugin: OmnisearchPlugin): void {
registerAPI('omnisearch', api, plugin as any)
@@ -19,10 +21,12 @@ function _registerAPI(plugin: OmnisearchPlugin): void {
export default class OmnisearchPlugin extends Plugin {
async onload(): Promise<void> {
// additional files to index by Omnisearch
await loadSettings(this)
await loadSearchHistory()
await cacheManager.loadNotesCache()
await pdfManager.loadPDFCache()
_registerAPI(this)
if (settings.ribbonIcon) {
@@ -57,35 +61,38 @@ export default class OmnisearchPlugin extends Plugin {
// Listeners to keep the search index up-to-date
this.registerEvent(
this.app.vault.on('create', file => {
addToIndex(file)
NotesIndex.addToIndexAndCache(file)
})
)
this.registerEvent(
this.app.vault.on('delete', file => {
removeFromIndex(file.path)
NotesIndex.removeFromIndex(file.path)
})
)
this.registerEvent(
this.app.vault.on('modify', async file => {
addNoteToReindex(file)
NotesIndex.addNoteToReindex(file)
})
)
this.registerEvent(
this.app.vault.on('rename', async (file, oldPath) => {
if (file instanceof TFile && isFilePlaintext(file.path)) {
removeFromIndex(oldPath)
await addToIndex(file)
NotesIndex.removeFromIndex(oldPath)
await NotesIndex.addToIndexAndCache(file)
}
})
)
await initGlobalSearchIndex()
await Search.initGlobalSearchIndex()
})
// showWelcomeNotice(this)
}
onunload(): void {}
onunload(): void {
console.log('Omnisearch - Interrupting PDF indexing')
NotesIndex.pdfQueue.pause()
}
addRibbonButton(): void {
this.addRibbonIcon('search', 'Omnisearch', _evt => {

View File

@@ -1,6 +1,5 @@
import {Notice, TAbstractFile, TFile} from 'obsidian'
import { Notice, TAbstractFile, TFile } from 'obsidian'
import {
canIndexPDFs,
extractHeadingsFromCache,
getAliasesFromMetadata,
getTagsFromMetadata,
@@ -9,36 +8,33 @@ import {
removeDiacritics,
wait,
} from './utils'
import {
addNoteToCache,
getNonExistingNotes,
getNonExistingNotesFromCache,
getNoteFromCache,
removeAnchors,
removeNoteFromCache,
saveNotesCacheToFile,
} from './notes'
import {getPdfText} from './pdf-parser'
import type {IndexedNote} from './globals'
import {searchIndexFilePath} from './globals'
import {settings} from './settings'
import {minisearchInstance} from './search'
import { getNonExistingNotes, removeAnchors } from './notes'
import * as PDF from './pdf-manager'
import type { IndexedNote } from './globals'
import { settings } from './settings'
import * as Search from './search'
import PQueue from 'p-queue-compat'
import { cacheManager } from './cache-manager'
let isIndexChanged: boolean
export const pdfQueue = new PQueue({
concurrency: settings.backgroundProcesses,
})
/**
* Adds a file to the index
* @param file
* @returns
*/
export async function addToIndex(file: TAbstractFile): Promise<void> {
export async function addToIndexAndCache(file: TAbstractFile): Promise<void> {
if (!(file instanceof TFile) || !isFileIndexable(file.path)) {
return
}
// Check if the file was already indexed as non-existent,
// and if so, remove it from the index (before adding it again)
if (getNoteFromCache(file.path)?.doesNotExist) {
if (cacheManager.getNoteFromCache(file.path)?.doesNotExist) {
removeFromIndex(file.path)
}
@@ -50,18 +46,20 @@ export async function addToIndex(file: TAbstractFile): Promise<void> {
const metadata = app.metadataCache.getFileCache(file)
if (metadata) {
const nonExisting = getNonExistingNotes(file, metadata)
for (const name of nonExisting.filter(o => !getNoteFromCache(o))) {
for (const name of nonExisting.filter(
o => !cacheManager.getNoteFromCache(o)
)) {
addNonExistingToIndex(name, file.path)
}
}
if (getNoteFromCache(file.path)) {
if (cacheManager.getNoteFromCache(file.path)) {
throw new Error(`${file.basename} is already indexed`)
}
let content
if (file.path.endsWith('.pdf')) {
content = removeDiacritics(await getPdfText(file as TFile))
content = removeDiacritics(await PDF.pdfManager.getPdfText(file as TFile))
} else {
// Fetch content from the cache to index it as-is
content = removeDiacritics(await app.vault.cachedRead(file))
@@ -87,9 +85,9 @@ export async function addToIndex(file: TAbstractFile): Promise<void> {
: '',
}
minisearchInstance.add(note)
Search.minisearchInstance.add(note)
isIndexChanged = true
addNoteToCache(note.path, note)
cacheManager.addNoteToCache(note.path, note)
} catch (e) {
console.trace('Error while indexing ' + file.basename)
console.error(e)
@@ -105,7 +103,7 @@ export async function addToIndex(file: TAbstractFile): Promise<void> {
export function addNonExistingToIndex(name: string, parent: string): void {
name = removeAnchors(name)
const filename = name + (name.endsWith('.md') ? '' : '.md')
if (getNoteFromCache(filename)) return
if (cacheManager.getNoteFromCache(filename)) return
const note = {
path: filename,
@@ -121,9 +119,9 @@ export function addNonExistingToIndex(name: string, parent: string): void {
doesNotExist: true,
parent,
} as IndexedNote
minisearchInstance.add(note)
Search.minisearchInstance.add(note)
isIndexChanged = true
addNoteToCache(filename, note)
cacheManager.addNoteToCache(filename, note)
}
/**
@@ -135,18 +133,19 @@ export function removeFromIndex(path: string): void {
console.info(`"${path}" is not an indexable file`)
return
}
const note = getNoteFromCache(path)
const note = cacheManager.getNoteFromCache(path)
if (note) {
minisearchInstance.remove(note)
Search.minisearchInstance.remove(note)
isIndexChanged = true
removeNoteFromCache(path)
getNonExistingNotesFromCache()
cacheManager.removeNoteFromCache(path)
cacheManager
.getNonExistingNotesFromCache()
.filter(n => n.parent === path)
.forEach(n => {
removeFromIndex(n.path)
})
} else {
console.warn(`not not found under path ${path}`)
console.warn(`Omnisearch - Note not found under path ${path}`)
}
}
@@ -157,54 +156,40 @@ export function addNoteToReindex(note: TAbstractFile): void {
}
export async function refreshIndex(): Promise<void> {
if (settings.showIndexingNotices && notesToReindex.size > 0) {
new Notice(`Omnisearch - Reindexing ${notesToReindex.size} notes`, 2000)
}
for (const note of notesToReindex) {
removeFromIndex(note.path)
await addToIndex(note)
await wait(0)
}
notesToReindex.clear()
await saveIndexToFile()
}
export async function saveIndexToFile(): Promise<void> {
if (settings.storeIndexInFile && minisearchInstance && isIndexChanged) {
const json = JSON.stringify(minisearchInstance)
await app.vault.adapter.write(searchIndexFilePath, json)
console.log('Omnisearch - Index saved on disk')
await saveNotesCacheToFile()
isIndexChanged = false
if (notesToReindex.size > 0) {
if (settings.showIndexingNotices) {
new Notice(`Omnisearch - Reindexing ${notesToReindex.size} notes`, 2000)
}
for (const note of notesToReindex) {
removeFromIndex(note.path)
await addToIndexAndCache(note)
await wait(0)
}
notesToReindex.clear()
await cacheManager.writeMinisearchIndex(Search.minisearchInstance)
}
}
export async function indexPDFs() {
if (canIndexPDFs()) {
const start = new Date().getTime()
if (settings.PDFIndexing) {
const files = app.vault.getFiles().filter(f => f.path.endsWith('.pdf'))
if (files.length > 50) {
new Notice(`⚠️ Omnisearch is indexing ${files.length} PDFs. You can experience slowdowns while this work is in progress.`)
}
const promises: Promise<void>[] = []
console.time('PDF Indexing')
console.log(`Omnisearch - Indexing ${files.length} PDFs`)
for (const file of files) {
if (getNoteFromCache(file.path)) {
if (cacheManager.getNoteFromCache(file.path)) {
removeFromIndex(file.path)
}
promises.push(addToIndex(file))
pdfQueue.add(async () => {
await addToIndexAndCache(file)
await cacheManager.writeMinisearchIndex(Search.minisearchInstance)
})
}
await Promise.all(promises)
// Notice & log
const message = `Omnisearch - Indexed ${files.length} PDFs in ${
new Date().getTime() - start
}ms`
await pdfQueue.onEmpty()
console.timeEnd('PDF Indexing')
if (settings.showIndexingNotices) {
new Notice(message)
new Notice(`Omnisearch - Indexed ${files.length} PDFs`)
}
console.log(message)
}
}
}

View File

@@ -1,55 +1,6 @@
import { type CachedMetadata, MarkdownView, TFile } from 'obsidian'
import {
type IndexedNote,
notesCacheFilePath,
type ResultNote,
} from './globals'
import { stringsToRegex } from './utils'
import { settings } from './settings'
/**
* This is an in-memory cache of the notes, with all their computed fields
* used by the search engine.
* This cache allows us to quickly de-index notes when they are deleted or updated.
*/
export let notesCache: Record<string, IndexedNote> = {}
export function resetNotesCache(): void {
notesCache = {}
}
export async function loadNotesCache(): Promise<void> {
if (
settings.storeIndexInFile &&
(await app.vault.adapter.exists(notesCacheFilePath))
) {
try {
const json = await app.vault.adapter.read(notesCacheFilePath)
notesCache = JSON.parse(json)
console.log('Omnisearch - Notes cache loaded from the file')
} catch (e) {
console.trace('Omnisearch - Could not load Notes cache from the file')
console.error(e)
}
}
notesCache ||= {}
}
export function getNoteFromCache(key: string): IndexedNote | undefined {
return notesCache[key]
}
export function getNonExistingNotesFromCache(): IndexedNote[] {
return Object.values(notesCache).filter(note => note.doesNotExist)
}
export function addNoteToCache(filename: string, note: IndexedNote): void {
notesCache[filename] = note
}
export function removeNoteFromCache(key: string): void {
delete notesCache[key]
}
import type { ResultNote } from './globals'
export async function openNote(
item: ResultNote,
@@ -145,14 +96,3 @@ export function getNonExistingNotes(
export function removeAnchors(name: string): string {
return name.split(/[\^#]+/)[0]
}
export async function saveNotesCacheToFile(): Promise<void> {
const json = JSON.stringify(notesCache)
await app.vault.adapter.write(notesCacheFilePath, json)
console.log('Omnisearch - Notes cache saved to the file')
}
export function isCacheOutdated(file: TFile): boolean {
const indexedNote = getNoteFromCache(file.path)
return !indexedNote || indexedNote.mtime !== file.stat.mtime
}

14
src/obsidian.rs Normal file
View File

@@ -0,0 +1,14 @@
use wasm_bindgen::prelude::*;
#[wasm_bindgen(module = "obsidian")]
extern "C" {
pub type Plugin;
#[wasm_bindgen(structural, method)]
pub fn addCommand(this: &Plugin, command: JsValue);
pub type Notice;
#[wasm_bindgen(constructor)]
pub fn new(message: &str) -> Notice;
}

53
src/pdf-manager.ts Normal file
View File

@@ -0,0 +1,53 @@
import type { TFile } from 'obsidian'
import PQueue from 'p-queue-compat'
import PDFWorker from 'web-worker:./pdf-worker.ts'
import { pdfCacheFilePath } from './globals'
import { deflate, inflate } from 'pako'
import { md5 } from './utils'
class PDFManager {
private cache: Map<string, { content: string }> = new Map()
private serializeQueue = new PQueue({ concurrency: 1 })
public async loadPDFCache(): Promise<void> {
if (await app.vault.adapter.exists(pdfCacheFilePath)) {
try {
const data = await app.vault.adapter.readBinary(pdfCacheFilePath)
const json = new TextDecoder('utf8').decode(inflate(data))
this.cache = new Map(JSON.parse(json))
} catch (e) {
console.error(e)
this.cache = new Map()
}
}
}
public async getPdfText(file: TFile): Promise<string> {
const data = new Uint8Array(await app.vault.readBinary(file))
const hash = md5(data)
if (this.cache.has(hash)) {
return this.cache.get(hash)!.content
}
const worker = new PDFWorker({ name: 'PDF Text Extractor' })
return new Promise(async (resolve, reject) => {
// @ts-ignore
worker.postMessage({ data })
worker.onmessage = (evt: any) => {
const txt = evt.data.text
this.updatePDFCache(hash, txt)
resolve(txt)
}
})
}
private async updatePDFCache(hash: string, content: string): Promise<void> {
this.serializeQueue.add(() => {
this.cache.set(hash, { content })
const data = deflate(JSON.stringify(Array.from(this.cache), null, 1))
app.vault.adapter.writeBinary(pdfCacheFilePath, data as any)
})
}
}
export const pdfManager = new PDFManager()

View File

@@ -1,17 +0,0 @@
import type { TFile } from 'obsidian'
import PDFJs from 'pdfjs-dist'
import pdfjsWorker from 'pdfjs-dist/build/pdf.worker.entry'
PDFJs.GlobalWorkerOptions.workerSrc = pdfjsWorker
// https://stackoverflow.com/a/59929946
export async function getPdfText(file: TFile): Promise<string> {
const data = await app.vault.readBinary(file)
const doc = await PDFJs.getDocument(data).promise
const pageTexts = Array.from({ length: doc.numPages }, async (v, i) => {
const page = await doc.getPage(i + 1)
const content = await page.getTextContent()
return (content.items as any[]).map(token => token.str).join('')
})
return (await Promise.all(pageTexts)).join('')
}

16
src/pdf-worker.ts Normal file
View File

@@ -0,0 +1,16 @@
import rustPlugin from '../pkg/obsidian_search_bg.wasm'
import * as plugin from '../pkg/obsidian_search'
const decodedPlugin = decodeBase64(rustPlugin as any)
onmessage = async evt => {
const buffer = Uint8Array.from(decodedPlugin, c => c.charCodeAt(0))
await plugin.default(Promise.resolve(buffer))
const text = plugin.extract_pdf_text(evt.data.data as Uint8Array)
self.postMessage({ text })
}
function decodeBase64(data: string) {
return atob(data)
// return Buffer.from(data, 'base64').toString()
}

View File

@@ -4,12 +4,12 @@ import {
chsRegex,
type IndexedNote,
type ResultNote,
searchIndexFilePath,
minisearchCacheFilePath,
type SearchMatch,
SPACE_OR_PUNCTUATION,
} from './globals'
import {
canIndexPDFs,
isFileIndexable,
isFilePlaintext,
removeDiacritics,
stringsToRegex,
@@ -18,13 +18,15 @@ import {
} from './utils'
import type { Query } from './query'
import { settings } from './settings'
import {
getNoteFromCache,
isCacheOutdated,
loadNotesCache,
resetNotesCache,
} from './notes'
import {addToIndex, indexPDFs, removeFromIndex, saveIndexToFile} from './notes-index'
// import {
// getNoteFromCache,
// isCacheOutdated,
// loadNotesCache,
// resetNotesCache,
// } from './notes'
import * as NotesIndex from './notes-index'
import PQueue from 'p-queue-compat'
import { cacheManager } from './cache-manager'
export let minisearchInstance: MiniSearch<IndexedNote>
@@ -60,15 +62,18 @@ export async function initGlobalSearchIndex(): Promise<void> {
storeFields: ['tags'],
}
if (
settings.storeIndexInFile &&
(await app.vault.adapter.exists(searchIndexFilePath))
) {
// Default instance
minisearchInstance = new MiniSearch(options)
// Load Minisearch cache, if it exists
if (await app.vault.adapter.exists(minisearchCacheFilePath)) {
try {
const json = await app.vault.adapter.read(searchIndexFilePath)
minisearchInstance = MiniSearch.loadJSON(json, options)
const json = await cacheManager.readMinisearchIndex()
if (json) {
// If we have cache data, reload it
minisearchInstance = MiniSearch.loadJSON(json, options)
}
console.log('Omnisearch - MiniSearch index loaded from the file')
await loadNotesCache()
} catch (e) {
console.trace(
'Omnisearch - Could not load MiniSearch index from the file'
@@ -77,10 +82,9 @@ export async function initGlobalSearchIndex(): Promise<void> {
}
}
if (!minisearchInstance) {
minisearchInstance = new MiniSearch(options)
resetNotesCache()
}
// if (!minisearchInstance) {
// resetNotesCache()
// }
// Index files that are already present
const start = new Date().getTime()
@@ -89,32 +93,28 @@ export async function initGlobalSearchIndex(): Promise<void> {
let files
let notesSuffix
if (settings.storeIndexInFile) {
files = allFiles.filter(file => isCacheOutdated(file))
if (settings.persistCache) {
files = allFiles.filter(file => cacheManager.isCacheOutdated(file))
notesSuffix = 'modified notes'
} else {
files = allFiles
notesSuffix = 'notes'
}
console.log(`Omnisearch - indexing ${files.length} ${notesSuffix}`)
// This is basically the same behavior as MiniSearch's `addAllAsync()`.
// We index markdown and plaintext files by batches of 10
let promises: Promise<void>[] = []
for (let i = 0; i < files.length; ++i) {
const file = files[i]
if (getNoteFromCache(file.path)) {
removeFromIndex(file.path)
}
promises.push(addToIndex(file))
if (i % 10 === 0) {
await wait(1)
await Promise.all(promises)
promises = []
}
if (files.length > 0) {
console.log(`Omnisearch - Indexing ${files.length} ${notesSuffix}`)
}
await Promise.all(promises)
// Read and index all the files into the search engine
const queue = new PQueue({ concurrency: 10 })
for (const file of files) {
if (cacheManager.getNoteFromCache(file.path)) {
NotesIndex.removeFromIndex(file.path)
}
queue.add(() => NotesIndex.addToIndexAndCache(file))
}
await queue.onEmpty()
if (files.length > 0) {
const message = `Omnisearch - Indexed ${files.length} ${notesSuffix} in ${
@@ -127,10 +127,10 @@ export async function initGlobalSearchIndex(): Promise<void> {
new Notice(message)
}
await saveIndexToFile()
await cacheManager.writeMinisearchIndex(minisearchInstance)
// PDFs are indexed later, since they're heavier
await indexPDFs()
await NotesIndex.indexPDFs()
}
}
@@ -172,9 +172,10 @@ async function search(query: Query): Promise<SearchResult[]> {
const exactTerms = query.getExactTerms()
if (exactTerms.length) {
results = results.filter(r => {
const title = getNoteFromCache(r.id)?.path.toLowerCase() ?? ''
const title =
cacheManager.getNoteFromCache(r.id)?.path.toLowerCase() ?? ''
const content = stripMarkdownCharacters(
getNoteFromCache(r.id)?.content ?? ''
cacheManager.getNoteFromCache(r.id)?.content ?? ''
).toLowerCase()
return exactTerms.every(q => content.includes(q) || title.includes(q))
})
@@ -185,7 +186,7 @@ async function search(query: Query): Promise<SearchResult[]> {
if (exclusions.length) {
results = results.filter(r => {
const content = stripMarkdownCharacters(
getNoteFromCache(r.id)?.content ?? ''
cacheManager.getNoteFromCache(r.id)?.content ?? ''
).toLowerCase()
return exclusions.every(q => !content.includes(q.value))
})
@@ -253,7 +254,7 @@ export async function getSuggestions(
// Map the raw results to get usable suggestions
return results.map(result => {
const note = getNoteFromCache(result.id)
const note = cacheManager.getNoteFromCache(result.id)
if (!note) {
throw new Error(`Note "${result.id}" not indexed`)
}

View File

@@ -1,6 +1,6 @@
import { Plugin, PluginSettingTab, Setting, SliderComponent } from 'obsidian'
import { writable } from 'svelte/store'
import { notesCacheFilePath, searchIndexFilePath } from './globals'
import { notesCacheFilePath, minisearchCacheFilePath } from './globals'
import type OmnisearchPlugin from './main'
interface WeightingSettings {
@@ -11,20 +11,33 @@ interface WeightingSettings {
}
export interface OmnisearchSettings extends WeightingSettings {
/** Respect the "excluded files" Obsidian setting by downranking results ignored files */
respectExcluded: boolean
/** Ignore diacritics when indexing files */
ignoreDiacritics: boolean
/** Extensions of plain text files to index, in addition to .md */
indexedFileTypes: string[]
indexPDFs: boolean
storeIndexInFile: boolean
/** Enable PDF indexing */
PDFIndexing: boolean
/** Max number of spawned processes for background tasks, such as extracting text from PDFs */
backgroundProcesses: number
/** Write cache files on disk (unrelated to PDFs) */
persistCache: boolean
/** Display Omnisearch popup notices over Obsidian */
showIndexingNotices: boolean
/** Activate the small 🔍 button on Obsidian's ribbon */
ribbonIcon: boolean
/** Display short filenames in search results, instead of the full path */
showShortName: boolean
/** Display the small contextual excerpt in search results */
showExcerpt: boolean
/** Enable a "create note" button in the Vault Search modal */
showCreateButton: boolean
/** Vim mode shortcuts */
CtrlJK: boolean
/** Vim mode shortcuts */
CtrlNP: boolean
/** Key for the welcome message when Obsidian is updated. A message is only shown once. */
welcomeMessage: string
}
@@ -74,7 +87,7 @@ export class SettingsTab extends PluginSettingTab {
const diacriticsDesc = new DocumentFragment()
diacriticsDesc.createSpan({}, span => {
span.innerHTML = `Normalize diacritics in search terms. Words like "brûlée" or "žluťoučký" will be indexed as "brulee" and "zlutoucky".<br/>
<strong>Needs a restart to fully take effect.</strong>`
<strong style="color: var(--text-accent)">Needs a restart to fully take effect.</strong>`
})
new Setting(containerEl)
.setName('Ignore diacritics')
@@ -91,7 +104,7 @@ export class SettingsTab extends PluginSettingTab {
indexedFileTypesDesc.createSpan({}, span => {
span.innerHTML = `In addition to standard <code>md</code> files, Omnisearch can also index other plain text files.<br/>
Add extensions separated by a space. Example: <code>txt org</code>.<br />
<strong>Needs a restart to fully take effect.</strong>`
<strong style="color: var(--text-accent)">Needs a restart to fully take effect.</strong>`
})
new Setting(containerEl)
.setName('Additional files to index')
@@ -106,50 +119,68 @@ export class SettingsTab extends PluginSettingTab {
})
})
// Index PDFs
const indexPDFsDesc = new DocumentFragment()
indexPDFsDesc.createSpan({}, span => {
span.innerHTML = `Omnisearch will index your PDFs, and return them in search results.
This feature is currently a work-in-progress, please report slowdowns or issues that you might experience.<br>
PDFs being quite slow to index, <strong style="color: var(--text-accent)">it is strongly recommended to also enable "Store index in file"</strong>.<br>
<strong>Needs a restart to fully take effect.</strong>`
})
new Setting(containerEl)
.setName('BETA - Index PDFs')
.setDesc(indexPDFsDesc)
.addToggle(toggle =>
toggle.setValue(settings.indexPDFs).onChange(async v => {
settings.indexPDFs = v
await saveSettings(this.plugin)
})
)
// // Background processes
// new Setting(containerEl)
// .setName(
// `Background processes (default: ${DEFAULT_SETTINGS.backgroundProcesses})`
// )
// .setDesc('The maximum number of processes for background work, like PDF indexing. This value should not be higher than your number of CPU cores.')
// .addSlider(cb => {
// cb.setLimits(1, 16, 1)
// .setValue(settings.backgroundProcesses)
// .setDynamicTooltip()
// .onChange(v => {
// settings.backgroundProcesses = v
// saveSettings(this.plugin)
// })
// })
// Store index
const serializedIndexDesc = new DocumentFragment()
serializedIndexDesc.createSpan({}, span => {
span.innerHTML = `The search index is stored on disk, instead of being rebuilt at every startup.
This results in faster loading times for bigger vaults and mobile devices.<br />
<em>⚠️ Note: the index can become corrupted - if you notice any issue, disable and re-enable this option to clear the cache.</em><br/>
<em>⚠️ Cache files in <code>.obsidian/plugins/omnisearch/</code> must not be synchronized.</em><br/>
<strong>Needs a restart to fully take effect.</strong>
span.innerHTML = `This will speedup startup times after the initial indexing. Do not activate it unless indexing is too slow on your device:
<ul>
<li>PDF indexing is not affected by this setting</li>
<li>⚠️ The index can become corrupted - if you notice any issue, disable and re-enable this option to clear the cache.</li>
<li>⚠️ Cache files in <code>.obsidian/plugins/omnisearch/*.data</code> must not be synchronized between your devices.</li>
</ul>
<strong style="color: var(--text-accent)">Needs a restart to fully take effect.</strong>
`
})
new Setting(containerEl)
.setName('Store index in file')
.setName('Persist cache on disk')
.setDesc(serializedIndexDesc)
.addToggle(toggle =>
toggle.setValue(settings.storeIndexInFile).onChange(async v => {
toggle.setValue(settings.persistCache).onChange(async v => {
try {
await app.vault.adapter.remove(notesCacheFilePath)
} catch (e) {
console.warn(e)
}
try {
await app.vault.adapter.remove(searchIndexFilePath)
await app.vault.adapter.remove(minisearchCacheFilePath)
} catch (e) {
console.warn(e)
}
settings.storeIndexInFile = v
settings.persistCache = v
await saveSettings(this.plugin)
})
)
// PDF Indexing
const indexPDFsDesc = new DocumentFragment()
indexPDFsDesc.createSpan({}, span => {
span.innerHTML = `Omnisearch will include PDFs in search results.
This feature is currently a work-in-progress, please report slowdowns or issues that you might experience.<br>
Each PDF can take a few seconds to be indexed, so it may not appear immediately in search results.<br>
<strong style="color: var(--text-accent)">Needs a restart to fully take effect.</strong>`
})
new Setting(containerEl)
.setName('BETA - PDF Indexing')
.setDesc(indexPDFsDesc)
.addToggle(toggle =>
toggle.setValue(settings.PDFIndexing).onChange(async v => {
settings.PDFIndexing = v
await saveSettings(this.plugin)
})
)
@@ -285,12 +316,12 @@ export class SettingsTab extends PluginSettingTab {
weightSlider(cb: SliderComponent, key: keyof WeightingSettings): void {
cb.setLimits(1, 3, 0.1)
cb.setValue(settings[key])
cb.setDynamicTooltip()
cb.onChange(v => {
settings[key] = v
saveSettings(this.plugin)
})
.setValue(settings[key])
.setDynamicTooltip()
.onChange(v => {
settings[key] = v
saveSettings(this.plugin)
})
}
}
@@ -298,7 +329,8 @@ export const DEFAULT_SETTINGS: OmnisearchSettings = {
respectExcluded: true,
ignoreDiacritics: true,
indexedFileTypes: [] as string[],
indexPDFs: false,
PDFIndexing: false,
backgroundProcesses: Math.max(1, Math.floor(require('os').cpus().length / 2)),
showIndexingNotices: false,
showShortName: false,
@@ -314,7 +346,7 @@ export const DEFAULT_SETTINGS: OmnisearchSettings = {
CtrlJK: false,
CtrlNP: false,
storeIndexInFile: false,
persistCache: false,
welcomeMessage: '',
} as const

4
src/typings/workers.d.ts vendored Normal file
View File

@@ -0,0 +1,4 @@
declare module "web-worker:*" {
const WorkerFactory: new (options: any) => Worker;
export default WorkerFactory;
}

View File

@@ -10,6 +10,7 @@ import {
regexYaml,
} from './globals'
import { settings } from './settings'
import { createHash, type BinaryLike } from 'crypto'
export function highlighter(str: string): string {
return `<span class="${highlightClass}">${str}</span>`
@@ -172,12 +173,10 @@ export function getCtrlKeyLabel(): 'ctrl' | '⌘' {
return Platform.isMacOS ? '⌘' : 'ctrl'
}
export function canIndexPDFs(): boolean {
return settings.indexPDFs
}
export function isFileIndexable(path: string): boolean {
return (canIndexPDFs() && path.endsWith('.pdf')) || isFilePlaintext(path)
return (
(settings.PDFIndexing && path.endsWith('.pdf')) || isFilePlaintext(path)
)
}
export function isFilePlaintext(path: string): boolean {
@@ -194,6 +193,7 @@ export function getExtension(path: string): string {
}
export function showWelcomeNotice(plugin: Plugin) {
return
const code = '1.6.0'
if (settings.welcomeMessage !== code) {
const welcome = new DocumentFragment()
@@ -207,4 +207,8 @@ New beta feature: PDF search 🔎📄
settings.welcomeMessage = code
plugin.saveData(settings)
}
}
export function md5(data: BinaryLike): string {
return createHash('md5').update(data).digest('hex')
}