diff --git a/src/cache-manager.ts b/src/cache-manager.ts index 26c22f7..372db26 100644 --- a/src/cache-manager.ts +++ b/src/cache-manager.ts @@ -13,6 +13,7 @@ import { isFileFromDataloomPlugin, isFileImage, isFilePDF, + isFileOffice, isFilePlaintext, isFilenameIndexable, logDebug, @@ -106,6 +107,15 @@ async function getAndMapIndexedDocument( content = await extractor.extractText(file) } + // ** Office document ** + else if ( + isFileOffice(path) && + settings.officeIndexing && + extractor?.canFileBeExtracted(path) + ) { + content = await extractor.extractText(file) + } + // ** Unsupported files ** else if (isFilenameIndexable(path)) { content = file.path diff --git a/src/settings.ts b/src/settings.ts index 8116504..5c103f5 100644 --- a/src/settings.ts +++ b/src/settings.ts @@ -37,6 +37,8 @@ export interface OmnisearchSettings extends WeightingSettings { PDFIndexing: boolean /** Enable Images indexing */ imagesIndexing: boolean + /** Enable Office documents indexing */ + officeIndexing: boolean /** Enable Excalidraw indexing */ excalidrawIndexing: boolean /** Enable indexing of unknown files */ @@ -160,11 +162,30 @@ export class SettingsTab extends PluginSettingTab { ) .setDisabled(!getTextExtractor()) + // Office Documents Indexing + const indexOfficesDesc = new DocumentFragment() + indexOfficesDesc.createSpan({}, span => { + span.innerHTML = `Omnisearch will use Text Extractor to index the content of your office documents (currently
.docx
and
.xlsx
)` + }) + new Setting(containerEl) + .setName( + `Documents content indexing ${getTextExtractor() ? '' : '⚠️ Disabled'}` + ) + .setDesc(indexOfficesDesc) + .addToggle(toggle => + toggle.setValue(settings.officeIndexing).onChange(async v => { + await database.clearCache() + settings.officeIndexing = v + await saveSettings(this.plugin) + }) + ) + .setDisabled(!getTextExtractor()) + // Index filenames of unsupported files const indexUnsupportedDesc = new DocumentFragment() indexUnsupportedDesc.createSpan({}, span => { span.innerHTML = ` - Omnisearch can index filenames of "unsupported" files, such as e.g.
.mp4
,
.xlsx
, + Omnisearch can index filenames of "unsupported" files, such as e.g.
.mp4
or non-extracted PDFs & images.
"Obsidian setting" will respect the value of "Files & Links > Detect all file extensions"` }) @@ -177,7 +198,7 @@ export class SettingsTab extends PluginSettingTab { .setValue(settings.unsupportedFilesIndexing) .onChange(async v => { await database.clearCache() - ;(settings.unsupportedFilesIndexing as any) = v + ; (settings.unsupportedFilesIndexing as any) = v await saveSettings(this.plugin) }) }) @@ -187,7 +208,7 @@ export class SettingsTab extends PluginSettingTab { indexedFileTypesDesc.createSpan({}, span => { span.innerHTML = `In addition to standard md files, Omnisearch can also index other PLAINTEXT files.
Add extensions separated by a space, without the dot. Example: "txt org csv".
- ⚠️ Using extensions of non-plaintext files (like .docx or .pptx) WILL cause crashes, + ⚠️ Using extensions of non-plaintext files (like .pptx) WILL cause crashes, because Omnisearch will try to index their content.` }) new Setting(containerEl) @@ -604,6 +625,7 @@ export const DEFAULT_SETTINGS: OmnisearchSettings = { ignoreDiacritics: true, indexedFileTypes: [] as string[], PDFIndexing: false, + officeIndexing: false, imagesIndexing: false, excalidrawIndexing: true, unsupportedFilesIndexing: 'no', diff --git a/src/tools/utils.ts b/src/tools/utils.ts index 79c9767..a24e488 100644 --- a/src/tools/utils.ts +++ b/src/tools/utils.ts @@ -174,6 +174,11 @@ export function isFilePDF(path: string): boolean { return getExtension(path) === 'pdf' } +export function isFileOffice(path: string): boolean { + const ext = getExtension(path) + return ext === 'docx' || ext === 'xlsx' +} + export function isFilePlaintext(path: string): boolean { return [...settings.indexedFileTypes, 'md'].some(t => path.endsWith(`.${t}`)) }