Index office documents (#340)

👌
This commit is contained in:
demig00d
2024-01-20 14:01:43 +03:00
committed by GitHub
parent b86bbf8407
commit e618d4ca47
3 changed files with 40 additions and 3 deletions

View File

@@ -13,6 +13,7 @@ import {
isFileFromDataloomPlugin, isFileFromDataloomPlugin,
isFileImage, isFileImage,
isFilePDF, isFilePDF,
isFileOffice,
isFilePlaintext, isFilePlaintext,
isFilenameIndexable, isFilenameIndexable,
logDebug, logDebug,
@@ -106,6 +107,15 @@ async function getAndMapIndexedDocument(
content = await extractor.extractText(file) content = await extractor.extractText(file)
} }
// ** Office document **
else if (
isFileOffice(path) &&
settings.officeIndexing &&
extractor?.canFileBeExtracted(path)
) {
content = await extractor.extractText(file)
}
// ** Unsupported files ** // ** Unsupported files **
else if (isFilenameIndexable(path)) { else if (isFilenameIndexable(path)) {
content = file.path content = file.path

View File

@@ -37,6 +37,8 @@ export interface OmnisearchSettings extends WeightingSettings {
PDFIndexing: boolean PDFIndexing: boolean
/** Enable Images indexing */ /** Enable Images indexing */
imagesIndexing: boolean imagesIndexing: boolean
/** Enable Office documents indexing */
officeIndexing: boolean
/** Enable Excalidraw indexing */ /** Enable Excalidraw indexing */
excalidrawIndexing: boolean excalidrawIndexing: boolean
/** Enable indexing of unknown files */ /** Enable indexing of unknown files */
@@ -160,11 +162,30 @@ export class SettingsTab extends PluginSettingTab {
) )
.setDisabled(!getTextExtractor()) .setDisabled(!getTextExtractor())
// Office Documents Indexing
const indexOfficesDesc = new DocumentFragment()
indexOfficesDesc.createSpan({}, span => {
span.innerHTML = `Omnisearch will use Text Extractor to index the content of your office documents (currently <pre style="display:inline">.docx</pre> and <pre style="display:inline">.xlsx</pre>)`
})
new Setting(containerEl)
.setName(
`Documents content indexing ${getTextExtractor() ? '' : '⚠️ Disabled'}`
)
.setDesc(indexOfficesDesc)
.addToggle(toggle =>
toggle.setValue(settings.officeIndexing).onChange(async v => {
await database.clearCache()
settings.officeIndexing = v
await saveSettings(this.plugin)
})
)
.setDisabled(!getTextExtractor())
// Index filenames of unsupported files // Index filenames of unsupported files
const indexUnsupportedDesc = new DocumentFragment() const indexUnsupportedDesc = new DocumentFragment()
indexUnsupportedDesc.createSpan({}, span => { indexUnsupportedDesc.createSpan({}, span => {
span.innerHTML = ` span.innerHTML = `
Omnisearch can index file<strong>names</strong> of "unsupported" files, such as e.g. <pre style="display:inline">.mp4</pre>, <pre style="display:inline">.xlsx</pre>, Omnisearch can index file<strong>names</strong> of "unsupported" files, such as e.g. <pre style="display:inline">.mp4</pre>
or non-extracted PDFs & images.<br/> or non-extracted PDFs & images.<br/>
"Obsidian setting" will respect the value of "Files & Links > Detect all file extensions"` "Obsidian setting" will respect the value of "Files & Links > Detect all file extensions"`
}) })
@@ -177,7 +198,7 @@ export class SettingsTab extends PluginSettingTab {
.setValue(settings.unsupportedFilesIndexing) .setValue(settings.unsupportedFilesIndexing)
.onChange(async v => { .onChange(async v => {
await database.clearCache() await database.clearCache()
;(settings.unsupportedFilesIndexing as any) = v ; (settings.unsupportedFilesIndexing as any) = v
await saveSettings(this.plugin) await saveSettings(this.plugin)
}) })
}) })
@@ -187,7 +208,7 @@ export class SettingsTab extends PluginSettingTab {
indexedFileTypesDesc.createSpan({}, span => { indexedFileTypesDesc.createSpan({}, span => {
span.innerHTML = `In addition to standard <code>md</code> files, Omnisearch can also index other <strong style="color: var(--text-accent)">PLAINTEXT</strong> files.<br/> span.innerHTML = `In addition to standard <code>md</code> files, Omnisearch can also index other <strong style="color: var(--text-accent)">PLAINTEXT</strong> files.<br/>
Add extensions separated by a space, without the dot. Example: "<code>txt org csv</code>".<br /> Add extensions separated by a space, without the dot. Example: "<code>txt org csv</code>".<br />
⚠️ <span style="color: var(--text-accent)">Using extensions of non-plaintext files (like .docx or .pptx) WILL cause crashes, ⚠️ <span style="color: var(--text-accent)">Using extensions of non-plaintext files (like .pptx) WILL cause crashes,
because Omnisearch will try to index their content.</span>` because Omnisearch will try to index their content.</span>`
}) })
new Setting(containerEl) new Setting(containerEl)
@@ -604,6 +625,7 @@ export const DEFAULT_SETTINGS: OmnisearchSettings = {
ignoreDiacritics: true, ignoreDiacritics: true,
indexedFileTypes: [] as string[], indexedFileTypes: [] as string[],
PDFIndexing: false, PDFIndexing: false,
officeIndexing: false,
imagesIndexing: false, imagesIndexing: false,
excalidrawIndexing: true, excalidrawIndexing: true,
unsupportedFilesIndexing: 'no', unsupportedFilesIndexing: 'no',

View File

@@ -174,6 +174,11 @@ export function isFilePDF(path: string): boolean {
return getExtension(path) === 'pdf' return getExtension(path) === 'pdf'
} }
export function isFileOffice(path: string): boolean {
const ext = getExtension(path)
return ext === 'docx' || ext === 'xlsx'
}
export function isFilePlaintext(path: string): boolean { export function isFilePlaintext(path: string): boolean {
return [...settings.indexedFileTypes, 'md'].some(t => path.endsWith(`.${t}`)) return [...settings.indexedFileTypes, 'md'].some(t => path.endsWith(`.${t}`))
} }