From e618d4ca47590106fb3be1bd060305b7055f3ddb Mon Sep 17 00:00:00 2001
From: demig00d <28487425+demig00d@users.noreply.github.com>
Date: Sat, 20 Jan 2024 14:01:43 +0300
Subject: [PATCH] Index office documents (#340)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

👌
---
 src/cache-manager.ts | 10 ++++++++++
 src/settings.ts      | 28 +++++++++++++++++++++++++---
 src/tools/utils.ts   |  5 +++++
 3 files changed, 40 insertions(+), 3 deletions(-)
diff --git a/src/cache-manager.ts b/src/cache-manager.ts
index 26c22f7..372db26 100644
--- a/src/cache-manager.ts
+++ b/src/cache-manager.ts
@@ -13,6 +13,7 @@ import {
   isFileFromDataloomPlugin,
   isFileImage,
   isFilePDF,
+  isFileOffice,
   isFilePlaintext,
   isFilenameIndexable,
   logDebug,
@@ -106,6 +107,15 @@ async function getAndMapIndexedDocument(
     content = await extractor.extractText(file)
   }
 
+  // ** Office document **
+  else if (
+    isFileOffice(path) &&
+    settings.officeIndexing &&
+    extractor?.canFileBeExtracted(path)
+  ) {
+    content = await extractor.extractText(file)
+  }
+
   // ** Unsupported files **
   else if (isFilenameIndexable(path)) {
     content = file.path
diff --git a/src/settings.ts b/src/settings.ts
index 8116504..5c103f5 100644
--- a/src/settings.ts
+++ b/src/settings.ts
@@ -37,6 +37,8 @@ export interface OmnisearchSettings extends WeightingSettings {
   PDFIndexing: boolean
   /** Enable Images indexing */
   imagesIndexing: boolean
+  /** Enable Office documents indexing */
+  officeIndexing: boolean
   /** Enable Excalidraw indexing */
   excalidrawIndexing: boolean
   /** Enable indexing of unknown files */
@@ -160,11 +162,30 @@ export class SettingsTab extends PluginSettingTab {
       )
       .setDisabled(!getTextExtractor())
 
+    // Office Documents Indexing
+    const indexOfficesDesc = new DocumentFragment()
+    indexOfficesDesc.createSpan({}, span => {
+      span.innerHTML = `Omnisearch will use Text Extractor to index the content of your office documents (currently <pre style="display:inline">.docx</pre> and <pre style="display:inline">.xlsx</pre>)`
+    })
+    new Setting(containerEl)
+      .setName(
+        `Documents content indexing ${getTextExtractor() ? '' : '⚠️ Disabled'}`
+      )
+      .setDesc(indexOfficesDesc)
+      .addToggle(toggle =>
+        toggle.setValue(settings.officeIndexing).onChange(async v => {
+          await database.clearCache()
+          settings.officeIndexing = v
+          await saveSettings(this.plugin)
+        })
+      )
+      .setDisabled(!getTextExtractor())
+
     // Index filenames of unsupported files
     const indexUnsupportedDesc = new DocumentFragment()
     indexUnsupportedDesc.createSpan({}, span => {
       span.innerHTML = `
-      Omnisearch can index file<strong>names</strong> of "unsupported" files, such as e.g. <pre style="display:inline">.mp4</pre>, <pre style="display:inline">.xlsx</pre>, 
+      Omnisearch can index file<strong>names</strong> of "unsupported" files, such as e.g. <pre style="display:inline">.mp4</pre>
       or non-extracted PDFs & images.<br/>
       "Obsidian setting" will respect the value of "Files & Links > Detect all file extensions"`
     })
@@ -177,7 +198,7 @@ export class SettingsTab extends PluginSettingTab {
           .setValue(settings.unsupportedFilesIndexing)
           .onChange(async v => {
             await database.clearCache()
-            ;(settings.unsupportedFilesIndexing as any) = v
+              ; (settings.unsupportedFilesIndexing as any) = v
             await saveSettings(this.plugin)
           })
       })
@@ -187,7 +208,7 @@ export class SettingsTab extends PluginSettingTab {
     indexedFileTypesDesc.createSpan({}, span => {
       span.innerHTML = `In addition to standard <code>md</code> files, Omnisearch can also index other <strong style="color: var(--text-accent)">PLAINTEXT</strong> files.<br/>
       Add extensions separated by a space, without the dot. Example: "<code>txt org csv</code>".<br />
-      ⚠️ <span style="color: var(--text-accent)">Using extensions of non-plaintext files (like .docx or .pptx) WILL cause crashes,
+      ⚠️ <span style="color: var(--text-accent)">Using extensions of non-plaintext files (like .pptx) WILL cause crashes,
       because Omnisearch will try to index their content.</span>`
     })
     new Setting(containerEl)
@@ -604,6 +625,7 @@ export const DEFAULT_SETTINGS: OmnisearchSettings = {
   ignoreDiacritics: true,
   indexedFileTypes: [] as string[],
   PDFIndexing: false,
+  officeIndexing: false,
   imagesIndexing: false,
   excalidrawIndexing: true,
   unsupportedFilesIndexing: 'no',
diff --git a/src/tools/utils.ts b/src/tools/utils.ts
index 79c9767..a24e488 100644
--- a/src/tools/utils.ts
+++ b/src/tools/utils.ts
@@ -174,6 +174,11 @@ export function isFilePDF(path: string): boolean {
   return getExtension(path) === 'pdf'
 }
 
+export function isFileOffice(path: string): boolean {
+  const ext = getExtension(path)
+  return ext === 'docx' || ext === 'xlsx'
+}
+
 export function isFilePlaintext(path: string): boolean {
   return [...settings.indexedFileTypes, 'md'].some(t => path.endsWith(`.${t}`))
 }