Merge branch 'master' into develop

2024-03-11 10:34:04 +01:00
parent 59162531a1 32233a1468
commit 3ac878f6bd
20 changed files with 1598 additions and 2138 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,72 @@
 # Omnisearch Changelog

+This changelog is not exhaustive.
+
+## 1.21.x
+
+- Added support for .docx and .xlsx
+
+## 1.20.x
+
+- Refactored indexing tokenization process to correctly take diacritics into account
+- Added highlighting in the note's path
+- Improved the selection of the chosen excerpt in the results list
+
+## 1.19.x
+
+- Various bugfixes and improvements
+
+## 1.18.x
+
+- Added a localhost server to use Omnisearch's API from outside Obsidian
+
+## 1.17.x
+
+- Added a shortcut to open files without closing Omnisearch
+- Prefill the search field with selected text
+- Improved highlighting
+
+## 1.16.x
+
+- Various indexing/tokenization improvements
+
+## 1.15.x
+
+- Added support of webp images
+- Configurable fuzziness
+- Added support for DataLoom plugin files
+- Unsupported files are now indexed by their path
+- Unmarked tags are now slightly boosted
+
+## 1.14.x
+
+- Added a `path:` option
+- Bugfixes
+
+## 1.13.x
+
+- CamelCaseWords are now indexed as 3 words
+- Reduced search freezes in some cases
+
+## 1.12.x
+
+- You can filter files by their extension
+- Refreshed UI
+- New API functions
+- Fixed some tokenization issues
+
+## 1.10.x - 1.11.x
+
+- Added support for Text Extractor; Omnisearch no longer extracts text itself
+- Added canvas indexing
+- Improved tags indexing
+
+## 1.9.x
+
+- PDFs are no longer indexed on mobile 
+- Performance improvements
+- Various bugfixes
+
 ## 1.8.x

 - Added OCR for images
--- a/README.md
+++ b/README.md
@@ -39,6 +39,7 @@ You can check the [CHANGELOG](./CHANGELOG.md) for more information on the differ
    its filename, and its headings
 - Keyboard first: you never have to use your mouse
 - Workflow similar to the "Quick Switcher" core plugin
+- Opt-in local HTTP server to query Omnisearch from outside of Obsidian
 - Resistance to typos
 - Switch between Vault and In-file search to quickly skim multiple results in a single note
 - Supports `"expressions in quotes"` and `-exclusions`
--- a/manifest-beta.json
+++ b/manifest-beta.json
@@ -1,7 +1,7 @@
 {
 	"id": "omnisearch",
 	"name": "Omnisearch",
-	"version": "1.20.0-beta.1",
+	"version": "1.21.1",
 	"minAppVersion": "1.3.0",
 	"description": "A search engine that just works",
 	"author": "Simon Cambier",
--- a/manifest.json
+++ b/manifest.json
@@ -1,7 +1,7 @@
 {
  "id": "omnisearch",
  "name": "Omnisearch",
-  "version": "1.19.0",
+  "version": "1.21.1",
  "minAppVersion": "1.3.0",
  "description": "A search engine that just works",
  "author": "Simon Cambier",
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
  "name": "scambier.obsidian-search",
-  "version": "1.20.0-beta.1",
+  "version": "1.21.1",
  "description": "A search engine for Obsidian",
  "main": "dist/main.js",
  "scripts": {
@@ -14,36 +14,36 @@
  "author": "Simon Cambier",
  "license": "GPL-3",
  "devDependencies": {
-    "@babel/preset-env": "^7.20.2",
-    "@babel/preset-typescript": "^7.18.6",
-    "@testing-library/jest-dom": "^5.16.5",
+    "@babel/preset-env": "^7.23.8",
+    "@babel/preset-typescript": "^7.23.3",
+    "@testing-library/jest-dom": "^5.17.0",
    "@tsconfig/svelte": "^3.0.0",
    "@types/jest": "^27.5.2",
-    "@types/lodash-es": "^4.17.6",
-    "@types/node": "^16.18.7",
-    "@types/pako": "^2.0.0",
+    "@types/lodash-es": "^4.17.12",
+    "@types/node": "^16.18.74",
+    "@types/pako": "^2.0.3",
    "babel-jest": "^27.5.1",
    "builtin-modules": "^3.3.0",
    "esbuild": "0.14.0",
    "esbuild-plugin-copy": "1.3.0",
    "esbuild-svelte": "0.7.1",
    "jest": "^27.5.1",
-    "obsidian": "^1.4.11",
-    "prettier": "^2.8.1",
-    "prettier-plugin-svelte": "^2.8.1",
-    "svelte": "^3.54.0",
-    "svelte-check": "^2.10.2",
+    "obsidian": "1.3.5",
+    "prettier": "^2.8.8",
+    "prettier-plugin-svelte": "^2.10.1",
+    "svelte": "^3.59.2",
+    "svelte-check": "^2.10.3",
    "svelte-jester": "^2.3.2",
    "svelte-preprocess": "^4.10.7",
    "tslib": "2.3.1",
-    "typescript": "^4.9.4",
-    "vite": "^3.2.5"
+    "typescript": "^4.9.5",
+    "vite": "^3.2.8"
  },
  "dependencies": {
    "cancelable-promise": "^4.3.1",
-    "dexie": "^3.2.2",
+    "dexie": "^3.2.4",
    "lodash-es": "4.17.21",
-    "minisearch": "6.0.0-beta.1",
+    "minisearch": "^6.3.0",
    "pure-md5": "^0.1.14",
    "search-query-parser": "^1.6.0"
  },
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
--- a/src/cache-manager.ts
+++ b/src/cache-manager.ts
@@ -13,10 +13,13 @@ import {
  isFileFromDataloomPlugin,
  isFileImage,
  isFilePDF,
+  isFileOffice,
  isFilePlaintext,
  isFilenameIndexable,
  logDebug,
  makeMD5,
+  removeDiacritics,
+  stripMarkdownCharacters,
 } from './tools/utils'
 import type { CanvasData } from 'obsidian/canvas'
 import type { AsPlainObject } from 'minisearch'
@@ -104,6 +107,15 @@ async function getAndMapIndexedDocument(
    content = await extractor.extractText(file)
  }

+  // ** Office document **
+  else if (
+    isFileOffice(path) &&
+    settings.officeIndexing &&
+    extractor?.canFileBeExtracted(path)
+  ) {
+    content = await extractor.extractText(file)
+  }
+
  // ** Unsupported files **
  else if (isFilenameIndexable(path)) {
    content = file.path
@@ -143,6 +155,8 @@ async function getAndMapIndexedDocument(
  return {
    basename: file.basename,
    content,
+    /** Content without diacritics and markdown chars */
+    cleanedContent: stripMarkdownCharacters(removeDiacritics(content)),
    path: file.path,
    mtime: file.stat.mtime,

--- a/src/components/ModalInFile.svelte
+++ b/src/components/ModalInFile.svelte
@@ -19,6 +19,7 @@
  import { Query } from 'src/search/query'
  import { openNote } from 'src/tools/notes'
  import { searchEngine } from 'src/search/omnisearch'
+  import { stringsToRegex } from 'src/tools/text-processing'

  export let modal: OmnisearchInFileModal
  export let parent: OmnisearchVaultModal | null = null
@@ -64,10 +65,20 @@

  $: {
    if (note) {
-      const groups = getGroups(note.matches)
-      groupedOffsets = groups.map(group =>
-        Math.round((group.first()!.offset + group.last()!.offset) / 2)
+      let groups = getGroups(note.matches)
+
+      // If there are quotes in the search,
+      // only show results that match at least one of the quotes
+      const exactTerms = query.getExactTerms()
+      if (exactTerms.length) {
+        groups = groups.filter(group =>
+          exactTerms.every(exact =>
+            group.some(match => match.match.includes(exact))
          )
+        )
+      }
+
+      groupedOffsets = groups.map(group => Math.round(group.first()!.offset))
    }
  }

@@ -77,13 +88,12 @@
  function getGroups(matches: SearchMatch[]): SearchMatch[][] {
    const groups: SearchMatch[][] = []
    let lastOffset = -1
-    let count = 0 // TODO: FIXME: this is a hack to avoid infinite loops
-    while (true) {
+    let count = 0 // Avoid infinite loops
+    while (++count < 100) {
      const group = getGroupedMatches(matches, lastOffset, excerptAfter)
      if (!group.length) break
      lastOffset = group.last()!.offset
      groups.push(group)
-      if (++count > 100) break
    }
    return groups
  }
@@ -121,7 +131,9 @@
      if (parent) parent.close()

      // Open (or switch focus to) the note
-      await openNote(note, newTab)
+      const reg = stringsToRegex(note.foundWords)
+      reg.exec(note.content)
+      await openNote(note, reg.lastIndex, newTab)

      // Move cursor to the match
      const view = app.workspace.getActiveViewOfType(MarkdownView)
--- a/src/components/ModalVault.svelte
+++ b/src/components/ModalVault.svelte
@@ -116,7 +116,7 @@
      historySearchIndex = 0
    }
    searchQuery = history[historySearchIndex]
-    refInput?.setInputValue(searchQuery)
+    refInput?.setInputValue(searchQuery ?? '')
  }

  async function nextSearchHistory() {
@@ -125,7 +125,7 @@
      historySearchIndex = history.length ? history.length - 1 : 0
    }
    searchQuery = history[historySearchIndex]
-    refInput?.setInputValue(searchQuery)
+    refInput?.setInputValue(searchQuery ?? '')
  }

  let cancelableQuery: CancelablePromise<ResultNote[]> | null = null
@@ -186,7 +186,8 @@

  function openSearchResult(note: ResultNote, newPane = false) {
    saveCurrentQuery()
-    openNote(note, newPane)
+    const offset = note.matches?.[0]?.offset ?? 0
+    openNote(note, offset, newPane)
  }

  async function onClickCreateNote(_e: MouseEvent) {
--- a/src/globals.ts
+++ b/src/globals.ts
@@ -53,6 +53,7 @@ export type IndexedDocument = {
  mtime: number

  content: string
+  cleanedContent: string
  aliases: string
  tags: string[]
  unmarkedTags: string[]
@@ -120,7 +121,7 @@ export function isCacheEnabled(): boolean {
 }

 export const SEPARATORS =
-  /[|\t\n\r\^= -#%-*,.`\/<>:;?@[-\]_{}\u00A0\u00A1\u00A7\u00AB\u00B6\u00B7\u00BB\u00BF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C77\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166E\u1680\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2000-\u200A\u2010-\u2029\u202F-\u2043\u2045-\u2051\u2053-\u205F\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4F\u3000-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]/
+  /[|\t\n\r\^"= -#%-*,.`\/<>:;?@[-\]_{}\u00A0\u00A1\u00A7\u00AB\u00B6\u00B7\u00BB\u00BF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C77\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166E\u1680\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2000-\u200A\u2010-\u2029\u202F-\u2043\u2045-\u2051\u2053-\u205F\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4F\u3000-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]/
    .toString()
    .slice(1, -1)
 export const SPACE_OR_PUNCTUATION = new RegExp(`${SEPARATORS}+`, 'u')
--- a/src/main.ts
+++ b/src/main.ts
@@ -14,6 +14,7 @@ import {
 import {
  eventBus,
  EventNames,
+  getTextExtractor,
  indexingStep,
  IndexingStepType,
  isCacheEnabled,
@@ -36,8 +37,8 @@ export default class OmnisearchPlugin extends Plugin {
    this.addSettingTab(new SettingsTab(this))

    if (!Platform.isMobile) {
-      import('./tools/api-server').then(m =>
-        this.apiHttpServer = m.getServer()
+      import('./tools/api-server').then(
+        m => (this.apiHttpServer = m.getServer())
      )
    }

@@ -119,7 +120,7 @@ export default class OmnisearchPlugin extends Plugin {
        })
      )

-      this.executeFirstLaunchTasks()
+      await this.executeFirstLaunchTasks()
      await this.populateIndex()

      if (this.apiHttpServer && settings.httpApiEnabled) {
@@ -128,18 +129,17 @@ export default class OmnisearchPlugin extends Plugin {
    })
  }

-  executeFirstLaunchTasks(): void {
-    const code = '1.10.1'
-    if (settings.welcomeMessage !== code) {
-      // const welcome = new DocumentFragment()
-      // welcome.createSpan({}, span => {
-      //   span.innerHTML = `🔎 Omnisearch now requires the <strong>Text Extractor</strong> plugin to index PDF and images. See Omnisearch settings for more information.`
-      // })
-      // new Notice(welcome, 20_000)
+  async executeFirstLaunchTasks(): Promise<void> {
+    const code = '1.21.0'
+    if (settings.welcomeMessage !== code && getTextExtractor()) {
+      const welcome = new DocumentFragment()
+      welcome.createSpan({}, span => {
+        span.innerHTML = `🔎 Omnisearch can now index .docx and .xlsx documents. Don't forget to update Text Extractor and enable the toggle in Omnisearch settings.`
+      })
+      new Notice(welcome, 20_000)
    }
    settings.welcomeMessage = code
-
-    this.saveData(settings)
+    await this.saveData(settings)
  }

  async onunload(): Promise<void> {
--- a/src/search/omnisearch.ts
+++ b/src/search/omnisearch.ts
@@ -1,59 +1,18 @@
 import MiniSearch, { type Options, type SearchResult } from 'minisearch'
 import type { DocumentRef, IndexedDocument, ResultNote } from '../globals'
-import {
-  BRACKETS_AND_SPACE,
-  chsRegex,
-  getChsSegmenter,
-  SPACE_OR_PUNCTUATION,
-} from '../globals'
+
 import { settings } from '../settings'
-import {
-  chunkArray,
-  logDebug,
-  removeDiacritics,
-  splitCamelCase,
-  splitHyphens,
-  stripMarkdownCharacters,
-} from '../tools/utils'
+import { chunkArray, logDebug, removeDiacritics } from '../tools/utils'
 import { Notice } from 'obsidian'
 import type { Query } from './query'
 import { cacheManager } from '../cache-manager'
 import { sortBy } from 'lodash-es'
 import { getMatches, stringsToRegex } from 'src/tools/text-processing'
-
-const tokenize = (text: string): string[] => {
-  const words = text.split(BRACKETS_AND_SPACE)
-
-  let tokens = text.split(SPACE_OR_PUNCTUATION)
-
-  // Split hyphenated tokens
-  tokens = [...tokens, ...tokens.flatMap(splitHyphens)]
-
-  // Split camelCase tokens into "camel" and "case
-  tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]
-
-  // Add whole words (aka "not tokens")
-  tokens = [...tokens, ...words]
-
-  // When enabled, we only use the chsSegmenter,
-  // and not the other custom tokenizers
-  const chsSegmenter = getChsSegmenter()
-  if (chsSegmenter) {
-    const chs = tokens.flatMap(word =>
-      chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
-    )
-    tokens = [...tokens, ...chs]
-  }
-
-  // Remove duplicates
-  tokens = [...new Set(tokens)]
-
-  return tokens
-}
+import { tokenizeForIndexing, tokenizeForSearch } from './tokenizer'

 export class Omnisearch {
  public static readonly options: Options<IndexedDocument> = {
-    tokenize,
+    tokenize: tokenizeForIndexing,
    extractField: (doc, fieldName) => {
      if (fieldName === 'directory') {
        // return path without the filename
@@ -87,6 +46,7 @@ export class Omnisearch {
    },
  }
  private minisearch: MiniSearch
+  /** Map<path, mtime> */
  private indexedDocuments: Map<string, number> = new Map()
  // private previousResults: SearchResult[] = []
  // private previousQuery: Query | null = null
@@ -212,14 +172,15 @@ export class Omnisearch {
        break
    }

-    let results = this.minisearch.search(query.segmentsToStr(), {
+    const searchTokens = tokenizeForSearch(query.segmentsToStr())
+    logDebug(JSON.stringify(searchTokens, null, 1))
+    let results = this.minisearch.search(searchTokens, {
      prefix: term => term.length >= options.prefixLength,
      // length <= 3: no fuzziness
      // length <= 5: fuzziness of 10%
      // length > 5: fuzziness of 20%
      fuzzy: term =>
        term.length <= 3 ? 0 : term.length <= 5 ? fuzziness / 2 : fuzziness,
-      combineWith: 'AND',
      boost: {
        basename: settings.weightBasename,
        directory: settings.weightDirectory,
@@ -321,10 +282,10 @@ export class Omnisearch {
      results = results.filter(r => {
        const document = documents.find(d => d.path === r.id)
        const title = document?.path.toLowerCase() ?? ''
-        const content = stripMarkdownCharacters(
-          document?.content ?? ''
-        ).toLowerCase()
-        return exactTerms.every(q => content.includes(q) || title.includes(q))
+        const content = (document?.cleanedContent ?? '').toLowerCase()
+        return exactTerms.every(
+          q => content.includes(q) || removeDiacritics(title).includes(q)
+        )
      })
    }

@@ -333,7 +294,7 @@ export class Omnisearch {
    if (exclusions.length) {
      logDebug('Filtering with exclusions')
      results = results.filter(r => {
-        const content = stripMarkdownCharacters(
+        const content = (
          documents.find(d => d.path === r.id)?.content ?? ''
        ).toLowerCase()
        return exclusions.every(q => !content.includes(q))
@@ -402,7 +363,7 @@ export class Omnisearch {
      const foundWords = [
        // Matching terms from the result,
        // do not necessarily match the query
-        ...Object.keys(result.match),
+        ...result.terms,

        // Quoted expressions
        ...query.getExactTerms(),
--- a/src/search/query.ts
+++ b/src/search/query.ts
@@ -46,6 +46,15 @@ export class Query {
    }
    this.query = parsed

+    // Extract keywords starting with a dot...
+    const ext = this.query.text
+      .filter(o => o.startsWith('.'))
+      .map(o => o.slice(1))
+    // add them to the ext field...
+    this.query.ext = [...new Set([...ext, ...(this.query.ext ?? [])])]
+    // and remove them from the text field
+    this.query.text = this.query.text.filter(o => !o.startsWith('.'))
+
    // Get strings in quotes, and remove the quotes
    this.#inQuotes =
      text.match(/"([^"]+)"/g)?.map(o => o.replace(/"/g, '')) ?? []
@@ -89,4 +98,13 @@ export class Query {
      ),
    ]
  }
+
+  public getBestStringForExcerpt(): string {
+    // If we have quoted expressions, return the longest one
+    if (this.#inQuotes.length) {
+      return this.#inQuotes.sort((a, b) => b.length - a.length)[0] ?? ''
+    }
+    // Otherwise, just return the query as is
+    return this.segmentsToStr()
+  }
 }
--- a/src/search/tokenizer.ts
+++ b/src/search/tokenizer.ts
@@ -0,0 +1,79 @@
+import type { QueryCombination } from 'minisearch'
+import {
+  BRACKETS_AND_SPACE,
+  SPACE_OR_PUNCTUATION,
+  chsRegex,
+  getChsSegmenter,
+} from 'src/globals'
+import { logDebug, splitCamelCase, splitHyphens } from 'src/tools/utils'
+
+function tokenizeWords(text: string): string[] {
+  return text.split(BRACKETS_AND_SPACE)
+}
+
+function tokenizeTokens(text: string): string[] {
+  return text.split(SPACE_OR_PUNCTUATION)
+}
+
+/**
+ * Tokenization for indexing will possibly return more tokens than the original text.
+ * This is because we combine different methods of tokenization to get the best results.
+ * @param text
+ * @returns
+ */
+export function tokenizeForIndexing(text: string): string[] {
+  const words = tokenizeWords(text)
+
+  let tokens = tokenizeTokens(text)
+
+  // Split hyphenated tokens
+  tokens = [...tokens, ...tokens.flatMap(splitHyphens)]
+
+  // Split camelCase tokens into "camel" and "case
+  tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]
+
+  // Add whole words (aka "not tokens")
+  tokens = [...tokens, ...words]
+
+  const chsSegmenter = getChsSegmenter()
+  if (chsSegmenter) {
+    const chs = tokens.flatMap(word =>
+      chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
+    )
+    tokens = [...tokens, ...chs]
+  }
+
+  // Remove duplicates
+  tokens = [...new Set(tokens)]
+
+  return tokens
+}
+
+/**
+ * Search tokenization will use the same tokenization methods as indexing,
+ * but will combine each group with "OR" operators
+ * @param text
+ * @returns
+ */
+export function tokenizeForSearch(text: string): QueryCombination {
+  const tokens = tokenizeTokens(text)
+
+  let chs: string[] = []
+  const chsSegmenter = getChsSegmenter()
+  if (chsSegmenter) {
+    chs = tokens.flatMap(word =>
+      chsRegex.test(word) ? chsSegmenter.cut(word) : [word]
+    )
+  }
+
+  return {
+    combineWith: 'OR',
+    queries: [
+      { combineWith: 'AND', queries: tokens },
+      { combineWith: 'AND', queries: tokenizeWords(text) },
+      { combineWith: 'AND', queries: tokens.flatMap(splitHyphens) },
+      { combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) },
+      { combineWith: 'AND', queries: chs },
+    ],
+  }
+}
--- a/src/settings.ts
+++ b/src/settings.ts
@@ -37,6 +37,9 @@ export interface OmnisearchSettings extends WeightingSettings {
  PDFIndexing: boolean
  /** Enable Images indexing */
  imagesIndexing: boolean
+  /** Enable Office documents indexing */
+  officeIndexing: boolean
+
  /** Enable indexing of unknown files */
  unsupportedFilesIndexing: 'yes' | 'no' | 'default'
  /** Activate the small 🔍 button on Obsidian's ribbon */
@@ -99,7 +102,7 @@ export class SettingsTab extends PluginSettingTab {
    // Sponsor link - Thank you!
    const divSponsor = containerEl.createDiv()
    divSponsor.innerHTML = `
-        <iframe src="https://github.com/sponsors/scambier/button" title="Sponsor scambier" height="35" width="116" style="border: 0;"></iframe>
+        <iframe sandbox="allow-top-navigation-by-user-activation" src="https://github.com/sponsors/scambier/button" title="Sponsor scambier" height="35" width="116" style="border: 0;"></iframe>
        <a href='https://ko-fi.com/B0B6LQ2C' target='_blank'><img height='36' style='border:0px;height:36px;' src='https://cdn.ko-fi.com/cdn/kofi2.png?v=3' border='0' alt='Buy Me a Coffee at ko-fi.com' /></a> 
    `

@@ -158,11 +161,30 @@ export class SettingsTab extends PluginSettingTab {
      )
      .setDisabled(!getTextExtractor())

+    // Office Documents Indexing
+    const indexOfficesDesc = new DocumentFragment()
+    indexOfficesDesc.createSpan({}, span => {
+      span.innerHTML = `Omnisearch will use Text Extractor to index the content of your office documents (currently <pre style="display:inline">.docx</pre> and <pre style="display:inline">.xlsx</pre>)`
+    })
+    new Setting(containerEl)
+      .setName(
+        `Documents content indexing ${getTextExtractor() ? '' : '⚠️ Disabled'}`
+      )
+      .setDesc(indexOfficesDesc)
+      .addToggle(toggle =>
+        toggle.setValue(settings.officeIndexing).onChange(async v => {
+          await database.clearCache()
+          settings.officeIndexing = v
+          await saveSettings(this.plugin)
+        })
+      )
+      .setDisabled(!getTextExtractor())
+
    // Index filenames of unsupported files
    const indexUnsupportedDesc = new DocumentFragment()
    indexUnsupportedDesc.createSpan({}, span => {
      span.innerHTML = `
-      Omnisearch can index file<strong>names</strong> of "unsupported" files, such as e.g. <pre style="display:inline">.mp4</pre>, <pre style="display:inline">.xlsx</pre>, 
+      Omnisearch can index file<strong>names</strong> of "unsupported" files, such as e.g. <pre style="display:inline">.mp4</pre>
      or non-extracted PDFs & images.<br/>
      "Obsidian setting" will respect the value of "Files & Links > Detect all file extensions"`
    })
@@ -185,7 +207,7 @@ export class SettingsTab extends PluginSettingTab {
    indexedFileTypesDesc.createSpan({}, span => {
      span.innerHTML = `In addition to standard <code>md</code> files, Omnisearch can also index other <strong style="color: var(--text-accent)">PLAINTEXT</strong> files.<br/>
      Add extensions separated by a space, without the dot. Example: "<code>txt org csv</code>".<br />
-      ⚠️ <span style="color: var(--text-accent)">Using extensions of non-plaintext files (like .docx or .pptx) WILL cause crashes,
+      ⚠️ <span style="color: var(--text-accent)">Using extensions of non-plaintext files (like .pptx) WILL cause crashes,
      because Omnisearch will try to index their content.</span>`
    })
    new Setting(containerEl)
@@ -444,24 +466,6 @@ export class SettingsTab extends PluginSettingTab {

    //#endregion Results Weighting

-    //#region Debugging
-
-    new Setting(containerEl).setName('Debugging').setHeading()
-
-    new Setting(containerEl)
-      .setName('Enable verbose logging')
-      .setDesc(
-        "Adds a LOT of logs for debugging purposes. Don't forget to disable it."
-      )
-      .addToggle(toggle =>
-        toggle.setValue(settings.verboseLogging).onChange(async v => {
-          settings.verboseLogging = v
-          await saveSettings(this.plugin)
-        })
-      )
-
-    //#endregion Debugging
-
    //#region HTTP Server

    if (!Platform.isMobile) {
@@ -521,6 +525,24 @@ export class SettingsTab extends PluginSettingTab {

    //#endregion HTTP Server

+    //#region Debugging
+
+    new Setting(containerEl).setName('Debugging').setHeading()
+
+    new Setting(containerEl)
+      .setName('Enable verbose logging')
+      .setDesc(
+        "Adds a LOT of logs for debugging purposes. Don't forget to disable it."
+      )
+      .addToggle(toggle =>
+        toggle.setValue(settings.verboseLogging).onChange(async v => {
+          settings.verboseLogging = v
+          await saveSettings(this.plugin)
+        })
+      )
+
+    //#endregion Debugging
+
    //#region Danger Zone
    new Setting(containerEl).setName('Danger Zone').setHeading()

@@ -602,6 +624,7 @@ export const DEFAULT_SETTINGS: OmnisearchSettings = {
  ignoreDiacritics: true,
  indexedFileTypes: [] as string[],
  PDFIndexing: false,
+  officeIndexing: false,
  imagesIndexing: false,
  unsupportedFilesIndexing: 'no',
  splitCamelCase: false,
--- a/src/tools/api-server.ts
+++ b/src/tools/api-server.ts
@@ -63,7 +63,7 @@ export function getServer() {
    close() {
      server.close()
      console.log(`Omnisearch - Terminated HTTP server`)
-      if (settings.httpApiNotice) {
+      if (settings.httpApiEnabled && settings.httpApiNotice) {
        new Notice(`Omnisearch - Terminated HTTP server`)
      }
    },
--- a/src/tools/notes.ts
+++ b/src/tools/notes.ts
@@ -4,12 +4,9 @@ import { stringsToRegex } from './text-processing'

 export async function openNote(
  item: ResultNote,
+  offset = 0,
  newPane = false
 ): Promise<void> {
-  const reg = stringsToRegex(item.foundWords)
-  reg.exec(item.content)
-  const offset = reg.lastIndex
-
  // Check if the note is already open,
  // to avoid opening it twice if the first one is pinned
  let alreadyOpenAndPinned = false
--- a/src/tools/text-processing.ts
+++ b/src/tools/text-processing.ts
@@ -14,13 +14,6 @@ import type { Query } from 'src/search/query'
 import { Notice } from 'obsidian'
 import { escapeRegExp } from 'lodash-es'

-export function highlighterGroups(_substring: string, ...args: any[]) {
-  // args[0] is the single char preceding args[1], which is the word we want to highlight
-  if (!!args[1].trim())
-    return `<span>${args[0]}</span><span class="${highlightClass}">${args[1]}</span>`
-  return '&lt;no content&gt;'
-}
-
 /**
 * Wraps the matches in the text with a <span> element and a highlight class
 * @param text
@@ -134,18 +127,18 @@ export function getMatches(
      .substring(matchStartIndex, matchEndIndex)
      .trim()
    if (originalMatch && match.index >= 0) {
-      matches.push({ match: originalMatch, offset: match.index + 1 })
+      matches.push({ match: originalMatch, offset: match.index })
    }
  }

  // If the query is more than 1 token and can be found "as is" in the text, put this match first
-  if (query && query.query.text.length > 1) {
-    const best = text.indexOf(query.segmentsToStr())
+  if (query && (query.query.text.length > 1 || query.getExactTerms().length > 0)) {
+    const best = text.indexOf(query.getBestStringForExcerpt())
    if (best > -1 && matches.find(m => m.offset === best)) {
      matches = matches.filter(m => m.offset !== best)
      matches.unshift({
        offset: best,
-        match: query.segmentsToStr(),
+        match: query.getBestStringForExcerpt(),
      })
    }
  }
--- a/src/tools/utils.ts
+++ b/src/tools/utils.ts
@@ -9,10 +9,6 @@ import { canIndexUnsupportedFiles, settings } from '../settings'
 import { type BinaryLike, createHash } from 'crypto'
 import { md5 } from 'pure-md5'

-// export function highlighter(str: string): string {
-//   return `<span class="${highlightClass}">${str}</span>`
-// }
-
 export function pathWithoutFilename(path: string): string {
  const split = path.split('/')
  split.pop()
@@ -174,6 +170,11 @@ export function isFilePDF(path: string): boolean {
  return getExtension(path) === 'pdf'
 }

+export function isFileOffice(path: string): boolean {
+  const ext = getExtension(path)
+  return ext === 'docx' || ext === 'xlsx'
+}
+
 export function isFilePlaintext(path: string): boolean {
  return [...settings.indexedFileTypes, 'md'].some(t => path.endsWith(`.${t}`))
 }
--- a/versions.json
+++ b/versions.json
@@ -122,5 +122,12 @@
 	"1.18.1": "1.3.0",
 	"1.19.0-beta.1": "1.3.0",
 	"1.19.0": "1.3.0",
-	"1.20.0-beta.1": "1.3.0"
+	"1.20.0-beta.1": "1.3.0",
+	"1.20.0": "1.3.0",
+	"1.20.1": "1.3.0",
+	"1.20.2": "1.3.0",
+	"1.20.3": "1.3.0",
+	"1.20.4": "1.3.0",
+	"1.21.0": "1.3.0",
+	"1.21.1": "1.3.0"
 }