Massive refactor to get rid of the global app.

2024-05-25 22:49:50 +02:00
parent bbe7b112ed
commit 6566a2e958
24 changed files with 1532 additions and 1146 deletions
--- a/src/search/omnisearch.ts
+++ b/src/search/omnisearch.ts
@@ -4,79 +4,34 @@ import type { DocumentRef, IndexedDocument, ResultNote } from '../globals'
 import { chunkArray, logDebug, removeDiacritics } from '../tools/utils'
 import { Notice } from 'obsidian'
 import type { Query } from './query'
-import { cacheManager } from '../cache-manager'
 import { sortBy } from 'lodash-es'
-import { getMatches, stringsToRegex } from 'src/tools/text-processing'
-import { tokenizeForIndexing, tokenizeForSearch } from './tokenizer'
-import { getObsidianApp } from '../stores/obsidian-app'
-import { getSettings } from 'src/settings'
+import type OmnisearchPlugin from '../main'
+import { Tokenizer } from './tokenizer'

+// TODO: rename to SearchEngine
 export class Omnisearch {
-
-  private static instance: Omnisearch
-
-  app = getObsidianApp()
-  settings = getSettings()
-
-  public static getInstance(): Omnisearch {
-    if (!Omnisearch.instance) {
-      Omnisearch.instance = new Omnisearch();
-    }
-    return Omnisearch.instance;
-  }
-
-  public static readonly options: Options<IndexedDocument> = {
-    tokenize: tokenizeForIndexing,
-    extractField: (doc, fieldName) => {
-      if (fieldName === 'directory') {
-        // return path without the filename
-        const parts = doc.path.split('/')
-        parts.pop()
-        return parts.join('/')
-      }
-      return (doc as any)[fieldName]
-    },
-    processTerm: (term: string) =>
-      (getSettings().ignoreDiacritics ? removeDiacritics(term) : term).toLowerCase(),
-    idField: 'path',
-    fields: [
-      'basename',
-      // Different from `path`, since `path` is the unique index and needs to include the filename
-      'directory',
-      'aliases',
-      'content',
-      'headings1',
-      'headings2',
-      'headings3',
-    ],
-    storeFields: ['tags'],
-    logger(_level, _message, code) {
-      if (code === 'version_conflict') {
-        new Notice(
-          'Omnisearch - Your index cache may be incorrect or corrupted. If this message keeps appearing, go to Settings to clear the cache.',
-          5000
-        )
-      }
-    },
-  }
-
+  private tokenizer: Tokenizer
  private minisearch: MiniSearch
  /** Map<path, mtime> */
  private indexedDocuments: Map<string, number> = new Map()
  // private previousResults: SearchResult[] = []
  // private previousQuery: Query | null = null

-  private constructor() {
-    this.minisearch = new MiniSearch(Omnisearch.options)
+  constructor(protected plugin: OmnisearchPlugin) {
+    this.tokenizer = new Tokenizer(plugin)
+    this.minisearch = new MiniSearch(this.getOptions())
  }

  /**
   * Return true if the cache is valid
   */
  async loadCache(): Promise<boolean> {
-    const cache = await cacheManager.getMinisearchCache()
+    const cache = await this.plugin.cacheManager.getMinisearchCache()
    if (cache) {
-      this.minisearch = await MiniSearch.loadJSAsync(cache.data, Omnisearch.options)
+      this.minisearch = await MiniSearch.loadJSAsync(
+        cache.data,
+        this.getOptions()
+      )
      this.indexedDocuments = new Map(cache.paths.map(o => [o.path, o.mtime]))
      return true
    }
@@ -117,7 +72,9 @@ export class Omnisearch {
    logDebug('Adding files', paths)
    let documents = (
      await Promise.all(
-        paths.map(async path => await cacheManager.getDocument(path))
+        paths.map(
+          async path => await this.plugin.cacheManager.getDocument(path)
+        )
      )
    ).filter(d => !!d?.path)
    logDebug('Sorting documents to first index markdown')
@@ -164,6 +121,7 @@ export class Omnisearch {
    query: Query,
    options: { prefixLength: number; singleFilePath?: string }
  ): Promise<SearchResult[]> {
+    const settings = this.plugin.settings
    if (query.isEmpty()) {
      // this.previousResults = []
      // this.previousQuery = null
@@ -174,7 +132,7 @@ export class Omnisearch {
    logDebug('Starting search for', query)

    let fuzziness: number
-    switch (this.settings.fuzziness) {
+    switch (settings.fuzziness) {
      case '0':
        fuzziness = 0
        break
@@ -186,7 +144,7 @@ export class Omnisearch {
        break
    }

-    const searchTokens = tokenizeForSearch(query.segmentsToStr())
+    const searchTokens = this.tokenizer.tokenizeForSearch(query.segmentsToStr())
    logDebug(JSON.stringify(searchTokens, null, 1))
    let results = this.minisearch.search(searchTokens, {
      prefix: term => term.length >= options.prefixLength,
@@ -196,14 +154,14 @@ export class Omnisearch {
      fuzzy: term =>
        term.length <= 3 ? 0 : term.length <= 5 ? fuzziness / 2 : fuzziness,
      boost: {
-        basename: this.settings.weightBasename,
-        directory: this.settings.weightDirectory,
-        aliases: this.settings.weightBasename,
-        headings1: this.settings.weightH1,
-        headings2: this.settings.weightH2,
-        headings3: this.settings.weightH3,
-        tags: this.settings.weightUnmarkedTags,
-        unmarkedTags: this.settings.weightUnmarkedTags,
+        basename: settings.weightBasename,
+        directory: settings.weightDirectory,
+        aliases: settings.weightBasename,
+        headings1: settings.weightH1,
+        headings2: settings.weightH2,
+        headings3: settings.weightH3,
+        tags: settings.weightUnmarkedTags,
+        unmarkedTags: settings.weightUnmarkedTags,
      },
      // The query is already tokenized, don't tokenize again
      tokenize: text => [text],
@@ -249,25 +207,25 @@ export class Omnisearch {

    logDebug(
      'searching with downranked folders',
-      this.settings.downrankedFoldersFilters
+      settings.downrankedFoldersFilters
    )

    // Hide or downrank files that are in Obsidian's excluded list
-    if (this.settings.hideExcluded) {
+    if (settings.hideExcluded) {
      // Filter the files out
      results = results.filter(
        result =>
          !(
-            this.app.metadataCache.isUserIgnored &&
-            this.app.metadataCache.isUserIgnored(result.id)
+            this.plugin.app.metadataCache.isUserIgnored &&
+            this.plugin.app.metadataCache.isUserIgnored(result.id)
          )
      )
    } else {
      // Just downrank them
      results.forEach(result => {
        if (
-          this.app.metadataCache.isUserIgnored &&
-          this.app.metadataCache.isUserIgnored(result.id)
+          this.plugin.app.metadataCache.isUserIgnored &&
+          this.plugin.app.metadataCache.isUserIgnored(result.id)
        ) {
          result.score /= 10
        }
@@ -279,10 +237,10 @@ export class Omnisearch {

    for (const result of results) {
      const path = result.id
-      if (this.settings.downrankedFoldersFilters.length > 0) {
+      if (settings.downrankedFoldersFilters.length > 0) {
        // downrank files that are in folders listed in the downrankedFoldersFilters
        let downrankingFolder = false
-        this.settings.downrankedFoldersFilters.forEach(filter => {
+        settings.downrankedFoldersFilters.forEach(filter => {
          if (path.startsWith(filter)) {
            // we don't want the filter to match the folder sources, e.g.
            // it needs to match a whole folder name
@@ -299,7 +257,7 @@ export class Omnisearch {
        const pathPartsLength = pathParts.length
        for (let i = 0; i < pathPartsLength; i++) {
          const pathPart = pathParts[i]
-          if (this.settings.downrankedFoldersFilters.includes(pathPart)) {
+          if (settings.downrankedFoldersFilters.includes(pathPart)) {
            result.score /= 10
            break
          }
@@ -307,9 +265,9 @@ export class Omnisearch {
      }

      // Boost custom properties
-      const metadata = this.app.metadataCache.getCache(path)
+      const metadata = this.plugin.app.metadataCache.getCache(path)
      if (metadata) {
-        for (const { name, weight } of this.settings.weightCustomProperties) {
+        for (const { name, weight } of settings.weightCustomProperties) {
          const values = metadata?.frontmatter?.[name]
          if (values && result.terms.some(t => values.includes(t))) {
            logDebug(`Boosting field "${name}" x${weight} for ${path}`)
@@ -333,7 +291,9 @@ export class Omnisearch {
    if (results.length) logDebug('First result:', results[0])

    const documents = await Promise.all(
-      results.map(async result => await cacheManager.getDocument(result.id))
+      results.map(
+        async result => await this.plugin.cacheManager.getDocument(result.id)
+      )
    )

    // If the search query contains quotes, filter out results that don't have the exact match
@@ -389,7 +349,7 @@ export class Omnisearch {
  ): Promise<ResultNote[]> {
    // Get the raw results
    let results: SearchResult[]
-    if (this.settings.simpleSearch) {
+    if (this.plugin.settings.simpleSearch) {
      results = await this.search(query, {
        prefixLength: 3,
        singleFilePath: options?.singleFilePath,
@@ -402,7 +362,9 @@ export class Omnisearch {
    }

    const documents = await Promise.all(
-      results.map(async result => await cacheManager.getDocument(result.id))
+      results.map(
+        async result => await this.plugin.cacheManager.getDocument(result.id)
+      )
    )

    // Map the raw results to get usable suggestions
@@ -435,9 +397,9 @@ export class Omnisearch {
      logDebug('Matching tokens:', foundWords)

      logDebug('Getting matches locations...')
-      const matches = getMatches(
+      const matches = this.plugin.textProcessor.getMatches(
        note.content,
-        stringsToRegex(foundWords),
+        foundWords,
        query
      )
      logDebug(`Matches for ${note.basename}`, matches)
@@ -453,10 +415,49 @@ export class Omnisearch {
  }

  public async writeToCache(): Promise<void> {
-    await cacheManager.writeMinisearchCache(
+    await this.plugin.cacheManager.writeMinisearchCache(
      this.minisearch,
      this.indexedDocuments
    )
  }
-}

+  private getOptions(): Options<IndexedDocument> {
+    return {
+      tokenize: this.tokenizer.tokenizeForIndexing,
+      extractField: (doc, fieldName) => {
+        if (fieldName === 'directory') {
+          // return path without the filename
+          const parts = doc.path.split('/')
+          parts.pop()
+          return parts.join('/')
+        }
+        return (doc as any)[fieldName]
+      },
+      processTerm: (term: string) =>
+        (this.plugin.settings.ignoreDiacritics
+          ? removeDiacritics(term)
+          : term
+        ).toLowerCase(),
+      idField: 'path',
+      fields: [
+        'basename',
+        // Different from `path`, since `path` is the unique index and needs to include the filename
+        'directory',
+        'aliases',
+        'content',
+        'headings1',
+        'headings2',
+        'headings3',
+      ],
+      storeFields: ['tags'],
+      logger(_level, _message, code) {
+        if (code === 'version_conflict') {
+          new Notice(
+            'Omnisearch - Your index cache may be incorrect or corrupted. If this message keeps appearing, go to Settings to clear the cache.',
+            5000
+          )
+        }
+      },
+    }
+  }
+}
--- a/src/search/query.ts
+++ b/src/search/query.ts
@@ -1,4 +1,3 @@
-import { getSettings } from 'src/settings'
 import { removeDiacritics } from '../tools/utils'
 import { parse } from 'search-query-parser'

@@ -14,8 +13,8 @@ export class Query {
  }
  #inQuotes: string[]

-  constructor(text = '') {
-    if (getSettings().ignoreDiacritics) {
+  constructor(text = '', options: { ignoreDiacritics: boolean }) {
+    if (options.ignoreDiacritics) {
      text = removeDiacritics(text)
    }
    const parsed = parse(text.toLowerCase(), {
--- a/src/search/tokenizer.ts
+++ b/src/search/tokenizer.ts
@@ -1,93 +1,96 @@
 import type { QueryCombination } from 'minisearch'
-import {
-  BRACKETS_AND_SPACE,
-  SPACE_OR_PUNCTUATION,
-  chsRegex,
-  getChsSegmenter,
-} from 'src/globals'
-import { getSettings } from 'src/settings'
+import { BRACKETS_AND_SPACE, chsRegex, SPACE_OR_PUNCTUATION } from 'src/globals'
 import { logDebug, splitCamelCase, splitHyphens } from 'src/tools/utils'
+import type OmnisearchPlugin from '../main'
+
 const markdownLinkExtractor = require('markdown-link-extractor')

-function tokenizeWords(text: string, { skipChs = false } = {}): string[] {
-  const tokens = text.split(BRACKETS_AND_SPACE)
-  if (skipChs) return tokens
-  return tokenizeChsWord(tokens)
-}
+export class Tokenizer {
+  constructor(private plugin: OmnisearchPlugin) {}

-function tokenizeTokens(text: string, { skipChs = false } = {}): string[] {
-  const tokens = text.split(SPACE_OR_PUNCTUATION)
-  if (skipChs) return tokens
-  return tokenizeChsWord(tokens)
-}
+  /**
+   * Tokenization for indexing will possibly return more tokens than the original text.
+   * This is because we combine different methods of tokenization to get the best results.
+   * @param text
+   * @returns
+   */
+  public tokenizeForIndexing(text: string): string[] {
+    const words = this.tokenizeWords(text)
+    let urls: string[] = []
+    if (this.plugin.settings.tokenizeUrls) {
+      try {
+        urls = markdownLinkExtractor(text)
+      } catch (e) {
+        logDebug('Error extracting urls', e)
+      }
+    }

-function tokenizeChsWord(tokens: string[]): string[] {
-  const segmenter = getChsSegmenter()
-  if (!segmenter) return tokens
-  return tokens.flatMap(word =>
-    chsRegex.test(word) ? segmenter.cut(word, { search: true }) : [word]
-  )
-}
+    let tokens = this.tokenizeTokens(text, { skipChs: true })

-/**
- * Tokenization for indexing will possibly return more tokens than the original text.
- * This is because we combine different methods of tokenization to get the best results.
- * @param text
- * @returns
- */
-export function tokenizeForIndexing(text: string): string[] {
-  const words = tokenizeWords(text)
-  let urls: string[] = []
-  if (getSettings().tokenizeUrls) {
-    try {
-      urls = markdownLinkExtractor(text)
-    } catch (e) {
-      logDebug('Error extracting urls', e)
+    // Split hyphenated tokens
+    tokens = [...tokens, ...tokens.flatMap(splitHyphens)]
+
+    // Split camelCase tokens into "camel" and "case
+    tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]
+
+    // Add whole words (aka "not tokens")
+    tokens = [...tokens, ...words]
+
+    // Add urls
+    if (urls.length) {
+      tokens = [...tokens, ...urls]
+    }
+
+    // Remove duplicates
+    tokens = [...new Set(tokens)]
+
+    return tokens
+  }
+
+  /**
+   * Search tokenization will use the same tokenization methods as indexing,
+   * but will combine each group with "OR" operators
+   * @param text
+   * @returns
+   */
+  public tokenizeForSearch(text: string): QueryCombination {
+    // Extract urls and remove them from the query
+    const urls: string[] = markdownLinkExtractor(text)
+    text = urls.reduce((acc, url) => acc.replace(url, ''), text)
+
+    const tokens = [...this.tokenizeTokens(text), ...urls].filter(Boolean)
+
+    return {
+      combineWith: 'OR',
+      queries: [
+        { combineWith: 'AND', queries: tokens },
+        {
+          combineWith: 'AND',
+          queries: this.tokenizeWords(text).filter(Boolean),
+        },
+        { combineWith: 'AND', queries: tokens.flatMap(splitHyphens) },
+        { combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) },
+      ],
    }
  }

-  let tokens = tokenizeTokens(text, { skipChs: true })
-
-  // Split hyphenated tokens
-  tokens = [...tokens, ...tokens.flatMap(splitHyphens)]
-
-  // Split camelCase tokens into "camel" and "case
-  tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]
-
-  // Add whole words (aka "not tokens")
-  tokens = [...tokens, ...words]
-
-  // Add urls
-  if (urls.length) {
-    tokens = [...tokens, ...urls]
+  private tokenizeWords(text: string, { skipChs = false } = {}): string[] {
+    const tokens = text.split(BRACKETS_AND_SPACE)
+    if (skipChs) return tokens
+    return this.tokenizeChsWord(tokens)
  }

-  // Remove duplicates
-  tokens = [...new Set(tokens)]
+  private tokenizeTokens(text: string, { skipChs = false } = {}): string[] {
+    const tokens = text.split(SPACE_OR_PUNCTUATION)
+    if (skipChs) return tokens
+    return this.tokenizeChsWord(tokens)
+  }

-  return tokens
-}
-
-/**
- * Search tokenization will use the same tokenization methods as indexing,
- * but will combine each group with "OR" operators
- * @param text
- * @returns
- */
-export function tokenizeForSearch(text: string): QueryCombination {
-  // Extract urls and remove them from the query
-  const urls: string[] = markdownLinkExtractor(text)
-  text = urls.reduce((acc, url) => acc.replace(url, ''), text)
-
-  const tokens = [...tokenizeTokens(text), ...urls].filter(Boolean)
-
-  return {
-    combineWith: 'OR',
-    queries: [
-      { combineWith: 'AND', queries: tokens },
-      { combineWith: 'AND', queries: tokenizeWords(text).filter(Boolean) },
-      { combineWith: 'AND', queries: tokens.flatMap(splitHyphens) },
-      { combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) },
-    ],
+  private tokenizeChsWord(tokens: string[]): string[] {
+    const segmenter = this.plugin.getChsSegmenter()
+    if (!segmenter) return tokens
+    return tokens.flatMap(word =>
+      chsRegex.test(word) ? segmenter.cut(word, { search: true }) : [word]
+    )
  }
 }