obsidian-tannersearch/src/search/search-engine.ts

import MiniSearch, {
  type AsPlainObject,
  type Options,
  type SearchResult,
} from 'minisearch'
import type { DocumentRef, IndexedDocument, ResultNote } from '../globals'

import { chunkArray, logDebug, removeDiacritics } from '../tools/utils'
import { Notice } from 'obsidian'
import type { Query } from './query'
import { sortBy } from 'lodash-es'
import type OmnisearchPlugin from '../main'
import { Tokenizer } from './tokenizer'

export class SearchEngine {
  private tokenizer: Tokenizer
  private minisearch: MiniSearch
  /** Map<path, mtime> */
  private indexedDocuments: Map<string, number> = new Map()

  // private previousResults: SearchResult[] = []
  // private previousQuery: Query | null = null

  constructor(protected plugin: OmnisearchPlugin) {
    this.tokenizer = new Tokenizer(plugin)
    this.minisearch = new MiniSearch(this.getOptions())
  }

  /**
   * Return true if the cache is valid
   */
  async loadCache(): Promise<boolean> {
    await this.plugin.embedsRepository.loadFromCache()
    const cache = await this.plugin.database.getMinisearchCache()
    if (cache) {
      this.minisearch = await MiniSearch.loadJSAsync(
        cache.data,
        this.getOptions()
      )
      this.indexedDocuments = new Map(cache.paths.map(o => [o.path, o.mtime]))
      return true
    }
    console.log('Omnisearch - No cache found')
    return false
  }

  /**
   * Returns the list of documents that need to be reindexed or removed,
   * either because they are new, have been modified, or have been deleted
   * @param docs
   */
  getDocumentsToReindex(docs: DocumentRef[]): {
    toAdd: DocumentRef[]
    toRemove: DocumentRef[]
  } {
    const docsMap = new Map(docs.map(d => [d.path, d.mtime]))

    // console.log(this.indexedDocuments)
    const toAdd = docs.filter(
      d =>
        !this.indexedDocuments.has(d.path) ||
        this.indexedDocuments.get(d.path) !== d.mtime
    )
    // console.log(toAdd)
    const toRemove = [...this.indexedDocuments]
      .filter(
        ([path, mtime]) => !docsMap.has(path) || docsMap.get(path) !== mtime
      )
      .map(o => ({ path: o[0], mtime: o[1] }))
    return { toAdd, toRemove }
  }

  /**
   * Add notes/PDFs/images to the search index
   * @param paths
   */
  public async addFromPaths(paths: string[]): Promise<void> {
    logDebug('Adding files', paths)
    let documents = (
      await Promise.all(
        paths.map(
          async path => await this.plugin.cacheManager.getDocument(path)
        )
      )
    ).filter(d => !!d?.path)
    logDebug('Sorting documents to first index markdown')
    // Index markdown files first
    documents = sortBy(documents, d => (d.path.endsWith('.md') ? 0 : 1))

    // If a document is already added, discard it
    this.removeFromPaths(
      documents.filter(d => this.indexedDocuments.has(d.path)).map(d => d.path)
    )

    // Split the documents in smaller chunks to add them to minisearch
    const chunkedDocs = chunkArray(documents, 500)
    for (const docs of chunkedDocs) {
      logDebug('Indexing into search engine', docs)
      // Update the list of indexed docs
      docs.forEach(doc => this.indexedDocuments.set(doc.path, doc.mtime))

      // Discard files that may have been already added (though it shouldn't happen)
      const alreadyAdded = docs.filter(doc => this.minisearch.has(doc.path))
      this.removeFromPaths(alreadyAdded.map(o => o.path))

      // Add docs to minisearch
      await this.minisearch.addAllAsync(docs)
    }
  }

  /**
   * Discard a document from minisearch
   * @param paths
   */
  public removeFromPaths(paths: string[]): void {
    paths.forEach(p => this.indexedDocuments.delete(p))
    // Make sure to not discard a file that we don't have
    const existing = paths.filter(p => this.minisearch.has(p))
    this.minisearch.discardAll(existing)
  }

  /**
   * Searches the index for the given query,
   * and returns an array of raw results
   */
  public async search(
    query: Query,
    options: { prefixLength: number; singleFilePath?: string }
  ): Promise<SearchResult[]> {
    const settings = this.plugin.settings
    if (query.isEmpty()) {
      // this.previousResults = []
      // this.previousQuery = null
      return []
    }

    logDebug('=== New search ===')
    logDebug('Starting search for', query)

    let fuzziness: number
    switch (settings.fuzziness) {
      case '0':
        fuzziness = 0
        break
      case '1':
        fuzziness = 0.1
        break
      default:
        fuzziness = 0.2
        break
    }

    const searchTokens = this.tokenizer.tokenizeForSearch(query.segmentsToStr())
    logDebug(JSON.stringify(searchTokens, null, 1))
    let results = this.minisearch.search(searchTokens, {
      prefix: term => term.length >= options.prefixLength,
      // length <= 3: no fuzziness
      // length <= 5: fuzziness of 10%
      // length > 5: fuzziness of 20%
      fuzzy: term =>
        term.length <= 3 ? 0 : term.length <= 5 ? fuzziness / 2 : fuzziness,
      boost: {
        basename: settings.weightBasename,
        aliases: settings.weightBasename,
        displayTitle: settings.weightBasename,
        directory: settings.weightDirectory,
        headings1: settings.weightH1,
        headings2: settings.weightH2,
        headings3: settings.weightH3,
        tags: settings.weightUnmarkedTags,
        unmarkedTags: settings.weightUnmarkedTags,
      },
      // The query is already tokenized, don't tokenize again
      tokenize: text => [text],
    })

    logDebug(`Found ${results.length} results`, results)

    // Filter query results to only keep files that match query.query.ext (if any)
    if (query.query.ext?.length) {
      results = results.filter(r => {
        // ".can" should match ".canvas"
        const ext = '.' + r.id.split('.').pop()
        return query.query.ext?.some(e =>
          ext.startsWith(e.startsWith('.') ? e : '.' + e)
        )
      })
    }

    // Filter query results that match the path
    if (query.query.path) {
      results = results.filter(r =>
        query.query.path?.some(p =>
          (r.id as string).toLowerCase().includes(p.toLowerCase())
        )
      )
    }
    if (query.query.exclude.path) {
      results = results.filter(
        r =>
          !query.query.exclude.path?.some(p =>
            (r.id as string).toLowerCase().includes(p.toLowerCase())
          )
      )
    }

    if (!results.length) {
      return []
    }

    if (options.singleFilePath) {
      return results.filter(r => r.id === options.singleFilePath)
    }

    logDebug(
      'searching with downranked folders',
      settings.downrankedFoldersFilters
    )

    // Hide or downrank files that are in Obsidian's excluded list
    if (settings.hideExcluded) {
      // Filter the files out
      results = results.filter(
        result =>
          !(
            this.plugin.app.metadataCache.isUserIgnored &&
            this.plugin.app.metadataCache.isUserIgnored(result.id)
          )
      )
    } else {
      // Just downrank them
      results.forEach(result => {
        if (
          this.plugin.app.metadataCache.isUserIgnored &&
          this.plugin.app.metadataCache.isUserIgnored(result.id)
        ) {
          result.score /= 10
        }
      })
    }

    // Extract tags from the query
    const tags = query.getTags()

    for (const result of results) {
      const path = result.id
      if (settings.downrankedFoldersFilters.length > 0) {
        // downrank files that are in folders listed in the downrankedFoldersFilters
        let downrankingFolder = false
        settings.downrankedFoldersFilters.forEach(filter => {
          if (path.startsWith(filter)) {
            // we don't want the filter to match the folder sources, e.g.
            // it needs to match a whole folder name
            if (path === filter || path.startsWith(filter + '/')) {
              logDebug('searching with downranked folders in path: ', path)
              downrankingFolder = true
            }
          }
        })
        if (downrankingFolder) {
          result.score /= 10
        }
        const pathParts = path.split('/')
        const pathPartsLength = pathParts.length
        for (let i = 0; i < pathPartsLength; i++) {
          const pathPart = pathParts[i]
          if (settings.downrankedFoldersFilters.includes(pathPart)) {
            result.score /= 10
            break
          }
        }
      }

      const metadata = this.plugin.app.metadataCache.getCache(path)
      if (metadata) {
        // Boost custom properties
        for (const { name, weight } of settings.weightCustomProperties) {
          const values = metadata?.frontmatter?.[name]
          if (values && result.terms.some(t => values.includes(t))) {
            logDebug(`Boosting field "${name}" x${weight} for ${path}`)
            result.score *= weight
          }
        }
      }

      // Put the results with tags on top
      for (const tag of tags) {
        if ((result.tags ?? []).includes(tag)) {
          result.score *= 100
        }
      }
    }
    logDebug('Sorting and limiting results')

    // Sort results and keep the 50 best
    results = results.sort((a, b) => b.score - a.score).slice(0, 50)

    logDebug('Filtered results:', results)

    if (results.length) logDebug('First result:', results[0])

    const documents = await Promise.all(
      results.map(
        async result => await this.plugin.cacheManager.getDocument(result.id)
      )
    )

    // If the search query contains quotes, filter out results that don't have the exact match
    const exactTerms = query.getExactTerms()
    if (exactTerms.length) {
      logDebug('Filtering with quoted terms: ', exactTerms)
      results = results.filter(r => {
        const document = documents.find(d => d.path === r.id)
        const title = document?.path.toLowerCase() ?? ''
        const content = (document?.cleanedContent ?? '').toLowerCase()
        return exactTerms.every(
          q =>
            content.includes(q) ||
            removeDiacritics(
              title,
              this.plugin.settings.ignoreArabicDiacritics
            ).includes(q)
        )
      })
    }

    // If the search query contains exclude terms, filter out results that have them
    const exclusions = query.query.exclude.text
    if (exclusions.length) {
      logDebug('Filtering with exclusions')
      results = results.filter(r => {
        const content = (
          documents.find(d => d.path === r.id)?.content ?? ''
        ).toLowerCase()
        return exclusions.every(q => !content.includes(q))
      })
    }

    logDebug('Deduping')
    // FIXME:
    // Dedupe results - clutch for https://github.com/scambier/obsidian-omnisearch/issues/129
    results = results.filter(
      (result, index, arr) => arr.findIndex(t => t.id === result.id) === index
    )

    // this.previousQuery = query
    // this.previousResults = results

    return results
  }

  /**
   * Searches the index, and returns an array of ResultNote objects.
   * If we have the singleFile option set,
   * the array contains a single result from that file
   * @param query
   * @param options
   * @returns
   */
  public async getSuggestions(
    query: Query,
    options?: Partial<{ singleFilePath?: string }>
  ): Promise<ResultNote[]> {
    // Get the raw results
    let results: SearchResult[]
    if (this.plugin.settings.simpleSearch) {
      results = await this.search(query, {
        prefixLength: 3,
        singleFilePath: options?.singleFilePath,
      })
    } else {
      results = await this.search(query, {
        prefixLength: 1,
        singleFilePath: options?.singleFilePath,
      })
    }

    const documents = await Promise.all(
      results.map(
        async result => await this.plugin.cacheManager.getDocument(result.id)
      )
    )

    // Inject embeds for images, documents, and PDFs
    let total = documents.length
    for (let i = 0; i < total; i++) {
      const doc = documents[i]
      if (!doc) continue

      const embeds = this.plugin.embedsRepository
        .getEmbeds(doc.path)
        .slice(0, this.plugin.settings.maxEmbeds)

      // Inject embeds in the results
      for (const embed of embeds) {
        total++
        const newDoc = await this.plugin.cacheManager.getDocument(embed)
        documents.splice(i + 1, 0, newDoc)
        results.splice(i + 1, 0, {
          id: newDoc.path,
          score: 0,
          terms: [],
          queryTerms: [],
          match: {},
          isEmbed: true,
        })
        i++ // Increment i to skip the newly inserted document
      }
    }

    // Map the raw results to get usable suggestions
    const resultNotes = results.map(result => {
      logDebug('Locating matches for', result.id)
      let note = documents.find(d => d.path === result.id)
      if (!note) {
        // throw new Error(`Omnisearch - Note "${result.id}" not indexed`)
        console.warn(`Omnisearch - Note "${result.id}" not in the live cache`)
        note = {
          content: '',
          basename: result.id,
          path: result.id,
        } as IndexedDocument
      }

      // Clean search matches that match quoted expressions,
      // and inject those expressions instead
      const foundWords = [
        // Matching terms from the result,
        // do not necessarily match the query
        ...result.terms,

        // Quoted expressions
        ...query.getExactTerms(),

        // Tags, starting with #
        ...query.getTags(),
      ]
      logDebug('Matching tokens:', foundWords)

      logDebug('Getting matches locations...')
      const matches = this.plugin.textProcessor.getMatches(
        note.content,
        foundWords,
        query
      )
      logDebug(`Matches for note "${note.path}"`, matches)
      const resultNote: ResultNote = {
        score: result.score,
        foundWords,
        matches,
        isEmbed: result.isEmbed,
        ...note,
      }
      return resultNote
    })

    logDebug('Suggestions:', resultNotes)

    return resultNotes
  }

  /**
   * For cache saving
   */
  public getSerializedMiniSearch(): AsPlainObject {
    return this.minisearch.toJSON()
  }

  /**
   * For cache saving
   */
  public getSerializedIndexedDocuments(): { path: string; mtime: number }[] {
    return Array.from(this.indexedDocuments).map(([path, mtime]) => ({
      path,
      mtime,
    }))
  }

  private getOptions(): Options<IndexedDocument> {
    return {
      tokenize: this.tokenizer.tokenizeForIndexing.bind(this.tokenizer),
      extractField: (doc, fieldName) => {
        if (fieldName === 'directory') {
          // return path without the filename
          const parts = doc.path.split('/')
          parts.pop()
          return parts.join('/')
        }
        return (doc as any)[fieldName]
      },
      processTerm: (term: string) =>
        (this.plugin.settings.ignoreDiacritics
          ? removeDiacritics(term, this.plugin.settings.ignoreArabicDiacritics)
          : term
        ).toLowerCase(),
      idField: 'path',
      fields: [
        'basename',
        // Different from `path`, since `path` is the unique index and needs to include the filename
        'directory',
        'aliases',
        'content',
        'headings1',
        'headings2',
        'headings3',
      ],
      storeFields: ['tags'],
      logger(_level, _message, code) {
        if (code === 'version_conflict') {
          new Notice(
            'Omnisearch - Your index cache may be incorrect or corrupted. If this message keeps appearing, go to Settings to clear the cache.',
            5000
          )
        }
      },
    }
  }
}