Compare commits

..

10 Commits

Author SHA1 Message Date
1297a1034a Ignore aider 2026-02-05 16:00:22 -07:00
b195bf65ee fix: Resolve TypeScript build errors with type imports and assertion
Co-authored-by: aider (gemini/gemini-2.5-pro) <aider@aider.chat>
2026-02-05 15:56:32 -07:00
2ef3a1392f feat: Treat contextual colon-suffixed lines as headings for indexing
Co-authored-by: aider (gemini/gemini-2.5-pro) <aider@aider.chat>
2026-02-05 15:36:39 -07:00
c75d5d89f7 fix: Filter stop words and short tokens from search queries
Co-authored-by: aider (gemini/gemini-2.5-pro) <aider@aider.chat>
2026-02-05 15:24:19 -07:00
df73ab0f1c feat: Filter stop words and short tokens from search index
Co-authored-by: aider (gemini/gemini-2.5-pro) <aider@aider.chat>
2026-02-05 15:21:06 -07:00
637c20905e fix: Improve search tokenizer by adding exact phrase and filtering queries
Co-authored-by: aider (gemini/gemini-2.5-pro) <aider@aider.chat>
2026-02-05 13:59:00 -07:00
c4c4e782fb fix: Correct single-word query ranking to prioritize headings
Co-authored-by: aider (gemini/gemini-2.5-pro) <aider@aider.chat>
2026-02-05 13:21:14 -07:00
2b00a7af2d fix: Prioritize exact phrase matches and fix case-sensitive search
Co-authored-by: aider (gemini/gemini-2.5-pro) <aider@aider.chat>
2026-02-05 13:09:36 -07:00
3c84980903 fix: Prevent premature HTML escaping of search terms
Co-authored-by: aider (gemini/gemini-2.5-pro) <aider@aider.chat>
2026-02-05 13:03:34 -07:00
f17f9756a3 fix: Prevent search tokenizer from splitting on apostrophes
Co-authored-by: aider (gemini/gemini-2.5-pro) <aider@aider.chat>
2026-02-05 12:57:44 -07:00
6 changed files with 104 additions and 29 deletions

1
.gitignore vendored
View File

@@ -22,3 +22,4 @@ dist
coverage
package-lock.json
Doc Omnisearch/.obsidian
.aider*

View File

@@ -112,7 +112,7 @@ export type AIImageAnalyzerAPI = {
}
export const SEPARATORS =
/[|\t\n\r\^"= -#%-*,.`\/<>:;?@[-\]_{}\u00A0\u00A1\u00A7\u00AB\u00B6\u00B7\u00BB\u00BF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C77\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166E\u1680\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2000-\u200A\u2010-\u2029\u202F-\u2043\u2045-\u2051\u2053-\u205F\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4F\u3000-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]/
/[|\t\n\r\^"= -#-%&(*,.`\/<>:;?@[-\]_{}\u00A0\u00A1\u00A7\u00AB\u00B6\u00B7\u00BB\u00BF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C77\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166E\u1680\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2000-\u200A\u2010-\u2017\u201A-\u2029\u202F-\u2043\u2045-\u2051\u2053-\u205F\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4F\u3000-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]/
.toString()
.slice(1, -1)
export const SPACE_OR_PUNCTUATION = new RegExp(`${SEPARATORS}+`, 'u')

View File

@@ -229,6 +229,28 @@ export class DocumentsRepository {
metadata?.frontmatter?.[this.plugin.settings.displayTitle] ?? ''
}
const tags = getTagsFromMetadata(metadata)
const headings1 = metadata ? extractHeadingsFromCache(metadata, 1) : []
const headings2 = metadata ? extractHeadingsFromCache(metadata, 2) : []
const headings3 = metadata ? extractHeadingsFromCache(metadata, 3) : []
const lines = content.split('\n')
const colonHeadings: string[] = []
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim()
if (line.endsWith(':')) {
const prevLine = i > 0 ? lines[i - 1].trim() : null
const nextLine = i < lines.length - 1 ? lines[i + 1].trim() : null
if (
prevLine === '' &&
nextLine !== null &&
nextLine !== ''
) {
colonHeadings.push(line.slice(0, -1).trim())
}
}
}
return {
basename: file.basename,
displayTitle,
@@ -241,15 +263,9 @@ export class DocumentsRepository {
tags: tags,
unmarkedTags: tags.map(t => t.replace('#', '')),
aliases: getAliasesFromMetadata(metadata).join(''),
headings1: metadata
? extractHeadingsFromCache(metadata, 1).join(' ')
: '',
headings2: metadata
? extractHeadingsFromCache(metadata, 2).join(' ')
: '',
headings3: metadata
? extractHeadingsFromCache(metadata, 3).join(' ')
: '',
headings1: headings1.join(' '),
headings2: headings2.join(' '),
headings3: [...headings3, ...colonHeadings].join(' '),
}
}
}

View File

@@ -8,6 +8,7 @@ import {
type DocumentRef,
type IndexedDocument,
type ResultNote,
type SearchMatch,
} from '../globals'
import {
@@ -22,6 +23,8 @@ import { sortBy } from 'lodash-es'
import type OmnisearchPlugin from '../main'
import { Tokenizer } from './tokenizer'
const STOP_WORDS = new Set(["a", "an", "the", "and", "or", "but", "if", "in", "on", "at", "by", "for", "with", "to", "from", "of", "is", "it", "that", "this"])
export class SearchEngine {
private tokenizer: Tokenizer
private minisearch: MiniSearch
@@ -481,6 +484,16 @@ export class SearchEngine {
query
)
let bestMatch: SearchMatch | undefined
if (
matches.length > 0 &&
(query.query.text.length > 1 || query.getExactTerms().length > 0) &&
query.getBestStringForExcerpt() &&
matches[0].match.toLowerCase() === query.getBestStringForExcerpt()
) {
bestMatch = matches.shift()
}
const lowerCaseBasename = note.basename.toLowerCase()
const titleMatchWord = foundWords.find(word =>
lowerCaseBasename.includes(word.toLowerCase())
@@ -514,6 +527,10 @@ export class SearchEngine {
}
}
if (bestMatch) {
matches.unshift(bestMatch)
}
logVerbose(`Matches for note "${note.path}"`, matches)
const resultNote: ResultNote = {
score: result.score,
@@ -559,11 +576,20 @@ export class SearchEngine {
}
return (doc as any)[fieldName]
},
processTerm: (term: string) =>
(this.plugin.settings.ignoreDiacritics
? removeDiacritics(term, this.plugin.settings.ignoreArabicDiacritics)
: term
).toLowerCase(),
processTerm: (term: string) => {
const processedTerm = (
this.plugin.settings.ignoreDiacritics
? removeDiacritics(
term,
this.plugin.settings.ignoreArabicDiacritics
)
: term
).toLowerCase()
if (processedTerm.length < 3 || STOP_WORDS.has(processedTerm)) {
return null
}
return processedTerm
},
idField: 'path',
fields: [
'basename',

View File

@@ -1,10 +1,12 @@
import type { QueryCombination } from 'minisearch'
import type { Query, QueryCombination } from 'minisearch'
import { BRACKETS_AND_SPACE, chsRegex, SPACE_OR_PUNCTUATION } from '../globals'
import { logVerbose, splitCamelCase, splitHyphens } from '../tools/utils'
import type OmnisearchPlugin from '../main'
const markdownLinkExtractor = require('markdown-link-extractor')
const STOP_WORDS = new Set(["a", "an", "the", "and", "or", "but", "if", "in", "on", "at", "by", "for", "with", "to", "from", "of", "is", "it", "that", "this"])
export class Tokenizer {
constructor(private plugin: OmnisearchPlugin) {}
@@ -60,21 +62,47 @@ export class Tokenizer {
public tokenizeForSearch(text: string): QueryCombination {
// Extract urls and remove them from the query
const urls: string[] = markdownLinkExtractor(text)
const originalText = text
text = urls.reduce((acc, url) => acc.replace(url, ''), text)
const tokens = [...this.tokenizeTokens(text), ...urls].filter(Boolean)
const isStopWord = (term: string): boolean => {
const lower = term.toLowerCase()
return lower.length < 3 || STOP_WORDS.has(lower)
}
const queries = [
{ combineWith: 'AND', queries: [originalText] },
{ combineWith: 'AND', queries: tokens },
{
combineWith: 'AND',
queries: this.tokenizeWords(text).filter(Boolean),
},
{ combineWith: 'AND', queries: tokens.flatMap(splitHyphens) },
{ combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) },
].map(q => ({
...q,
queries: q.queries.filter(t => !isStopWord(t)),
}))
const nonEmptyQueries = queries.filter(q => q.queries.length > 0)
// Deduplicate
const uniqueQueries = []
const seen = new Set()
for (const q of nonEmptyQueries) {
// sort to make order irrelevant for duplication check
const key = JSON.stringify(q.queries.sort())
if (!seen.has(key)) {
uniqueQueries.push(q)
seen.add(key)
}
}
return {
combineWith: 'OR',
queries: [
{ combineWith: 'AND', queries: tokens },
{
combineWith: 'AND',
queries: this.tokenizeWords(text).filter(Boolean),
},
{ combineWith: 'AND', queries: tokens.flatMap(splitHyphens) },
{ combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) },
],
queries: uniqueQueries as Query[],
}
}

View File

@@ -64,7 +64,6 @@ export class TextProcessor {
words: string[],
query?: Query
): SearchMatch[] {
words = words.map(escapeHTML)
const reg = this.stringsToRegex(words)
const originalText = text
// text = text.toLowerCase().replace(new RegExp(SEPARATORS, 'gu'), ' ')
@@ -96,11 +95,16 @@ export class TextProcessor {
query &&
(query.query.text.length > 1 || query.getExactTerms().length > 0)
) {
const best = text.indexOf(query.getBestStringForExcerpt())
if (best > -1 && matches.find(m => m.offset === best)) {
const bestMatchStr = query.getBestStringForExcerpt()
const best = text.toLowerCase().indexOf(bestMatchStr)
if (best > -1) {
// We found the full query. We make it the first result, and remove any other match that it contains.
matches = matches.filter(
m => m.offset < best || m.offset >= best + bestMatchStr.length
)
matches.unshift({
offset: best,
match: query.getBestStringForExcerpt(),
match: originalText.substring(best, best + bestMatchStr.length),
})
}
}