fix: Improve search tokenizer by adding exact phrase and filtering queries

Co-authored-by: aider (gemini/gemini-2.5-pro) <aider@aider.chat>
This commit is contained in:
2026-02-05 13:59:00 -07:00
parent c4c4e782fb
commit 637c20905e

View File

@@ -60,13 +60,13 @@ export class Tokenizer {
public tokenizeForSearch(text: string): QueryCombination { public tokenizeForSearch(text: string): QueryCombination {
// Extract urls and remove them from the query // Extract urls and remove them from the query
const urls: string[] = markdownLinkExtractor(text) const urls: string[] = markdownLinkExtractor(text)
const originalText = text
text = urls.reduce((acc, url) => acc.replace(url, ''), text) text = urls.reduce((acc, url) => acc.replace(url, ''), text)
const tokens = [...this.tokenizeTokens(text), ...urls].filter(Boolean) const tokens = [...this.tokenizeTokens(text), ...urls].filter(Boolean)
return { const queries = [
combineWith: 'OR', { combineWith: 'AND', queries: [originalText] },
queries: [
{ combineWith: 'AND', queries: tokens }, { combineWith: 'AND', queries: tokens },
{ {
combineWith: 'AND', combineWith: 'AND',
@@ -74,7 +74,25 @@ export class Tokenizer {
}, },
{ combineWith: 'AND', queries: tokens.flatMap(splitHyphens) }, { combineWith: 'AND', queries: tokens.flatMap(splitHyphens) },
{ combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) }, { combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) },
], ]
const nonEmptyQueries = queries.filter(q => q.queries.length > 0)
// Deduplicate
const uniqueQueries = []
const seen = new Set()
for (const q of nonEmptyQueries) {
// sort to make order irrelevant for duplication check
const key = JSON.stringify(q.queries.sort())
if (!seen.has(key)) {
uniqueQueries.push(q)
seen.add(key)
}
}
return {
combineWith: 'OR',
queries: uniqueQueries,
} }
} }