fix: Improve search tokenizer by adding exact phrase and filtering queries

Co-authored-by: aider (gemini/gemini-2.5-pro) <aider@aider.chat>
This commit is contained in:
2026-02-05 13:59:00 -07:00
parent c4c4e782fb
commit 637c20905e

View File

@@ -60,21 +60,39 @@ export class Tokenizer {
public tokenizeForSearch(text: string): QueryCombination { public tokenizeForSearch(text: string): QueryCombination {
// Extract urls and remove them from the query // Extract urls and remove them from the query
const urls: string[] = markdownLinkExtractor(text) const urls: string[] = markdownLinkExtractor(text)
const originalText = text
text = urls.reduce((acc, url) => acc.replace(url, ''), text) text = urls.reduce((acc, url) => acc.replace(url, ''), text)
const tokens = [...this.tokenizeTokens(text), ...urls].filter(Boolean) const tokens = [...this.tokenizeTokens(text), ...urls].filter(Boolean)
const queries = [
{ combineWith: 'AND', queries: [originalText] },
{ combineWith: 'AND', queries: tokens },
{
combineWith: 'AND',
queries: this.tokenizeWords(text).filter(Boolean),
},
{ combineWith: 'AND', queries: tokens.flatMap(splitHyphens) },
{ combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) },
]
const nonEmptyQueries = queries.filter(q => q.queries.length > 0)
// Deduplicate
const uniqueQueries = []
const seen = new Set()
for (const q of nonEmptyQueries) {
// sort to make order irrelevant for duplication check
const key = JSON.stringify(q.queries.sort())
if (!seen.has(key)) {
uniqueQueries.push(q)
seen.add(key)
}
}
return { return {
combineWith: 'OR', combineWith: 'OR',
queries: [ queries: uniqueQueries,
{ combineWith: 'AND', queries: tokens },
{
combineWith: 'AND',
queries: this.tokenizeWords(text).filter(Boolean),
},
{ combineWith: 'AND', queries: tokens.flatMap(splitHyphens) },
{ combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) },
],
} }
} }