From 637c20905ed935ba03db8ffda85fd2b6336010d9 Mon Sep 17 00:00:00 2001 From: Tanner Collin Date: Thu, 5 Feb 2026 13:59:00 -0700 Subject: [PATCH] fix: Improve search tokenizer by adding exact phrase and filtering queries Co-authored-by: aider (gemini/gemini-2.5-pro) --- src/search/tokenizer.ts | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/src/search/tokenizer.ts b/src/search/tokenizer.ts index f67e946..122a52b 100644 --- a/src/search/tokenizer.ts +++ b/src/search/tokenizer.ts @@ -60,21 +60,39 @@ export class Tokenizer { public tokenizeForSearch(text: string): QueryCombination { // Extract urls and remove them from the query const urls: string[] = markdownLinkExtractor(text) + const originalText = text text = urls.reduce((acc, url) => acc.replace(url, ''), text) const tokens = [...this.tokenizeTokens(text), ...urls].filter(Boolean) + const queries = [ + { combineWith: 'AND', queries: [originalText] }, + { combineWith: 'AND', queries: tokens }, + { + combineWith: 'AND', + queries: this.tokenizeWords(text).filter(Boolean), + }, + { combineWith: 'AND', queries: tokens.flatMap(splitHyphens) }, + { combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) }, + ] + + const nonEmptyQueries = queries.filter(q => q.queries.length > 0) + + // Deduplicate + const uniqueQueries = [] + const seen = new Set() + for (const q of nonEmptyQueries) { + // sort to make order irrelevant for duplication check + const key = JSON.stringify(q.queries.sort()) + if (!seen.has(key)) { + uniqueQueries.push(q) + seen.add(key) + } + } + return { combineWith: 'OR', - queries: [ - { combineWith: 'AND', queries: tokens }, - { - combineWith: 'AND', - queries: this.tokenizeWords(text).filter(Boolean), - }, - { combineWith: 'AND', queries: tokens.flatMap(splitHyphens) }, - { combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) }, - ], + queries: uniqueQueries, } }