fix: Improve search tokenizer by adding exact phrase and filtering queries
Co-authored-by: aider (gemini/gemini-2.5-pro) <aider@aider.chat>
This commit is contained in:
@@ -60,21 +60,39 @@ export class Tokenizer {
|
|||||||
public tokenizeForSearch(text: string): QueryCombination {
|
public tokenizeForSearch(text: string): QueryCombination {
|
||||||
// Extract urls and remove them from the query
|
// Extract urls and remove them from the query
|
||||||
const urls: string[] = markdownLinkExtractor(text)
|
const urls: string[] = markdownLinkExtractor(text)
|
||||||
|
const originalText = text
|
||||||
text = urls.reduce((acc, url) => acc.replace(url, ''), text)
|
text = urls.reduce((acc, url) => acc.replace(url, ''), text)
|
||||||
|
|
||||||
const tokens = [...this.tokenizeTokens(text), ...urls].filter(Boolean)
|
const tokens = [...this.tokenizeTokens(text), ...urls].filter(Boolean)
|
||||||
|
|
||||||
|
const queries = [
|
||||||
|
{ combineWith: 'AND', queries: [originalText] },
|
||||||
|
{ combineWith: 'AND', queries: tokens },
|
||||||
|
{
|
||||||
|
combineWith: 'AND',
|
||||||
|
queries: this.tokenizeWords(text).filter(Boolean),
|
||||||
|
},
|
||||||
|
{ combineWith: 'AND', queries: tokens.flatMap(splitHyphens) },
|
||||||
|
{ combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) },
|
||||||
|
]
|
||||||
|
|
||||||
|
const nonEmptyQueries = queries.filter(q => q.queries.length > 0)
|
||||||
|
|
||||||
|
// Deduplicate
|
||||||
|
const uniqueQueries = []
|
||||||
|
const seen = new Set()
|
||||||
|
for (const q of nonEmptyQueries) {
|
||||||
|
// sort to make order irrelevant for duplication check
|
||||||
|
const key = JSON.stringify(q.queries.sort())
|
||||||
|
if (!seen.has(key)) {
|
||||||
|
uniqueQueries.push(q)
|
||||||
|
seen.add(key)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
combineWith: 'OR',
|
combineWith: 'OR',
|
||||||
queries: [
|
queries: uniqueQueries,
|
||||||
{ combineWith: 'AND', queries: tokens },
|
|
||||||
{
|
|
||||||
combineWith: 'AND',
|
|
||||||
queries: this.tokenizeWords(text).filter(Boolean),
|
|
||||||
},
|
|
||||||
{ combineWith: 'AND', queries: tokens.flatMap(splitHyphens) },
|
|
||||||
{ combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) },
|
|
||||||
],
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user