fix: Filter stop words and short tokens from search queries
Co-authored-by: aider (gemini/gemini-2.5-pro) <aider@aider.chat>
This commit is contained in:
@@ -5,6 +5,8 @@ import type OmnisearchPlugin from '../main'
|
|||||||
|
|
||||||
const markdownLinkExtractor = require('markdown-link-extractor')
|
const markdownLinkExtractor = require('markdown-link-extractor')
|
||||||
|
|
||||||
|
const STOP_WORDS = new Set(["a", "an", "the", "and", "or", "but", "if", "in", "on", "at", "by", "for", "with", "to", "from", "of", "is", "it", "that", "this"])
|
||||||
|
|
||||||
export class Tokenizer {
|
export class Tokenizer {
|
||||||
constructor(private plugin: OmnisearchPlugin) {}
|
constructor(private plugin: OmnisearchPlugin) {}
|
||||||
|
|
||||||
@@ -65,6 +67,11 @@ export class Tokenizer {
|
|||||||
|
|
||||||
const tokens = [...this.tokenizeTokens(text), ...urls].filter(Boolean)
|
const tokens = [...this.tokenizeTokens(text), ...urls].filter(Boolean)
|
||||||
|
|
||||||
|
const isStopWord = (term: string): boolean => {
|
||||||
|
const lower = term.toLowerCase()
|
||||||
|
return lower.length < 3 || STOP_WORDS.has(lower)
|
||||||
|
}
|
||||||
|
|
||||||
const queries = [
|
const queries = [
|
||||||
{ combineWith: 'AND', queries: [originalText] },
|
{ combineWith: 'AND', queries: [originalText] },
|
||||||
{ combineWith: 'AND', queries: tokens },
|
{ combineWith: 'AND', queries: tokens },
|
||||||
@@ -74,7 +81,10 @@ export class Tokenizer {
|
|||||||
},
|
},
|
||||||
{ combineWith: 'AND', queries: tokens.flatMap(splitHyphens) },
|
{ combineWith: 'AND', queries: tokens.flatMap(splitHyphens) },
|
||||||
{ combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) },
|
{ combineWith: 'AND', queries: tokens.flatMap(splitCamelCase) },
|
||||||
]
|
].map(q => ({
|
||||||
|
...q,
|
||||||
|
queries: q.queries.filter(t => !isStopWord(t)),
|
||||||
|
}))
|
||||||
|
|
||||||
const nonEmptyQueries = queries.filter(q => q.queries.length > 0)
|
const nonEmptyQueries = queries.filter(q => q.queries.length > 0)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user