feat: Filter stop words and short tokens from search index
Co-authored-by: aider (gemini/gemini-2.5-pro) <aider@aider.chat>
This commit is contained in:
@@ -22,6 +22,8 @@ import { sortBy } from 'lodash-es'
|
|||||||
import type OmnisearchPlugin from '../main'
|
import type OmnisearchPlugin from '../main'
|
||||||
import { Tokenizer } from './tokenizer'
|
import { Tokenizer } from './tokenizer'
|
||||||
|
|
||||||
|
const STOP_WORDS = new Set(["a", "an", "the", "and", "or", "but", "if", "in", "on", "at", "by", "for", "with", "to", "from", "of", "is", "it", "that", "this"])
|
||||||
|
|
||||||
export class SearchEngine {
|
export class SearchEngine {
|
||||||
private tokenizer: Tokenizer
|
private tokenizer: Tokenizer
|
||||||
private minisearch: MiniSearch
|
private minisearch: MiniSearch
|
||||||
@@ -573,11 +575,20 @@ export class SearchEngine {
|
|||||||
}
|
}
|
||||||
return (doc as any)[fieldName]
|
return (doc as any)[fieldName]
|
||||||
},
|
},
|
||||||
processTerm: (term: string) =>
|
processTerm: (term: string) => {
|
||||||
(this.plugin.settings.ignoreDiacritics
|
const processedTerm = (
|
||||||
? removeDiacritics(term, this.plugin.settings.ignoreArabicDiacritics)
|
this.plugin.settings.ignoreDiacritics
|
||||||
|
? removeDiacritics(
|
||||||
|
term,
|
||||||
|
this.plugin.settings.ignoreArabicDiacritics
|
||||||
|
)
|
||||||
: term
|
: term
|
||||||
).toLowerCase(),
|
).toLowerCase()
|
||||||
|
if (processedTerm.length < 3 || STOP_WORDS.has(processedTerm)) {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
return processedTerm
|
||||||
|
},
|
||||||
idField: 'path',
|
idField: 'path',
|
||||||
fields: [
|
fields: [
|
||||||
'basename',
|
'basename',
|
||||||
|
|||||||
Reference in New Issue
Block a user