fix: tentative workaround for #383

This commit is contained in:
Simon Cambier
2024-07-31 11:17:35 +02:00
parent a778937292
commit 439150a1f0

View File

@@ -15,36 +15,41 @@ export class Tokenizer {
* @returns * @returns
*/ */
public tokenizeForIndexing(text: string): string[] { public tokenizeForIndexing(text: string): string[] {
const words = this.tokenizeWords(text) try {
let urls: string[] = [] const words = this.tokenizeWords(text)
if (this.plugin.settings.tokenizeUrls) { let urls: string[] = []
try { if (this.plugin.settings.tokenizeUrls) {
urls = markdownLinkExtractor(text) try {
} catch (e) { urls = markdownLinkExtractor(text)
logDebug('Error extracting urls', e) } catch (e) {
logDebug('Error extracting urls', e)
}
} }
let tokens = this.tokenizeTokens(text, { skipChs: true })
// Split hyphenated tokens
tokens = [...tokens, ...tokens.flatMap(splitHyphens)]
// Split camelCase tokens into "camel" and "case
tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]
// Add whole words (aka "not tokens")
tokens = [...tokens, ...words]
// Add urls
if (urls.length) {
tokens = [...tokens, ...urls]
}
// Remove duplicates
tokens = [...new Set(tokens)]
return tokens
} catch (e) {
console.error('Error tokenizing text, skipping document', e)
return []
} }
let tokens = this.tokenizeTokens(text, { skipChs: true })
// Split hyphenated tokens
tokens = [...tokens, ...tokens.flatMap(splitHyphens)]
// Split camelCase tokens into "camel" and "case
tokens = [...tokens, ...tokens.flatMap(splitCamelCase)]
// Add whole words (aka "not tokens")
tokens = [...tokens, ...words]
// Add urls
if (urls.length) {
tokens = [...tokens, ...urls]
}
// Remove duplicates
tokens = [...new Set(tokens)]
return tokens
} }
/** /**