diff --git a/src/search/query.ts b/src/search/query.ts index 2264832..f822d46 100644 --- a/src/search/query.ts +++ b/src/search/query.ts @@ -13,9 +13,9 @@ export class Query { } #inQuotes: string[] - constructor(text = '', options: { ignoreDiacritics: boolean }) { + constructor(text = '', options: { ignoreDiacritics: boolean, ignoreArabicDiacritics: boolean}) { if (options.ignoreDiacritics) { - text = removeDiacritics(text) + text = removeDiacritics(text, options.ignoreArabicDiacritics) } const parsed = parse(text.toLowerCase(), { tokenize: true, diff --git a/src/search/search-engine.ts b/src/search/search-engine.ts index dba9a80..90f495b 100644 --- a/src/search/search-engine.ts +++ b/src/search/search-engine.ts @@ -304,7 +304,12 @@ export class SearchEngine { const title = document?.path.toLowerCase() ?? '' const content = (document?.cleanedContent ?? '').toLowerCase() return exactTerms.every( - q => content.includes(q) || removeDiacritics(title).includes(q) + q => + content.includes(q) || + removeDiacritics( + title, + this.plugin.settings.ignoreArabicDiacritics + ).includes(q) ) }) } @@ -434,7 +439,7 @@ export class SearchEngine { }, processTerm: (term: string) => (this.plugin.settings.ignoreDiacritics - ? removeDiacritics(term) + ? removeDiacritics(term, this.plugin.settings.ignoreArabicDiacritics) : term ).toLowerCase(), idField: 'path', diff --git a/src/settings.ts b/src/settings.ts index a02503e..8f5fe31 100644 --- a/src/settings.ts +++ b/src/settings.ts @@ -32,6 +32,8 @@ export interface OmnisearchSettings extends WeightingSettings { downrankedFoldersFilters: string[] /** Ignore diacritics when indexing files */ ignoreDiacritics: boolean + ignoreArabicDiacritics: boolean + /** Extensions of plain text files to index, in addition to .md */ indexedFileTypes: string[] /** Enable PDF indexing */ @@ -661,6 +663,17 @@ export class SettingsTab extends PluginSettingTab { }) ) + new Setting(containerEl) + .setName('Ignore Arabic diacritics (beta)') + .setDesc(diacriticsDesc) + .addToggle(toggle => + toggle.setValue(settings.ignoreArabicDiacritics).onChange(async v => { + await database.clearCache() + settings.ignoreArabicDiacritics = v + await saveSettings(this.plugin) + }) + ) + // Disable Omnisearch const disableDesc = new DocumentFragment() disableDesc.createSpan({}, span => { @@ -720,6 +733,7 @@ export function getDefaultSettings(app: App): OmnisearchSettings { hideExcluded: false, downrankedFoldersFilters: [] as string[], ignoreDiacritics: true, + ignoreArabicDiacritics: false, indexedFileTypes: [] as string[], PDFIndexing: false, officeIndexing: false, diff --git a/src/tools/text-processing.ts b/src/tools/text-processing.ts index 64574f3..497c513 100644 --- a/src/tools/text-processing.ts +++ b/src/tools/text-processing.ts @@ -115,7 +115,7 @@ export class TextProcessor { const originalText = text // text = text.toLowerCase().replace(new RegExp(SEPARATORS, 'gu'), ' ') if (this.plugin.settings.ignoreDiacritics) { - text = removeDiacritics(text) + text = removeDiacritics(text, this.plugin.settings.ignoreArabicDiacritics) } const startTime = new Date().getTime() let match: RegExpExecArray | null = null diff --git a/src/tools/utils.ts b/src/tools/utils.ts index 1c32558..74d7beb 100644 --- a/src/tools/utils.ts +++ b/src/tools/utils.ts @@ -109,15 +109,30 @@ export function getTagsFromMetadata(metadata: CachedMetadata | null): string[] { /** * https://stackoverflow.com/a/37511463 */ -export function removeDiacritics(str: string): string { - // Japanese diacritics that should be distinguished - const excludeDiacritics: string[] = ['\\u30FC', '\\u309A', '\\u3099'] - const regexpExclude: string = excludeDiacritics.join('|') - const regexp: RegExp = new RegExp(`(?!${regexpExclude})\\p{Diacritic}`, 'gu') - +export function removeDiacritics(str: string, arabic = false): string { if (str === null || str === undefined) { return '' } + + // Japanese diacritics that should be distinguished + const japaneseDiacritics: string[] = ['\\u30FC', '\\u309A', '\\u3099'] + const regexpExclude: string = japaneseDiacritics.join('|') + const regexp: RegExp = new RegExp(`(?!${regexpExclude})\\p{Diacritic}`, 'gu') + + if (arabic) { + // Arabic diacritics + // https://stackoverflow.com/a/40959537 + str = str + .replace(/([^\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z 0-9])/g, '') + .replace(/(آ|إ|أ)/g, 'ا') + .replace(/(ة)/g, 'ه') + .replace(/(ئ|ؤ)/g, 'ء') + .replace(/(ى)/g, 'ي') + for (let i = 0; i < 10; i++) { + str.replace(String.fromCharCode(0x660 + i), String.fromCharCode(48 + i)) + } + } + // Keep backticks for code blocks, because otherwise they are removed by the .normalize() function // https://stackoverflow.com/a/36100275 str = str.replaceAll('`', '[__omnisearch__backtick__]') @@ -223,7 +238,7 @@ export function warnDebug(...args: any[]): void { printDebug(console.warn, ...args) } -let printDebugEnabled= false +let printDebugEnabled = false export function enablePrintDebug(enable: boolean): void { printDebugEnabled = enable }