#373 - Arabic diacritics

This commit is contained in:
Simon Cambier
2024-06-27 21:00:18 +02:00
parent 85a23d8352
commit 87d2085fda
5 changed files with 46 additions and 12 deletions

View File

@@ -13,9 +13,9 @@ export class Query {
} }
#inQuotes: string[] #inQuotes: string[]
constructor(text = '', options: { ignoreDiacritics: boolean }) { constructor(text = '', options: { ignoreDiacritics: boolean, ignoreArabicDiacritics: boolean}) {
if (options.ignoreDiacritics) { if (options.ignoreDiacritics) {
text = removeDiacritics(text) text = removeDiacritics(text, options.ignoreArabicDiacritics)
} }
const parsed = parse(text.toLowerCase(), { const parsed = parse(text.toLowerCase(), {
tokenize: true, tokenize: true,

View File

@@ -304,7 +304,12 @@ export class SearchEngine {
const title = document?.path.toLowerCase() ?? '' const title = document?.path.toLowerCase() ?? ''
const content = (document?.cleanedContent ?? '').toLowerCase() const content = (document?.cleanedContent ?? '').toLowerCase()
return exactTerms.every( return exactTerms.every(
q => content.includes(q) || removeDiacritics(title).includes(q) q =>
content.includes(q) ||
removeDiacritics(
title,
this.plugin.settings.ignoreArabicDiacritics
).includes(q)
) )
}) })
} }
@@ -434,7 +439,7 @@ export class SearchEngine {
}, },
processTerm: (term: string) => processTerm: (term: string) =>
(this.plugin.settings.ignoreDiacritics (this.plugin.settings.ignoreDiacritics
? removeDiacritics(term) ? removeDiacritics(term, this.plugin.settings.ignoreArabicDiacritics)
: term : term
).toLowerCase(), ).toLowerCase(),
idField: 'path', idField: 'path',

View File

@@ -32,6 +32,8 @@ export interface OmnisearchSettings extends WeightingSettings {
downrankedFoldersFilters: string[] downrankedFoldersFilters: string[]
/** Ignore diacritics when indexing files */ /** Ignore diacritics when indexing files */
ignoreDiacritics: boolean ignoreDiacritics: boolean
ignoreArabicDiacritics: boolean
/** Extensions of plain text files to index, in addition to .md */ /** Extensions of plain text files to index, in addition to .md */
indexedFileTypes: string[] indexedFileTypes: string[]
/** Enable PDF indexing */ /** Enable PDF indexing */
@@ -661,6 +663,17 @@ export class SettingsTab extends PluginSettingTab {
}) })
) )
new Setting(containerEl)
.setName('Ignore Arabic diacritics (beta)')
.setDesc(diacriticsDesc)
.addToggle(toggle =>
toggle.setValue(settings.ignoreArabicDiacritics).onChange(async v => {
await database.clearCache()
settings.ignoreArabicDiacritics = v
await saveSettings(this.plugin)
})
)
// Disable Omnisearch // Disable Omnisearch
const disableDesc = new DocumentFragment() const disableDesc = new DocumentFragment()
disableDesc.createSpan({}, span => { disableDesc.createSpan({}, span => {
@@ -720,6 +733,7 @@ export function getDefaultSettings(app: App): OmnisearchSettings {
hideExcluded: false, hideExcluded: false,
downrankedFoldersFilters: [] as string[], downrankedFoldersFilters: [] as string[],
ignoreDiacritics: true, ignoreDiacritics: true,
ignoreArabicDiacritics: false,
indexedFileTypes: [] as string[], indexedFileTypes: [] as string[],
PDFIndexing: false, PDFIndexing: false,
officeIndexing: false, officeIndexing: false,

View File

@@ -115,7 +115,7 @@ export class TextProcessor {
const originalText = text const originalText = text
// text = text.toLowerCase().replace(new RegExp(SEPARATORS, 'gu'), ' ') // text = text.toLowerCase().replace(new RegExp(SEPARATORS, 'gu'), ' ')
if (this.plugin.settings.ignoreDiacritics) { if (this.plugin.settings.ignoreDiacritics) {
text = removeDiacritics(text) text = removeDiacritics(text, this.plugin.settings.ignoreArabicDiacritics)
} }
const startTime = new Date().getTime() const startTime = new Date().getTime()
let match: RegExpExecArray | null = null let match: RegExpExecArray | null = null

View File

@@ -109,15 +109,30 @@ export function getTagsFromMetadata(metadata: CachedMetadata | null): string[] {
/** /**
* https://stackoverflow.com/a/37511463 * https://stackoverflow.com/a/37511463
*/ */
export function removeDiacritics(str: string): string { export function removeDiacritics(str: string, arabic = false): string {
// Japanese diacritics that should be distinguished
const excludeDiacritics: string[] = ['\\u30FC', '\\u309A', '\\u3099']
const regexpExclude: string = excludeDiacritics.join('|')
const regexp: RegExp = new RegExp(`(?!${regexpExclude})\\p{Diacritic}`, 'gu')
if (str === null || str === undefined) { if (str === null || str === undefined) {
return '' return ''
} }
// Japanese diacritics that should be distinguished
const japaneseDiacritics: string[] = ['\\u30FC', '\\u309A', '\\u3099']
const regexpExclude: string = japaneseDiacritics.join('|')
const regexp: RegExp = new RegExp(`(?!${regexpExclude})\\p{Diacritic}`, 'gu')
if (arabic) {
// Arabic diacritics
// https://stackoverflow.com/a/40959537
str = str
.replace(/([^\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z 0-9])/g, '')
.replace(/(آ|إ|أ)/g, 'ا')
.replace(/(ة)/g, 'ه')
.replace(/(ئ|ؤ)/g, 'ء')
.replace(/(ى)/g, 'ي')
for (let i = 0; i < 10; i++) {
str.replace(String.fromCharCode(0x660 + i), String.fromCharCode(48 + i))
}
}
// Keep backticks for code blocks, because otherwise they are removed by the .normalize() function // Keep backticks for code blocks, because otherwise they are removed by the .normalize() function
// https://stackoverflow.com/a/36100275 // https://stackoverflow.com/a/36100275
str = str.replaceAll('`', '[__omnisearch__backtick__]') str = str.replaceAll('`', '[__omnisearch__backtick__]')
@@ -223,7 +238,7 @@ export function warnDebug(...args: any[]): void {
printDebug(console.warn, ...args) printDebug(console.warn, ...args)
} }
let printDebugEnabled= false let printDebugEnabled = false
export function enablePrintDebug(enable: boolean): void { export function enablePrintDebug(enable: boolean): void {
printDebugEnabled = enable printDebugEnabled = enable
} }