#373 - Arabic diacritics

This commit is contained in:
Simon Cambier
2024-06-27 21:00:18 +02:00
parent 85a23d8352
commit 87d2085fda
5 changed files with 46 additions and 12 deletions

View File

@@ -13,9 +13,9 @@ export class Query {
}
#inQuotes: string[]
constructor(text = '', options: { ignoreDiacritics: boolean }) {
constructor(text = '', options: { ignoreDiacritics: boolean, ignoreArabicDiacritics: boolean}) {
if (options.ignoreDiacritics) {
text = removeDiacritics(text)
text = removeDiacritics(text, options.ignoreArabicDiacritics)
}
const parsed = parse(text.toLowerCase(), {
tokenize: true,

View File

@@ -304,7 +304,12 @@ export class SearchEngine {
const title = document?.path.toLowerCase() ?? ''
const content = (document?.cleanedContent ?? '').toLowerCase()
return exactTerms.every(
q => content.includes(q) || removeDiacritics(title).includes(q)
q =>
content.includes(q) ||
removeDiacritics(
title,
this.plugin.settings.ignoreArabicDiacritics
).includes(q)
)
})
}
@@ -434,7 +439,7 @@ export class SearchEngine {
},
processTerm: (term: string) =>
(this.plugin.settings.ignoreDiacritics
? removeDiacritics(term)
? removeDiacritics(term, this.plugin.settings.ignoreArabicDiacritics)
: term
).toLowerCase(),
idField: 'path',

View File

@@ -32,6 +32,8 @@ export interface OmnisearchSettings extends WeightingSettings {
downrankedFoldersFilters: string[]
/** Ignore diacritics when indexing files */
ignoreDiacritics: boolean
ignoreArabicDiacritics: boolean
/** Extensions of plain text files to index, in addition to .md */
indexedFileTypes: string[]
/** Enable PDF indexing */
@@ -661,6 +663,17 @@ export class SettingsTab extends PluginSettingTab {
})
)
new Setting(containerEl)
.setName('Ignore Arabic diacritics (beta)')
.setDesc(diacriticsDesc)
.addToggle(toggle =>
toggle.setValue(settings.ignoreArabicDiacritics).onChange(async v => {
await database.clearCache()
settings.ignoreArabicDiacritics = v
await saveSettings(this.plugin)
})
)
// Disable Omnisearch
const disableDesc = new DocumentFragment()
disableDesc.createSpan({}, span => {
@@ -720,6 +733,7 @@ export function getDefaultSettings(app: App): OmnisearchSettings {
hideExcluded: false,
downrankedFoldersFilters: [] as string[],
ignoreDiacritics: true,
ignoreArabicDiacritics: false,
indexedFileTypes: [] as string[],
PDFIndexing: false,
officeIndexing: false,

View File

@@ -115,7 +115,7 @@ export class TextProcessor {
const originalText = text
// text = text.toLowerCase().replace(new RegExp(SEPARATORS, 'gu'), ' ')
if (this.plugin.settings.ignoreDiacritics) {
text = removeDiacritics(text)
text = removeDiacritics(text, this.plugin.settings.ignoreArabicDiacritics)
}
const startTime = new Date().getTime()
let match: RegExpExecArray | null = null

View File

@@ -109,15 +109,30 @@ export function getTagsFromMetadata(metadata: CachedMetadata | null): string[] {
/**
* https://stackoverflow.com/a/37511463
*/
export function removeDiacritics(str: string): string {
// Japanese diacritics that should be distinguished
const excludeDiacritics: string[] = ['\\u30FC', '\\u309A', '\\u3099']
const regexpExclude: string = excludeDiacritics.join('|')
const regexp: RegExp = new RegExp(`(?!${regexpExclude})\\p{Diacritic}`, 'gu')
export function removeDiacritics(str: string, arabic = false): string {
if (str === null || str === undefined) {
return ''
}
// Japanese diacritics that should be distinguished
const japaneseDiacritics: string[] = ['\\u30FC', '\\u309A', '\\u3099']
const regexpExclude: string = japaneseDiacritics.join('|')
const regexp: RegExp = new RegExp(`(?!${regexpExclude})\\p{Diacritic}`, 'gu')
if (arabic) {
// Arabic diacritics
// https://stackoverflow.com/a/40959537
str = str
.replace(/([^\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z 0-9])/g, '')
.replace(/(آ|إ|أ)/g, 'ا')
.replace(/(ة)/g, 'ه')
.replace(/(ئ|ؤ)/g, 'ء')
.replace(/(ى)/g, 'ي')
for (let i = 0; i < 10; i++) {
str.replace(String.fromCharCode(0x660 + i), String.fromCharCode(48 + i))
}
}
// Keep backticks for code blocks, because otherwise they are removed by the .normalize() function
// https://stackoverflow.com/a/36100275
str = str.replaceAll('`', '[__omnisearch__backtick__]')