#373 - Arabic diacritics
This commit is contained in:
@@ -13,9 +13,9 @@ export class Query {
|
||||
}
|
||||
#inQuotes: string[]
|
||||
|
||||
constructor(text = '', options: { ignoreDiacritics: boolean }) {
|
||||
constructor(text = '', options: { ignoreDiacritics: boolean, ignoreArabicDiacritics: boolean}) {
|
||||
if (options.ignoreDiacritics) {
|
||||
text = removeDiacritics(text)
|
||||
text = removeDiacritics(text, options.ignoreArabicDiacritics)
|
||||
}
|
||||
const parsed = parse(text.toLowerCase(), {
|
||||
tokenize: true,
|
||||
|
||||
@@ -304,7 +304,12 @@ export class SearchEngine {
|
||||
const title = document?.path.toLowerCase() ?? ''
|
||||
const content = (document?.cleanedContent ?? '').toLowerCase()
|
||||
return exactTerms.every(
|
||||
q => content.includes(q) || removeDiacritics(title).includes(q)
|
||||
q =>
|
||||
content.includes(q) ||
|
||||
removeDiacritics(
|
||||
title,
|
||||
this.plugin.settings.ignoreArabicDiacritics
|
||||
).includes(q)
|
||||
)
|
||||
})
|
||||
}
|
||||
@@ -434,7 +439,7 @@ export class SearchEngine {
|
||||
},
|
||||
processTerm: (term: string) =>
|
||||
(this.plugin.settings.ignoreDiacritics
|
||||
? removeDiacritics(term)
|
||||
? removeDiacritics(term, this.plugin.settings.ignoreArabicDiacritics)
|
||||
: term
|
||||
).toLowerCase(),
|
||||
idField: 'path',
|
||||
|
||||
@@ -32,6 +32,8 @@ export interface OmnisearchSettings extends WeightingSettings {
|
||||
downrankedFoldersFilters: string[]
|
||||
/** Ignore diacritics when indexing files */
|
||||
ignoreDiacritics: boolean
|
||||
ignoreArabicDiacritics: boolean
|
||||
|
||||
/** Extensions of plain text files to index, in addition to .md */
|
||||
indexedFileTypes: string[]
|
||||
/** Enable PDF indexing */
|
||||
@@ -661,6 +663,17 @@ export class SettingsTab extends PluginSettingTab {
|
||||
})
|
||||
)
|
||||
|
||||
new Setting(containerEl)
|
||||
.setName('Ignore Arabic diacritics (beta)')
|
||||
.setDesc(diacriticsDesc)
|
||||
.addToggle(toggle =>
|
||||
toggle.setValue(settings.ignoreArabicDiacritics).onChange(async v => {
|
||||
await database.clearCache()
|
||||
settings.ignoreArabicDiacritics = v
|
||||
await saveSettings(this.plugin)
|
||||
})
|
||||
)
|
||||
|
||||
// Disable Omnisearch
|
||||
const disableDesc = new DocumentFragment()
|
||||
disableDesc.createSpan({}, span => {
|
||||
@@ -720,6 +733,7 @@ export function getDefaultSettings(app: App): OmnisearchSettings {
|
||||
hideExcluded: false,
|
||||
downrankedFoldersFilters: [] as string[],
|
||||
ignoreDiacritics: true,
|
||||
ignoreArabicDiacritics: false,
|
||||
indexedFileTypes: [] as string[],
|
||||
PDFIndexing: false,
|
||||
officeIndexing: false,
|
||||
|
||||
@@ -115,7 +115,7 @@ export class TextProcessor {
|
||||
const originalText = text
|
||||
// text = text.toLowerCase().replace(new RegExp(SEPARATORS, 'gu'), ' ')
|
||||
if (this.plugin.settings.ignoreDiacritics) {
|
||||
text = removeDiacritics(text)
|
||||
text = removeDiacritics(text, this.plugin.settings.ignoreArabicDiacritics)
|
||||
}
|
||||
const startTime = new Date().getTime()
|
||||
let match: RegExpExecArray | null = null
|
||||
|
||||
@@ -109,15 +109,30 @@ export function getTagsFromMetadata(metadata: CachedMetadata | null): string[] {
|
||||
/**
|
||||
* https://stackoverflow.com/a/37511463
|
||||
*/
|
||||
export function removeDiacritics(str: string): string {
|
||||
// Japanese diacritics that should be distinguished
|
||||
const excludeDiacritics: string[] = ['\\u30FC', '\\u309A', '\\u3099']
|
||||
const regexpExclude: string = excludeDiacritics.join('|')
|
||||
const regexp: RegExp = new RegExp(`(?!${regexpExclude})\\p{Diacritic}`, 'gu')
|
||||
|
||||
export function removeDiacritics(str: string, arabic = false): string {
|
||||
if (str === null || str === undefined) {
|
||||
return ''
|
||||
}
|
||||
|
||||
// Japanese diacritics that should be distinguished
|
||||
const japaneseDiacritics: string[] = ['\\u30FC', '\\u309A', '\\u3099']
|
||||
const regexpExclude: string = japaneseDiacritics.join('|')
|
||||
const regexp: RegExp = new RegExp(`(?!${regexpExclude})\\p{Diacritic}`, 'gu')
|
||||
|
||||
if (arabic) {
|
||||
// Arabic diacritics
|
||||
// https://stackoverflow.com/a/40959537
|
||||
str = str
|
||||
.replace(/([^\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z 0-9])/g, '')
|
||||
.replace(/(آ|إ|أ)/g, 'ا')
|
||||
.replace(/(ة)/g, 'ه')
|
||||
.replace(/(ئ|ؤ)/g, 'ء')
|
||||
.replace(/(ى)/g, 'ي')
|
||||
for (let i = 0; i < 10; i++) {
|
||||
str.replace(String.fromCharCode(0x660 + i), String.fromCharCode(48 + i))
|
||||
}
|
||||
}
|
||||
|
||||
// Keep backticks for code blocks, because otherwise they are removed by the .normalize() function
|
||||
// https://stackoverflow.com/a/36100275
|
||||
str = str.replaceAll('`', '[__omnisearch__backtick__]')
|
||||
|
||||
Reference in New Issue
Block a user