#373 - Arabic diacritics
This commit is contained in:
@@ -13,9 +13,9 @@ export class Query {
|
|||||||
}
|
}
|
||||||
#inQuotes: string[]
|
#inQuotes: string[]
|
||||||
|
|
||||||
constructor(text = '', options: { ignoreDiacritics: boolean }) {
|
constructor(text = '', options: { ignoreDiacritics: boolean, ignoreArabicDiacritics: boolean}) {
|
||||||
if (options.ignoreDiacritics) {
|
if (options.ignoreDiacritics) {
|
||||||
text = removeDiacritics(text)
|
text = removeDiacritics(text, options.ignoreArabicDiacritics)
|
||||||
}
|
}
|
||||||
const parsed = parse(text.toLowerCase(), {
|
const parsed = parse(text.toLowerCase(), {
|
||||||
tokenize: true,
|
tokenize: true,
|
||||||
|
|||||||
@@ -304,7 +304,12 @@ export class SearchEngine {
|
|||||||
const title = document?.path.toLowerCase() ?? ''
|
const title = document?.path.toLowerCase() ?? ''
|
||||||
const content = (document?.cleanedContent ?? '').toLowerCase()
|
const content = (document?.cleanedContent ?? '').toLowerCase()
|
||||||
return exactTerms.every(
|
return exactTerms.every(
|
||||||
q => content.includes(q) || removeDiacritics(title).includes(q)
|
q =>
|
||||||
|
content.includes(q) ||
|
||||||
|
removeDiacritics(
|
||||||
|
title,
|
||||||
|
this.plugin.settings.ignoreArabicDiacritics
|
||||||
|
).includes(q)
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -434,7 +439,7 @@ export class SearchEngine {
|
|||||||
},
|
},
|
||||||
processTerm: (term: string) =>
|
processTerm: (term: string) =>
|
||||||
(this.plugin.settings.ignoreDiacritics
|
(this.plugin.settings.ignoreDiacritics
|
||||||
? removeDiacritics(term)
|
? removeDiacritics(term, this.plugin.settings.ignoreArabicDiacritics)
|
||||||
: term
|
: term
|
||||||
).toLowerCase(),
|
).toLowerCase(),
|
||||||
idField: 'path',
|
idField: 'path',
|
||||||
|
|||||||
@@ -32,6 +32,8 @@ export interface OmnisearchSettings extends WeightingSettings {
|
|||||||
downrankedFoldersFilters: string[]
|
downrankedFoldersFilters: string[]
|
||||||
/** Ignore diacritics when indexing files */
|
/** Ignore diacritics when indexing files */
|
||||||
ignoreDiacritics: boolean
|
ignoreDiacritics: boolean
|
||||||
|
ignoreArabicDiacritics: boolean
|
||||||
|
|
||||||
/** Extensions of plain text files to index, in addition to .md */
|
/** Extensions of plain text files to index, in addition to .md */
|
||||||
indexedFileTypes: string[]
|
indexedFileTypes: string[]
|
||||||
/** Enable PDF indexing */
|
/** Enable PDF indexing */
|
||||||
@@ -661,6 +663,17 @@ export class SettingsTab extends PluginSettingTab {
|
|||||||
})
|
})
|
||||||
)
|
)
|
||||||
|
|
||||||
|
new Setting(containerEl)
|
||||||
|
.setName('Ignore Arabic diacritics (beta)')
|
||||||
|
.setDesc(diacriticsDesc)
|
||||||
|
.addToggle(toggle =>
|
||||||
|
toggle.setValue(settings.ignoreArabicDiacritics).onChange(async v => {
|
||||||
|
await database.clearCache()
|
||||||
|
settings.ignoreArabicDiacritics = v
|
||||||
|
await saveSettings(this.plugin)
|
||||||
|
})
|
||||||
|
)
|
||||||
|
|
||||||
// Disable Omnisearch
|
// Disable Omnisearch
|
||||||
const disableDesc = new DocumentFragment()
|
const disableDesc = new DocumentFragment()
|
||||||
disableDesc.createSpan({}, span => {
|
disableDesc.createSpan({}, span => {
|
||||||
@@ -720,6 +733,7 @@ export function getDefaultSettings(app: App): OmnisearchSettings {
|
|||||||
hideExcluded: false,
|
hideExcluded: false,
|
||||||
downrankedFoldersFilters: [] as string[],
|
downrankedFoldersFilters: [] as string[],
|
||||||
ignoreDiacritics: true,
|
ignoreDiacritics: true,
|
||||||
|
ignoreArabicDiacritics: false,
|
||||||
indexedFileTypes: [] as string[],
|
indexedFileTypes: [] as string[],
|
||||||
PDFIndexing: false,
|
PDFIndexing: false,
|
||||||
officeIndexing: false,
|
officeIndexing: false,
|
||||||
|
|||||||
@@ -115,7 +115,7 @@ export class TextProcessor {
|
|||||||
const originalText = text
|
const originalText = text
|
||||||
// text = text.toLowerCase().replace(new RegExp(SEPARATORS, 'gu'), ' ')
|
// text = text.toLowerCase().replace(new RegExp(SEPARATORS, 'gu'), ' ')
|
||||||
if (this.plugin.settings.ignoreDiacritics) {
|
if (this.plugin.settings.ignoreDiacritics) {
|
||||||
text = removeDiacritics(text)
|
text = removeDiacritics(text, this.plugin.settings.ignoreArabicDiacritics)
|
||||||
}
|
}
|
||||||
const startTime = new Date().getTime()
|
const startTime = new Date().getTime()
|
||||||
let match: RegExpExecArray | null = null
|
let match: RegExpExecArray | null = null
|
||||||
|
|||||||
@@ -109,15 +109,30 @@ export function getTagsFromMetadata(metadata: CachedMetadata | null): string[] {
|
|||||||
/**
|
/**
|
||||||
* https://stackoverflow.com/a/37511463
|
* https://stackoverflow.com/a/37511463
|
||||||
*/
|
*/
|
||||||
export function removeDiacritics(str: string): string {
|
export function removeDiacritics(str: string, arabic = false): string {
|
||||||
// Japanese diacritics that should be distinguished
|
|
||||||
const excludeDiacritics: string[] = ['\\u30FC', '\\u309A', '\\u3099']
|
|
||||||
const regexpExclude: string = excludeDiacritics.join('|')
|
|
||||||
const regexp: RegExp = new RegExp(`(?!${regexpExclude})\\p{Diacritic}`, 'gu')
|
|
||||||
|
|
||||||
if (str === null || str === undefined) {
|
if (str === null || str === undefined) {
|
||||||
return ''
|
return ''
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Japanese diacritics that should be distinguished
|
||||||
|
const japaneseDiacritics: string[] = ['\\u30FC', '\\u309A', '\\u3099']
|
||||||
|
const regexpExclude: string = japaneseDiacritics.join('|')
|
||||||
|
const regexp: RegExp = new RegExp(`(?!${regexpExclude})\\p{Diacritic}`, 'gu')
|
||||||
|
|
||||||
|
if (arabic) {
|
||||||
|
// Arabic diacritics
|
||||||
|
// https://stackoverflow.com/a/40959537
|
||||||
|
str = str
|
||||||
|
.replace(/([^\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z 0-9])/g, '')
|
||||||
|
.replace(/(آ|إ|أ)/g, 'ا')
|
||||||
|
.replace(/(ة)/g, 'ه')
|
||||||
|
.replace(/(ئ|ؤ)/g, 'ء')
|
||||||
|
.replace(/(ى)/g, 'ي')
|
||||||
|
for (let i = 0; i < 10; i++) {
|
||||||
|
str.replace(String.fromCharCode(0x660 + i), String.fromCharCode(48 + i))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Keep backticks for code blocks, because otherwise they are removed by the .normalize() function
|
// Keep backticks for code blocks, because otherwise they are removed by the .normalize() function
|
||||||
// https://stackoverflow.com/a/36100275
|
// https://stackoverflow.com/a/36100275
|
||||||
str = str.replaceAll('`', '[__omnisearch__backtick__]')
|
str = str.replaceAll('`', '[__omnisearch__backtick__]')
|
||||||
|
|||||||
Reference in New Issue
Block a user