Fix: issue#190 (#317)

* Fixed search results with diacritics

- Caches are now stored with diacritics regardless of settings.ignoreDiacritics
- Modified getMatches() behavior to return results with correct form
- Modified ResultItemVault.svelte

* Fixed highlighting words with comma and period

- remove commas and periods from matches

* Fixed highlighting of Cyrillic characters

- changed highlight regexp determination to be based on character type

* Fixed highlighting problem of Japanese and Korean

- marked some Japanese diacritics to escape removal
- added NFC normalization to keep right form of Korean character

* Fixed highlighting of words with punctuation

- deleted space/punctuation list from stringsToRegex()
- it seems to be working correctly with words with punctuation and hyphenated words AFAIK

* Deleted some unused imports

* Modified the comment

* Added comment

* Fixed highlighting issue with comma and period

* Fixed highlighting issue with caret and other symbols

- Added `^` to separators
- Changed regex to use separators
- Added escape of `^` from diacritics removal
This commit is contained in:
Hajime Nagisa
2023-11-29 15:32:25 +09:00
committed by GitHub
parent 115886af78
commit 9cac7c0581
5 changed files with 44 additions and 39 deletions

View File

@@ -115,14 +115,23 @@ export function getTagsFromMetadata(metadata: CachedMetadata | null): string[] {
* https://stackoverflow.com/a/37511463
*/
export function removeDiacritics(str: string): string {
// Japanese diacritics that should be distinguished
const excludeDiacritics: string[] = ['\\u30FC', '\\u309A', '\\u3099']
const regexpExclude: string = excludeDiacritics.join('|')
const regexp: RegExp = new RegExp(`(?!${regexpExclude})\\p{Diacritic}`, 'gu')
if (str === null || str === undefined) {
return ''
}
// Keep backticks for code blocks, because otherwise they are removed by the .normalize() function
// https://stackoverflow.com/a/36100275
str = str.replaceAll('`', '[__omnisearch__backtick__]')
str = str.normalize('NFD').replace(/\p{Diacritic}/gu, '')
// Keep caret same as above
str = str.replaceAll('^', '[__omnisearch__caret__]')
// To keep right form of Korean character, NFC normalization is necessary
str = str.normalize('NFD').replace(regexp, '').normalize('NFC')
str = str.replaceAll('[__omnisearch__backtick__]', '`')
str = str.replaceAll('[__omnisearch__caret__]', '^')
return str
}