Fix: issue#190 (#317)

* Fixed search results with diacritics - Caches are now stored with diacritics regardless of settings.ignoreDiacritics - Modified getMatches() behavior to return results with correct form - Modified ResultItemVault.svelte * Fixed highlighting words with comma and period - remove commas and periods from matches * Fixed highlighting of Cyrillic characters - changed highlight regexp determination to be based on character type * Fixed highlighting problem of Japanese and Korean - marked some Japanese diacritics to escape removal - added NFC normalization to keep right form of Korean character * Fixed highlighting of words with punctuation - deleted space/punctuation list from stringsToRegex() - it seems to be working correctly with words with punctuation and hyphenated words AFAIK * Deleted some unused imports * Modified the comment * Added comment * Fixed highlighting issue with comma and period * Fixed highlighting issue with caret and other symbols - Added `^` to separators - Changed regex to use separators - Added escape of `^` from diacritics removal
2023-11-29 15:32:25 +09:00
parent 115886af78
commit 9cac7c0581
5 changed files with 44 additions and 39 deletions
--- a/src/tools/utils.ts
+++ b/src/tools/utils.ts
@@ -115,14 +115,23 @@ export function getTagsFromMetadata(metadata: CachedMetadata | null): string[] {
 * https://stackoverflow.com/a/37511463
 */
 export function removeDiacritics(str: string): string {
+  // Japanese diacritics that should be distinguished
+  const excludeDiacritics: string[] = ['\\u30FC', '\\u309A', '\\u3099']
+  const regexpExclude: string = excludeDiacritics.join('|')
+  const regexp: RegExp = new RegExp(`(?!${regexpExclude})\\p{Diacritic}`, 'gu')
+
  if (str === null || str === undefined) {
    return ''
  }
  // Keep backticks for code blocks, because otherwise they are removed by the .normalize() function
  // https://stackoverflow.com/a/36100275
  str = str.replaceAll('`', '[__omnisearch__backtick__]')
-  str = str.normalize('NFD').replace(/\p{Diacritic}/gu, '')
+  // Keep caret same as above
+  str = str.replaceAll('^', '[__omnisearch__caret__]')
+  // To keep right form of Korean character, NFC normalization is necessary
+  str = str.normalize('NFD').replace(regexp, '').normalize('NFC')
  str = str.replaceAll('[__omnisearch__backtick__]', '`')
+  str = str.replaceAll('[__omnisearch__caret__]', '^')
  return str
 }