Fixed tokenization bug with Arabic diacritics

This commit is contained in:
Simon Cambier
2025-07-11 19:17:01 +02:00
parent d12312a43b
commit c25cc8e62b

View File

@@ -124,7 +124,7 @@ export function removeDiacritics(str: string, arabic = false): string {
// Arabic diacritics
// https://stackoverflow.com/a/40959537
str = str
.replace(/([^\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z 0-9])/g, '')
// .replace(/([^\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z 0-9])/g, '')
.replace(/(آ|إ|أ)/g, 'ا')
.replace(/(ة)/g, 'ه')
.replace(/(ئ|ؤ)/g, 'ء')