Fixed tokenization bug with Arabic diacritics
This commit is contained in:
@@ -124,7 +124,7 @@ export function removeDiacritics(str: string, arabic = false): string {
|
||||
// Arabic diacritics
|
||||
// https://stackoverflow.com/a/40959537
|
||||
str = str
|
||||
.replace(/([^\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z 0-9])/g, '')
|
||||
// .replace(/([^\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z 0-9])/g, '')
|
||||
.replace(/(آ|إ|أ)/g, 'ا')
|
||||
.replace(/(ة)/g, 'ه')
|
||||
.replace(/(ئ|ؤ)/g, 'ء')
|
||||
|
||||
Reference in New Issue
Block a user