From c25cc8e62b9318fd0f698d3bfb743e54f81e9617 Mon Sep 17 00:00:00 2001 From: Simon Cambier Date: Fri, 11 Jul 2025 19:17:01 +0200 Subject: [PATCH] Fixed tokenization bug with Arabic diacritics --- src/tools/utils.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tools/utils.ts b/src/tools/utils.ts index 65cb5e2..61f7db6 100644 --- a/src/tools/utils.ts +++ b/src/tools/utils.ts @@ -124,7 +124,7 @@ export function removeDiacritics(str: string, arabic = false): string { // Arabic diacritics // https://stackoverflow.com/a/40959537 str = str - .replace(/([^\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z 0-9])/g, '') + // .replace(/([^\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z 0-9])/g, '') .replace(/(آ|إ|أ)/g, 'ا') .replace(/(ة)/g, 'ه') .replace(/(ئ|ؤ)/g, 'ء')