Add NFD normalizer (#1211)

xenova · web-flow · commit 161237b096ab · 2025-02-26T16:41:36.000+02:00
* Add NFD normalizer

* Update test model ID
diff --git a/src/tokenizers.js b/src/tokenizers.js
@@ -995,6 +995,8 @@ class Normalizer extends Callable {
                 return new Replace(config);
             case 'NFC':
                 return new NFC(config);
+            case 'NFD':
+                return new NFD(config);
             case 'NFKC':
                 return new NFKC(config);
             case 'NFKD':
@@ -1053,50 +1055,62 @@ class Replace extends Normalizer {
 }
 
 /**
- * A normalizer that applies Unicode normalization form C (NFC) to the input text.
+ * A normalizer that applies Unicode normalization to the input text.
  * @extends Normalizer
+ * @abstract
  */
-class NFC extends Normalizer {
+class UnicodeNormalizer extends Normalizer {
+    /**
+     * @type {string} The Unicode normalization form to apply.
+     * Should be one of: 'NFC', 'NFD', 'NFKC', or 'NFKD'.
+     */
+    form = undefined;
+
     /**
-     * Normalize the input text by applying Unicode normalization form C (NFC).
+     * Normalize the input text by applying Unicode normalization.
      * @param {string} text The input text to be normalized.
      * @returns {string} The normalized text.
      */
     normalize(text) {
-        text = text.normalize('NFC')
+        text = text.normalize(this.form)
         return text;
     }
 }
 
 /**
- * NFKC Normalizer.
- * @extends Normalizer
+ * A normalizer that applies Unicode normalization form C (NFC) to the input text.
+ * Canonical Decomposition, followed by Canonical Composition.
+ * @extends UnicodeNormalizer
  */
-class NFKC extends Normalizer {
-    /**
-     * Normalize text using NFKC normalization.
-     * @param {string} text The text to be normalized.
-     * @returns {string} The normalized text.
-     */
-    normalize(text) {
-        text = text.normalize('NFKC')
-        return text;
-    }
+class NFC extends UnicodeNormalizer {
+    form = 'NFC';
 }
+
 /**
- * NFKD Normalizer.
- * @extends Normalizer
+ * A normalizer that applies Unicode normalization form D (NFD) to the input text.
+ * Canonical Decomposition.
+ * @extends UnicodeNormalizer
  */
-class NFKD extends Normalizer {
-    /**
-     * Normalize text using NFKD normalization.
-     * @param {string} text The text to be normalized.
-     * @returns {string} The normalized text.
-     */
-    normalize(text) {
-        text = text.normalize('NFKD')
-        return text;
-    }
+class NFD extends UnicodeNormalizer {
+    form = 'NFD';
+}
+
+/**
+ * A normalizer that applies Unicode normalization form KC (NFKC) to the input text.
+ * Compatibility Decomposition, followed by Canonical Composition.
+ * @extends UnicodeNormalizer
+ */
+class NFKC extends UnicodeNormalizer {
+    form = 'NFKC';
+}
+
+/**
+ * A normalizer that applies Unicode normalization form KD (NFKD) to the input text.
+ * Compatibility Decomposition.
+ * @extends UnicodeNormalizer
+ */
+class NFKD extends UnicodeNormalizer {
+    form = 'NFKD';
 }
 
 /**
diff --git a/tests/models/bert/test_tokenization_bert.js b/tests/models/bert/test_tokenization_bert.js
@@ -1,5 +1,5 @@
 import { BertTokenizer } from "../../../src/tokenizers.js";
-import { BASE_TEST_STRINGS, BERT_TEST_STRINGS } from "../test_strings.js";
+import { BASE_TEST_STRINGS, BERT_TEST_STRINGS, NORMALIZATION_TEST_STRINGS } from "../test_strings.js";
 
 export const TOKENIZER_CLASS = BertTokenizer;
 export const TEST_CONFIG = {
@@ -1341,4 +1341,31 @@ export const TEST_CONFIG = {
       decoded: "[CLS] ah [UNK] [UNK] zz [SEP]",
     },
   },
+  // NFD normalizer
+  "onnx-community/language_detection-ONNX": {
+    DEFAULT_EXAMPLE: {
+      text: NORMALIZATION_TEST_STRINGS.DEFAULT_EXAMPLE,
+      tokens: ["ame", "##lie", "|", "ame", "##lie"],
+      ids: [1, 21947, 31933, 70, 21947, 31933, 2],
+      decoded: "[CLS] amelie | amelie [SEP]",
+    },
+    CANONICAL_EQUIVALENCE_NORMALIZATION: {
+      text: NORMALIZATION_TEST_STRINGS.CANONICAL_EQUIVALENCE_NORMALIZATION,
+      tokens: ["n", "|", "n"],
+      ids: [1, 56, 70, 56, 2],
+      decoded: "[CLS] n | n [SEP]",
+    },
+    COMPATIBILITY_NORMALIZATION: {
+      text: NORMALIZATION_TEST_STRINGS.COMPATIBILITY_NORMALIZATION,
+      tokens: ["[UNK]", "|", "ff"],
+      ids: [1, 0, 70, 40133, 2],
+      decoded: "[CLS] [UNK] | ff [SEP]",
+    },
+    COMBINED_EXAMPLE: {
+      text: NORMALIZATION_TEST_STRINGS.COMBINED_EXAMPLE,
+      tokens: ["ſ", "|", "ſ", "|", "ſ", "|", "s", "|", "s"],
+      ids: [1, 121, 70, 121, 70, 121, 70, 61, 70, 61, 2],
+      decoded: "[CLS] ſ | ſ | ſ | s | s [SEP]",
+    },
+  },
 };
diff --git a/tests/models/test_strings.js b/tests/models/test_strings.js
@@ -113,3 +113,13 @@ export const M2M_100_TEST_STRINGS = {
   HIDNI_TEXT: "जीवन एक चॉकलेट बॉक्स की तरह है।",
   CHINESE_TEXT: "生活就像一盒巧克力。",
 };
+
+// Test strings adapted from https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/normalize
+export const NORMALIZATION_TEST_STRINGS = {
+  DEFAULT_EXAMPLE: "\u0041\u006d\u00e9\u006c\u0069\u0065 | \u0041\u006d\u0065\u0301\u006c\u0069\u0065",
+  CANONICAL_EQUIVALENCE_NORMALIZATION: "\u00F1 | \u006E\u0303",
+  COMPATIBILITY_NORMALIZATION: "\uFB00 | \u0066\u0066",
+
+  // Original | NFC | NFD | NFKC | NFKD
+  COMBINED_EXAMPLE: "\u1E9B\u0323 | \u1E9B\u0323 | \u017F\u0323\u0307 | \u1E69 | \u0073\u0323\u0307",
+};