Skip to content

Commit 161237b

Browse files
authored
Add NFD normalizer (#1211)
* Add NFD normalizer * Update test model ID
1 parent 591a112 commit 161237b

File tree

3 files changed

+80
-29
lines changed

3 files changed

+80
-29
lines changed

src/tokenizers.js

Lines changed: 42 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -995,6 +995,8 @@ class Normalizer extends Callable {
995995
return new Replace(config);
996996
case 'NFC':
997997
return new NFC(config);
998+
case 'NFD':
999+
return new NFD(config);
9981000
case 'NFKC':
9991001
return new NFKC(config);
10001002
case 'NFKD':
@@ -1053,50 +1055,62 @@ class Replace extends Normalizer {
10531055
}
10541056

10551057
/**
1056-
* A normalizer that applies Unicode normalization form C (NFC) to the input text.
1058+
* A normalizer that applies Unicode normalization to the input text.
10571059
* @extends Normalizer
1060+
* @abstract
10581061
*/
1059-
class NFC extends Normalizer {
1062+
class UnicodeNormalizer extends Normalizer {
1063+
/**
1064+
* @type {string} The Unicode normalization form to apply.
1065+
* Should be one of: 'NFC', 'NFD', 'NFKC', or 'NFKD'.
1066+
*/
1067+
form = undefined;
1068+
10601069
/**
1061-
* Normalize the input text by applying Unicode normalization form C (NFC).
1070+
* Normalize the input text by applying Unicode normalization.
10621071
* @param {string} text The input text to be normalized.
10631072
* @returns {string} The normalized text.
10641073
*/
10651074
normalize(text) {
1066-
text = text.normalize('NFC')
1075+
text = text.normalize(this.form)
10671076
return text;
10681077
}
10691078
}
10701079

10711080
/**
1072-
* NFKC Normalizer.
1073-
* @extends Normalizer
1081+
* A normalizer that applies Unicode normalization form C (NFC) to the input text.
1082+
* Canonical Decomposition, followed by Canonical Composition.
1083+
* @extends UnicodeNormalizer
10741084
*/
1075-
class NFKC extends Normalizer {
1076-
/**
1077-
* Normalize text using NFKC normalization.
1078-
* @param {string} text The text to be normalized.
1079-
* @returns {string} The normalized text.
1080-
*/
1081-
normalize(text) {
1082-
text = text.normalize('NFKC')
1083-
return text;
1084-
}
1085+
class NFC extends UnicodeNormalizer {
1086+
form = 'NFC';
10851087
}
1088+
10861089
/**
1087-
* NFKD Normalizer.
1088-
* @extends Normalizer
1090+
* A normalizer that applies Unicode normalization form D (NFD) to the input text.
1091+
* Canonical Decomposition.
1092+
* @extends UnicodeNormalizer
10891093
*/
1090-
class NFKD extends Normalizer {
1091-
/**
1092-
* Normalize text using NFKD normalization.
1093-
* @param {string} text The text to be normalized.
1094-
* @returns {string} The normalized text.
1095-
*/
1096-
normalize(text) {
1097-
text = text.normalize('NFKD')
1098-
return text;
1099-
}
1094+
class NFD extends UnicodeNormalizer {
1095+
form = 'NFD';
1096+
}
1097+
1098+
/**
1099+
* A normalizer that applies Unicode normalization form KC (NFKC) to the input text.
1100+
* Compatibility Decomposition, followed by Canonical Composition.
1101+
* @extends UnicodeNormalizer
1102+
*/
1103+
class NFKC extends UnicodeNormalizer {
1104+
form = 'NFKC';
1105+
}
1106+
1107+
/**
1108+
* A normalizer that applies Unicode normalization form KD (NFKD) to the input text.
1109+
* Compatibility Decomposition.
1110+
* @extends UnicodeNormalizer
1111+
*/
1112+
class NFKD extends UnicodeNormalizer {
1113+
form = 'NFKD';
11001114
}
11011115

11021116
/**

tests/models/bert/test_tokenization_bert.js

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { BertTokenizer } from "../../../src/tokenizers.js";
2-
import { BASE_TEST_STRINGS, BERT_TEST_STRINGS } from "../test_strings.js";
2+
import { BASE_TEST_STRINGS, BERT_TEST_STRINGS, NORMALIZATION_TEST_STRINGS } from "../test_strings.js";
33

44
export const TOKENIZER_CLASS = BertTokenizer;
55
export const TEST_CONFIG = {
@@ -1341,4 +1341,31 @@ export const TEST_CONFIG = {
13411341
decoded: "[CLS] ah [UNK] [UNK] zz [SEP]",
13421342
},
13431343
},
1344+
// NFD normalizer
1345+
"onnx-community/language_detection-ONNX": {
1346+
DEFAULT_EXAMPLE: {
1347+
text: NORMALIZATION_TEST_STRINGS.DEFAULT_EXAMPLE,
1348+
tokens: ["ame", "##lie", "|", "ame", "##lie"],
1349+
ids: [1, 21947, 31933, 70, 21947, 31933, 2],
1350+
decoded: "[CLS] amelie | amelie [SEP]",
1351+
},
1352+
CANONICAL_EQUIVALENCE_NORMALIZATION: {
1353+
text: NORMALIZATION_TEST_STRINGS.CANONICAL_EQUIVALENCE_NORMALIZATION,
1354+
tokens: ["n", "|", "n"],
1355+
ids: [1, 56, 70, 56, 2],
1356+
decoded: "[CLS] n | n [SEP]",
1357+
},
1358+
COMPATIBILITY_NORMALIZATION: {
1359+
text: NORMALIZATION_TEST_STRINGS.COMPATIBILITY_NORMALIZATION,
1360+
tokens: ["[UNK]", "|", "ff"],
1361+
ids: [1, 0, 70, 40133, 2],
1362+
decoded: "[CLS] [UNK] | ff [SEP]",
1363+
},
1364+
COMBINED_EXAMPLE: {
1365+
text: NORMALIZATION_TEST_STRINGS.COMBINED_EXAMPLE,
1366+
tokens: ["ſ", "|", "ſ", "|", "ſ", "|", "s", "|", "s"],
1367+
ids: [1, 121, 70, 121, 70, 121, 70, 61, 70, 61, 2],
1368+
decoded: "[CLS] ſ | ſ | ſ | s | s [SEP]",
1369+
},
1370+
},
13441371
};

tests/models/test_strings.js

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,3 +113,13 @@ export const M2M_100_TEST_STRINGS = {
113113
HIDNI_TEXT: "जीवन एक चॉकलेट बॉक्स की तरह है।",
114114
CHINESE_TEXT: "生活就像一盒巧克力。",
115115
};
116+
117+
// Test strings adapted from https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/normalize
118+
export const NORMALIZATION_TEST_STRINGS = {
119+
DEFAULT_EXAMPLE: "\u0041\u006d\u00e9\u006c\u0069\u0065 | \u0041\u006d\u0065\u0301\u006c\u0069\u0065",
120+
CANONICAL_EQUIVALENCE_NORMALIZATION: "\u00F1 | \u006E\u0303",
121+
COMPATIBILITY_NORMALIZATION: "\uFB00 | \u0066\u0066",
122+
123+
// Original | NFC | NFD | NFKC | NFKD
124+
COMBINED_EXAMPLE: "\u1E9B\u0323 | \u1E9B\u0323 | \u017F\u0323\u0307 | \u1E69 | \u0073\u0323\u0307",
125+
};

0 commit comments

Comments
 (0)