Fix lowercase bug in wordpiece tokenizer (#1543)

abuelnasr0 · web-flow · commit 825b1927532a · 2024-04-04T10:01:48.000-07:00
* Fix lowercase bug * Add a comment to explain * Change mask builder * Revert "Change mask builder" This reverts commit 5c9f61e.
diff --git a/keras_nlp/tokenizers/word_piece_tokenizer.py b/keras_nlp/tokenizers/word_piece_tokenizer.py
@@ -151,8 +151,6 @@ def pretokenize(
         text = tf.expand_dims(text, 0)
     if split_on_cjk and split:
         text = tf.strings.regex_replace(text, CJK_REGEX, r" \0 ")
-    if lowercase:
-        text = tf_text.case_fold_utf8(text)
     if strip_accents:
         # Normalize unicode to NFD, which splits out accent mark characters.
         text = tf_text.normalize_utf8(text, "NFD")
@@ -187,6 +185,18 @@ def pretokenize(
             delim_regex_pattern=split_pattern,
             keep_delim_regex_pattern=keep_split_pattern,
         )
+    if lowercase:
+        if special_tokens_pattern is not None:
+            # Do not lowercase special tokens in string space. They often
+            # contain capital letters, e.g. `"[CLS]"`.
+            mask = (
+                tf.strings.regex_replace(text, special_tokens_pattern, "६")
+                == "६"
+            )
+            text = tf.where(mask, text, tf_text.case_fold_utf8(text))
+        else:
+            text = tf_text.case_fold_utf8(text)
+
     return text
 
 
diff --git a/keras_nlp/tokenizers/word_piece_tokenizer_test.py b/keras_nlp/tokenizers/word_piece_tokenizer_test.py
@@ -110,6 +110,21 @@ def test_special_tokens_int_dtype(self):
         output = tokenizer(input_data)
         self.assertAllEqual(output, [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]])
 
+    def test_special_tokens_with_lowecase(self):
+        input_data = ["[UNK] [MASK] [SEP] [PAD] [CLS] THE QUICK BROWN FOX."]
+        special_tokens = ["[UNK]", "[MASK]", "[SEP]", "[PAD]", "[CLS]"]
+        vocab_data = ["the", "qu", "##ick", "br", "##own", "fox", "."]
+        vocab_data = [*special_tokens, *vocab_data]
+
+        tokenizer = WordPieceTokenizer(
+            vocabulary=vocab_data,
+            lowercase=True,
+            special_tokens=special_tokens,
+            special_tokens_in_strings=True,
+        )
+        output = tokenizer(input_data)
+        self.assertAllEqual(output, [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]])
+
     def test_cjk_tokens(self):
         input_data = ["ah半推zz"]
         vocab_data = ["[UNK]", "推", "敐", "乐", "半", "偷", "匕", "ah", "zz"]