huggingface
diff --git a/‎examples/research_projects/anytext/anytext.py‎
Lines changed: 65 additions & 2 deletions b/‎examples/research_projects/anytext/anytext.py‎
Lines changed: 65 additions & 2 deletions
@@ -33,7 +33,6 @@
 import PIL.Image
 import torch
 import torch.nn.functional as F
-from bert_tokenizer import BasicTokenizer
 from easydict import EasyDict as edict
 from frozen_clip_embedder_t3 import FrozenCLIPEmbedderT3
 from huggingface_hub import hf_hub_download
@@ -71,7 +70,71 @@
 from diffusers.utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
 
 
-checker = BasicTokenizer()
+class Checker:
+    def __init__(self):
+        pass
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)
+            or (cp >= 0x20000 and cp <= 0x2A6DF)
+            or (cp >= 0x2A700 and cp <= 0x2B73F)
+            or (cp >= 0x2B740 and cp <= 0x2B81F)
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)
+        ):
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or self._is_control(char):
+                continue
+            if self._is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_control(self, char):
+        """Checks whether `chars` is a control character."""
+        # These are technically control characters but we count them as whitespace
+        # characters.
+        if char == "\t" or char == "\n" or char == "\r":
+            return False
+        cat = unicodedata.category(char)
+        if cat in ("Cc", "Cf"):
+            return True
+        return False
+
+    def _is_whitespace(self, char):
+        """Checks whether `chars` is a whitespace character."""
+        # \t, \n, and \r are technically control characters but we treat them
+        # as whitespace since they are generally considered as such.
+        if char == " " or char == "\t" or char == "\n" or char == "\r":
+            return True
+        cat = unicodedata.category(char)
+        if cat == "Zs":
+            return True
+        return False
+
+
+checker = Checker()
 
 
 PLACE_HOLDER = "*"