Skip to content

Commit 61693a5

Browse files
committed
[UPDATE] Refactor BasicTokenizer usage to a new Checker class for text processing
1 parent 7f87755 commit 61693a5

File tree

2 files changed

+65
-432
lines changed

2 files changed

+65
-432
lines changed

examples/research_projects/anytext/anytext.py

Lines changed: 65 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
import PIL.Image
3434
import torch
3535
import torch.nn.functional as F
36-
from bert_tokenizer import BasicTokenizer
3736
from easydict import EasyDict as edict
3837
from frozen_clip_embedder_t3 import FrozenCLIPEmbedderT3
3938
from huggingface_hub import hf_hub_download
@@ -71,7 +70,71 @@
7170
from diffusers.utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
7271

7372

74-
checker = BasicTokenizer()
73+
class Checker:
74+
def __init__(self):
75+
pass
76+
77+
def _is_chinese_char(self, cp):
78+
"""Checks whether CP is the codepoint of a CJK character."""
79+
# This defines a "chinese character" as anything in the CJK Unicode block:
80+
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
81+
#
82+
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
83+
# despite its name. The modern Korean Hangul alphabet is a different block,
84+
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
85+
# space-separated words, so they are not treated specially and handled
86+
# like the all of the other languages.
87+
if (
88+
(cp >= 0x4E00 and cp <= 0x9FFF)
89+
or (cp >= 0x3400 and cp <= 0x4DBF)
90+
or (cp >= 0x20000 and cp <= 0x2A6DF)
91+
or (cp >= 0x2A700 and cp <= 0x2B73F)
92+
or (cp >= 0x2B740 and cp <= 0x2B81F)
93+
or (cp >= 0x2B820 and cp <= 0x2CEAF)
94+
or (cp >= 0xF900 and cp <= 0xFAFF)
95+
or (cp >= 0x2F800 and cp <= 0x2FA1F)
96+
):
97+
return True
98+
99+
return False
100+
101+
def _clean_text(self, text):
102+
"""Performs invalid character removal and whitespace cleanup on text."""
103+
output = []
104+
for char in text:
105+
cp = ord(char)
106+
if cp == 0 or cp == 0xFFFD or self._is_control(char):
107+
continue
108+
if self._is_whitespace(char):
109+
output.append(" ")
110+
else:
111+
output.append(char)
112+
return "".join(output)
113+
114+
def _is_control(self, char):
115+
"""Checks whether `chars` is a control character."""
116+
# These are technically control characters but we count them as whitespace
117+
# characters.
118+
if char == "\t" or char == "\n" or char == "\r":
119+
return False
120+
cat = unicodedata.category(char)
121+
if cat in ("Cc", "Cf"):
122+
return True
123+
return False
124+
125+
def _is_whitespace(self, char):
126+
"""Checks whether `chars` is a whitespace character."""
127+
# \t, \n, and \r are technically control characters but we treat them
128+
# as whitespace since they are generally considered as such.
129+
if char == " " or char == "\t" or char == "\n" or char == "\r":
130+
return True
131+
cat = unicodedata.category(char)
132+
if cat == "Zs":
133+
return True
134+
return False
135+
136+
137+
checker = Checker()
75138

76139

77140
PLACE_HOLDER = "*"

0 commit comments

Comments
 (0)