CodeWithKyrian
diff --git a/‎src/Normalizers/BertNormalizer.php‎
Lines changed: 76 additions & 33 deletions b/‎src/Normalizers/BertNormalizer.php‎
Lines changed: 76 additions & 33 deletions
diff --git a/‎src/Normalizers/StripAccents.php‎
Lines changed: 6 additions & 5 deletions b/‎src/Normalizers/StripAccents.php‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎src/PreTrainedTokenizers/PreTrainedTokenizer.php‎
Lines changed: 43 additions & 26 deletions b/‎src/PreTrainedTokenizers/PreTrainedTokenizer.php‎
Lines changed: 43 additions & 26 deletions
@@ -2,16 +2,57 @@
 
 declare(strict_types=1);
 
-
 namespace Codewithkyrian\Transformers\Normalizers;
 
-use Codewithkyrian\Transformers\Tokenizers\TokenizerModel;
-
 /**
  * A class representing a normalizer used in BERT tokenization.
  */
 class BertNormalizer extends Normalizer
 {
+    /**
+     * Performs invalid character removal and whitespace cleanup on text.
+     * @param string $text The text to clean.
+     * @return string The cleaned text.
+     * @private
+     */
+    function cleanText(string $text): string
+    {
+        $output = [];
+        for ($i = 0; $i < mb_strlen($text); ++$i) {
+            $char = mb_substr($text, $i, 1);
+            $cp = mb_ord($char);
+            if ($cp === 0 || $cp === 0xFFFD || $this->isControl($char)) {
+                continue;
+            }
+            if (preg_match('/^\s$/', $char)) { // is whitespace
+                $output[] = " ";
+            } else {
+                $output[] = $char;
+            }
+        }
+        return implode("", $output);
+    }
+
+    public function normalize(string $text): string
+    {
+        if ($this->config['clean_text'] ?? false) {
+            $text = $this->cleanText($text);
+        }
+
+        if ($this->config['handle_chinese_chars'] ?? false) {
+            $text = $this->tokenizeChineseChars($text);
+        }
+
+        if ($this->config['lowercase'] ?? false) {
+            $text = mb_strtolower($text);
+        }
+
+        if ($this->config['strip_accents'] ?? false) {
+            $text = $this->stripAccents($text);
+        }
+
+        return $text;
+    }
 
     /**
      * Strips accents from the given text.
@@ -43,47 +84,49 @@ protected function isControl(string $char): bool
     }
 
     /**
-     * Performs invalid character removal and whitespace cleanup on text.
-     * @param string $text The text to clean.
-     * @return string The cleaned text.
-     * @private
+     * Checks whether the given Unicode codepoint represents a CJK (Chinese, Japanese, or Korean) character.
+     *
+     * A "chinese character" is defined as anything in the CJK Unicode block.
+     *
+     * @param int $cp The Unicode codepoint to check.
+     *
+     * @return bool True if the codepoint represents a CJK character, false otherwise.
      */
-    function cleanText(string $text): string
+    protected function isChineseChar(int $cp): bool
+    {
+        return (
+            ($cp >= 0x4E00 && $cp <= 0x9FFF)
+            || ($cp >= 0x3400 && $cp <= 0x4DBF)
+            || ($cp >= 0x20000 && $cp <= 0x2A6DF)
+            || ($cp >= 0x2A700 && $cp <= 0x2B73F)
+            || ($cp >= 0x2B740 && $cp <= 0x2B81F)
+            || ($cp >= 0x2B820 && $cp <= 0x2CEAF)
+            || ($cp >= 0xF900 && $cp <= 0xFAFF)
+            || ($cp >= 0x2F800 && $cp <= 0x2FA1F)
+        );
+    }
+
+    /**
+     * Adds whitespace around any CJK (Chinese, Japanese, or Korean) character in the input text.
+     *
+     * @param string $text The input text to tokenize.
+     *
+     * @return string The tokenized text with whitespace added around CJK characters.
+     */
+    public function tokenizeChineseChars(string $text): string
     {
         $output = [];
         for ($i = 0; $i < mb_strlen($text); ++$i) {
             $char = mb_substr($text, $i, 1);
             $cp = mb_ord($char);
-            if ($cp === 0 || $cp === 0xFFFD || $this->isControl($char)) {
-                continue;
-            }
-            if (preg_match('/^\s$/', $char)) { // is whitespace
+            if ($this->isChineseChar($cp)) {
+                $output[] = " ";
+                $output[] = $char;
                 $output[] = " ";
             } else {
                 $output[] = $char;
             }
         }
         return implode("", $output);
     }
-
-    public function normalize(string $text): string
-    {
-        if ($this->config['clean_text'] ?? false) {
-            $text = $this->cleanText($text);
-        }
-
-        if ($this->config['handle_chinese_chars'] ?? false) {
-            $text = TokenizerModel::tokenizeChineseChars($text);
-        }
-
-        if ($this->config['lowercase'] ?? false) {
-            $text = mb_strtolower($text);
-        }
-
-        if ($this->config['strip_accents'] ?? false) {
-            $text = $this->stripAccents($text);
-        }
-
-        return $text;
-    }
 }
@@ -2,19 +2,20 @@
 
 declare(strict_types=1);
 
-
 namespace Codewithkyrian\Transformers\Normalizers;
 
-use Codewithkyrian\Transformers\Tokenizers\TokenizerModel;
-
 /**
  * StripAccents normalizer removes all accents from the text.
  */
 class StripAccents extends Normalizer
 {
-
+    /**
+     * Removes accents from the text.
+     * @param string $text The text to remove accents from.
+     * @return string The text with accents removed.
+     */
     public function normalize(string $text): string
     {
-        return TokenizerModel::removeAccents($text);
+        return preg_replace('/[\x{0300}-\x{036f}]/u', '', $text);
     }
 }
@@ -103,10 +103,10 @@ public function __construct(protected array $tokenizerJSON, protected ?array $to
             $addedTokensPatterns = array_map(function ($x) {
                 $lstrip = $x->lStrip ? '\s*' : '';
                 $rstrip = $x->rStrip ? '\s*' : '';
-                return $lstrip.'('.preg_quote($x->content, '/').')'.$rstrip;
+                return $lstrip . '(' . preg_quote($x->content, '/') . ')' . $rstrip;
             }, $this->addedTokens);
 
-            $this->addedTokensRegex = '/'.implode('|', $addedTokensPatterns).'/';
+            $this->addedTokensRegex = '/' . implode('|', $addedTokensPatterns) . '/';
         }
 
         // Set mask token if present
@@ -160,7 +160,7 @@ protected function getToken(string ...$keys): ?string
                 if ($item['__type'] == 'AddedToken') {
                     return $item['content'];
                 } else {
-                    throw new Exception("Unknown token: ".json_encode($item));
+                    throw new Exception("Unknown token: " . json_encode($item));
                 }
             } else {
                 return $item;
@@ -184,9 +184,8 @@ public static function fromPretrained(
         string  $modelNameOrPath,
         ?string $cacheDir = null,
         string  $revision = 'main',
-                $legacy = null,
-    ): PreTrainedTokenizer
-    {
+        $legacy = null,
+    ): PreTrainedTokenizer {
         ['tokenizerJson' => $tokenizerJson, 'tokenizerConfig' => $tokenizerConfig] =
             TokenizerModel::load($modelNameOrPath, $cacheDir, $revision, $legacy);
 
@@ -213,8 +212,7 @@ public function tokenize(
         bool              $truncation = false,
         ?int              $maxLength = null,
         bool              $returnTensor = true
-    ): array
-    {
+    ): array {
         return $this->__invoke($text, $textPair, $padding, $addSpecialTokens, $truncation, $maxLength, $returnTensor);
     }
 
@@ -239,8 +237,7 @@ public function __invoke(
         bool              $truncation = false,
         ?int              $maxLength = null,
         bool              $returnTensor = true
-    ): array
-    {
+    ): array {
         $isBatched = is_array($text);
 
         $encodedTokens = [];
@@ -258,13 +255,13 @@ public function __invoke(
                 }
 
                 $encodedTokens = array_map(
-                    fn ($t, $i) => $this->encodePlus($t, $textPair[$i], $addSpecialTokens),
+                    fn($t, $i) => $this->encodePlus($t, $textPair[$i], $addSpecialTokens),
                     $text,
                     array_keys($text)
                 );
             } else {
                 $encodedTokens = array_map(
-                    fn ($x) => $this->encodePlus($x, addSpecialTokens: $addSpecialTokens),
+                    fn($x) => $this->encodePlus($x, addSpecialTokens: $addSpecialTokens),
                     $text
                 );
             }
@@ -285,7 +282,7 @@ public function __invoke(
                 $maxLength = $this->modelMaxLength;
             } else {
                 // Calculate max length from sequences
-                $maxLength = max(array_map(fn ($x) => count($x['input_ids']), $encodedTokens));
+                $maxLength = max(array_map(fn($x) => count($x['input_ids']), $encodedTokens));
             }
         } else {
             if (!$truncation) {
@@ -314,7 +311,7 @@ public function __invoke(
                         $this->padHelper(
                             $token,
                             $maxLength,
-                            fn ($key) => $key === 'input_ids' ? $this->padTokenId : 0,
+                            fn($key) => $key === 'input_ids' ? $this->padTokenId : 0,
                             $this->paddingSide
                         );
                     }
@@ -353,15 +350,15 @@ public function __invoke(
                     continue;
                 }
 
-                $array = array_map(fn ($x) => $x[$key], $encodedTokens);
+                $array = array_map(fn($x) => $x[$key], $encodedTokens);
 
                 $result[$key] = new Tensor($array, Tensor::int64, $shape);
             }
         } else {
             $result = [];
 
             foreach ($encodedTokens[0] as $key => $value) {
-                $result[$key] = array_map(fn ($x) => $x[$key], $encodedTokens);
+                $result[$key] = array_map(fn($x) => $x[$key], $encodedTokens);
             }
 
             // If not returning a tensor, we match the input type
@@ -388,8 +385,7 @@ public function encodePlus(
         string|null $text,
         string|null $textPair = null,
         bool        $addSpecialTokens = true
-    ): array
-    {
+    ): array {
         // Function called by users to encode possibly multiple texts
         $tokens = $this->encodeText($text);
 
@@ -443,9 +439,8 @@ protected function encodeText(?string $text): ?array
                     $x = preg_replace('/\s+/', ' ', trim($x));
                 }
 
-
                 if ($this->doLowerCaseAndRemoveAccent) {
-                    $x = TokenizerModel::lowerCaseAndRemoveAccents($x);
+                    $x = $this->lowerCaseAndRemoveAccents($x);
                 }
 
                 if ($this->normalizer !== null) {
@@ -522,7 +517,7 @@ protected function padHelper(array &$item, int $length, Closure $value_fn, strin
      *
      * @return array
      */
-    public function encode(string $text, string $textPair = null, bool $addSpecialTokens = true): array
+    public function encode(string $text, ?string $textPair = null, bool $addSpecialTokens = true): array
     {
         return $this->encodePlus($text, $textPair, $addSpecialTokens)['input_ids'];
     }
@@ -539,7 +534,7 @@ public function encode(string $text, string $textPair = null, bool $addSpecialTo
     public function batchDecode(array|Tensor $batch, bool $skipSpecialTokens = false, ?bool $cleanUpTokenizationSpaces = null): array
     {
         if ($batch instanceof Tensor) $batch = $batch->toArray();
-        return array_map(fn ($x) => $this->decode($x, $skipSpecialTokens, $cleanUpTokenizationSpaces), $batch);
+        return array_map(fn($x) => $this->decode($x, $skipSpecialTokens, $cleanUpTokenizationSpaces), $batch);
     }
 
     /**
@@ -574,7 +569,7 @@ private function decodeSingle(array $tokenIds, bool $skipSpecialTokens = false,
         $tokens = $this->model->convertIdsToTokens($tokenIds);
 
         if ($skipSpecialTokens) {
-            $tokens = array_values(array_filter($tokens, fn ($x) => !in_array($x, $this->specialTokens)));
+            $tokens = array_values(array_filter($tokens, fn($x) => !in_array($x, $this->specialTokens)));
         }
 
         // If `this.decoder` is null, we just join tokens with a space:
@@ -592,7 +587,6 @@ private function decodeSingle(array $tokenIds, bool $skipSpecialTokens = false,
             }
         }
 
-
         if ($cleanUpTokenizationSpaces ?? $this->cleanUpTokenizationSpaces) {
             $decoded = TokenizerModel::cleanUpTokenization($decoded);
         }
@@ -644,8 +638,7 @@ public function applyChatTemplate(
         bool    $truncation = false,
         ?int    $maxLength = null,
         bool    $returnTensor = true
-    ): string|array
-    {
+    ): string|array {
         $chatTemplate ??= $this->chatTemplate ?? $this->getDefaultChatTemplate();
 
         // Compilation function uses a cache to avoid recompiling the same template
@@ -693,4 +686,28 @@ protected function getDefaultChatTemplate(): string
 
         return $this->defaultChatTemplate;
     }
+
+    /**
+     * Helper function to lowercase a string and remove accents.
+     *
+     * @param string $text The text to lowercase and remove accents from.
+     *
+     * @return string The text with accents removed and lowercased.
+     */
+    protected function lowerCaseAndRemoveAccents(string $text): string
+    {
+        return mb_strtolower($this->removeAccents($text));
+    }
+
+    /**
+     * Helper function to remove accents from a string.
+     *
+     * @param string $text The text to remove accents from.
+     *
+     * @return string The text with accents removed.
+     */
+    protected function removeAccents(string $text): string
+    {
+        return preg_replace('/[\x{0300}-\x{036f}]/u', '', $text);
+    }
 }
Original file line number	Diff line number	Diff line change
`@@ -2,19 +2,20 @@`
`2`	`2`
`3`	`3`	`declare(strict_types=1);`
`4`	`4`
`5`		`-`
`6`	`5`	`namespace Codewithkyrian\Transformers\Normalizers;`
`7`	`6`
`8`		`-use Codewithkyrian\Transformers\Tokenizers\TokenizerModel;`
`9`		`-`
`10`	`7`	`/**`
`11`	`8`	`* StripAccents normalizer removes all accents from the text.`
`12`	`9`	`*/`
`13`	`10`	`class StripAccents extends Normalizer`
`14`	`11`	`{`
`15`		`-`
	`12`	`+ /**`
	`13`	`+ * Removes accents from the text.`
	`14`	`+ * @param string $text The text to remove accents from.`
	`15`	`+ * @return string The text with accents removed.`
	`16`	`+ */`
`16`	`17`	`public function normalize(string $text): string`
`17`	`18`	`{`
`18`		`- return TokenizerModel::removeAccents($text);`
	`19`	`+ return preg_replace('/[\x{0300}-\x{036f}]/u', '', $text);`
`19`	`20`	`}`
`20`	`21`	`}`