@@ -103,10 +103,10 @@ public function __construct(protected array $tokenizerJSON, protected ?array $to
103103 $ addedTokensPatterns = array_map (function ($ x ) {
104104 $ lstrip = $ x ->lStrip ? '\s* ' : '' ;
105105 $ rstrip = $ x ->rStrip ? '\s* ' : '' ;
106- return $ lstrip. '( ' . preg_quote ($ x ->content , '/ ' ). ') ' . $ rstrip ;
106+ return $ lstrip . '( ' . preg_quote ($ x ->content , '/ ' ) . ') ' . $ rstrip ;
107107 }, $ this ->addedTokens );
108108
109- $ this ->addedTokensRegex = '/ ' . implode ('| ' , $ addedTokensPatterns ). '/ ' ;
109+ $ this ->addedTokensRegex = '/ ' . implode ('| ' , $ addedTokensPatterns ) . '/ ' ;
110110 }
111111
112112 // Set mask token if present
@@ -160,7 +160,7 @@ protected function getToken(string ...$keys): ?string
160160 if ($ item ['__type ' ] == 'AddedToken ' ) {
161161 return $ item ['content ' ];
162162 } else {
163- throw new Exception ("Unknown token: " . json_encode ($ item ));
163+ throw new Exception ("Unknown token: " . json_encode ($ item ));
164164 }
165165 } else {
166166 return $ item ;
@@ -184,9 +184,8 @@ public static function fromPretrained(
184184 string $ modelNameOrPath ,
185185 ?string $ cacheDir = null ,
186186 string $ revision = 'main ' ,
187- $ legacy = null ,
188- ): PreTrainedTokenizer
189- {
187+ $ legacy = null ,
188+ ): PreTrainedTokenizer {
190189 ['tokenizerJson ' => $ tokenizerJson , 'tokenizerConfig ' => $ tokenizerConfig ] =
191190 TokenizerModel::load ($ modelNameOrPath , $ cacheDir , $ revision , $ legacy );
192191
@@ -213,8 +212,7 @@ public function tokenize(
213212 bool $ truncation = false ,
214213 ?int $ maxLength = null ,
215214 bool $ returnTensor = true
216- ): array
217- {
215+ ): array {
218216 return $ this ->__invoke ($ text , $ textPair , $ padding , $ addSpecialTokens , $ truncation , $ maxLength , $ returnTensor );
219217 }
220218
@@ -239,8 +237,7 @@ public function __invoke(
239237 bool $ truncation = false ,
240238 ?int $ maxLength = null ,
241239 bool $ returnTensor = true
242- ): array
243- {
240+ ): array {
244241 $ isBatched = is_array ($ text );
245242
246243 $ encodedTokens = [];
@@ -258,13 +255,13 @@ public function __invoke(
258255 }
259256
260257 $ encodedTokens = array_map (
261- fn ($ t , $ i ) => $ this ->encodePlus ($ t , $ textPair [$ i ], $ addSpecialTokens ),
258+ fn ($ t , $ i ) => $ this ->encodePlus ($ t , $ textPair [$ i ], $ addSpecialTokens ),
262259 $ text ,
263260 array_keys ($ text )
264261 );
265262 } else {
266263 $ encodedTokens = array_map (
267- fn ($ x ) => $ this ->encodePlus ($ x , addSpecialTokens: $ addSpecialTokens ),
264+ fn ($ x ) => $ this ->encodePlus ($ x , addSpecialTokens: $ addSpecialTokens ),
268265 $ text
269266 );
270267 }
@@ -285,7 +282,7 @@ public function __invoke(
285282 $ maxLength = $ this ->modelMaxLength ;
286283 } else {
287284 // Calculate max length from sequences
288- $ maxLength = max (array_map (fn ($ x ) => count ($ x ['input_ids ' ]), $ encodedTokens ));
285+ $ maxLength = max (array_map (fn ($ x ) => count ($ x ['input_ids ' ]), $ encodedTokens ));
289286 }
290287 } else {
291288 if (!$ truncation ) {
@@ -314,7 +311,7 @@ public function __invoke(
314311 $ this ->padHelper (
315312 $ token ,
316313 $ maxLength ,
317- fn ($ key ) => $ key === 'input_ids ' ? $ this ->padTokenId : 0 ,
314+ fn ($ key ) => $ key === 'input_ids ' ? $ this ->padTokenId : 0 ,
318315 $ this ->paddingSide
319316 );
320317 }
@@ -353,15 +350,15 @@ public function __invoke(
353350 continue ;
354351 }
355352
356- $ array = array_map (fn ($ x ) => $ x [$ key ], $ encodedTokens );
353+ $ array = array_map (fn ($ x ) => $ x [$ key ], $ encodedTokens );
357354
358355 $ result [$ key ] = new Tensor ($ array , Tensor::int64, $ shape );
359356 }
360357 } else {
361358 $ result = [];
362359
363360 foreach ($ encodedTokens [0 ] as $ key => $ value ) {
364- $ result [$ key ] = array_map (fn ($ x ) => $ x [$ key ], $ encodedTokens );
361+ $ result [$ key ] = array_map (fn ($ x ) => $ x [$ key ], $ encodedTokens );
365362 }
366363
367364 // If not returning a tensor, we match the input type
@@ -388,8 +385,7 @@ public function encodePlus(
388385 string |null $ text ,
389386 string |null $ textPair = null ,
390387 bool $ addSpecialTokens = true
391- ): array
392- {
388+ ): array {
393389 // Function called by users to encode possibly multiple texts
394390 $ tokens = $ this ->encodeText ($ text );
395391
@@ -443,9 +439,8 @@ protected function encodeText(?string $text): ?array
443439 $ x = preg_replace ('/\s+/ ' , ' ' , trim ($ x ));
444440 }
445441
446-
447442 if ($ this ->doLowerCaseAndRemoveAccent ) {
448- $ x = TokenizerModel:: lowerCaseAndRemoveAccents ($ x );
443+ $ x = $ this -> lowerCaseAndRemoveAccents ($ x );
449444 }
450445
451446 if ($ this ->normalizer !== null ) {
@@ -522,7 +517,7 @@ protected function padHelper(array &$item, int $length, Closure $value_fn, strin
522517 *
523518 * @return array
524519 */
525- public function encode (string $ text , string $ textPair = null , bool $ addSpecialTokens = true ): array
520+ public function encode (string $ text , ? string $ textPair = null , bool $ addSpecialTokens = true ): array
526521 {
527522 return $ this ->encodePlus ($ text , $ textPair , $ addSpecialTokens )['input_ids ' ];
528523 }
@@ -539,7 +534,7 @@ public function encode(string $text, string $textPair = null, bool $addSpecialTo
539534 public function batchDecode (array |Tensor $ batch , bool $ skipSpecialTokens = false , ?bool $ cleanUpTokenizationSpaces = null ): array
540535 {
541536 if ($ batch instanceof Tensor) $ batch = $ batch ->toArray ();
542- return array_map (fn ($ x ) => $ this ->decode ($ x , $ skipSpecialTokens , $ cleanUpTokenizationSpaces ), $ batch );
537+ return array_map (fn ($ x ) => $ this ->decode ($ x , $ skipSpecialTokens , $ cleanUpTokenizationSpaces ), $ batch );
543538 }
544539
545540 /**
@@ -574,7 +569,7 @@ private function decodeSingle(array $tokenIds, bool $skipSpecialTokens = false,
574569 $ tokens = $ this ->model ->convertIdsToTokens ($ tokenIds );
575570
576571 if ($ skipSpecialTokens ) {
577- $ tokens = array_values (array_filter ($ tokens , fn ($ x ) => !in_array ($ x , $ this ->specialTokens )));
572+ $ tokens = array_values (array_filter ($ tokens , fn ($ x ) => !in_array ($ x , $ this ->specialTokens )));
578573 }
579574
580575 // If `this.decoder` is null, we just join tokens with a space:
@@ -592,7 +587,6 @@ private function decodeSingle(array $tokenIds, bool $skipSpecialTokens = false,
592587 }
593588 }
594589
595-
596590 if ($ cleanUpTokenizationSpaces ?? $ this ->cleanUpTokenizationSpaces ) {
597591 $ decoded = TokenizerModel::cleanUpTokenization ($ decoded );
598592 }
@@ -644,8 +638,7 @@ public function applyChatTemplate(
644638 bool $ truncation = false ,
645639 ?int $ maxLength = null ,
646640 bool $ returnTensor = true
647- ): string |array
648- {
641+ ): string |array {
649642 $ chatTemplate ??= $ this ->chatTemplate ?? $ this ->getDefaultChatTemplate ();
650643
651644 // Compilation function uses a cache to avoid recompiling the same template
@@ -693,4 +686,28 @@ protected function getDefaultChatTemplate(): string
693686
694687 return $ this ->defaultChatTemplate ;
695688 }
689+
690+ /**
691+ * Helper function to lowercase a string and remove accents.
692+ *
693+ * @param string $text The text to lowercase and remove accents from.
694+ *
695+ * @return string The text with accents removed and lowercased.
696+ */
697+ protected function lowerCaseAndRemoveAccents (string $ text ): string
698+ {
699+ return mb_strtolower ($ this ->removeAccents ($ text ));
700+ }
701+
702+ /**
703+ * Helper function to remove accents from a string.
704+ *
705+ * @param string $text The text to remove accents from.
706+ *
707+ * @return string The text with accents removed.
708+ */
709+ protected function removeAccents (string $ text ): string
710+ {
711+ return preg_replace ('/[\x{0300}-\x{036f}]/u ' , '' , $ text );
712+ }
696713}
0 commit comments