Merge pull request #2 from CodeWithKyrian/chore/php-8-2-and-readonly-classes

CodeWithKyrian · web-flow · commit e7d79fea686b · 2026-02-04T12:07:47.000+01:00
Require PHP 8.2 and make core tokenization types readonly
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -11,7 +11,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        php: ['8.1', '8.2', '8.3', '8.4']
+        php: ['8.2', '8.3', '8.4']
 
     steps:
       - name: Checkout
diff --git a/.php-cs-fixer.php b/.php-cs-fixer.php
@@ -1,7 +1,7 @@
 <?php
 
 $finder = (new PhpCsFixer\Finder())
-    ->in(__DIR__);
+    ->in(__DIR__)->exclude('tests');
 
 return (new PhpCsFixer\Config())
     ->setParallelConfig(PhpCsFixer\Runner\Parallel\ParallelConfigFactory::detect())
diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@
 - **Zero Hard Dependencies** — Core tokenization has no required dependencies. Optional HTTP client needed only for Hub downloads.
 - **Hub Compatible** — Load tokenizers directly from Hugging Face Hub or from local files.
 - **Fully Tested** — Validated against BERT, GPT-2, Llama, Gemma, Qwen, RoBERTa, ALBERT, and more.
-- **Modern PHP** — Built for PHP 8.1+ with strict types, readonly properties, and clean interfaces.
+- **Modern PHP** — Built for PHP 8.2+ with strict types, readonly classes, and clean interfaces.
 
 ## Installation
 
diff --git a/composer.json b/composer.json
@@ -31,7 +31,7 @@
         }
     },
     "require": {
-        "php": "^8.1",
+        "php": "^8.2",
         "psr/http-client": "^1.0",
         "psr/http-factory": "^1.0",
         "php-http/discovery": "^1.19"
diff --git a/examples/document_chunking_pipeline.php b/examples/document_chunking_pipeline.php
@@ -232,7 +232,7 @@ function createChunks(Tokenizer $tokenizer, string $text, int $maxTokens, int $o
     $originalWords = array_slice(explode(' ', $chunk), 0, 5);
     $decodedWords = explode(' ', $decoded);
 
-    $matchCount = count(array_filter($originalWords, fn ($w) => in_array(strtolower($w), array_map('strtolower', $decodedWords))));
+    $matchCount = count(array_filter($originalWords, static fn ($w) => in_array(strtolower($w), array_map('strtolower', $decodedWords))));
 
     if ($matchCount < 3) {
         echo '⚠️ Chunk '.($index + 1)." may have encoding issues\n";
diff --git a/examples/text_classification_preprocessing.php b/examples/text_classification_preprocessing.php
@@ -138,8 +138,8 @@
     echo "  Structure: [CLS] premise [SEP] hypothesis [SEP]\n";
 
     // Type IDs distinguish between premise (0) and hypothesis (1)
-    $segment0Count = count(array_filter($encoding->typeIds, fn ($t) => 0 === $t));
-    $segment1Count = count(array_filter($encoding->typeIds, fn ($t) => 1 === $t));
+    $segment0Count = count(array_filter($encoding->typeIds, static fn ($t) => 0 === $t));
+    $segment1Count = count(array_filter($encoding->typeIds, static fn ($t) => 1 === $t));
 
     echo "  Segment A (premise) tokens: {$segment0Count}\n";
     echo "  Segment B (hypothesis) tokens: {$segment1Count}\n";
diff --git a/src/DataStructures/AddedToken.php b/src/DataStructures/AddedToken.php
@@ -10,7 +10,7 @@
  *   - Whether they should only match single words
  *   - Whether to include any whitespace on its left or right.
  */
-class AddedToken
+readonly class AddedToken
 {
     public function __construct(
         /**
diff --git a/src/DataStructures/TokenLattice.php b/src/DataStructures/TokenLattice.php
@@ -117,6 +117,6 @@ public function tokenIds(): array
     {
         $nodes = $this->viterbi();
 
-        return array_map(fn ($x) => $x->tokenId, $nodes);
+        return array_map(static fn ($x) => $x->tokenId, $nodes);
     }
 }
diff --git a/src/Decoders/ByteLevelDecoder.php b/src/Decoders/ByteLevelDecoder.php
@@ -302,7 +302,7 @@ protected function convertTokensToString(array $tokens): string
     {
         $text = implode('', $tokens);
         $textArray = preg_split('//u', $text, -1, \PREG_SPLIT_NO_EMPTY);
-        $byteArray = array_map(fn ($x) => self::UNICODE_TO_BYTES[$x] ?? \ord($x), $textArray);
+        $byteArray = array_map(static fn ($x) => self::UNICODE_TO_BYTES[$x] ?? \ord($x), $textArray);
         $binaryString = pack('C*', ...$byteArray);
 
         return mb_convert_encoding($binaryString, 'UTF-8');
diff --git a/src/Decoders/DecoderSequence.php b/src/Decoders/DecoderSequence.php
@@ -15,7 +15,7 @@ protected function processTokens(array $tokens): array
     {
         return array_reduce(
             $this->decoders,
-            fn (array $tokens, BaseDecoder $decoder) => $decoder->processTokens($tokens),
+            static fn (array $tokens, BaseDecoder $decoder) => $decoder->processTokens($tokens),
             $tokens
         );
     }
diff --git a/src/Encoding.php b/src/Encoding.php
@@ -7,16 +7,16 @@
 /**
  * Represents the output of tokenization.
  */
-class Encoding
+readonly class Encoding
 {
     /**
      * @param int[]    $ids     The list of token IDs
      * @param string[] $tokens  The list of tokens
      * @param int[]    $typeIds The list of type IDs
      */
     public function __construct(
-        public readonly array $ids,
-        public readonly array $tokens,
-        public readonly array $typeIds = [],
+        public array $ids,
+        public array $tokens,
+        public array $typeIds = [],
     ) {}
 }
diff --git a/src/Factories/NormalizerFactory.php b/src/Factories/NormalizerFactory.php
@@ -35,7 +35,7 @@ public static function create(array $config): NormalizerInterface
             ),
             'Lowercase' => new LowercaseNormalizer(),
             'Sequence' => new NormalizerSequence(
-                array_map(fn ($c) => self::create($c), $config['normalizers'] ?? [])
+                array_map(static fn ($c) => self::create($c), $config['normalizers'] ?? [])
             ),
             'Strip' => new StripNormalizer(
                 stripLeft: $config['strip_left'] ?? true,
diff --git a/src/Factories/PostProcessorFactory.php b/src/Factories/PostProcessorFactory.php
@@ -45,7 +45,7 @@ public static function create(array $config): ?PostProcessorInterface
                 pair: $config['pair']
             ),
             'Sequence' => new PostProcessorSequence(
-                array_map(fn ($c) => self::create($c), $config['processors'] ?? [])
+                array_map(static fn ($c) => self::create($c), $config['processors'] ?? [])
             ),
             default => throw new \Exception("Unknown post-processor type: {$type}"),
         };
diff --git a/src/Factories/PreTokenizerFactory.php b/src/Factories/PreTokenizerFactory.php
@@ -55,7 +55,7 @@ public static function create(array $config): PreTokenizerInterface
                 invert: $config['invert'] ?? true
             ),
             'Sequence' => new PreTokenizerSequence(
-                array_map(fn ($c) => self::create($c), $config['pretokenizers'] ?? [])
+                array_map(static fn ($c) => self::create($c), $config['pretokenizers'] ?? [])
             ),
             default => throw new \Exception("Unknown pre-tokenizer type: {$type}"),
         };
diff --git a/src/Models/AbstractModel.php b/src/Models/AbstractModel.php
@@ -45,7 +45,7 @@ public function decode(array $ids): array
             $tokens[] = $this->vocab[$id] ?? $this->unkToken ?? null;
         }
 
-        return array_filter($tokens, fn ($t) => null !== $t);
+        return array_filter($tokens, static fn ($t) => null !== $t);
     }
 
     /**
@@ -65,7 +65,7 @@ public function encode(array $tokens): array
         }
 
         // Removing nulls in case unkTokenId is null (though should exist)
-        return array_filter($ids, fn ($id) => null !== $id);
+        return array_filter($ids, static fn ($id) => null !== $id);
     }
 
     /**
diff --git a/src/Models/BPEModel.php b/src/Models/BPEModel.php
@@ -59,7 +59,7 @@ public function __construct(
             $this->merges = $merges;
         } else {
             $this->merges = array_map(
-                fn ($merge) => \is_string($merge) ? explode(' ', $merge, 2) : $merge,
+                static fn ($merge) => \is_string($merge) ? explode(' ', $merge, 2) : $merge,
                 $merges
             );
         }
diff --git a/src/Normalizers/NormalizerSequence.php b/src/Normalizers/NormalizerSequence.php
@@ -17,7 +17,7 @@ public function normalize(string $text): string
     {
         return array_reduce(
             $this->normalizers,
-            fn (string $text, NormalizerInterface $normalizer) => $normalizer->normalize($text),
+            static fn (string $text, NormalizerInterface $normalizer) => $normalizer->normalize($text),
             $text
         );
     }
diff --git a/src/PreTokenizers/ByteLevelPreTokenizer.php b/src/PreTokenizers/ByteLevelPreTokenizer.php
@@ -303,9 +303,9 @@ public function preTokenize(array|string $text, array $options = []): array
         }
 
         // Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
-        return array_map(function ($token) {
+        return array_map(static function ($token) {
             $utf8Bytes = mb_convert_encoding($token, 'UTF-8');
-            $bytes = array_map(fn ($byte) => self::BYTES_TO_UNICODE[$byte], unpack('C*', $utf8Bytes));
+            $bytes = array_map(static fn ($byte) => self::BYTES_TO_UNICODE[$byte], unpack('C*', $utf8Bytes));
 
             return implode('', $bytes);
         }, $tokens);
diff --git a/src/PreTokenizers/PreTokenizerSequence.php b/src/PreTokenizers/PreTokenizerSequence.php
@@ -17,7 +17,7 @@ public function preTokenize(array|string $text, array $options = []): array
     {
         return array_reduce(
             $this->preTokenizers,
-            fn ($text, PreTokenizerInterface $preTokenizer) => $preTokenizer->preTokenize($text, $options),
+            static fn ($text, PreTokenizerInterface $preTokenizer) => $preTokenizer->preTokenize($text, $options),
             \is_array($text) ? $text : [$text]
         );
     }
diff --git a/src/Tokenizer.php b/src/Tokenizer.php
@@ -25,12 +25,12 @@
 use Codewithkyrian\Tokenizers\Utils\DecoderUtils;
 use Codewithkyrian\Tokenizers\Utils\NormalizerUtils;
 
-class Tokenizer
+readonly class Tokenizer
 {
     /**
      * The model's maximum sequence length (convenience accessor for config).
      */
-    public readonly ?int $modelMaxLength;
+    public ?int $modelMaxLength;
     protected DictionarySplitter $addedTokensSplitter;
 
     /**
@@ -44,11 +44,11 @@ class Tokenizer
      * @param array<string, mixed>      $config        Additional configuration options
      */
     public function __construct(
-        protected ModelInterface $model,
-        protected NormalizerInterface $normalizer,
-        protected PreTokenizerInterface $preTokenizer,
-        protected PostProcessorInterface $postProcessor,
-        protected DecoderInterface $decoder,
+        public ModelInterface $model,
+        public NormalizerInterface $normalizer,
+        public PreTokenizerInterface $preTokenizer,
+        public PostProcessorInterface $postProcessor,
+        public DecoderInterface $decoder,
         protected array $specialTokens = [],
         protected array $addedTokens = [],
         protected array $config = []
diff --git a/tests/Pest.php b/tests/Pest.php
@@ -9,7 +9,7 @@
  */
 function modelTokenizationDataset(string $datasetClass, bool $withTextPair = false): Closure
 {
-    return function () use ($datasetClass, $withTextPair) {
+    return static function () use ($datasetClass, $withTextPair) {
         if (!class_exists($datasetClass) || !method_exists($datasetClass, 'data')) {
             return;
         }

Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@`
`31`	`31`	`}`
`32`	`32`	`},`
`33`	`33`	`"require": {`
`34`		`- "php": "^8.1",`
	`34`	`+ "php": "^8.2",`
`35`	`35`	`"psr/http-client": "^1.0",`
`36`	`36`	`"psr/http-factory": "^1.0",`
`37`	`37`	`"php-http/discovery": "^1.19"`
Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@`
`10`	`10`	`* - Whether they should only match single words`
`11`	`11`	`* - Whether to include any whitespace on its left or right.`
`12`	`12`	`*/`
`13`		`-class AddedToken`
	`13`	`+readonly class AddedToken`
`14`	`14`	`{`
`15`	`15`	`public function __construct(`
`16`	`16`	`/**`
Original file line number	Diff line number	Diff line change
`@@ -117,6 +117,6 @@ public function tokenIds(): array`
`117`	`117`	`{`
`118`	`118`	`$nodes = $this->viterbi();`
`119`	`119`
`120`		`- return array_map(fn ($x) => $x->tokenId, $nodes);`
	`120`	`+ return array_map(static fn ($x) => $x->tokenId, $nodes);`
`121`	`121`	`}`
`122`	`122`	`}`
Original file line number	Diff line number	Diff line change
`@@ -302,7 +302,7 @@ protected function convertTokensToString(array $tokens): string`
`302`	`302`	`{`
`303`	`303`	`$text = implode('', $tokens);`
`304`	`304`	`$textArray = preg_split('//u', $text, -1, \PREG_SPLIT_NO_EMPTY);`
`305`		`- $byteArray = array_map(fn ($x) => self::UNICODE_TO_BYTES[$x] ?? \ord($x), $textArray);`
	`305`	`+ $byteArray = array_map(static fn ($x) => self::UNICODE_TO_BYTES[$x] ?? \ord($x), $textArray);`
`306`	`306`	`$binaryString = pack('C*', ...$byteArray);`
`307`	`307`
`308`	`308`	`return mb_convert_encoding($binaryString, 'UTF-8');`
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@ protected function processTokens(array $tokens): array`
`15`	`15`	`{`
`16`	`16`	`return array_reduce(`
`17`	`17`	`$this->decoders,`
`18`		`- fn (array $tokens, BaseDecoder $decoder) => $decoder->processTokens($tokens),`
	`18`	`+ static fn (array $tokens, BaseDecoder $decoder) => $decoder->processTokens($tokens),`
`19`	`19`	`$tokens`
`20`	`20`	`);`
`21`	`21`	`}`