Skip to content

Commit e7d79fe

Browse files
Merge pull request #2 from CodeWithKyrian/chore/php-8-2-and-readonly-classes
Require PHP 8.2 and make core tokenization types readonly
2 parents 4c6bf1a + ef84875 commit e7d79fe

21 files changed

Lines changed: 33 additions & 33 deletions

.github/workflows/tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
strategy:
1212
fail-fast: false
1313
matrix:
14-
php: ['8.1', '8.2', '8.3', '8.4']
14+
php: ['8.2', '8.3', '8.4']
1515

1616
steps:
1717
- name: Checkout

.php-cs-fixer.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
<?php
22

33
$finder = (new PhpCsFixer\Finder())
4-
->in(__DIR__);
4+
->in(__DIR__)->exclude('tests');
55

66
return (new PhpCsFixer\Config())
77
->setParallelConfig(PhpCsFixer\Runner\Parallel\ParallelConfigFactory::detect())

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
- **Zero Hard Dependencies** — Core tokenization has no required dependencies. Optional HTTP client needed only for Hub downloads.
1919
- **Hub Compatible** — Load tokenizers directly from Hugging Face Hub or from local files.
2020
- **Fully Tested** — Validated against BERT, GPT-2, Llama, Gemma, Qwen, RoBERTa, ALBERT, and more.
21-
- **Modern PHP** — Built for PHP 8.1+ with strict types, readonly properties, and clean interfaces.
21+
- **Modern PHP** — Built for PHP 8.2+ with strict types, readonly classes, and clean interfaces.
2222

2323
## Installation
2424

composer.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
}
3232
},
3333
"require": {
34-
"php": "^8.1",
34+
"php": "^8.2",
3535
"psr/http-client": "^1.0",
3636
"psr/http-factory": "^1.0",
3737
"php-http/discovery": "^1.19"

examples/document_chunking_pipeline.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ function createChunks(Tokenizer $tokenizer, string $text, int $maxTokens, int $o
232232
$originalWords = array_slice(explode(' ', $chunk), 0, 5);
233233
$decodedWords = explode(' ', $decoded);
234234

235-
$matchCount = count(array_filter($originalWords, fn ($w) => in_array(strtolower($w), array_map('strtolower', $decodedWords))));
235+
$matchCount = count(array_filter($originalWords, static fn ($w) => in_array(strtolower($w), array_map('strtolower', $decodedWords))));
236236

237237
if ($matchCount < 3) {
238238
echo '⚠️ Chunk '.($index + 1)." may have encoding issues\n";

examples/text_classification_preprocessing.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -138,8 +138,8 @@
138138
echo " Structure: [CLS] premise [SEP] hypothesis [SEP]\n";
139139

140140
// Type IDs distinguish between premise (0) and hypothesis (1)
141-
$segment0Count = count(array_filter($encoding->typeIds, fn ($t) => 0 === $t));
142-
$segment1Count = count(array_filter($encoding->typeIds, fn ($t) => 1 === $t));
141+
$segment0Count = count(array_filter($encoding->typeIds, static fn ($t) => 0 === $t));
142+
$segment1Count = count(array_filter($encoding->typeIds, static fn ($t) => 1 === $t));
143143

144144
echo " Segment A (premise) tokens: {$segment0Count}\n";
145145
echo " Segment B (hypothesis) tokens: {$segment1Count}\n";

src/DataStructures/AddedToken.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* - Whether they should only match single words
1111
* - Whether to include any whitespace on its left or right.
1212
*/
13-
class AddedToken
13+
readonly class AddedToken
1414
{
1515
public function __construct(
1616
/**

src/DataStructures/TokenLattice.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,6 @@ public function tokenIds(): array
117117
{
118118
$nodes = $this->viterbi();
119119

120-
return array_map(fn ($x) => $x->tokenId, $nodes);
120+
return array_map(static fn ($x) => $x->tokenId, $nodes);
121121
}
122122
}

src/Decoders/ByteLevelDecoder.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,7 @@ protected function convertTokensToString(array $tokens): string
302302
{
303303
$text = implode('', $tokens);
304304
$textArray = preg_split('//u', $text, -1, \PREG_SPLIT_NO_EMPTY);
305-
$byteArray = array_map(fn ($x) => self::UNICODE_TO_BYTES[$x] ?? \ord($x), $textArray);
305+
$byteArray = array_map(static fn ($x) => self::UNICODE_TO_BYTES[$x] ?? \ord($x), $textArray);
306306
$binaryString = pack('C*', ...$byteArray);
307307

308308
return mb_convert_encoding($binaryString, 'UTF-8');

src/Decoders/DecoderSequence.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ protected function processTokens(array $tokens): array
1515
{
1616
return array_reduce(
1717
$this->decoders,
18-
fn (array $tokens, BaseDecoder $decoder) => $decoder->processTokens($tokens),
18+
static fn (array $tokens, BaseDecoder $decoder) => $decoder->processTokens($tokens),
1919
$tokens
2020
);
2121
}

0 commit comments

Comments
 (0)