Skip to content

Commit 41d8af8

Browse files
committed
Mark Python tests that need network access
1 parent f7db48f commit 41d8af8

File tree

13 files changed

+44
-0
lines changed

13 files changed

+44
-0
lines changed

bindings/python/pytest.ini

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[pytest]
2+
markers =
3+
network: mark a test that requires network access.

bindings/python/tests/bindings/test_encoding.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from ..utils import bert_files, data_dir
66

77

8+
@pytest.mark.network
89
class TestEncoding:
910
@pytest.fixture(scope="class")
1011
def encodings(self, bert_files):

bindings/python/tests/bindings/test_models.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88

99
class TestBPE:
10+
@pytest.mark.network
1011
def test_instantiate(self, roberta_files):
1112
assert isinstance(BPE(), Model)
1213
assert isinstance(BPE(), BPE)
@@ -75,6 +76,7 @@ def test_dropout_zero(self):
7576

7677

7778
class TestWordPiece:
79+
@pytest.mark.network
7880
def test_instantiate(self, bert_files):
7981
assert isinstance(WordPiece(), Model)
8082
assert isinstance(WordPiece(), WordPiece)
@@ -112,6 +114,7 @@ def test_can_modify(self):
112114

113115

114116
class TestWordLevel:
117+
@pytest.mark.network
115118
def test_instantiate(self, roberta_files):
116119
assert isinstance(WordLevel(), Model)
117120
assert isinstance(WordLevel(), WordLevel)

bindings/python/tests/bindings/test_processors.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ def test_instantiate(self):
7171
assert isinstance(ByteLevel(), ByteLevel)
7272
assert isinstance(pickle.loads(pickle.dumps(ByteLevel())), ByteLevel)
7373

74+
@pytest.mark.network
7475
def test_processing(self, roberta_files):
7576
# Deprecated in 0.9
7677
with pytest.deprecated_call():

bindings/python/tests/bindings/test_tokenizer.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ def test_encode(self):
154154
output = tokenizer.encode_batch(["my name is john", ("my name is john", "pair")])
155155
assert len(output) == 2
156156

157+
@pytest.mark.network
157158
def test_encode_formats(self, bert_files):
158159
with pytest.deprecated_call():
159160
tokenizer = BertWordPieceTokenizer(bert_files["vocab"])
@@ -286,6 +287,7 @@ def test_pair(input, is_pretokenized=False):
286287
with pytest.raises(TypeError, match="InputSequence must be Union[List[str]"):
287288
tokenizer.encode(["My", "name", "is", "John"], "pair", is_pretokenized=True)
288289

290+
@pytest.mark.network
289291
def test_encode_add_special_tokens(self, roberta_files):
290292
with pytest.deprecated_call():
291293
tokenizer = Tokenizer(BPE(roberta_files["vocab"], roberta_files["merges"]))
@@ -376,6 +378,7 @@ def test_decode(self):
376378
stream = DecodeStream(ids=[0, 1, 2])
377379
assert stream.step(tokenizer, 3) == " john"
378380

381+
@pytest.mark.network
379382
def test_decode_stream_fallback(self):
380383
tokenizer = Tokenizer.from_pretrained("gpt2")
381384
# tokenizer.decode([255]) fails because its a fallback
@@ -408,6 +411,7 @@ def test_decode_stream_fallback(self):
408411
out = stream.step(tokenizer, [109])
409412
assert out == "อั"
410413

414+
@pytest.mark.network
411415
def test_decode_skip_special_tokens(self):
412416
tokenizer = Tokenizer.from_pretrained("hf-internal-testing/Llama-3.1-8B-Instruct")
413417

@@ -557,11 +561,13 @@ def test_multiprocessing_with_parallelism(self):
557561
multiprocessing_with_parallelism(tokenizer, False)
558562
multiprocessing_with_parallelism(tokenizer, True)
559563

564+
@pytest.mark.network
560565
def test_from_pretrained(self):
561566
tokenizer = Tokenizer.from_pretrained("bert-base-cased")
562567
output = tokenizer.encode("Hey there dear friend!", add_special_tokens=False)
563568
assert output.tokens == ["Hey", "there", "dear", "friend", "!"]
564569

570+
@pytest.mark.network
565571
def test_from_pretrained_revision(self):
566572
tokenizer = Tokenizer.from_pretrained("anthony/tokenizers-test")
567573
output = tokenizer.encode("Hey there dear friend!", add_special_tokens=False)
@@ -597,6 +603,7 @@ def test_unigram_byte_fallback(self):
597603
assert output.ids == [1, 10, 2, 3, 4, 5, 10, 6, 7, 8, 9]
598604
assert output.tokens == ["A", " ", "sen", "te", "n", "ce", " ", "<0xF0>", "<0x9F>", "<0xA4>", "<0x97>"]
599605

606+
@pytest.mark.network
600607
def test_encode_special_tokens(self):
601608
tokenizer = Tokenizer.from_pretrained("t5-base")
602609
tokenizer.add_tokens(["<eot>"])
@@ -628,6 +635,7 @@ def test_encode_special_tokens(self):
628635
output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
629636
assert output.tokens == ["▁Hey", "▁there", "<", "end", "_", "of_text>", "▁dear", "<eot>", "▁friend", "!"]
630637

638+
@pytest.mark.network
631639
def test_splitting(self):
632640
tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-new-metaspace")
633641
tokenizer.pre_tokenizer.split = False
@@ -724,6 +732,7 @@ def test_repr_complete(self):
724732
)
725733

726734

735+
@pytest.mark.network
727736
class TestAsyncTokenizer:
728737
"""Tests for async methods of the Tokenizer class."""
729738

bindings/python/tests/bindings/test_trainers.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ def test_can_pickle(self):
150150

151151

152152
class TestUnigram:
153+
@pytest.mark.network
153154
def test_train(self, train_files):
154155
tokenizer = SentencePieceUnigramTokenizer()
155156
tokenizer.train(train_files["small"], show_progress=False)
@@ -158,6 +159,7 @@ def test_train(self, train_files):
158159
tokenizer.save(filename)
159160
os.remove(filename)
160161

162+
@pytest.mark.network
161163
def test_train_parallelism_with_custom_pretokenizer(self, train_files):
162164
class GoodCustomPretok:
163165
def split(self, n, normalized):
@@ -184,6 +186,7 @@ def test_can_pickle(self):
184186

185187
def test_train_with_special_tokens(self):
186188
filename = "tests/data/dummy-unigram-special_tokens-train.txt"
189+
os.makedirs("tests/data", exist_ok=True)
187190
with open(filename, "w") as f:
188191
f.write(
189192
"""
@@ -287,6 +290,7 @@ def test_can_modify(self):
287290
trainer.initial_alphabet = ["d", "z"]
288291
assert sorted(trainer.initial_alphabet) == ["d", "z"]
289292

293+
@pytest.mark.network
290294
def test_continuing_prefix_trainer_mismatch(self, train_files):
291295
UNK = "[UNK]"
292296
special_tokens = [UNK]

bindings/python/tests/documentation/test_pipeline.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import pytest
12
from tokenizers import Tokenizer
23

34
from ..utils import data_dir, doc_pipeline_bert_tokenizer, doc_wiki_tokenizer
@@ -12,6 +13,7 @@ def print(*args, **kwargs):
1213

1314

1415
class TestPipeline:
16+
@pytest.mark.network
1517
def test_pipeline(self, doc_wiki_tokenizer):
1618
try:
1719
# START reload_tokenizer
@@ -143,6 +145,7 @@ def slow_train():
143145
bert_tokenizer.save("data/bert-wiki.json")
144146
# END bert_train_tokenizer
145147

148+
@pytest.mark.network
146149
def test_bert_example(self, doc_pipeline_bert_tokenizer):
147150
try:
148151
bert_tokenizer = Tokenizer.from_file("data/bert-wiki.json")

bindings/python/tests/documentation/test_quicktour.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import pytest
12
from tokenizers import Tokenizer
23
from ..utils import data_dir, doc_wiki_tokenizer
34

@@ -45,6 +46,7 @@ def get_tokenizer_trainer():
4546
# END init_pretok
4647
return tokenizer, trainer
4748

49+
@pytest.mark.network
4850
def test_quicktour(self, doc_wiki_tokenizer):
4951
def print(*args, **kwargs):
5052
pass

bindings/python/tests/documentation/test_tutorial_train_from_iterators.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ def test_train_basic(self):
6262
tokenizer.train_from_iterator(data, trainer=trainer)
6363
# END train_basic
6464

65+
@pytest.mark.network
6566
def test_datasets(self):
6667
tokenizer, trainer = self.get_tokenizer_trainer()
6768

@@ -82,6 +83,7 @@ def batch_iterator(batch_size=1000):
8283
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer, length=len(dataset)) # type: ignore[arg-type]
8384
# END train_datasets
8485

86+
@pytest.mark.network
8587
def test_gzip(self, setup_gzip_files):
8688
tokenizer, trainer = self.get_tokenizer_trainer()
8789

bindings/python/tests/implementations/test_bert_wordpiece.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1+
import pytest
12
from tokenizers import BertWordPieceTokenizer
23

34
from ..utils import bert_files, data_dir, multiprocessing_with_parallelism
45

56

67
class TestBertWordPieceTokenizer:
8+
@pytest.mark.network
79
def test_basic_encode(self, bert_files):
810
tokenizer = BertWordPieceTokenizer.from_file(bert_files["vocab"])
911

@@ -39,6 +41,7 @@ def test_basic_encode(self, bert_files):
3941
assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)]
4042
assert output.type_ids == [0, 0, 0, 0, 1]
4143

44+
@pytest.mark.network
4245
def test_multiprocessing_with_parallelism(self, bert_files):
4346
tokenizer = BertWordPieceTokenizer.from_file(bert_files["vocab"])
4447
multiprocessing_with_parallelism(tokenizer, False)

0 commit comments

Comments
 (0)