@@ -154,6 +154,7 @@ def test_encode(self):
154154 output = tokenizer .encode_batch (["my name is john" , ("my name is john" , "pair" )])
155155 assert len (output ) == 2
156156
157+ @pytest .mark .network
157158 def test_encode_formats (self , bert_files ):
158159 with pytest .deprecated_call ():
159160 tokenizer = BertWordPieceTokenizer (bert_files ["vocab" ])
@@ -286,6 +287,7 @@ def test_pair(input, is_pretokenized=False):
286287 with pytest .raises (TypeError , match = "InputSequence must be Union[List[str]" ):
287288 tokenizer .encode (["My" , "name" , "is" , "John" ], "pair" , is_pretokenized = True )
288289
290+ @pytest .mark .network
289291 def test_encode_add_special_tokens (self , roberta_files ):
290292 with pytest .deprecated_call ():
291293 tokenizer = Tokenizer (BPE (roberta_files ["vocab" ], roberta_files ["merges" ]))
@@ -376,6 +378,7 @@ def test_decode(self):
376378 stream = DecodeStream (ids = [0 , 1 , 2 ])
377379 assert stream .step (tokenizer , 3 ) == " john"
378380
381+ @pytest .mark .network
379382 def test_decode_stream_fallback (self ):
380383 tokenizer = Tokenizer .from_pretrained ("gpt2" )
381384 # tokenizer.decode([255]) fails because its a fallback
@@ -408,6 +411,7 @@ def test_decode_stream_fallback(self):
408411 out = stream .step (tokenizer , [109 ])
409412 assert out == "อั"
410413
414+ @pytest .mark .network
411415 def test_decode_skip_special_tokens (self ):
412416 tokenizer = Tokenizer .from_pretrained ("hf-internal-testing/Llama-3.1-8B-Instruct" )
413417
@@ -557,11 +561,13 @@ def test_multiprocessing_with_parallelism(self):
557561 multiprocessing_with_parallelism (tokenizer , False )
558562 multiprocessing_with_parallelism (tokenizer , True )
559563
564+ @pytest .mark .network
560565 def test_from_pretrained (self ):
561566 tokenizer = Tokenizer .from_pretrained ("bert-base-cased" )
562567 output = tokenizer .encode ("Hey there dear friend!" , add_special_tokens = False )
563568 assert output .tokens == ["Hey" , "there" , "dear" , "friend" , "!" ]
564569
570+ @pytest .mark .network
565571 def test_from_pretrained_revision (self ):
566572 tokenizer = Tokenizer .from_pretrained ("anthony/tokenizers-test" )
567573 output = tokenizer .encode ("Hey there dear friend!" , add_special_tokens = False )
@@ -597,6 +603,7 @@ def test_unigram_byte_fallback(self):
597603 assert output .ids == [1 , 10 , 2 , 3 , 4 , 5 , 10 , 6 , 7 , 8 , 9 ]
598604 assert output .tokens == ["A" , " " , "sen" , "te" , "n" , "ce" , " " , "<0xF0>" , "<0x9F>" , "<0xA4>" , "<0x97>" ]
599605
606+ @pytest .mark .network
600607 def test_encode_special_tokens (self ):
601608 tokenizer = Tokenizer .from_pretrained ("t5-base" )
602609 tokenizer .add_tokens (["<eot>" ])
@@ -628,6 +635,7 @@ def test_encode_special_tokens(self):
628635 output = tokenizer .encode ("Hey there<end_of_text> dear<eot>friend!" , add_special_tokens = False )
629636 assert output .tokens == ["▁Hey" , "▁there" , "<" , "end" , "_" , "of_text>" , "▁dear" , "<eot>" , "▁friend" , "!" ]
630637
638+ @pytest .mark .network
631639 def test_splitting (self ):
632640 tokenizer = Tokenizer .from_pretrained ("hf-internal-testing/llama-new-metaspace" )
633641 tokenizer .pre_tokenizer .split = False
@@ -724,6 +732,7 @@ def test_repr_complete(self):
724732 )
725733
726734
735+ @pytest .mark .network
727736class TestAsyncTokenizer :
728737 """Tests for async methods of the Tokenizer class."""
729738
0 commit comments