@@ -599,25 +599,62 @@ def test_staccato_tokenizer_abbreviations():
599599 text_1 = "The firm is U.S.A. Inc. and i.e. in the U.S. we use e.g. to give examples."
600600 sentence_1 = Sentence (text_1 , use_tokenizer = tokenizer )
601601 expected_tokens_1 = [
602- "The" , "firm" , "is" , "U.S.A." , "Inc" , "." , "and" , "i.e." , "in" , "the" ,
603- "U.S." , "we" , "use" , "e.g." , "to" , "give" , "examples" , "." ,
602+ "The" ,
603+ "firm" ,
604+ "is" ,
605+ "U.S.A." ,
606+ "Inc" ,
607+ "." ,
608+ "and" ,
609+ "i.e." ,
610+ "in" ,
611+ "the" ,
612+ "U.S." ,
613+ "we" ,
614+ "use" ,
615+ "e.g." ,
616+ "to" ,
617+ "give" ,
618+ "examples" ,
619+ "." ,
604620 ]
605621 assert [token .text for token in sentence_1 .tokens ] == expected_tokens_1
606622
607623 # Case 2: Single letter/short word with a dot at sentence end should be split
608624 text_2 = "He wrote on X. Then Dr. Smith arrived."
609625 sentence_2 = Sentence (text_2 , use_tokenizer = tokenizer )
610626 expected_tokens_2 = [
611- "He" , "wrote" , "on" , "X" , "." , "Then" , "Dr" , "." , "Smith" , "arrived" , "." ,
627+ "He" ,
628+ "wrote" ,
629+ "on" ,
630+ "X" ,
631+ "." ,
632+ "Then" ,
633+ "Dr" ,
634+ "." ,
635+ "Smith" ,
636+ "arrived" ,
637+ "." ,
612638 ]
613639 assert [token .text for token in sentence_2 .tokens ] == expected_tokens_2
614640
615641 # Case 3: A mix of cases
616642 text_3 = "The item is from the U.K. (i.e. not the U.S.A.)."
617643 sentence_3 = Sentence (text_3 , use_tokenizer = tokenizer )
618644 expected_tokens_3 = [
619- "The" , "item" , "is" , "from" , "the" , "U.K." , "(" , "i.e." ,
620- "not" , "the" , "U.S.A." , ")" , "." ,
645+ "The" ,
646+ "item" ,
647+ "is" ,
648+ "from" ,
649+ "the" ,
650+ "U.K." ,
651+ "(" ,
652+ "i.e." ,
653+ "not" ,
654+ "the" ,
655+ "U.S.A." ,
656+ ")" ,
657+ "." ,
621658 ]
622659 assert [token .text for token in sentence_3 .tokens ] == expected_tokens_3
623660
0 commit comments