@@ -75,6 +75,24 @@ def test_newlines_tabs(self):
7575 result = count_tokens ("line1\n line2\t line3\r \n line4" )
7676 assert result .count > 0
7777
78+ def test_large_text_exact (self ):
79+ large_text = "word " * 500_000
80+ result = count_tokens (large_text )
81+ assert result .count > 0
82+ assert result .is_exact is True
83+
84+ def test_exact_count_matches_direct_encode (self ):
85+ from treemapper .tokens import _get_encoder
86+
87+ encoder = _get_encoder ("o200k_base" )
88+ if encoder is None :
89+ return
90+
91+ text = "word " * 5_000
92+ exact_count = len (encoder .encode (text ))
93+ result = count_tokens (text )
94+ assert result .count == exact_count
95+
7896
7997class TestPrintTokenSummary :
8098 def test_prints_to_stderr (self ):
@@ -139,67 +157,3 @@ def test_different_encodings_cached_separately(self):
139157 r1 = count_tokens ("test" , encoding = "o200k_base" )
140158 r2 = count_tokens ("test" , encoding = "cl100k_base" )
141159 assert r1 .encoding != r2 .encoding or r1 .encoding == "approximation"
142-
143-
144- class TestChunkedCounting :
145- def test_chunked_counting_for_large_text (self ):
146- from treemapper .tokens import CHUNK_THRESHOLD
147-
148- large_text = "word " * (CHUNK_THRESHOLD // 5 + 1000 )
149- result = count_tokens (large_text )
150- assert result .count > 0
151- # Chunked counting is not exact due to BPE context sensitivity
152- # is_exact=False with real encoding, or approximation fallback
153- assert result .is_exact is False
154-
155- def test_chunked_count_close_to_exact (self , monkeypatch ):
156- import treemapper .tokens as tokens_module
157- from treemapper .tokens import _get_encoder
158-
159- encoder = _get_encoder ("o200k_base" )
160- if encoder is None :
161- return
162-
163- text = "word " * 5_000
164- exact_count = len (encoder .encode (text ))
165-
166- monkeypatch .setattr (tokens_module , "CHUNK_THRESHOLD" , 1_000 )
167- chunked_result = count_tokens (text )
168-
169- assert abs (chunked_result .count - exact_count ) / exact_count < 0.05
170-
171- def test_small_text_not_chunked (self ):
172- small_text = "hello world"
173- result = count_tokens (small_text )
174- assert result .count > 0
175-
176-
177- class TestSampledCounting :
178- def test_sampling_threshold_is_reasonable (self ):
179- from treemapper .tokens import SAMPLE_CHAR_THRESHOLD
180-
181- assert SAMPLE_CHAR_THRESHOLD >= 1_000_000
182-
183- def test_very_large_text_uses_sampling (self , monkeypatch ):
184- import treemapper .tokens as tokens_module
185- from treemapper .tokens import _count_tokens_sampled , _get_encoder
186-
187- encoder = _get_encoder ("o200k_base" )
188- if encoder is None :
189- return
190-
191- monkeypatch .setattr (tokens_module , "SAMPLE_CHAR_THRESHOLD" , 10_000 )
192- large_text = "x" * 15_000
193- result = _count_tokens_sampled (large_text , len (large_text ), encoder , "o200k_base" )
194- assert result .is_exact is False
195- assert result .count > 0
196-
197- def test_sampled_result_is_approximate (self , monkeypatch ):
198- import treemapper .tokens as tokens_module
199-
200- monkeypatch .setattr (tokens_module , "SAMPLE_CHAR_THRESHOLD" , 10_000 )
201- monkeypatch .setattr (tokens_module , "CHUNK_THRESHOLD" , 1_000 )
202- text = "word " * 5_000
203- result = count_tokens (text )
204- if result .encoding != "approximation" :
205- assert result .is_exact is False
0 commit comments