openscilab · M-Mahdi-Razmjoo · Oct 11, 2025 · Oct 11, 2025 · Oct 11, 2025 · Nov 14, 2025
diff --git a/README.md b/README.md
@@ -72,33 +72,32 @@ ToCount is a lightweight and extensible Python library for estimating token coun
 
 ### Rule-Based
 
-
-| Model Name                 |   MAE   |     MSE     |   R²   |
-|----------------------------|---------|-------------|--------|
-| `RULE_BASED.UNIVERSAL`     | 106.70  | 381,647.81  | 0.8175 |
-| `RULE_BASED.GPT_4`         | 152.34  | 571,795.89  | 0.7266 |
-| `RULE_BASED.GPT_3_5`       | 161.93  | 652,923.59  | 0.6878 |
+| Model Name                 |   R²   |   MAE    |      MSE      |
+|----------------------------|--------|----------|---------------|
+| `RULE_BASED.UNIVERSAL`     | 0.8175 | 106.70   | 381,647.81    |
+| `RULE_BASED.GPT_4`         | 0.7266 | 152.34   | 571,795.89    |
+| `RULE_BASED.GPT_3_5`       | 0.6878 | 161.93   | 652,923.59    |
 
 ### Tiktoken R50K
 
-| Model Name                          |   MAE   |     MSE     |   R²   |
-|-------------------------------------|---------|-------------|--------|
-| `TIKTOKEN_R50K.LINEAR_ALL`          |  71.38  |  183897.01  | 0.8941 |
-| `TIKTOKEN_R50K.LINEAR_ENGLISH`      |  23.35  |  14127.92   | 0.9887 |
+| Model Name                     |   R²   |   MAE    |      MSE      | Median Abs Error | D² Abs Score |
+|--------------------------------|--------|----------|---------------|------------------|--------------|
+| `TIKTOKEN_R50K.LINEAR_ALL`     | 0.7334 | 152.39   | 537,877.36    |      28.55       |    0.4826    |
+| `TIKTOKEN_R50K.LINEAR_ENGLISH` | 0.8703 |  62.76   | 258,271.50    |       8.87       |    0.7287    |
 
 ### Tiktoken CL100K
 
-| Model Name                          |   MAE   |     MSE     |   R²   |
-|-------------------------------------|---------|-------------|--------|
-| `TIKTOKEN_CL100K.LINEAR_ALL`        |  41.85  |  47949.48   | 0.9545 |
-| `TIKTOKEN_CL100K.LINEAR_ENGLISH`    |  21.12  |  17597.20   | 0.9839 |
+| Model Name                       |   R²   |  MAE  |     MSE     | Median Abs Error | D² Abs Score |
+|----------------------------------|--------|-------|-------------|------------------|--------------|
+| `TIKTOKEN_CL100K.LINEAR_ALL`     | 0.9127 | 64.09 |  88,814.15  |      15.73       |    0.6804    |
+| `TIKTOKEN_CL100K.LINEAR_ENGLISH` | 0.9711 | 27.43 |  34,249.15  |       6.34       |    0.8527    |
 
 ### Tiktoken O200K
 
-| Model Name                          |   MAE   |     MSE     |   R²   |
-|-------------------------------------|---------|-------------|--------|
-| `TIKTOKEN_O200K.LINEAR_ALL`         |  25.53  |  20195.32   | 0.9777 |
-| `TIKTOKEN_O200K.LINEAR_ENGLISH`     |  20.24  |  15887.99   | 0.9859 |
+| Model Name                       |   R²   |  MAE  |     MSE     | Median Abs Error | D² Abs Score |
+|----------------------------------|--------|-------|-------------|------------------|--------------|
+| `TIKTOKEN_O200K.LINEAR_ALL`      | 0.9563 | 38.23 |  38,872.42  |       9.70       |    0.7818    |
+| `TIKTOKEN_O200K.LINEAR_ENGLISH`  | 0.9730 | 26.00 |  31,519.73  |       5.96       |    0.8581    |
 
 
 ℹ️ The training and testing dataset is taken from Lmsys-chat-1m [1] and Wildchat [2].

diff --git a/tests/test_tiktoken_cl100k.py b/tests/test_tiktoken_cl100k.py
@@ -5,21 +5,21 @@
 def test_linear_english_text_with_simple_prompt():
     message = "You are the text completion model"  # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=2
     assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int)
-    assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(6, abs=2)
+    assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(6, abs=5)
 
 
 def test_linear_english_text_with_contractions():
     # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=0
     message = "I’m refining a foolproof method for reality shifting"
     assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int)
-    assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(10, abs=2)
+    assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(10, abs=5)
 
 
 def test_linear_english_text_with_prefixes_and_suffixes():
     # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=10
     message = "reflecting the hardships of the preparation process"
     assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int)
-    assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(8, abs=4)
+    assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(8, abs=6)
 
 
 def test_linear_english_code_with_keywords():
@@ -30,7 +30,7 @@ def test_linear_english_code_with_keywords():
 
     message2 = "class QueryPlanner:"  # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19
     assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int)
-    assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(5, abs=0)
+    assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(5, abs=3)
 
     message3 = """
     for op in operations:
@@ -43,23 +43,23 @@ def test_linear_english_code_with_keywords():
 def test_linear_english_code_with_variable_names():
     message = "table_name = ast.table_name"  # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19
     assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int)
-    assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(6, abs=1)
+    assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(6, abs=3)
 
 
 def test_linear_english_text_empty_and_whitespace():
     message1 = ""
     assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int)
-    assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(0, abs=1)
+    assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(0, abs=4)
 
     message2 = " \t \n "
     assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int)
-    assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(3, abs=1)
+    assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(3, abs=2)
 
 
 def test_linear_english_text_with_long_word():
     message = "This is a verylongwordwithoutspaces and should be counted properly."
     assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int)
-    assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(14, abs=1)
+    assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(14, abs=4)
 
 
 def test_linear_english_text_with_rare_character():
@@ -72,61 +72,61 @@ def test_linear_english_text_with_rare_character():
 def test_linear_all_text_with_simple_prompt():
     message = "You are the text completion model"  # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=2
     assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int)
-    assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(6, abs=6)
+    assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(6, abs=16)
 
 
 def test_linear_all_text_with_contractions():
     # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=0
     message = "I’m refining a foolproof method for reality shifting"
     assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int)
-    assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(10, abs=6)
+    assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(10, abs=16)
 
 
 def test_linear_all_text_with_prefixes_and_suffixes():
     # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=10
     message = "reflecting the hardships of the preparation process"
     assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int)
-    assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(8, abs=8)
+    assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(8, abs=18)
 
 
 def test_linear_all_code_with_keywords():
     # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19
     message1 = "def __init__(self, schema):"
     assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int)
-    assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(8, abs=3)
+    assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(8, abs=12)
 
     message2 = "class QueryPlanner:"  # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19
     assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int)
-    assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(5, abs=4)
+    assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(5, abs=13)
 
     message3 = """
     for op in operations:
         if op.type == "SELECT":
     """  # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19
     assert isinstance(estimate_text_tokens(message3, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int)
-    assert estimate_text_tokens(message3, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(16, abs=3)
+    assert estimate_text_tokens(message3, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(16, abs=12)
 
 
 def test_linear_all_code_with_variable_names():
     message = "table_name = ast.table_name"  # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19
     assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int)
-    assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(6, abs=5)
+    assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(6, abs=14)
 
 
 def test_linear_all_text_empty_and_whitespace():
     message1 = ""
     assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int)
-    assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(0, abs=5)
+    assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(0, abs=14)
 
     message2 = " \t \n "
     assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int)
-    assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(3, abs=3)
+    assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(3, abs=12)
 
 
 def test_linear_all_text_non_english_with_special_chars():
     message = "versión británica"  # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=13
     assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int)
-    assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(5, abs=4)
+    assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(5, abs=13)
 
 
 def test_linear_all_text_non_english():
@@ -138,11 +138,11 @@ def test_linear_all_text_non_english():
 def test_linear_all_text_with_long_word():
     message = "This is a verylongwordwithoutspaces and should be counted properly."
     assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int)
-    assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(14, abs=6)
+    assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(14, abs=15)
 
 
 def test_linear_all_text_with_rare_character():
     # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=18
     message = "What is the smallest possible value for P[A ∩ B ∩ C]?"
     assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int)
-    assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(16, abs=1)
+    assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(16, abs=10)