From 4c6ca1adf8f5ed076505ea0acd862a62bb3025ba Mon Sep 17 00:00:00 2001 From: M-Mahdi-Razmjoo Date: Sat, 11 Oct 2025 10:08:59 +0330 Subject: [PATCH 1/5] add cl100k linear model tests --- tests/test_tiktoken_cl100k.py | 148 ++++++++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 tests/test_tiktoken_cl100k.py diff --git a/tests/test_tiktoken_cl100k.py b/tests/test_tiktoken_cl100k.py new file mode 100644 index 0000000..b007cbf --- /dev/null +++ b/tests/test_tiktoken_cl100k.py @@ -0,0 +1,148 @@ +import pytest +from tocount import estimate_text_tokens, TextEstimator + + +def test_linear_english_text_with_simple_prompt(): + message = "You are the text completion model" # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=2 + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(6, abs=2) + + +def test_linear_english_text_with_contractions(): + # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=0 + message = "I’m refining a foolproof method for reality shifting" + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(10, abs=2) + + +def test_linear_english_text_with_prefixes_and_suffixes(): + # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=10 + message = "reflecting the hardships of the preparation process" + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(8, abs=4) + + +def test_linear_english_code_with_keywords(): + # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 + message1 = "def __init__(self, schema):" + assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(8, abs=1) + + message2 = "class QueryPlanner:" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 + assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(5, abs=0) + + message3 = """ + for op in operations: + if op.type == "SELECT": + """ # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 + assert isinstance(estimate_text_tokens(message3, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int) + assert estimate_text_tokens(message3, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(16, abs=2) + + +def test_linear_english_code_with_variable_names(): + message = "table_name = ast.table_name" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(6, abs=1) + + +def test_linear_english_text_empty_and_whitespace(): + message1 = "" + assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(0, abs=1) + + message2 = " \t \n " + assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(3, abs=1) + + +def test_linear_english_text_with_long_word(): + message = "This is a verylongwordwithoutspaces and should be counted properly." + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(14, abs=1) + + +def test_linear_english_text_with_rare_character(): + # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=18 + message = "What is the smallest possible value for P[A ∩ B ∩ C]?" + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(16, abs=4) + + +def test_linear_all_text_with_simple_prompt(): + message = "You are the text completion model" # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=2 + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(6, abs=6) + + +def test_linear_all_text_with_contractions(): + # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=0 + message = "I’m refining a foolproof method for reality shifting" + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(10, abs=6) + + +def test_linear_all_text_with_prefixes_and_suffixes(): + # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=10 + message = "reflecting the hardships of the preparation process" + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(8, abs=8) + + +def test_linear_all_code_with_keywords(): + # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 + message1 = "def __init__(self, schema):" + assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(8, abs=3) + + message2 = "class QueryPlanner:" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 + assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(5, abs=4) + + message3 = """ + for op in operations: + if op.type == "SELECT": + """ # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 + assert isinstance(estimate_text_tokens(message3, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) + assert estimate_text_tokens(message3, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(16, abs=3) + + +def test_linear_all_code_with_variable_names(): + message = "table_name = ast.table_name" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(6, abs=5) + + +def test_linear_all_text_empty_and_whitespace(): + message1 = "" + assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(0, abs=5) + + message2 = " \t \n " + assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(3, abs=3) + + +def test_linear_all_text_non_english_with_special_chars(): + message = "versión británica" # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=13 + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(5, abs=4) + + +def test_linear_all_text_non_english(): + message = "如何在sd上无错误进行模型训练" # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=20 + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(15, abs=7) + + +def test_linear_all_text_with_long_word(): + message = "This is a verylongwordwithoutspaces and should be counted properly." + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(14, abs=6) + + +def test_linear_all_text_with_rare_character(): + # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=18 + message = "What is the smallest possible value for P[A ∩ B ∩ C]?" + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(16, abs=1) \ No newline at end of file From 3598a227cadbdd3685758bd010e67f020f5ef641 Mon Sep 17 00:00:00 2001 From: M-Mahdi-Razmjoo Date: Sat, 11 Oct 2025 10:15:45 +0330 Subject: [PATCH 2/5] add o200k linear model tests --- tests/test_tiktoken_o200k.py | 148 +++++++++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 tests/test_tiktoken_o200k.py diff --git a/tests/test_tiktoken_o200k.py b/tests/test_tiktoken_o200k.py new file mode 100644 index 0000000..4035fb9 --- /dev/null +++ b/tests/test_tiktoken_o200k.py @@ -0,0 +1,148 @@ +import pytest +from tocount import estimate_text_tokens, TextEstimator + + +def test_linear_english_text_with_simple_prompt(): + message = "You are the text completion model" # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=2 + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(6, abs=3) + + +def test_linear_english_text_with_contractions(): + # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=0 + message = "I’m refining a foolproof method for reality shifting" + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(10, abs=3) + + +def test_linear_english_text_with_prefixes_and_suffixes(): + # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=10 + message = "reflecting the hardships of the preparation process" + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(8, abs=4) + + +def test_linear_english_code_with_keywords(): + # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 + message1 = "def __init__(self, schema):" + assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(8, abs=0) + + message2 = "class QueryPlanner:" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 + assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(4, abs=2) + + message3 = """ + for op in operations: + if op.type == "SELECT": + """ # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 + assert isinstance(estimate_text_tokens(message3, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) + assert estimate_text_tokens(message3, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(16, abs=1) + + +def test_linear_english_code_with_variable_names(): + message = "table_name = ast.table_name" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(6, abs=2) + + +def test_linear_english_text_empty_and_whitespace(): + message1 = "" + assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(0, abs=2) + + message2 = " \t \n " + assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(3, abs=0) + + +def test_linear_english_text_with_long_word(): + message = "This is a verylongwordwithoutspaces and should be counted properly." + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(14, abs=2) + + +def test_linear_english_text_with_rare_character(): + # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=18 + message = "What is the smallest possible value for P[A ∩ B ∩ C]?" + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(16, abs=3) + + +def test_linear_all_text_with_simple_prompt(): + message = "You are the text completion model" # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=2 + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(6, abs=3) + + +def test_linear_all_text_with_contractions(): + # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=0 + message = "I’m refining a foolproof method for reality shifting" + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(10, abs=3) + + +def test_linear_all_text_with_prefixes_and_suffixes(): + # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=10 + message = "reflecting the hardships of the preparation process" + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(8, abs=5) + + +def test_linear_all_code_with_keywords(): + # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 + message1 = "def __init__(self, schema):" + assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(8, abs=0) + + message2 = "class QueryPlanner:" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 + assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(4, abs=2) + + message3 = """ + for op in operations: + if op.type == "SELECT": + """ # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 + assert isinstance(estimate_text_tokens(message3, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) + assert estimate_text_tokens(message3, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(16, abs=1) + + +def test_linear_all_code_with_variable_names(): + message = "table_name = ast.table_name" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(6, abs=2) + + +def test_linear_all_text_empty_and_whitespace(): + message1 = "" + assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(0, abs=2) + + message2 = " \t \n " + assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(3, abs=0) + + +def test_linear_all_text_non_english_with_special_chars(): + message = "versión británica" # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=13 + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(4, abs=1) + + +def test_linear_all_text_non_english(): + message = "如何在sd上无错误进行模型训练" # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=20 + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(12, abs=7) + + +def test_linear_all_text_with_long_word(): + message = "This is a verylongwordwithoutspaces and should be counted properly." + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(14, abs=2) + + +def test_linear_all_text_with_rare_character(): + # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=18 + message = "What is the smallest possible value for P[A ∩ B ∩ C]?" + assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(16, abs=3) \ No newline at end of file From 1f8e3614d16c371316fe28f3272108ca4c9ad86a Mon Sep 17 00:00:00 2001 From: M-Mahdi-Razmjoo Date: Sat, 11 Oct 2025 10:17:52 +0330 Subject: [PATCH 3/5] modify r50k tests --- tests/test_tiktoken_r50.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_tiktoken_r50.py b/tests/test_tiktoken_r50.py index d518a66..fff6f30 100644 --- a/tests/test_tiktoken_r50.py +++ b/tests/test_tiktoken_r50.py @@ -37,7 +37,7 @@ def test_linear_english_code_with_keywords(): if op.type == "SELECT": """ # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message3, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message3, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH) == pytest.approx(21, abs=6) + assert estimate_text_tokens(message3, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH) == pytest.approx(30, abs=15) def test_linear_english_code_with_variable_names(): @@ -104,7 +104,7 @@ def test_linear_all_code_with_keywords(): if op.type == "SELECT": """ # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message3, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL), int) - assert estimate_text_tokens(message3, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(21, abs=1) + assert estimate_text_tokens(message3, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(30, abs=10) def test_linear_all_code_with_variable_names(): From f1e06fd4eebfe0fe54aa97b034aa47d716f7f9e7 Mon Sep 17 00:00:00 2001 From: M-Mahdi-Razmjoo Date: Sat, 15 Nov 2025 00:13:42 +0330 Subject: [PATCH 4/5] modify params and readme --- README.md | 35 +++++++++++++++++------------------ tocount/params.py | 36 ++++++++++++++++++------------------ 2 files changed, 35 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 1522b7d..58bee24 100644 --- a/README.md +++ b/README.md @@ -65,33 +65,32 @@ ToCount is a lightweight and extensible Python library for estimating token coun ### Rule-Based - -| Model Name | MAE | MSE | R² | -|----------------------------|---------|-------------|--------| -| `RULE_BASED.UNIVERSAL` | 106.70 | 381,647.81 | 0.8175 | -| `RULE_BASED.GPT_4` | 152.34 | 571,795.89 | 0.7266 | -| `RULE_BASED.GPT_3_5` | 161.93 | 652,923.59 | 0.6878 | +| Model Name | R² | MAE | MSE | +|----------------------------|--------|----------|---------------| +| `RULE_BASED.UNIVERSAL` | 0.8175 | 106.70 | 381,647.81 | +| `RULE_BASED.GPT_4` | 0.7266 | 152.34 | 571,795.89 | +| `RULE_BASED.GPT_3_5` | 0.6878 | 161.93 | 652,923.59 | ### Tiktoken R50K -| Model Name | MAE | MSE | R² | -|-------------------------------------|---------|-------------|--------| -| `TIKTOKEN_R50K.LINEAR_ALL` | 71.38 | 183897.01 | 0.8941 | -| `TIKTOKEN_R50K.LINEAR_ENGLISH` | 23.35 | 14127.92 | 0.9887 | +| Model Name | R² | MAE | MSE | Median Abs Error | D² Abs Score | +|--------------------------------|--------|----------|---------------|------------------|--------------| +| `TIKTOKEN_R50K.LINEAR_ALL` | 0.7334 | 152.39 | 537,877.36 | 28.55 | 0.4826 | +| `TIKTOKEN_R50K.LINEAR_ENGLISH` | 0.8703 | 62.76 | 258,271.50 | 8.87 | 0.7287 | ### Tiktoken CL100K -| Model Name | MAE | MSE | R² | -|-------------------------------------|---------|-------------|--------| -| `TIKTOKEN_CL100K.LINEAR_ALL` | 41.85 | 47949.48 | 0.9545 | -| `TIKTOKEN_CL100K.LINEAR_ENGLISH` | 21.12 | 17597.20 | 0.9839 | +| Model Name | R² | MAE | MSE | Median Abs Error | D² Abs Score | +|----------------------------------|--------|-------|-------------|------------------|--------------| +| `TIKTOKEN_CL100K.LINEAR_ALL` | 0.9127 | 64.09 | 88,814.15 | 15.73 | 0.6804 | +| `TIKTOKEN_CL100K.LINEAR_ENGLISH` | 0.9711 | 27.43 | 34,249.15 | 6.34 | 0.8527 | ### Tiktoken O200K -| Model Name | MAE | MSE | R² | -|-------------------------------------|---------|-------------|--------| -| `TIKTOKEN_O200K.LINEAR_ALL` | 25.53 | 20195.32 | 0.9777 | -| `TIKTOKEN_O200K.LINEAR_ENGLISH` | 20.24 | 15887.99 | 0.9859 | +| Model Name | R² | MAE | MSE | Median Abs Error | D² Abs Score | +|----------------------------------|--------|-------|-------------|------------------|--------------| +| `TIKTOKEN_O200K.LINEAR_ALL` | 0.9563 | 38.23 | 38,872.42 | 9.70 | 0.7818 | +| `TIKTOKEN_O200K.LINEAR_ENGLISH` | 0.9730 | 26.00 | 31,519.73 | 5.96 | 0.8581 | ℹ️ The training and testing dataset is taken from Lmsys-chat-1m [1] and Wildchat [2]. diff --git a/tocount/params.py b/tocount/params.py index 5778714..1e8bbe1 100644 --- a/tocount/params.py +++ b/tocount/params.py @@ -14,39 +14,39 @@ TIKTOKEN_R50K_LINEAR_MODELS = { "english": { - "coefficient": {"a": 0.22027472695240083, "b": 1.30984549875905421}, - "input_scaler": {"mean": 847.18595335180884, "scale": 4824.54596296361160}, - "output_scaler": {"mean": 191.91873679585714, "scale": 1122.03854916642285} + "coefficient": {"a": 0.24220021827364216, "b": -1.52512159607669773}, + "input_scaler": {"mean": 944.83131738824351942, "scale": 5021.64260895033748966}, + "output_scaler": {"mean": 254.89806628634627828, "scale": 1446.92651162795687014} }, "all": { - "coefficient": {"a": 0.24897308965467127, "b": 4.54308265105588305}, - "input_scaler": {"mean": 863.91052735502114, "scale": 4579.14607319174774}, - "output_scaler": {"mean": 250.55580827419274, "scale": 1317.83991440127875} + "coefficient": {"a": 0.26949633800191791, "b": 17.71983908874145186}, + "input_scaler": {"mean": 807.95457802727003127, "scale": 4239.81308570276723913}, + "output_scaler": {"mean": 320.20259580719863379, "scale": 1446.23196906494854375} } } TIKTOKEN_CL100K_LINEAR_MODELS = { "english": { - "coefficient": {"a": 0.20632774595922751, "b": 1.31582377652722826}, - "input_scaler": {"mean": 928.01351455812346, "scale": 4839.45514713105058}, - "output_scaler": {"mean": 198.34363306972855, "scale": 1087.61891525056103} + "coefficient": {"a": 0.21207829974544795, "b": 3.61015453257535057}, + "input_scaler": {"mean": 944.83131738824351942, "scale": 5021.64260895033748966}, + "output_scaler": {"mean": 208.97662180989638614, "scale": 1160.24193688094055688} }, "all": { - "coefficient": {"a": 0.22359382657517404, "b": 4.81058433875418601}, - "input_scaler": {"mean": 874.16460544630535, "scale": 4486.74238683014846}, - "output_scaler": {"mean": 213.81428929203110, "scale": 1078.26297169722625} + "coefficient": {"a": 0.22389270545161979, "b": 14.24559780994757219}, + "input_scaler": {"mean": 807.95457802727003127, "scale": 4239.81308570276723913}, + "output_scaler": {"mean": 221.90881687060880267, "scale": 1055.65522548552621629} } } TIKTOKEN_O200K_LINEAR_MODELS = { "english": { - "coefficient": {"a": 0.20354485735834993, "b": 2.08764234347103361}, - "input_scaler": {"mean": 923.75157809972700, "scale": 4843.14030162006293}, - "output_scaler": {"mean": 194.41034579748791, "scale": 1073.00614112992844} + "coefficient": {"a": 0.20934150948723654, "b": 3.23697987353031991}, + "input_scaler": {"mean": 944.83131738824351942, "scale": 5021.64260895033748966}, + "output_scaler": {"mean": 205.52642979710270765, "scale": 1144.67974411186628458} }, "all": { - "coefficient": {"a": 0.21511955690162138, "b": 1.71656955330649552}, - "input_scaler": {"mean": 859.61614585211419, "scale": 4397.61706792694440}, - "output_scaler": {"mean": 191.96748588283666, "scale": 1006.41246761102514} + "coefficient": {"a": 0.21634871429041430, "b": 8.52848758076195246}, + "input_scaler": {"mean": 807.95457802727003127, "scale": 4239.81308570276723913}, + "output_scaler": {"mean": 194.13328834356752850, "scale": 993.46453791503881803} } } From a939e8c176fad35a8f30f20a3f1da908723b910f Mon Sep 17 00:00:00 2001 From: M-Mahdi-Razmjoo Date: Sat, 15 Nov 2025 00:50:13 +0330 Subject: [PATCH 5/5] update tests --- tests/test_tiktoken_cl100k.py | 40 ++++++++++++++++----------------- tests/test_tiktoken_o200k.py | 42 +++++++++++++++++------------------ tests/test_tiktoken_r50.py | 34 ++++++++++++++-------------- 3 files changed, 58 insertions(+), 58 deletions(-) diff --git a/tests/test_tiktoken_cl100k.py b/tests/test_tiktoken_cl100k.py index b007cbf..c56b817 100644 --- a/tests/test_tiktoken_cl100k.py +++ b/tests/test_tiktoken_cl100k.py @@ -5,21 +5,21 @@ def test_linear_english_text_with_simple_prompt(): message = "You are the text completion model" # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=2 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(6, abs=2) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(6, abs=5) def test_linear_english_text_with_contractions(): # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=0 message = "I’m refining a foolproof method for reality shifting" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(10, abs=2) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(10, abs=5) def test_linear_english_text_with_prefixes_and_suffixes(): # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=10 message = "reflecting the hardships of the preparation process" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(8, abs=4) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(8, abs=6) def test_linear_english_code_with_keywords(): @@ -30,7 +30,7 @@ def test_linear_english_code_with_keywords(): message2 = "class QueryPlanner:" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(5, abs=0) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(5, abs=3) message3 = """ for op in operations: @@ -43,23 +43,23 @@ def test_linear_english_code_with_keywords(): def test_linear_english_code_with_variable_names(): message = "table_name = ast.table_name" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(6, abs=1) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(6, abs=3) def test_linear_english_text_empty_and_whitespace(): message1 = "" assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(0, abs=1) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(0, abs=4) message2 = " \t \n " assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(3, abs=1) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(3, abs=2) def test_linear_english_text_with_long_word(): message = "This is a verylongwordwithoutspaces and should be counted properly." assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(14, abs=1) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(14, abs=4) def test_linear_english_text_with_rare_character(): @@ -72,61 +72,61 @@ def test_linear_english_text_with_rare_character(): def test_linear_all_text_with_simple_prompt(): message = "You are the text completion model" # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=2 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(6, abs=6) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(6, abs=16) def test_linear_all_text_with_contractions(): # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=0 message = "I’m refining a foolproof method for reality shifting" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(10, abs=6) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(10, abs=16) def test_linear_all_text_with_prefixes_and_suffixes(): # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=10 message = "reflecting the hardships of the preparation process" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(8, abs=8) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(8, abs=18) def test_linear_all_code_with_keywords(): # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 message1 = "def __init__(self, schema):" assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) - assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(8, abs=3) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(8, abs=12) message2 = "class QueryPlanner:" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) - assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(5, abs=4) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(5, abs=13) message3 = """ for op in operations: if op.type == "SELECT": """ # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message3, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) - assert estimate_text_tokens(message3, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(16, abs=3) + assert estimate_text_tokens(message3, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(16, abs=12) def test_linear_all_code_with_variable_names(): message = "table_name = ast.table_name" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(6, abs=5) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(6, abs=14) def test_linear_all_text_empty_and_whitespace(): message1 = "" assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) - assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(0, abs=5) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(0, abs=14) message2 = " \t \n " assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) - assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(3, abs=3) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(3, abs=12) def test_linear_all_text_non_english_with_special_chars(): message = "versión británica" # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=13 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(5, abs=4) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(5, abs=13) def test_linear_all_text_non_english(): @@ -138,11 +138,11 @@ def test_linear_all_text_non_english(): def test_linear_all_text_with_long_word(): message = "This is a verylongwordwithoutspaces and should be counted properly." assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(14, abs=6) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(14, abs=15) def test_linear_all_text_with_rare_character(): # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=18 message = "What is the smallest possible value for P[A ∩ B ∩ C]?" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(16, abs=1) \ No newline at end of file + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(16, abs=10) \ No newline at end of file diff --git a/tests/test_tiktoken_o200k.py b/tests/test_tiktoken_o200k.py index 4035fb9..4608c1c 100644 --- a/tests/test_tiktoken_o200k.py +++ b/tests/test_tiktoken_o200k.py @@ -5,32 +5,32 @@ def test_linear_english_text_with_simple_prompt(): message = "You are the text completion model" # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=2 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(6, abs=3) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(6, abs=4) def test_linear_english_text_with_contractions(): # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=0 message = "I’m refining a foolproof method for reality shifting" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(10, abs=3) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(10, abs=4) def test_linear_english_text_with_prefixes_and_suffixes(): # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=10 message = "reflecting the hardships of the preparation process" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(8, abs=4) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(8, abs=6) def test_linear_english_code_with_keywords(): # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 message1 = "def __init__(self, schema):" assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(8, abs=0) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(8, abs=1) message2 = "class QueryPlanner:" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(4, abs=2) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(4, abs=3) message3 = """ for op in operations: @@ -43,23 +43,23 @@ def test_linear_english_code_with_keywords(): def test_linear_english_code_with_variable_names(): message = "table_name = ast.table_name" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(6, abs=2) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(6, abs=3) def test_linear_english_text_empty_and_whitespace(): message1 = "" assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(0, abs=2) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(0, abs=3) message2 = " \t \n " assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(3, abs=0) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(3, abs=1) def test_linear_english_text_with_long_word(): message = "This is a verylongwordwithoutspaces and should be counted properly." assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(14, abs=2) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(14, abs=3) def test_linear_english_text_with_rare_character(): @@ -72,61 +72,61 @@ def test_linear_english_text_with_rare_character(): def test_linear_all_text_with_simple_prompt(): message = "You are the text completion model" # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=2 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(6, abs=3) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(6, abs=10) def test_linear_all_text_with_contractions(): # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=0 message = "I’m refining a foolproof method for reality shifting" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(10, abs=3) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(10, abs=10) def test_linear_all_text_with_prefixes_and_suffixes(): # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=10 message = "reflecting the hardships of the preparation process" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(8, abs=5) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(8, abs=12) def test_linear_all_code_with_keywords(): # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 message1 = "def __init__(self, schema):" assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) - assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(8, abs=0) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(8, abs=6) message2 = "class QueryPlanner:" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) - assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(4, abs=2) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(4, abs=9) message3 = """ for op in operations: if op.type == "SELECT": """ # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message3, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) - assert estimate_text_tokens(message3, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(16, abs=1) + assert estimate_text_tokens(message3, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(16, abs=6) def test_linear_all_code_with_variable_names(): message = "table_name = ast.table_name" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(6, abs=2) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(6, abs=8) def test_linear_all_text_empty_and_whitespace(): message1 = "" assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) - assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(0, abs=2) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(0, abs=9) message2 = " \t \n " assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) - assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(3, abs=0) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(3, abs=7) def test_linear_all_text_non_english_with_special_chars(): message = "versión británica" # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=13 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(4, abs=1) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(4, abs=8) def test_linear_all_text_non_english(): @@ -138,11 +138,11 @@ def test_linear_all_text_non_english(): def test_linear_all_text_with_long_word(): message = "This is a verylongwordwithoutspaces and should be counted properly." assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(14, abs=2) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(14, abs=9) def test_linear_all_text_with_rare_character(): # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=18 message = "What is the smallest possible value for P[A ∩ B ∩ C]?" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(16, abs=3) \ No newline at end of file + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(16, abs=4) \ No newline at end of file diff --git a/tests/test_tiktoken_r50.py b/tests/test_tiktoken_r50.py index fff6f30..bbb4fd1 100644 --- a/tests/test_tiktoken_r50.py +++ b/tests/test_tiktoken_r50.py @@ -26,24 +26,24 @@ def test_linear_english_code_with_keywords(): # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 message1 = "def __init__(self, schema):" assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH) == pytest.approx(9, abs=2) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH) == pytest.approx(9, abs=4) message2 = "class QueryPlanner:" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH) == pytest.approx(5, abs=1) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH) == pytest.approx(5, abs=2) message3 = """ for op in operations: if op.type == "SELECT": """ # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message3, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message3, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH) == pytest.approx(30, abs=15) + assert estimate_text_tokens(message3, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH) == pytest.approx(30, abs=16) def test_linear_english_code_with_variable_names(): message = "table_name = ast.table_name" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH) == pytest.approx(9, abs=2) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH) == pytest.approx(9, abs=4) def test_linear_english_text_empty_and_whitespace(): @@ -53,7 +53,7 @@ def test_linear_english_text_empty_and_whitespace(): message2 = " \t \n " assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH) == pytest.approx(5, abs=3) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH) == pytest.approx(5, abs=5) def test_linear_english_text_with_long_word(): @@ -66,38 +66,38 @@ def test_linear_english_text_with_rare_character(): # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=18 message = "What is the smallest possible value for P[A ∩ B ∩ C]?" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH) == pytest.approx(18, abs=6) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH) == pytest.approx(18, abs=7) def test_linear_all_text_with_simple_prompt(): message = "You are the text completion model" # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=2 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(6, abs=7) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(6, abs=21) def test_linear_all_text_with_contractions(): # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=0 message = "I’m refining a foolproof method for reality shifting" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(12, abs=6) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(12, abs=20) def test_linear_all_text_with_prefixes_and_suffixes(): # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=10 message = "reflecting the hardships of the preparation process" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(8, abs=10) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(8, abs=23) def test_linear_all_code_with_keywords(): # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 message1 = "def __init__(self, schema):" assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL), int) - assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(9, abs=3) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(9, abs=16) message2 = "class QueryPlanner:" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL), int) - assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(5, abs=5) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(5, abs=18) message3 = """ for op in operations: @@ -110,23 +110,23 @@ def test_linear_all_code_with_keywords(): def test_linear_all_code_with_variable_names(): message = "table_name = ast.table_name" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(9, abs=3) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(9, abs=16) def test_linear_all_text_empty_and_whitespace(): message1 = "" assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL), int) - assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(0, abs=5) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(0, abs=18) message2 = " \t \n " assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL), int) - assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(5, abs=1) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(5, abs=14) def test_linear_all_text_non_english_with_special_chars(): message = "versión británica" # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=13 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(7, abs=2) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(7, abs=15) def test_linear_all_text_non_english(): @@ -138,11 +138,11 @@ def test_linear_all_text_non_english(): def test_linear_all_text_with_long_word(): message = "This is a verylongwordwithoutspaces and should be counted properly." assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(15, abs=7) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(15, abs=21) def test_linear_all_text_with_rare_character(): # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=18 message = "What is the smallest possible value for P[A ∩ B ∩ C]?" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(18, abs=2) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(18, abs=14)