diff --git a/README.md b/README.md index 2c46c0b..b5c06eb 100644 --- a/README.md +++ b/README.md @@ -72,33 +72,32 @@ ToCount is a lightweight and extensible Python library for estimating token coun ### Rule-Based - -| Model Name | MAE | MSE | R² | -|----------------------------|---------|-------------|--------| -| `RULE_BASED.UNIVERSAL` | 106.70 | 381,647.81 | 0.8175 | -| `RULE_BASED.GPT_4` | 152.34 | 571,795.89 | 0.7266 | -| `RULE_BASED.GPT_3_5` | 161.93 | 652,923.59 | 0.6878 | +| Model Name | R² | MAE | MSE | +|----------------------------|--------|----------|---------------| +| `RULE_BASED.UNIVERSAL` | 0.8175 | 106.70 | 381,647.81 | +| `RULE_BASED.GPT_4` | 0.7266 | 152.34 | 571,795.89 | +| `RULE_BASED.GPT_3_5` | 0.6878 | 161.93 | 652,923.59 | ### Tiktoken R50K -| Model Name | MAE | MSE | R² | -|-------------------------------------|---------|-------------|--------| -| `TIKTOKEN_R50K.LINEAR_ALL` | 71.38 | 183897.01 | 0.8941 | -| `TIKTOKEN_R50K.LINEAR_ENGLISH` | 23.35 | 14127.92 | 0.9887 | +| Model Name | R² | MAE | MSE | Median Abs Error | D² Abs Score | +|--------------------------------|--------|----------|---------------|------------------|--------------| +| `TIKTOKEN_R50K.LINEAR_ALL` | 0.7334 | 152.39 | 537,877.36 | 28.55 | 0.4826 | +| `TIKTOKEN_R50K.LINEAR_ENGLISH` | 0.8703 | 62.76 | 258,271.50 | 8.87 | 0.7287 | ### Tiktoken CL100K -| Model Name | MAE | MSE | R² | -|-------------------------------------|---------|-------------|--------| -| `TIKTOKEN_CL100K.LINEAR_ALL` | 41.85 | 47949.48 | 0.9545 | -| `TIKTOKEN_CL100K.LINEAR_ENGLISH` | 21.12 | 17597.20 | 0.9839 | +| Model Name | R² | MAE | MSE | Median Abs Error | D² Abs Score | +|----------------------------------|--------|-------|-------------|------------------|--------------| +| `TIKTOKEN_CL100K.LINEAR_ALL` | 0.9127 | 64.09 | 88,814.15 | 15.73 | 0.6804 | +| `TIKTOKEN_CL100K.LINEAR_ENGLISH` | 0.9711 | 27.43 | 34,249.15 | 6.34 | 0.8527 | ### Tiktoken O200K -| Model Name | MAE | MSE | R² | -|-------------------------------------|---------|-------------|--------| -| `TIKTOKEN_O200K.LINEAR_ALL` | 25.53 | 20195.32 | 0.9777 | -| `TIKTOKEN_O200K.LINEAR_ENGLISH` | 20.24 | 15887.99 | 0.9859 | +| Model Name | R² | MAE | MSE | Median Abs Error | D² Abs Score | +|----------------------------------|--------|-------|-------------|------------------|--------------| +| `TIKTOKEN_O200K.LINEAR_ALL` | 0.9563 | 38.23 | 38,872.42 | 9.70 | 0.7818 | +| `TIKTOKEN_O200K.LINEAR_ENGLISH` | 0.9730 | 26.00 | 31,519.73 | 5.96 | 0.8581 | ℹ️ The training and testing dataset is taken from Lmsys-chat-1m [1] and Wildchat [2]. diff --git a/tests/test_tiktoken_cl100k.py b/tests/test_tiktoken_cl100k.py index b007cbf..978a4d2 100644 --- a/tests/test_tiktoken_cl100k.py +++ b/tests/test_tiktoken_cl100k.py @@ -5,21 +5,21 @@ def test_linear_english_text_with_simple_prompt(): message = "You are the text completion model" # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=2 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(6, abs=2) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(6, abs=5) def test_linear_english_text_with_contractions(): # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=0 message = "I’m refining a foolproof method for reality shifting" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(10, abs=2) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(10, abs=5) def test_linear_english_text_with_prefixes_and_suffixes(): # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=10 message = "reflecting the hardships of the preparation process" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(8, abs=4) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(8, abs=6) def test_linear_english_code_with_keywords(): @@ -30,7 +30,7 @@ def test_linear_english_code_with_keywords(): message2 = "class QueryPlanner:" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(5, abs=0) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(5, abs=3) message3 = """ for op in operations: @@ -43,23 +43,23 @@ def test_linear_english_code_with_keywords(): def test_linear_english_code_with_variable_names(): message = "table_name = ast.table_name" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(6, abs=1) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(6, abs=3) def test_linear_english_text_empty_and_whitespace(): message1 = "" assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(0, abs=1) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(0, abs=4) message2 = " \t \n " assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(3, abs=1) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(3, abs=2) def test_linear_english_text_with_long_word(): message = "This is a verylongwordwithoutspaces and should be counted properly." assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(14, abs=1) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(14, abs=4) def test_linear_english_text_with_rare_character(): @@ -72,61 +72,61 @@ def test_linear_english_text_with_rare_character(): def test_linear_all_text_with_simple_prompt(): message = "You are the text completion model" # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=2 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(6, abs=6) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(6, abs=16) def test_linear_all_text_with_contractions(): # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=0 message = "I’m refining a foolproof method for reality shifting" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(10, abs=6) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(10, abs=16) def test_linear_all_text_with_prefixes_and_suffixes(): # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=10 message = "reflecting the hardships of the preparation process" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(8, abs=8) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(8, abs=18) def test_linear_all_code_with_keywords(): # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 message1 = "def __init__(self, schema):" assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) - assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(8, abs=3) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(8, abs=12) message2 = "class QueryPlanner:" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) - assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(5, abs=4) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(5, abs=13) message3 = """ for op in operations: if op.type == "SELECT": """ # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message3, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) - assert estimate_text_tokens(message3, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(16, abs=3) + assert estimate_text_tokens(message3, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(16, abs=12) def test_linear_all_code_with_variable_names(): message = "table_name = ast.table_name" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(6, abs=5) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(6, abs=14) def test_linear_all_text_empty_and_whitespace(): message1 = "" assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) - assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(0, abs=5) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(0, abs=14) message2 = " \t \n " assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) - assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(3, abs=3) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(3, abs=12) def test_linear_all_text_non_english_with_special_chars(): message = "versión británica" # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=13 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(5, abs=4) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(5, abs=13) def test_linear_all_text_non_english(): @@ -138,11 +138,11 @@ def test_linear_all_text_non_english(): def test_linear_all_text_with_long_word(): message = "This is a verylongwordwithoutspaces and should be counted properly." assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(14, abs=6) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(14, abs=15) def test_linear_all_text_with_rare_character(): # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=18 message = "What is the smallest possible value for P[A ∩ B ∩ C]?" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(16, abs=1) \ No newline at end of file + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(16, abs=10) diff --git a/tests/test_tiktoken_o200k.py b/tests/test_tiktoken_o200k.py index 4035fb9..1bc0c39 100644 --- a/tests/test_tiktoken_o200k.py +++ b/tests/test_tiktoken_o200k.py @@ -5,32 +5,32 @@ def test_linear_english_text_with_simple_prompt(): message = "You are the text completion model" # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=2 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(6, abs=3) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(6, abs=4) def test_linear_english_text_with_contractions(): # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=0 message = "I’m refining a foolproof method for reality shifting" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(10, abs=3) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(10, abs=4) def test_linear_english_text_with_prefixes_and_suffixes(): # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=10 message = "reflecting the hardships of the preparation process" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(8, abs=4) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(8, abs=6) def test_linear_english_code_with_keywords(): # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 message1 = "def __init__(self, schema):" assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(8, abs=0) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(8, abs=1) message2 = "class QueryPlanner:" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(4, abs=2) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(4, abs=3) message3 = """ for op in operations: @@ -43,23 +43,23 @@ def test_linear_english_code_with_keywords(): def test_linear_english_code_with_variable_names(): message = "table_name = ast.table_name" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(6, abs=2) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(6, abs=3) def test_linear_english_text_empty_and_whitespace(): message1 = "" assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(0, abs=2) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(0, abs=3) message2 = " \t \n " assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(3, abs=0) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(3, abs=1) def test_linear_english_text_with_long_word(): message = "This is a verylongwordwithoutspaces and should be counted properly." assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(14, abs=2) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ENGLISH) == pytest.approx(14, abs=3) def test_linear_english_text_with_rare_character(): @@ -72,61 +72,61 @@ def test_linear_english_text_with_rare_character(): def test_linear_all_text_with_simple_prompt(): message = "You are the text completion model" # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=2 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(6, abs=3) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(6, abs=10) def test_linear_all_text_with_contractions(): # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=0 message = "I’m refining a foolproof method for reality shifting" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(10, abs=3) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(10, abs=10) def test_linear_all_text_with_prefixes_and_suffixes(): # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=10 message = "reflecting the hardships of the preparation process" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(8, abs=5) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(8, abs=12) def test_linear_all_code_with_keywords(): # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 message1 = "def __init__(self, schema):" assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) - assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(8, abs=0) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(8, abs=6) message2 = "class QueryPlanner:" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) - assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(4, abs=2) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(4, abs=9) message3 = """ for op in operations: if op.type == "SELECT": """ # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message3, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) - assert estimate_text_tokens(message3, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(16, abs=1) + assert estimate_text_tokens(message3, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(16, abs=6) def test_linear_all_code_with_variable_names(): message = "table_name = ast.table_name" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(6, abs=2) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(6, abs=8) def test_linear_all_text_empty_and_whitespace(): message1 = "" assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) - assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(0, abs=2) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(0, abs=9) message2 = " \t \n " assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) - assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(3, abs=0) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(3, abs=7) def test_linear_all_text_non_english_with_special_chars(): message = "versión británica" # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=13 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(4, abs=1) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(4, abs=8) def test_linear_all_text_non_english(): @@ -138,11 +138,11 @@ def test_linear_all_text_non_english(): def test_linear_all_text_with_long_word(): message = "This is a verylongwordwithoutspaces and should be counted properly." assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(14, abs=2) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(14, abs=9) def test_linear_all_text_with_rare_character(): # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=18 message = "What is the smallest possible value for P[A ∩ B ∩ C]?" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(16, abs=3) \ No newline at end of file + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_O200K.LINEAR_ALL) == pytest.approx(16, abs=4) diff --git a/tests/test_tiktoken_r50.py b/tests/test_tiktoken_r50.py index fff6f30..bbb4fd1 100644 --- a/tests/test_tiktoken_r50.py +++ b/tests/test_tiktoken_r50.py @@ -26,24 +26,24 @@ def test_linear_english_code_with_keywords(): # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 message1 = "def __init__(self, schema):" assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH) == pytest.approx(9, abs=2) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH) == pytest.approx(9, abs=4) message2 = "class QueryPlanner:" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH) == pytest.approx(5, abs=1) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH) == pytest.approx(5, abs=2) message3 = """ for op in operations: if op.type == "SELECT": """ # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message3, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message3, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH) == pytest.approx(30, abs=15) + assert estimate_text_tokens(message3, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH) == pytest.approx(30, abs=16) def test_linear_english_code_with_variable_names(): message = "table_name = ast.table_name" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH) == pytest.approx(9, abs=2) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH) == pytest.approx(9, abs=4) def test_linear_english_text_empty_and_whitespace(): @@ -53,7 +53,7 @@ def test_linear_english_text_empty_and_whitespace(): message2 = " \t \n " assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH) == pytest.approx(5, abs=3) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH) == pytest.approx(5, abs=5) def test_linear_english_text_with_long_word(): @@ -66,38 +66,38 @@ def test_linear_english_text_with_rare_character(): # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=18 message = "What is the smallest possible value for P[A ∩ B ∩ C]?" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH) == pytest.approx(18, abs=6) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ENGLISH) == pytest.approx(18, abs=7) def test_linear_all_text_with_simple_prompt(): message = "You are the text completion model" # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=2 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(6, abs=7) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(6, abs=21) def test_linear_all_text_with_contractions(): # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=0 message = "I’m refining a foolproof method for reality shifting" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(12, abs=6) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(12, abs=20) def test_linear_all_text_with_prefixes_and_suffixes(): # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=10 message = "reflecting the hardships of the preparation process" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(8, abs=10) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(8, abs=23) def test_linear_all_code_with_keywords(): # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 message1 = "def __init__(self, schema):" assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL), int) - assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(9, abs=3) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(9, abs=16) message2 = "class QueryPlanner:" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL), int) - assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(5, abs=5) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(5, abs=18) message3 = """ for op in operations: @@ -110,23 +110,23 @@ def test_linear_all_code_with_keywords(): def test_linear_all_code_with_variable_names(): message = "table_name = ast.table_name" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(9, abs=3) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(9, abs=16) def test_linear_all_text_empty_and_whitespace(): message1 = "" assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL), int) - assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(0, abs=5) + assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(0, abs=18) message2 = " \t \n " assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL), int) - assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(5, abs=1) + assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(5, abs=14) def test_linear_all_text_non_english_with_special_chars(): message = "versión británica" # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=13 assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(7, abs=2) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(7, abs=15) def test_linear_all_text_non_english(): @@ -138,11 +138,11 @@ def test_linear_all_text_non_english(): def test_linear_all_text_with_long_word(): message = "This is a verylongwordwithoutspaces and should be counted properly." assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(15, abs=7) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(15, abs=21) def test_linear_all_text_with_rare_character(): # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=18 message = "What is the smallest possible value for P[A ∩ B ∩ C]?" assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL), int) - assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(18, abs=2) + assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_R50K.LINEAR_ALL) == pytest.approx(18, abs=14) diff --git a/tocount/params.py b/tocount/params.py index 5e9ad0b..20babdf 100644 --- a/tocount/params.py +++ b/tocount/params.py @@ -14,39 +14,39 @@ TIKTOKEN_R50K_LINEAR_MODELS = { "english": { - "coefficient": {"a": 0.22027472695240083, "b": 1.30984549875905421}, - "input_scaler": {"mean": 847.18595335180884, "scale": 4824.54596296361160}, - "output_scaler": {"mean": 191.91873679585714, "scale": 1122.03854916642285} + "coefficient": {"a": 0.24220021827364216, "b": -1.52512159607669773}, + "input_scaler": {"mean": 944.83131738824351942, "scale": 5021.64260895033748966}, + "output_scaler": {"mean": 254.89806628634627828, "scale": 1446.92651162795687014} }, "all": { - "coefficient": {"a": 0.24897308965467127, "b": 4.54308265105588305}, - "input_scaler": {"mean": 863.91052735502114, "scale": 4579.14607319174774}, - "output_scaler": {"mean": 250.55580827419274, "scale": 1317.83991440127875} + "coefficient": {"a": 0.26949633800191791, "b": 17.71983908874145186}, + "input_scaler": {"mean": 807.95457802727003127, "scale": 4239.81308570276723913}, + "output_scaler": {"mean": 320.20259580719863379, "scale": 1446.23196906494854375} } } TIKTOKEN_CL100K_LINEAR_MODELS = { "english": { - "coefficient": {"a": 0.20632774595922751, "b": 1.31582377652722826}, - "input_scaler": {"mean": 928.01351455812346, "scale": 4839.45514713105058}, - "output_scaler": {"mean": 198.34363306972855, "scale": 1087.61891525056103} + "coefficient": {"a": 0.21207829974544795, "b": 3.61015453257535057}, + "input_scaler": {"mean": 944.83131738824351942, "scale": 5021.64260895033748966}, + "output_scaler": {"mean": 208.97662180989638614, "scale": 1160.24193688094055688} }, "all": { - "coefficient": {"a": 0.22359382657517404, "b": 4.81058433875418601}, - "input_scaler": {"mean": 874.16460544630535, "scale": 4486.74238683014846}, - "output_scaler": {"mean": 213.81428929203110, "scale": 1078.26297169722625} + "coefficient": {"a": 0.22389270545161979, "b": 14.24559780994757219}, + "input_scaler": {"mean": 807.95457802727003127, "scale": 4239.81308570276723913}, + "output_scaler": {"mean": 221.90881687060880267, "scale": 1055.65522548552621629} } } TIKTOKEN_O200K_LINEAR_MODELS = { "english": { - "coefficient": {"a": 0.20354485735834993, "b": 2.08764234347103361}, - "input_scaler": {"mean": 923.75157809972700, "scale": 4843.14030162006293}, - "output_scaler": {"mean": 194.41034579748791, "scale": 1073.00614112992844} + "coefficient": {"a": 0.20934150948723654, "b": 3.23697987353031991}, + "input_scaler": {"mean": 944.83131738824351942, "scale": 5021.64260895033748966}, + "output_scaler": {"mean": 205.52642979710270765, "scale": 1144.67974411186628458} }, "all": { - "coefficient": {"a": 0.21511955690162138, "b": 1.71656955330649552}, - "input_scaler": {"mean": 859.61614585211419, "scale": 4397.61706792694440}, - "output_scaler": {"mean": 191.96748588283666, "scale": 1006.41246761102514} + "coefficient": {"a": 0.21634871429041430, "b": 8.52848758076195246}, + "input_scaler": {"mean": 807.95457802727003127, "scale": 4239.81308570276723913}, + "output_scaler": {"mean": 194.13328834356752850, "scale": 993.46453791503881803} } }