Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 17 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,33 +72,32 @@ ToCount is a lightweight and extensible Python library for estimating token coun

### Rule-Based


| Model Name | MAE | MSE | R² |
|----------------------------|---------|-------------|--------|
| `RULE_BASED.UNIVERSAL` | 106.70 | 381,647.81 | 0.8175 |
| `RULE_BASED.GPT_4` | 152.34 | 571,795.89 | 0.7266 |
| `RULE_BASED.GPT_3_5` | 161.93 | 652,923.59 | 0.6878 |
| Model Name | R² | MAE | MSE |
|----------------------------|--------|----------|---------------|
| `RULE_BASED.UNIVERSAL` | 0.8175 | 106.70 | 381,647.81 |
| `RULE_BASED.GPT_4` | 0.7266 | 152.34 | 571,795.89 |
| `RULE_BASED.GPT_3_5` | 0.6878 | 161.93 | 652,923.59 |

### Tiktoken R50K

| Model Name | MAE | MSE | R² |
|-------------------------------------|---------|-------------|--------|
| `TIKTOKEN_R50K.LINEAR_ALL` | 71.38 | 183897.01 | 0.8941 |
| `TIKTOKEN_R50K.LINEAR_ENGLISH` | 23.35 | 14127.92 | 0.9887 |
| Model Name | | MAE | MSE | Median Abs Error | D² Abs Score |
|--------------------------------|--------|----------|---------------|------------------|--------------|
| `TIKTOKEN_R50K.LINEAR_ALL` | 0.7334 | 152.39 | 537,877.36 | 28.55 | 0.4826 |
| `TIKTOKEN_R50K.LINEAR_ENGLISH` | 0.8703 | 62.76 | 258,271.50 | 8.87 | 0.7287 |

### Tiktoken CL100K

| Model Name | MAE | MSE | |
|-------------------------------------|---------|-------------|--------|
| `TIKTOKEN_CL100K.LINEAR_ALL` | 41.85 | 47949.48 | 0.9545 |
| `TIKTOKEN_CL100K.LINEAR_ENGLISH` | 21.12 | 17597.20 | 0.9839 |
| Model Name | | MAE | MSE | Median Abs Error | D² Abs Score |
|----------------------------------|--------|-------|-------------|------------------|--------------|
| `TIKTOKEN_CL100K.LINEAR_ALL` | 0.9127 | 64.09 | 88,814.15 | 15.73 | 0.6804 |
| `TIKTOKEN_CL100K.LINEAR_ENGLISH` | 0.9711 | 27.43 | 34,249.15 | 6.34 | 0.8527 |

### Tiktoken O200K

| Model Name | MAE | MSE | |
|-------------------------------------|---------|-------------|--------|
| `TIKTOKEN_O200K.LINEAR_ALL` | 25.53 | 20195.32 | 0.9777 |
| `TIKTOKEN_O200K.LINEAR_ENGLISH` | 20.24 | 15887.99 | 0.9859 |
| Model Name | | MAE | MSE | Median Abs Error | D² Abs Score |
|----------------------------------|--------|-------|-------------|------------------|--------------|
| `TIKTOKEN_O200K.LINEAR_ALL` | 0.9563 | 38.23 | 38,872.42 | 9.70 | 0.7818 |
| `TIKTOKEN_O200K.LINEAR_ENGLISH` | 0.9730 | 26.00 | 31,519.73 | 5.96 | 0.8581 |


ℹ️ The training and testing dataset is taken from Lmsys-chat-1m [1] and Wildchat [2].
Expand Down
40 changes: 20 additions & 20 deletions tests/test_tiktoken_cl100k.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,21 @@
def test_linear_english_text_with_simple_prompt():
message = "You are the text completion model" # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=2
assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int)
assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(6, abs=2)
assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(6, abs=5)


def test_linear_english_text_with_contractions():
# https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=0
message = "I’m refining a foolproof method for reality shifting"
assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int)
assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(10, abs=2)
assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(10, abs=5)


def test_linear_english_text_with_prefixes_and_suffixes():
# https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=10
message = "reflecting the hardships of the preparation process"
assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int)
assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(8, abs=4)
assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(8, abs=6)


def test_linear_english_code_with_keywords():
Expand All @@ -30,7 +30,7 @@ def test_linear_english_code_with_keywords():

message2 = "class QueryPlanner:" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19
assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int)
assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(5, abs=0)
assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(5, abs=3)

message3 = """
for op in operations:
Expand All @@ -43,23 +43,23 @@ def test_linear_english_code_with_keywords():
def test_linear_english_code_with_variable_names():
message = "table_name = ast.table_name" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19
assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int)
assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(6, abs=1)
assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(6, abs=3)


def test_linear_english_text_empty_and_whitespace():
message1 = ""
assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int)
assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(0, abs=1)
assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(0, abs=4)

message2 = " \t \n "
assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int)
assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(3, abs=1)
assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(3, abs=2)


def test_linear_english_text_with_long_word():
message = "This is a verylongwordwithoutspaces and should be counted properly."
assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH), int)
assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(14, abs=1)
assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ENGLISH) == pytest.approx(14, abs=4)


def test_linear_english_text_with_rare_character():
Expand All @@ -72,61 +72,61 @@ def test_linear_english_text_with_rare_character():
def test_linear_all_text_with_simple_prompt():
message = "You are the text completion model" # https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=2
assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int)
assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(6, abs=6)
assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(6, abs=16)


def test_linear_all_text_with_contractions():
# https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=0
message = "I’m refining a foolproof method for reality shifting"
assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int)
assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(10, abs=6)
assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(10, abs=16)


def test_linear_all_text_with_prefixes_and_suffixes():
# https://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=10
message = "reflecting the hardships of the preparation process"
assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int)
assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(8, abs=8)
assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(8, abs=18)


def test_linear_all_code_with_keywords():
# http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19
message1 = "def __init__(self, schema):"
assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int)
assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(8, abs=3)
assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(8, abs=12)

message2 = "class QueryPlanner:" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19
assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int)
assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(5, abs=4)
assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(5, abs=13)

message3 = """
for op in operations:
if op.type == "SELECT":
""" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19
assert isinstance(estimate_text_tokens(message3, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int)
assert estimate_text_tokens(message3, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(16, abs=3)
assert estimate_text_tokens(message3, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(16, abs=12)


def test_linear_all_code_with_variable_names():
message = "table_name = ast.table_name" # http://huggingface.co/datasets/lmsys/lmsys-chat-1m?conversation-viewer=19
assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int)
assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(6, abs=5)
assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(6, abs=14)


def test_linear_all_text_empty_and_whitespace():
message1 = ""
assert isinstance(estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int)
assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(0, abs=5)
assert estimate_text_tokens(message1, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(0, abs=14)

message2 = " \t \n "
assert isinstance(estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int)
assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(3, abs=3)
assert estimate_text_tokens(message2, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(3, abs=12)


def test_linear_all_text_non_english_with_special_chars():
message = "versión británica" # https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=13
assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int)
assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(5, abs=4)
assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(5, abs=13)


def test_linear_all_text_non_english():
Expand All @@ -138,11 +138,11 @@ def test_linear_all_text_non_english():
def test_linear_all_text_with_long_word():
message = "This is a verylongwordwithoutspaces and should be counted properly."
assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int)
assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(14, abs=6)
assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(14, abs=15)


def test_linear_all_text_with_rare_character():
# https://huggingface.co/datasets/allenai/WildChat-1M?conversation-viewer=18
message = "What is the smallest possible value for P[A ∩ B ∩ C]?"
assert isinstance(estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL), int)
assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(16, abs=1)
assert estimate_text_tokens(message, TextEstimator.TIKTOKEN_CL100K.LINEAR_ALL) == pytest.approx(16, abs=10)
Loading