Add TokenCount metric (#74)

pantonante · web-flow · commit a9823db2145c · 2024-08-04T15:43:24.000-07:00
* Add TokenCount metric

* token count docs

* update readme + bump version

---------

Co-authored-by: yisz
diff --git a/README.md b/README.md
@@ -18,14 +18,14 @@
 </div>
 
 <h2 align="center">
-  <p>Production-Grade Evaluation for LLM-Powered Applications</p>
+  <p>Data-Driven Evaluation for LLM-Powered Applications</p>
 </h2>
 
 
 
 ## Overview
 
-`continuous-eval` is an open-source package created for granular and rigorous evaluation of LLM-powered application. 
+`continuous-eval` is an open-source package created for data-driven evaluation of LLM-powered application.
 
 <h1 align="center">
   <img
@@ -63,7 +63,7 @@ To run LLM-based metrics, the code requires at least one of the LLM API keys in
 ## Run a single metric
 
 Here's how you run a single metric on a datum.
-Check all available metrics here: [link](https://docs.relari.ai/)
+Check all available metrics here: [link](https://continuous-eval.docs.relari.ai/)
 
 ```python
 from continuous_eval.metrics.retrieval import PrecisionRecallF1
@@ -95,7 +95,7 @@ print(metric(**datum))
     <tr>
         <td rowspan="2">Retrieval</td>
         <td>Deterministic</td>
-        <td>PrecisionRecallF1, RankedRetrievalMetrics</td>
+        <td>PrecisionRecallF1, RankedRetrievalMetrics, TokenCount</td>
     </tr>
     <tr>
         <td>LLM-based</td>
@@ -222,21 +222,17 @@ metrics = evalrunner.evaluate(dataset)
 ## Synthetic Data Generation
 
 Ground truth data, or reference data, is important for evaluation as it can offer a comprehensive and consistent measurement of system performance. However, it is often costly and time-consuming to manually curate such a golden dataset.
-We have created a synthetic data pipeline that can custom generate user interaction data for a variety of use cases such as RAG, agents, copilots. They can serve a starting point for a golden dataset for evaluation or for other training purposes. Below is an example for Coding Agents.
+We have created a synthetic data pipeline that can custom generate user interaction data for a variety of use cases such as RAG, agents, copilots. They can serve a starting point for a golden dataset for evaluation or for other training purposes.
 
-<h1 align="center">
-  <img
-    src="docs/public/synthetic-data-demo.png"
-  >
-</h1>
+To generate custom synthetic data, please visit [Relari](https://www.relari.ai/) to create a free account and you can then generate custom synthetic golden datasets through the Relari Cloud.
 
 ## 💡 Contributing
 
 Interested in contributing? See our [Contribution Guide](CONTRIBUTING.md) for more details.
 
 ## Resources
 
-- **Docs:** [link](https://docs.relari.ai/)
+- **Docs:** [link](https://continuous-eval.docs.relari.ai/)
 - **Examples Repo**: [end-to-end example repo](https://github.com/relari-ai/examples)
 - **Blog Posts:**
   - Practical Guide to RAG Pipeline Evaluation: [Part 1: Retrieval](https://medium.com/relari/a-practical-guide-to-rag-pipeline-evaluation-part-1-27a472b09893), [Part 2: Generation](https://medium.com/relari/a-practical-guide-to-rag-evaluation-part-2-generation-c79b1bde0f5d)
@@ -246,7 +242,7 @@ Interested in contributing? See our [Contribution Guide](CONTRIBUTING.md) for mo
   - How to Make the Most Out of LLM Production Data: Simulated User Feedback [(link)](https://medium.com/towards-data-science/how-to-make-the-most-out-of-llm-production-data-simulated-user-feedback-843c444febc7)
   - Generate Synthetic Data to Test LLM Applications [(link)](https://medium.com/relari/generate-synthetic-data-to-test-llm-applications-4bffeb51b80e)
 - **Discord:** Join our community of LLM developers [Discord](https://discord.gg/GJnM8SRsHr)
-- **Reach out to founders:** [Email](mailto:founders@relari.ai) or [Schedule a chat](https://cal.com/pasquale/continuous-eval)
+- **Reach out to founders:** [Email](mailto:founders@relari.ai) or [Schedule a chat](https://cal.com/relari/demo)
 
 ## License
 
diff --git a/continuous_eval/metrics/retrieval/__init__.py b/continuous_eval/metrics/retrieval/__init__.py
@@ -1,12 +1,13 @@
-from continuous_eval.metrics.retrieval.precision_recall_f1 import PrecisionRecallF1
-from continuous_eval.metrics.retrieval.ranked import RankedRetrievalMetrics
+from continuous_eval.metrics.retrieval.llm_based import (
+    LLMBasedContextCoverage,
+    LLMBasedContextPrecision,
+)
 from continuous_eval.metrics.retrieval.matching_strategy import (
     ExactChunkMatch,
     ExactSentenceMatch,
     RougeChunkMatch,
     RougeSentenceMatch,
 )
-from continuous_eval.metrics.retrieval.llm_based import (
-    LLMBasedContextCoverage,
-    LLMBasedContextPrecision,
-)
+from continuous_eval.metrics.retrieval.precision_recall_f1 import PrecisionRecallF1
+from continuous_eval.metrics.retrieval.ranked import RankedRetrievalMetrics
+from continuous_eval.metrics.retrieval.tokens import TokenCount
diff --git a/continuous_eval/metrics/retrieval/tokens.py b/continuous_eval/metrics/retrieval/tokens.py
@@ -0,0 +1,25 @@
+import tiktoken
+
+from continuous_eval.metrics.base import Metric
+
+_CHARACTERS_PER_TOKEN = 4.0
+
+
+class TokenCount(Metric):
+    def __init__(self, encoder_name: str) -> None:
+        super().__init__()
+        if encoder_name == "approx":
+            self._encoder = None
+        else:
+            try:
+                self._encoder = tiktoken.get_encoding(encoder_name)
+            except ValueError:
+                raise ValueError(f"Invalid encoder name: {encoder_name}")
+
+    def __call__(self, retrieved_context, **kwargs):
+        ctx = "\n".join(retrieved_context)
+        if self._encoder is None:
+            num_tokens = int(len(ctx) / _CHARACTERS_PER_TOKEN)
+        else:
+            num_tokens = len(self._encoder.encode(ctx))
+        return {"num_tokens": num_tokens}
diff --git a/docs/src/content/docs/metrics/Retrieval/Deterministic/token_count.md b/docs/src/content/docs/metrics/Retrieval/Deterministic/token_count.md
@@ -0,0 +1,41 @@
+---
+title: Token Count
+---
+
+### Definitions
+
+Token Count calculates the number of tokens used in the retrieved context.
+
+A required input for the metrics is `encoder_name` for tiktoken. 
+
+For example, for the most recent OpenAI models, you use `cl100k_base` as the encoder. For other models, you should look up the specific tokenizer used, or alternatively, you can also use `approx` to get an approximate token count which measures 1 token for every 4 characters.
+
+:::tip
+**Tokens in `retrieved_context` often accounts for the majority of LLM token usage in a RAG application.** 
+Token count is useful to keep track of if you are concerned about LLM cost, LLM context window limit, and LLM performance issued caused by low context precision (such as "needle-in-a-haystack" problems).
+:::
+
+Required data items: `retrieved_context`
+
+```python
+from continuous_eval.metrics.retrieval import TokenCount
+
+datum = {
+    "retrieved_context": [
+        "Lyon is a major city in France.",
+        "Paris is the capital of France and also the largest city in the country.",
+    ],
+    "ground_truth_context": ["Paris is the capital of France."],
+}
+
+metric = TokenCount(encoder_name="cl100k_base")
+print(metric(**datum))
+```
+
+### Example Output
+
+```JSON
+{
+    'num_tokens': 24, 
+}
+```
diff --git a/docs/src/content/docs/metrics/overview.md b/docs/src/content/docs/metrics/overview.md
@@ -35,7 +35,7 @@ Below is the list of metrics available:
     <tr>
         <td rowspan="2">Retrieval</td>
         <td>Deterministic</td>
-        <td>PrecisionRecallF1, RankedRetrievalMetrics</td>
+        <td>PrecisionRecallF1, RankedRetrievalMetrics, TokenCount</td>
     </tr>
     <tr>
         <td>LLM-based</td>
@@ -93,6 +93,10 @@ Below is the list of metrics available:
 - **Definition:** Rank-aware metrics including Mean Average Precision (MAP), Mean Reciprical Rank (MRR), NDCG (Normalized Discounted Cumulative Gain) of retrieved contexts
 - **Inputs:** `retrieved_context`, `ground_truth_context`
 
+**`TokenCount`**
+- **Definition:** Counts the amount of tokens in the retrieved context.
+- **Inputs:** `retrieved_context`
+
 ##### LLM-based
 
 **`LLMBasedContextPrecision`**
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "continuous-eval"
-version = "0.3.12"
+version = "0.3.13"
 description = "Open-Source Evaluation for GenAI Application Pipelines."
 authors = ["Yi Zhang <yi@relari.ai>", "Pasquale Antonante <pasquale@relari.ai>"]
 readme = "README.md"
diff --git a/tests/retrieval_metrics_test.py b/tests/retrieval_metrics_test.py
@@ -9,9 +9,10 @@
     RankedRetrievalMetrics,
     RougeChunkMatch,
     RougeSentenceMatch,
+    TokenCount,
 )
 from tests.helpers import example_datum
-from tests.helpers.utils import all_close, in_zero_one, list_of_dicts_to_dict_of_lists
+from tests.helpers.utils import all_close, in_zero_one
 
 
 def test_precision_recall_exact_chunk_match():
@@ -75,3 +76,13 @@ def test_llm_based_context_coverage_openai():
 
     metric = LLMBasedContextCoverage(model=LLMFactory("gpt-3.5-turbo-1106"))
     assert all(in_zero_one(metric(**datum)["LLM_based_context_coverage"]) for datum in data)
+
+
+def test_token_count():
+    data = [example_datum.CAPITAL_OF_FRANCE, example_datum.ROMEO_AND_JULIET]
+    metric = TokenCount("o200k_base")
+    expected = [17, 16]
+    assert (result := [metric(**datum)["num_tokens"] for datum in data]) == expected, result
+    expected = [17, 18]
+    metric = TokenCount("approx")
+    assert (result := [metric(**datum)["num_tokens"] for datum in data]) == expected, result