Skip to content

Commit 344f7e9

Browse files
authored
Transition to eval runner (#61)
Removed EvaluationManager - Introduced EvaluationRunner and PipelineLogger - Updated documentation, including changes in README.md - Refactored codebase for new class integration - Fixed various bugs
1 parent 5c152c2 commit 344f7e9

31 files changed

+2456
-2113
lines changed

README.md

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
</div>
1919

2020
<h2 align="center">
21-
<p>Open-Source Evaluation for GenAI Application Pipelines</p>
21+
<p>Open-Source Evaluation for GenAI Applications</p>
2222
</h2>
2323

2424

@@ -143,12 +143,13 @@ print(metric(**datum))
143143
To define your own metrics, you only need to extend the [Metric](continuous_eval/metrics/base.py#L23C7-L23C13) class implementing the `__call__` method.
144144
Optional methods are `batch` (if it is possible to implement optimizations for batch processing) and `aggregate` (to aggregate metrics results over multiple samples_).
145145

146-
## Run evaluation on pipeline modules
146+
## Run evaluation on a pipeline
147147

148148
Define modules in your pipeline and select corresponding metrics.
149149

150150
```python
151-
from continuous_eval.eval import Module, ModuleOutput, Pipeline, Dataset
151+
from continuous_eval.eval import Module, ModuleOutput, Pipeline, Dataset, EvaluationRunner
152+
from continuous_eval.eval.logger import PipelineLogger
152153
from continuous_eval.metrics.retrieval import PrecisionRecallF1, RankedRetrievalMetrics
153154
from continuous_eval.metrics.generation.text import DeterministicAnswerCorrectness
154155
from typing import List, Dict
@@ -199,25 +200,24 @@ print(pipeline.graph_repr()) # optional: visualize the pipeline
199200
Now you can run the evaluation on your pipeline
200201

201202
```python
202-
eval_manager.start_run()
203-
while eval_manager.is_running():
204-
if eval_manager.curr_sample is None:
205-
break
206-
q = eval_manager.curr_sample["question"] # get the question or any other field
207-
# run your pipeline ...
208-
eval_manager.next_sample()
209-
```
203+
pipelog = PipelineLogger(pipeline=pipeline)
210204

211-
To **log** the results you just need to call the `eval_manager.log` method with the module name and the output, for example:
205+
# now run your LLM application pipeline, and for each module, log the results:
206+
pipelog.log(uid=sample_uid, module="module_name", value=data)
212207

213-
```python
214-
eval_manager.log("answer_generator", response)
208+
# Once you finish logging the data, you can use the EvaluationRunner to evaluate the logs
209+
evalrunner = EvaluationRunner(pipeline)
210+
metrics = evalrunner.evaluate(pipelog)
211+
metrics.results() # returns a dictionary with the results
215212
```
216213

217-
The evaluator manager also offers
214+
To run evaluation over an existing dataset (BYODataset), you can run the following:
218215

219-
- `eval_manager.run_metrics()` to run all the metrics defined in the pipeline
220-
- `eval_manager.run_tests()` to run the tests defined in the pipeline (see the documentation [docs](docs.relari.ai) for more details)
216+
```python
217+
dataset = Dataset(...)
218+
evalrunner = EvaluationRunner(pipeline)
219+
metrics = evalrunner.evaluate(dataset)
220+
```
221221

222222
## Synthetic Data Generation
223223

@@ -244,6 +244,8 @@ integrations that build on the core are both accepted and highly encouraged! See
244244
- How important is a Golden Dataset for LLM evaluation?
245245
[(link)](https://medium.com/relari/how-important-is-a-golden-dataset-for-llm-pipeline-evaluation-4ef6deb14dc5)
246246
- How to evaluate complex GenAI Apps: a granular approach [(link)](https://medium.com/relari/how-to-evaluate-complex-genai-apps-a-granular-approach-0ab929d5b3e2)
247+
- How to Make the Most Out of LLM Production Data: Simulated User Feedback [(link)](https://medium.com/towards-data-science/how-to-make-the-most-out-of-llm-production-data-simulated-user-feedback-843c444febc7)
248+
- Generate Synthetic Data to Test LLM Applications [(link)](https://medium.com/relari/generate-synthetic-data-to-test-llm-applications-4bffeb51b80e)
247249
- **Discord:** Join our community of LLM developers [Discord](https://discord.gg/GJnM8SRsHr)
248250
- **Reach out to founders:** [Email](mailto:[email protected]) or [Schedule a chat](https://cal.com/pasquale/continuous-eval)
249251

continuous_eval/data_downloader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,8 @@ def example_data_downloader(
6969
out_dir = destination_dir / resource
7070
return _download_and_extract_zip(EXAMPLES_DATA_URL + res["filename"], out_dir, force_download=force_download)
7171
elif res["type"] == "chromadb":
72-
from langchain.embeddings.openai import OpenAIEmbeddings
73-
from langchain.vectorstores import Chroma
72+
from langchain_chroma import Chroma
73+
from langchain_openai import OpenAIEmbeddings
7474

7575
out_dir = destination_dir / resource
7676
_download_and_extract_zip(EXAMPLES_DATA_URL + res["filename"], out_dir, force_download=force_download)

continuous_eval/eval/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@
66
CalledTools,
77
)
88
from continuous_eval.eval.dataset import Dataset
9+
from continuous_eval.eval.runner import EvaluationRunner

continuous_eval/eval/dataset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@
55

66
import yaml
77

8-
from continuous_eval.eval.types import UUID, ToolCall
8+
from continuous_eval.eval.types import UID, ToolCall
99
from continuous_eval.eval.utils import type_hint_to_str
1010

1111
_SAFE_DICT = {k: v for k, v in typing.__dict__.items() if not k.startswith("__")}
12-
_SAFE_DICT["UUID"] = UUID
12+
_SAFE_DICT["UID"] = UID
1313
_SAFE_DICT["ToolCall"] = ToolCall
1414

1515

continuous_eval/eval/logger.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
import json
2+
import logging
3+
from enum import Enum
4+
from pathlib import Path
5+
from typing import Any, Optional, Union
6+
7+
from continuous_eval.eval.modules import AgentModule
8+
from continuous_eval.eval.pipeline import Pipeline
9+
from continuous_eval.eval.result_types import TOOL_PREFIX
10+
from continuous_eval.eval.utils import instantiate_type
11+
from continuous_eval.utils.telemetry import telemetry_event
12+
13+
logger = logging.getLogger("eval-manager")
14+
Serializable = Any
15+
16+
17+
class LogMode(Enum):
18+
APPEND = 0
19+
REPLACE = 1
20+
21+
22+
class PipelineLogger:
23+
@telemetry_event("logger")
24+
def __init__(self, pipeline: Optional[Pipeline] = None):
25+
self._pipeline: Optional[Pipeline] = pipeline
26+
self.data = dict()
27+
28+
@property
29+
def pipeline(self) -> Pipeline:
30+
if self._pipeline is None:
31+
raise ValueError("Pipeline not set")
32+
return self._pipeline
33+
34+
def _empty_sample(self):
35+
if self._pipeline is None:
36+
raise ValueError("Pipeline not set")
37+
empty_samples = dict()
38+
for module in self._pipeline.modules:
39+
empty_samples[module.name] = instantiate_type(module.output)
40+
if isinstance(module, AgentModule):
41+
empty_samples[f"{TOOL_PREFIX}{module.name}"] = list()
42+
return empty_samples
43+
44+
def log(
45+
self,
46+
uid: Serializable,
47+
module: str,
48+
value: Any,
49+
mode: LogMode = LogMode.REPLACE,
50+
**kwargs,
51+
):
52+
# Make sure everything looks good
53+
assert uid is not None, "UID cannot be None"
54+
if self._pipeline is None:
55+
raise ValueError("Pipeline not set")
56+
if uid not in self.data:
57+
self.data[uid] = self._empty_sample()
58+
if kwargs and "tool_args" in kwargs:
59+
key = f"{TOOL_PREFIX}{module}"
60+
self.data[uid][key].append({"name": value, "kwargs": kwargs["tool_args"]})
61+
else:
62+
if mode == LogMode.REPLACE:
63+
self.data[uid][module] = value
64+
elif mode == LogMode.APPEND:
65+
if not isinstance(self.data[uid][module], list):
66+
if isinstance(value, list):
67+
self.data[uid][module].extend(value)
68+
else:
69+
self.data[uid][module].append(value)
70+
else:
71+
self.data[uid][module].add(value)
72+
73+
def save(self, filepath: Union[str, Path]):
74+
if isinstance(filepath, str):
75+
filepath = Path(filepath)
76+
assert filepath.suffix == ".jsonl", "File must be a JSONL file"
77+
assert self.data, "No samples to save"
78+
with open(filepath, "w") as f:
79+
for uid, res in self.data.items():
80+
line = {**{"__uid": uid}, **res}
81+
json_record = json.dumps(line, ensure_ascii=False)
82+
f.write(json_record + "\n")
83+
84+
def load(self, filepath: Union[str, Path]):
85+
if isinstance(filepath, str):
86+
filepath = Path(filepath)
87+
assert filepath.suffix == ".jsonl", "File must be a JSONL file"
88+
with open(filepath, "r") as f:
89+
for line in f:
90+
record = json.loads(line)
91+
uid = record.pop("__uid")
92+
self.data[uid] = record

0 commit comments

Comments
 (0)