relari-ai
diff --git a/‎README.md‎
Lines changed: 19 additions & 17 deletions b/‎README.md‎
Lines changed: 19 additions & 17 deletions
diff --git a/‎continuous_eval/data_downloader.py‎
Lines changed: 2 additions & 2 deletions b/‎continuous_eval/data_downloader.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎continuous_eval/eval/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎continuous_eval/eval/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎continuous_eval/eval/dataset.py‎
Lines changed: 2 additions & 2 deletions b/‎continuous_eval/eval/dataset.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎continuous_eval/eval/logger.py‎
Lines changed: 92 additions & 0 deletions b/‎continuous_eval/eval/logger.py‎
Lines changed: 92 additions & 0 deletions
@@ -18,7 +18,7 @@
 </div>
 
 <h2 align="center">
-  <p>Open-Source Evaluation for GenAI Application Pipelines</p>
+  <p>Open-Source Evaluation for GenAI Applications</p>
 </h2>
 
 
@@ -143,12 +143,13 @@ print(metric(**datum))
 To define your own metrics, you only need to extend the [Metric](continuous_eval/metrics/base.py#L23C7-L23C13) class implementing the `__call__` method.
 Optional methods are `batch` (if it is possible to implement optimizations for batch processing) and `aggregate` (to aggregate metrics results over multiple samples_).
 
-## Run evaluation on pipeline modules
+## Run evaluation on a pipeline
 
 Define modules in your pipeline and select corresponding metrics.
 
 ```python
-from continuous_eval.eval import Module, ModuleOutput, Pipeline, Dataset
+from continuous_eval.eval import Module, ModuleOutput, Pipeline, Dataset, EvaluationRunner
+from continuous_eval.eval.logger import PipelineLogger
 from continuous_eval.metrics.retrieval import PrecisionRecallF1, RankedRetrievalMetrics
 from continuous_eval.metrics.generation.text import DeterministicAnswerCorrectness
 from typing import List, Dict
@@ -199,25 +200,24 @@ print(pipeline.graph_repr()) # optional: visualize the pipeline
 Now you can run the evaluation on your pipeline
 
 ```python
-eval_manager.start_run()
-  while eval_manager.is_running():
-    if eval_manager.curr_sample is None:
-      break
-    q = eval_manager.curr_sample["question"] # get the question or any other field
-    # run your pipeline ...
-    eval_manager.next_sample()
-```
+pipelog = PipelineLogger(pipeline=pipeline)
 
-To **log** the results you just need to call the `eval_manager.log` method with the module name and the output, for example:
+# now run your LLM application pipeline, and for each module, log the results:
+pipelog.log(uid=sample_uid, module="module_name", value=data)
 
-```python
-eval_manager.log("answer_generator", response)
+# Once you finish logging the data, you can use the EvaluationRunner to evaluate the logs
+evalrunner = EvaluationRunner(pipeline)
+metrics = evalrunner.evaluate(pipelog)
+metrics.results() # returns a dictionary with the results
 ```
 
-The evaluator manager also offers
+To run evaluation over an existing dataset (BYODataset), you can run the following:
 
-- `eval_manager.run_metrics()` to run all the metrics defined in the pipeline
-- `eval_manager.run_tests()` to run the tests defined in the pipeline (see the documentation [docs](docs.relari.ai) for more details)
+```python
+dataset = Dataset(...)
+evalrunner = EvaluationRunner(pipeline)
+metrics = evalrunner.evaluate(dataset)
+```
 
 ## Synthetic Data Generation
 
@@ -244,6 +244,8 @@ integrations that build on the core are both accepted and highly encouraged! See
   - How important is a Golden Dataset for LLM evaluation?
  [(link)](https://medium.com/relari/how-important-is-a-golden-dataset-for-llm-pipeline-evaluation-4ef6deb14dc5)
   - How to evaluate complex GenAI Apps: a granular approach [(link)](https://medium.com/relari/how-to-evaluate-complex-genai-apps-a-granular-approach-0ab929d5b3e2)
+  - How to Make the Most Out of LLM Production Data: Simulated User Feedback [(link)](https://medium.com/towards-data-science/how-to-make-the-most-out-of-llm-production-data-simulated-user-feedback-843c444febc7)
+  - Generate Synthetic Data to Test LLM Applications [(link)](https://medium.com/relari/generate-synthetic-data-to-test-llm-applications-4bffeb51b80e)
 - **Discord:** Join our community of LLM developers [Discord](https://discord.gg/GJnM8SRsHr)
 - **Reach out to founders:** [Email](mailto:[email protected]) or [Schedule a chat](https://cal.com/pasquale/continuous-eval)
 
 
@@ -69,8 +69,8 @@ def example_data_downloader(
         out_dir = destination_dir / resource
         return _download_and_extract_zip(EXAMPLES_DATA_URL + res["filename"], out_dir, force_download=force_download)
     elif res["type"] == "chromadb":
-        from langchain.embeddings.openai import OpenAIEmbeddings
-        from langchain.vectorstores import Chroma
+        from langchain_chroma import Chroma
+        from langchain_openai import OpenAIEmbeddings
 
         out_dir = destination_dir / resource
         _download_and_extract_zip(EXAMPLES_DATA_URL + res["filename"], out_dir, force_download=force_download)
 
@@ -6,3 +6,4 @@
     CalledTools,
 )
 from continuous_eval.eval.dataset import Dataset
+from continuous_eval.eval.runner import EvaluationRunner
@@ -5,11 +5,11 @@
 
 import yaml
 
-from continuous_eval.eval.types import UUID, ToolCall
+from continuous_eval.eval.types import UID, ToolCall
 from continuous_eval.eval.utils import type_hint_to_str
 
 _SAFE_DICT = {k: v for k, v in typing.__dict__.items() if not k.startswith("__")}
-_SAFE_DICT["UUID"] = UUID
+_SAFE_DICT["UID"] = UID
 _SAFE_DICT["ToolCall"] = ToolCall
 
 
 
@@ -0,0 +1,92 @@
+import json
+import logging
+from enum import Enum
+from pathlib import Path
+from typing import Any, Optional, Union
+
+from continuous_eval.eval.modules import AgentModule
+from continuous_eval.eval.pipeline import Pipeline
+from continuous_eval.eval.result_types import TOOL_PREFIX
+from continuous_eval.eval.utils import instantiate_type
+from continuous_eval.utils.telemetry import telemetry_event
+
+logger = logging.getLogger("eval-manager")
+Serializable = Any
+
+
+class LogMode(Enum):
+    APPEND = 0
+    REPLACE = 1
+
+
+class PipelineLogger:
+    @telemetry_event("logger")
+    def __init__(self, pipeline: Optional[Pipeline] = None):
+        self._pipeline: Optional[Pipeline] = pipeline
+        self.data = dict()
+
+    @property
+    def pipeline(self) -> Pipeline:
+        if self._pipeline is None:
+            raise ValueError("Pipeline not set")
+        return self._pipeline
+
+    def _empty_sample(self):
+        if self._pipeline is None:
+            raise ValueError("Pipeline not set")
+        empty_samples = dict()
+        for module in self._pipeline.modules:
+            empty_samples[module.name] = instantiate_type(module.output)
+            if isinstance(module, AgentModule):
+                empty_samples[f"{TOOL_PREFIX}{module.name}"] = list()
+        return empty_samples
+
+    def log(
+        self,
+        uid: Serializable,
+        module: str,
+        value: Any,
+        mode: LogMode = LogMode.REPLACE,
+        **kwargs,
+    ):
+        # Make sure everything looks good
+        assert uid is not None, "UID cannot be None"
+        if self._pipeline is None:
+            raise ValueError("Pipeline not set")
+        if uid not in self.data:
+            self.data[uid] = self._empty_sample()
+        if kwargs and "tool_args" in kwargs:
+            key = f"{TOOL_PREFIX}{module}"
+            self.data[uid][key].append({"name": value, "kwargs": kwargs["tool_args"]})
+        else:
+            if mode == LogMode.REPLACE:
+                self.data[uid][module] = value
+            elif mode == LogMode.APPEND:
+                if not isinstance(self.data[uid][module], list):
+                    if isinstance(value, list):
+                        self.data[uid][module].extend(value)
+                    else:
+                        self.data[uid][module].append(value)
+                else:
+                    self.data[uid][module].add(value)
+
+    def save(self, filepath: Union[str, Path]):
+        if isinstance(filepath, str):
+            filepath = Path(filepath)
+        assert filepath.suffix == ".jsonl", "File must be a JSONL file"
+        assert self.data, "No samples to save"
+        with open(filepath, "w") as f:
+            for uid, res in self.data.items():
+                line = {**{"__uid": uid}, **res}
+                json_record = json.dumps(line, ensure_ascii=False)
+                f.write(json_record + "\n")
+
+    def load(self, filepath: Union[str, Path]):
+        if isinstance(filepath, str):
+            filepath = Path(filepath)
+        assert filepath.suffix == ".jsonl", "File must be a JSONL file"
+        with open(filepath, "r") as f:
+            for line in f:
+                record = json.loads(line)
+                uid = record.pop("__uid")
+                self.data[uid] = record
Original file line number	Diff line number	Diff line change
`@@ -6,3 +6,4 @@`
`6`	`6`	`CalledTools,`
`7`	`7`	`)`
`8`	`8`	`from continuous_eval.eval.dataset import Dataset`
	`9`	`+from continuous_eval.eval.runner import EvaluationRunner`