Evaluations from Langfuse annotations (#168)

strickvl · web-flow · commit b3af83aad586 · 2025-02-20T14:34:06.000+01:00
* Add placeholder for Langfuse evaluation pipeline

* finish porting langfuse evals to the new branch

* Add Langfuse tracing and evaluation documentation to README

* fix typos

* Update deployment section reference in README
diff --git a/.typos.toml b/.typos.toml
@@ -3,6 +3,8 @@ extend-exclude = [
     "*.json",
     "*.js",
     "*.ipynb",
+    "llm-finetuning/*",
+    "end-to-end-computer-vision/*",
 ]
 
 [default.extend-identifiers]
@@ -31,6 +33,12 @@ arange = "arange"
 cachable = "cachable"
 OT = "OT"
 cll = "cll"
+Louvre = "Louvre"
+quantised = "quantised"
+colours = "colours"
+initialised = "initialised"
+visualisation = "visualisation"
+customise = "customise"
 
 [default]
 locale = "en-us"
diff --git a/flux-dreambooth/train_dreambooth_lora_flux.py b/flux-dreambooth/train_dreambooth_lora_flux.py
@@ -1977,7 +1977,10 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
                 # Predict the noise residual
                 model_pred = transformer(
                     hidden_states=packed_noisy_model_input,
-                    # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
+                    # YiYi notes: divide it by 1000 for now because we scale it
+                    # by 1000 in the transformer model (we should not keep it
+                    # but I want to keep the inputs same for the model for
+                    # testing)
                     timestep=timesteps / 1000,
                     guidance=guidance,
                     pooled_projections=pooled_prompt_embeds,
diff --git a/huggingface-sagemaker/README.md b/huggingface-sagemaker/README.md
@@ -214,7 +214,7 @@ This will train a model from Huggingface and register a new ZenML model on the M
 Please note the above screens are a cloud-only feature in [ZenML Pro](https://zenml.io/pro), and
 the CLI `zenml models list` should be used instead for OSS users.
 
-At the end of the pipeline, the model will also be pushed the Huggingface, and a link estabilished between the ZenML Control Plane and the Huggingface model repository.
+At the end of the pipeline, the model will also be pushed the Huggingface, and a link established between the ZenML Control Plane and the Huggingface model repository.
 
 <img src="assets/hf_repo_commit.png" alt="Huggingface Repo" width="800">
 
diff --git a/llm-complete-guide/README.md b/llm-complete-guide/README.md
@@ -130,6 +130,11 @@ Once the pipeline has run successfully, you can query the assets in your vector
 using the `--query` flag as well as passing in the model you'd like to
 use for the LLM.
 
+Note that you'll need to set the `LANGFUSE_API_KEY` environment variable for the
+tracing which is built in to the implementation of the inference. This will
+trace all LLM calls and store them in the [Langfuse](https://langfuse.com/)
+platform.
+
 When you're ready to make the query, run the following command:
 
 ```shell
@@ -197,6 +202,21 @@ python run.py evaluation
 You'll need to have first run the RAG pipeline to have the necessary assets in
 the database to evaluate.
 
+## RAG evaluation with Langfuse
+
+You can run the Langfuse evaluation pipeline if you have marked some of your
+responses as good or bad in the deployed Hugging Face space.
+
+To run the evaluation pipeline, you can use the following command:
+
+```shell
+python run.py langfuse_evaluation
+```
+
+Note that this pipeline will only work if you have set the `LANGFUSE_API_KEY`
+environment variable. It will use this key to fetch the traces from Langfuse and
+evaluate the responses.
+
 ## Embeddings finetuning
 
 For embeddings finetuning we first generate synthetic data and then finetune the
@@ -292,7 +312,7 @@ The project loosely follows [the recommended ZenML project structure](https://do
 ├── most_basic_eval.py                                  # Basic evaluation script
 ├── most_basic_rag_pipeline.py                          # Basic RAG pipeline script
 ├── notebooks
-│   └── visualise_embeddings.ipynb                      # Notebook to visualize embeddings
+│   └── visualize_embeddings.ipynb                      # Notebook to visualize embeddings
 ├── pipelines
 │   ├── __init__.py
 │   ├── generate_chunk_questions.py                     # Pipeline to generate chunk questions
diff --git a/llm-complete-guide/configs/dev/rag.yaml b/llm-complete-guide/configs/dev/rag.yaml
@@ -28,3 +28,6 @@ steps:
     parameters:
       docs_url: https://docs.zenml.io/
       use_dev_set: true
+  index_generator:
+    parameters:
+      index_type: postgres
diff --git a/llm-complete-guide/pipelines/__init__.py b/llm-complete-guide/pipelines/__init__.py
@@ -20,4 +20,5 @@
 from pipelines.llm_basic_rag import llm_basic_rag
 from pipelines.llm_eval import llm_eval
 from pipelines.llm_index_and_evaluate import llm_index_and_evaluate
+from pipelines.llm_langfuse_evals import llm_langfuse_evaluation
 from pipelines.rag_deployment import rag_deployment
diff --git a/llm-complete-guide/pipelines/llm_langfuse_evals.py b/llm-complete-guide/pipelines/llm_langfuse_evals.py
@@ -0,0 +1,14 @@
+from typing import Optional
+
+from steps.eval_langfuse import fast_eval, visualize_fast_eval_results
+from zenml import pipeline
+
+
+@pipeline(enable_cache=False)
+def llm_langfuse_evaluation(after: Optional[str] = None) -> None:
+    results = fast_eval(after=after)
+    visualize_fast_eval_results(results)
+
+
+if __name__ == "__main__":
+    llm_langfuse_evaluation()
diff --git a/llm-complete-guide/run.py b/llm-complete-guide/run.py
@@ -48,6 +48,7 @@
     llm_basic_rag,
     llm_eval,
     llm_index_and_evaluate,
+    llm_langfuse_evaluation,
     rag_deployment,
 )
 from structures import Document
@@ -76,6 +77,7 @@
             "embeddings",
             "chunks",
             "basic_rag",
+            "langfuse_evaluation",
         ]
     ),
     required=True,
@@ -268,6 +270,10 @@ def main(
         pipeline_args["enable_cache"] = False
         llm_eval.with_options(model=zenml_model, config_path=config_path)()
 
+    elif pipeline == "langfuse_evaluation":
+        pipeline_args["enable_cache"] = False
+        llm_langfuse_evaluation.with_options(model=zenml_model)()
+
     elif pipeline == "synthetic":
         generate_synthetic_data.with_options(
             model=zenml_model, config_path=config_path, **pipeline_args
diff --git a/llm-complete-guide/steps/eval_langfuse.py b/llm-complete-guide/steps/eval_langfuse.py
diff --git a/llm-complete-guide/steps/finetune_embeddings_legacy.py b/llm-complete-guide/steps/finetune_embeddings_legacy.py
diff --git a/llm-complete-guide/utils/llm_utils.py b/llm-complete-guide/utils/llm_utils.py
diff --git a/llm-finetuning/README.md b/llm-finetuning/README.md