Skip to content

Commit b3af83a

Browse files
authored
Evaluations from Langfuse annotations (#168)
* Add placeholder for Langfuse evaluation pipeline * finish porting langfuse evals to the new branch * Add Langfuse tracing and evaluation documentation to README * fix typos * Update deployment section reference in README
1 parent a597a43 commit b3af83a

File tree

12 files changed

+469
-7
lines changed

12 files changed

+469
-7
lines changed

.typos.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ extend-exclude = [
33
"*.json",
44
"*.js",
55
"*.ipynb",
6+
"llm-finetuning/*",
7+
"end-to-end-computer-vision/*",
68
]
79

810
[default.extend-identifiers]
@@ -31,6 +33,12 @@ arange = "arange"
3133
cachable = "cachable"
3234
OT = "OT"
3335
cll = "cll"
36+
Louvre = "Louvre"
37+
quantised = "quantised"
38+
colours = "colours"
39+
initialised = "initialised"
40+
visualisation = "visualisation"
41+
customise = "customise"
3442

3543
[default]
3644
locale = "en-us"

flux-dreambooth/train_dreambooth_lora_flux.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1977,7 +1977,10 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
19771977
# Predict the noise residual
19781978
model_pred = transformer(
19791979
hidden_states=packed_noisy_model_input,
1980-
# YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
1980+
# YiYi notes: divide it by 1000 for now because we scale it
1981+
# by 1000 in the transformer model (we should not keep it
1982+
# but I want to keep the inputs same for the model for
1983+
# testing)
19811984
timestep=timesteps / 1000,
19821985
guidance=guidance,
19831986
pooled_projections=pooled_prompt_embeds,

huggingface-sagemaker/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ This will train a model from Huggingface and register a new ZenML model on the M
214214
Please note the above screens are a cloud-only feature in [ZenML Pro](https://zenml.io/pro), and
215215
the CLI `zenml models list` should be used instead for OSS users.
216216

217-
At the end of the pipeline, the model will also be pushed the Huggingface, and a link estabilished between the ZenML Control Plane and the Huggingface model repository.
217+
At the end of the pipeline, the model will also be pushed the Huggingface, and a link established between the ZenML Control Plane and the Huggingface model repository.
218218

219219
<img src="assets/hf_repo_commit.png" alt="Huggingface Repo" width="800">
220220

llm-complete-guide/README.md

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,11 @@ Once the pipeline has run successfully, you can query the assets in your vector
130130
using the `--query` flag as well as passing in the model you'd like to
131131
use for the LLM.
132132

133+
Note that you'll need to set the `LANGFUSE_API_KEY` environment variable for the
134+
tracing which is built in to the implementation of the inference. This will
135+
trace all LLM calls and store them in the [Langfuse](https://langfuse.com/)
136+
platform.
137+
133138
When you're ready to make the query, run the following command:
134139

135140
```shell
@@ -197,6 +202,21 @@ python run.py evaluation
197202
You'll need to have first run the RAG pipeline to have the necessary assets in
198203
the database to evaluate.
199204

205+
## RAG evaluation with Langfuse
206+
207+
You can run the Langfuse evaluation pipeline if you have marked some of your
208+
responses as good or bad in the deployed Hugging Face space.
209+
210+
To run the evaluation pipeline, you can use the following command:
211+
212+
```shell
213+
python run.py langfuse_evaluation
214+
```
215+
216+
Note that this pipeline will only work if you have set the `LANGFUSE_API_KEY`
217+
environment variable. It will use this key to fetch the traces from Langfuse and
218+
evaluate the responses.
219+
200220
## Embeddings finetuning
201221

202222
For embeddings finetuning we first generate synthetic data and then finetune the
@@ -292,7 +312,7 @@ The project loosely follows [the recommended ZenML project structure](https://do
292312
├── most_basic_eval.py # Basic evaluation script
293313
├── most_basic_rag_pipeline.py # Basic RAG pipeline script
294314
├── notebooks
295-
│ └── visualise_embeddings.ipynb # Notebook to visualize embeddings
315+
│ └── visualize_embeddings.ipynb # Notebook to visualize embeddings
296316
├── pipelines
297317
│ ├── __init__.py
298318
│ ├── generate_chunk_questions.py # Pipeline to generate chunk questions

llm-complete-guide/configs/dev/rag.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,6 @@ steps:
2828
parameters:
2929
docs_url: https://docs.zenml.io/
3030
use_dev_set: true
31+
index_generator:
32+
parameters:
33+
index_type: postgres

llm-complete-guide/pipelines/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,5 @@
2020
from pipelines.llm_basic_rag import llm_basic_rag
2121
from pipelines.llm_eval import llm_eval
2222
from pipelines.llm_index_and_evaluate import llm_index_and_evaluate
23+
from pipelines.llm_langfuse_evals import llm_langfuse_evaluation
2324
from pipelines.rag_deployment import rag_deployment
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
from typing import Optional
2+
3+
from steps.eval_langfuse import fast_eval, visualize_fast_eval_results
4+
from zenml import pipeline
5+
6+
7+
@pipeline(enable_cache=False)
8+
def llm_langfuse_evaluation(after: Optional[str] = None) -> None:
9+
results = fast_eval(after=after)
10+
visualize_fast_eval_results(results)
11+
12+
13+
if __name__ == "__main__":
14+
llm_langfuse_evaluation()

llm-complete-guide/run.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
llm_basic_rag,
4949
llm_eval,
5050
llm_index_and_evaluate,
51+
llm_langfuse_evaluation,
5152
rag_deployment,
5253
)
5354
from structures import Document
@@ -76,6 +77,7 @@
7677
"embeddings",
7778
"chunks",
7879
"basic_rag",
80+
"langfuse_evaluation",
7981
]
8082
),
8183
required=True,
@@ -268,6 +270,10 @@ def main(
268270
pipeline_args["enable_cache"] = False
269271
llm_eval.with_options(model=zenml_model, config_path=config_path)()
270272

273+
elif pipeline == "langfuse_evaluation":
274+
pipeline_args["enable_cache"] = False
275+
llm_langfuse_evaluation.with_options(model=zenml_model)()
276+
271277
elif pipeline == "synthetic":
272278
generate_synthetic_data.with_options(
273279
model=zenml_model, config_path=config_path, **pipeline_args

0 commit comments

Comments
 (0)