Skip to content

Commit 969faa8

Browse files
committed
Add metadata logging for comprehensive evaluation metrics
Enhance the evaluation visualization step by logging detailed metrics to ZenML, including: - Retrieval performance metrics - Generation failure rates - Quality scores (toxicity, faithfulness, helpfulness, relevance) - Composite scores for overall quality and retrieval effectiveness
1 parent 4e2aaa3 commit 969faa8

File tree

1 file changed

+34
-2
lines changed

1 file changed

+34
-2
lines changed

llm-complete-guide/steps/eval_visualisation.py

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
import matplotlib.pyplot as plt
1919
import numpy as np
2020
from PIL import Image
21-
from zenml import ArtifactConfig, get_step_context, step
21+
from zenml import ArtifactConfig, get_step_context, step, log_metadata
2222

2323

2424
def create_image(
@@ -124,7 +124,7 @@ def visualize_evaluation_results(
124124
Annotated[Image.Image, ArtifactConfig(name="generation_eval_full")],
125125
]:
126126
"""
127-
Visualize the evaluation results by creating three separate images.
127+
Visualize the evaluation results by creating three separate images and logging metrics.
128128
129129
Args:
130130
small_retrieval_eval_failure_rate (float): Small retrieval evaluation failure rate.
@@ -145,6 +145,38 @@ def visualize_evaluation_results(
145145
step_context = get_step_context()
146146
pipeline_run_name = step_context.pipeline_run.name
147147

148+
# Log all metrics as metadata for dashboard visualization
149+
log_metadata(
150+
metadata={
151+
# Retrieval metrics
152+
"retrieval.small_failure_rate": small_retrieval_eval_failure_rate,
153+
"retrieval.small_failure_rate_reranking": small_retrieval_eval_failure_rate_reranking,
154+
"retrieval.full_failure_rate": full_retrieval_eval_failure_rate,
155+
"retrieval.full_failure_rate_reranking": full_retrieval_eval_failure_rate_reranking,
156+
# Generation failure metrics
157+
"generation.failure_rate_bad_answers": failure_rate_bad_answers,
158+
"generation.failure_rate_bad_immediate": failure_rate_bad_immediate_responses,
159+
"generation.failure_rate_good": failure_rate_good_responses,
160+
# Quality metrics
161+
"quality.toxicity": average_toxicity_score,
162+
"quality.faithfulness": average_faithfulness_score,
163+
"quality.helpfulness": average_helpfulness_score,
164+
"quality.relevance": average_relevance_score,
165+
# Composite scores
166+
"composite.overall_quality": (
167+
average_faithfulness_score
168+
+ average_helpfulness_score
169+
+ average_relevance_score
170+
)
171+
/ 3,
172+
"composite.retrieval_effectiveness": (
173+
(1 - small_retrieval_eval_failure_rate)
174+
+ (1 - full_retrieval_eval_failure_rate)
175+
)
176+
/ 2,
177+
}
178+
)
179+
148180
normalized_scores = [
149181
score / 20
150182
for score in [

0 commit comments

Comments
 (0)