1818import matplotlib .pyplot as plt
1919import numpy as np
2020from PIL import Image
21- from zenml import ArtifactConfig , get_step_context , step
21+ from zenml import ArtifactConfig , get_step_context , step , log_metadata
2222
2323
2424def create_image (
@@ -124,7 +124,7 @@ def visualize_evaluation_results(
124124 Annotated [Image .Image , ArtifactConfig (name = "generation_eval_full" )],
125125]:
126126 """
127- Visualize the evaluation results by creating three separate images.
127+ Visualize the evaluation results by creating three separate images and logging metrics .
128128
129129 Args:
130130 small_retrieval_eval_failure_rate (float): Small retrieval evaluation failure rate.
@@ -145,6 +145,38 @@ def visualize_evaluation_results(
145145 step_context = get_step_context ()
146146 pipeline_run_name = step_context .pipeline_run .name
147147
148+ # Log all metrics as metadata for dashboard visualization
149+ log_metadata (
150+ metadata = {
151+ # Retrieval metrics
152+ "retrieval.small_failure_rate" : small_retrieval_eval_failure_rate ,
153+ "retrieval.small_failure_rate_reranking" : small_retrieval_eval_failure_rate_reranking ,
154+ "retrieval.full_failure_rate" : full_retrieval_eval_failure_rate ,
155+ "retrieval.full_failure_rate_reranking" : full_retrieval_eval_failure_rate_reranking ,
156+ # Generation failure metrics
157+ "generation.failure_rate_bad_answers" : failure_rate_bad_answers ,
158+ "generation.failure_rate_bad_immediate" : failure_rate_bad_immediate_responses ,
159+ "generation.failure_rate_good" : failure_rate_good_responses ,
160+ # Quality metrics
161+ "quality.toxicity" : average_toxicity_score ,
162+ "quality.faithfulness" : average_faithfulness_score ,
163+ "quality.helpfulness" : average_helpfulness_score ,
164+ "quality.relevance" : average_relevance_score ,
165+ # Composite scores
166+ "composite.overall_quality" : (
167+ average_faithfulness_score
168+ + average_helpfulness_score
169+ + average_relevance_score
170+ )
171+ / 3 ,
172+ "composite.retrieval_effectiveness" : (
173+ (1 - small_retrieval_eval_failure_rate )
174+ + (1 - full_retrieval_eval_failure_rate )
175+ )
176+ / 2 ,
177+ }
178+ )
179+
148180 normalized_scores = [
149181 score / 20
150182 for score in [
0 commit comments