diff --git a/databricks-production-qa-demo/steps/deployment/deployment_deploy.py b/databricks-production-qa-demo/steps/deployment/deployment_deploy.py
index b7407dcfb..c8220afe9 100644
--- a/databricks-production-qa-demo/steps/deployment/deployment_deploy.py
+++ b/databricks-production-qa-demo/steps/deployment/deployment_deploy.py
@@ -31,10 +31,14 @@
@step(enable_cache=False)
-def deployment_deploy() -> Annotated[
- Optional[DatabricksDeploymentService],
- ArtifactConfig(name="databricks_deployment", is_deployment_artifact=True),
-]:
+def deployment_deploy() -> (
+ Annotated[
+ Optional[DatabricksDeploymentService],
+ ArtifactConfig(
+ name="databricks_deployment", is_deployment_artifact=True
+ ),
+ ]
+):
"""Predictions step.
This is an example of a predictions step that takes the data in and returns
diff --git a/end-to-end-computer-vision/steps/export_label_studio.py b/end-to-end-computer-vision/steps/export_label_studio.py
index 1b47431b6..cf0f96328 100644
--- a/end-to-end-computer-vision/steps/export_label_studio.py
+++ b/end-to-end-computer-vision/steps/export_label_studio.py
@@ -28,7 +28,6 @@
logger = get_logger(__name__)
-
@step(
output_materializers={
LABELED_DATASET_NAME: LabelStudioAnnotationMaterializer
diff --git a/eurorate-predictor/pipelines/training.py b/eurorate-predictor/pipelines/training.py
index f9a5b57f0..a94db33de 100644
--- a/eurorate-predictor/pipelines/training.py
+++ b/eurorate-predictor/pipelines/training.py
@@ -24,7 +24,9 @@
@pipeline
-def ecb_predictor_model_training_pipeline(augmented_dataset_id, mode: str = "develop"):
+def ecb_predictor_model_training_pipeline(
+ augmented_dataset_id, mode: str = "develop"
+):
"""A pipeline to train an XGBoost model and promote it.
Args:
diff --git a/eurorate-predictor/run.py b/eurorate-predictor/run.py
index bee2c1bef..5f4097aa5 100644
--- a/eurorate-predictor/run.py
+++ b/eurorate-predictor/run.py
@@ -101,7 +101,9 @@ def main(
pipeline_args["config_path"] = os.path.join(
config_folder, f"etl_{mode}.yaml"
)
- ecb_predictor_etl_pipeline.with_options(**pipeline_args)(**run_args_etl)
+ ecb_predictor_etl_pipeline.with_options(**pipeline_args)(
+ **run_args_etl
+ )
logger.info("ETL pipeline finished successfully!\n")
# Execute Feature Engineering Pipeline
@@ -126,9 +128,9 @@ def main(
pipeline_args["config_path"] = os.path.join(
config_folder, f"feature_engineering_{mode}.yaml"
)
- ecb_predictor_feature_engineering_pipeline.with_options(**pipeline_args)(
- **run_args_feature
- )
+ ecb_predictor_feature_engineering_pipeline.with_options(
+ **pipeline_args
+ )(**run_args_feature)
logger.info("Feature Engineering pipeline finished successfully!\n")
# Execute Model Training Pipeline
@@ -153,7 +155,9 @@ def main(
pipeline_args["config_path"] = os.path.join(
config_folder, f"training_{mode}.yaml"
)
- ecb_predictor_model_training_pipeline.with_options(**pipeline_args)(**run_args_train)
+ ecb_predictor_model_training_pipeline.with_options(**pipeline_args)(
+ **run_args_train
+ )
logger.info("Model Training pipeline finished successfully!\n")
diff --git a/gamesense/steps/finetune.py b/gamesense/steps/finetune.py
index 3757801eb..5421757d7 100644
--- a/gamesense/steps/finetune.py
+++ b/gamesense/steps/finetune.py
@@ -28,12 +28,11 @@
from utils.loaders import load_base_model
from utils.tokenizer import load_tokenizer
from zenml import ArtifactConfig, step
+from zenml.client import Client
from zenml.enums import ArtifactType
from zenml.logger import get_logger
from zenml.materializers import BuiltInMaterializer
from zenml.utils.cuda_utils import cleanup_gpu_memory
-from zenml.client import Client
-
logger = get_logger(__name__)
diff --git a/huggingface-sagemaker/steps/deploying/huggingface_deployment.py b/huggingface-sagemaker/steps/deploying/huggingface_deployment.py
index 33adcf81d..89d7305fe 100644
--- a/huggingface-sagemaker/steps/deploying/huggingface_deployment.py
+++ b/huggingface-sagemaker/steps/deploying/huggingface_deployment.py
@@ -47,9 +47,7 @@ def deploy_to_huggingface(
save_model_to_deploy.entrypoint()
logger.info("Model saved locally. Pushing to HuggingFace...")
- assert secret, (
- "No secret found with name 'huggingface_creds'. Please create one with your `token`."
- )
+ assert secret, "No secret found with name 'huggingface_creds'. Please create one with your `token`."
token = secret.secret_values["token"]
api = HfApi(token=token)
diff --git a/huggingface-sagemaker/steps/promotion/promote_get_metrics.py b/huggingface-sagemaker/steps/promotion/promote_get_metrics.py
index 06473701c..93cebad1b 100644
--- a/huggingface-sagemaker/steps/promotion/promote_get_metrics.py
+++ b/huggingface-sagemaker/steps/promotion/promote_get_metrics.py
@@ -27,10 +27,12 @@
@step
-def promote_get_metrics() -> Tuple[
- Annotated[Dict[str, Any], "latest_metrics"],
- Annotated[Dict[str, Any], "current_metrics"],
-]:
+def promote_get_metrics() -> (
+ Tuple[
+ Annotated[Dict[str, Any], "latest_metrics"],
+ Annotated[Dict[str, Any], "current_metrics"],
+ ]
+):
"""Get metrics for comparison for promoting a model.
This is an example of a metric retrieval step. It is used to retrieve
diff --git a/llm-complete-guide/pipelines/llm_eval.py b/llm-complete-guide/pipelines/llm_eval.py
index 9112a8ccf..f0b3f39ab 100644
--- a/llm-complete-guide/pipelines/llm_eval.py
+++ b/llm-complete-guide/pipelines/llm_eval.py
@@ -17,7 +17,7 @@
from typing import Optional
import click
-from steps.create_prompt import PROMPT, create_prompt
+from steps.create_prompt import PROMPT
from steps.eval_e2e import e2e_evaluation, e2e_evaluation_llm_judged
from steps.eval_retrieval import (
retrieval_evaluation_full,
diff --git a/llm-complete-guide/steps/create_prompt.py b/llm-complete-guide/steps/create_prompt.py
index 61465592a..4c840019c 100644
--- a/llm-complete-guide/steps/create_prompt.py
+++ b/llm-complete-guide/steps/create_prompt.py
@@ -24,6 +24,7 @@
answers. \
"""
+
@step
def create_prompt() -> str:
"""Create a prompt for the RAG pipeline."""
diff --git a/llm-complete-guide/steps/eval_retrieval.py b/llm-complete-guide/steps/eval_retrieval.py
index ebec42b50..bf3594f8a 100644
--- a/llm-complete-guide/steps/eval_retrieval.py
+++ b/llm-complete-guide/steps/eval_retrieval.py
@@ -275,9 +275,9 @@ def perform_small_retrieval_evaluation(use_reranking: bool) -> float:
@step
-def retrieval_evaluation_small() -> Annotated[
- float, "small_failure_rate_retrieval"
-]:
+def retrieval_evaluation_small() -> (
+ Annotated[float, "small_failure_rate_retrieval"]
+):
"""Executes the retrieval evaluation step without reranking.
Returns:
@@ -287,9 +287,9 @@ def retrieval_evaluation_small() -> Annotated[
@step
-def retrieval_evaluation_small_with_reranking() -> Annotated[
- float, "small_failure_rate_retrieval_reranking"
-]:
+def retrieval_evaluation_small_with_reranking() -> (
+ Annotated[float, "small_failure_rate_retrieval_reranking"]
+):
"""Executes the retrieval evaluation step with reranking.
Returns:
diff --git a/llm-complete-guide/steps/eval_visualisation.py b/llm-complete-guide/steps/eval_visualisation.py
index 65b26fd02..d04454e38 100644
--- a/llm-complete-guide/steps/eval_visualisation.py
+++ b/llm-complete-guide/steps/eval_visualisation.py
@@ -12,13 +12,13 @@
# or implied. See the License for the specific language governing
# permissions and limitations under the License.
-from typing import Annotated, Dict, List, Tuple
+from typing import Annotated, Dict, List
import plotly.graph_objects as go
-from plotly.subplots import make_subplots
-from zenml import ArtifactConfig, get_step_context, log_metadata, step
+from zenml import get_step_context, log_metadata, step
from zenml.types import HTMLString
+
def create_plotly_bar_chart(
labels: List[str],
scores: List[float],
@@ -43,54 +43,66 @@ def create_plotly_bar_chart(
"""
# Generate colors for bars
if alternate_colors:
- colors = ["rgba(66, 133, 244, 0.8)" if i % 2 == 0 else "rgba(219, 68, 55, 0.8)" for i in range(len(labels))]
+ colors = [
+ "rgba(66, 133, 244, 0.8)"
+ if i % 2 == 0
+ else "rgba(219, 68, 55, 0.8)"
+ for i in range(len(labels))
+ ]
else:
colors = ["rgba(66, 133, 244, 0.8)" for _ in range(len(labels))]
# Prepare hover text
if descriptions:
- hover_text = [f"{label}
Value: {score:.2f}
{descriptions.get(label, '')}"
- for label, score in zip(labels, scores)]
+ hover_text = [
+ f"{label}
Value: {score:.2f}
{descriptions.get(label, '')}"
+ for label, score in zip(labels, scores)
+ ]
else:
- hover_text = [f"{label}
Value: {score:.2f}" for label, score in zip(labels, scores)]
+ hover_text = [
+ f"{label}
Value: {score:.2f}"
+ for label, score in zip(labels, scores)
+ ]
# Create figure
fig = go.Figure()
-
+
fig.add_trace(
go.Bar(
y=labels,
x=scores,
- orientation='h',
+ orientation="h",
marker_color=colors,
text=[f"{score:.2f}" for score in scores],
- textposition='auto',
+ textposition="auto",
hovertext=hover_text,
- hoverinfo='text',
+ hoverinfo="text",
)
)
# Set layout
max_value = max(scores) if scores else 5
- xaxis_range = [0, 100] if percentage_scale else [0, max(5, max_value * 1.1)]
+ xaxis_range = (
+ [0, 100] if percentage_scale else [0, max(5, max_value * 1.1)]
+ )
xaxis_title = "Percentage (%)" if percentage_scale else "Score"
-
+
fig.update_layout(
title=title,
xaxis=dict(
title=xaxis_title,
range=xaxis_range,
showgrid=True,
- gridcolor='rgba(230, 230, 230, 0.8)',
+ gridcolor="rgba(230, 230, 230, 0.8)",
),
yaxis=dict(
autorange="reversed", # Make labels read top-to-bottom
),
margin=dict(l=20, r=20, t=60, b=20),
height=max(300, 70 * len(labels)),
- plot_bgcolor='rgba(255, 255, 255, 1)',
+ plot_bgcolor="rgba(255, 255, 255, 1)",
)
-
+
return fig
@@ -122,58 +134,49 @@ def generate_evaluation_html(
"""
# Metric descriptions for hovering
metric_descriptions = {
- "Small Retrieval Eval Failure Rate":
- "Percentage of small test cases where retrieval failed to find relevant documents.",
- "Small Retrieval Eval Failure Rate Reranking":
- "Percentage of small test cases where retrieval with reranking failed to find relevant documents.",
- "Full Retrieval Eval Failure Rate":
- "Percentage of all test cases where retrieval failed to find relevant documents.",
- "Full Retrieval Eval Failure Rate Reranking":
- "Percentage of all test cases where retrieval with reranking failed to find relevant documents.",
- "Failure Rate Bad Answers":
- "Percentage of responses that were factually incorrect or misleading.",
- "Failure Rate Bad Immediate Responses":
- "Percentage of immediate responses that did not adequately address the query.",
- "Failure Rate Good Responses":
- "Percentage of responses rated as good by evaluators.",
- "Average Toxicity Score":
- "Average score measuring harmful, offensive, or inappropriate content (lower is better).",
- "Average Faithfulness Score":
- "Average score measuring how accurately the response represents the source material (higher is better).",
- "Average Helpfulness Score":
- "Average score measuring the practical utility of responses to users (higher is better).",
- "Average Relevance Score":
- "Average score measuring how well responses address the specific query intent (higher is better).",
+ "Small Retrieval Eval Failure Rate": "Percentage of small test cases where retrieval failed to find relevant documents.",
+ "Small Retrieval Eval Failure Rate Reranking": "Percentage of small test cases where retrieval with reranking failed to find relevant documents.",
+ "Full Retrieval Eval Failure Rate": "Percentage of all test cases where retrieval failed to find relevant documents.",
+ "Full Retrieval Eval Failure Rate Reranking": "Percentage of all test cases where retrieval with reranking failed to find relevant documents.",
+ "Failure Rate Bad Answers": "Percentage of responses that were factually incorrect or misleading.",
+ "Failure Rate Bad Immediate Responses": "Percentage of immediate responses that did not adequately address the query.",
+ "Failure Rate Good Responses": "Percentage of responses rated as good by evaluators.",
+ "Average Toxicity Score": "Average score measuring harmful, offensive, or inappropriate content (lower is better).",
+ "Average Faithfulness Score": "Average score measuring how accurately the response represents the source material (higher is better).",
+ "Average Helpfulness Score": "Average score measuring the practical utility of responses to users (higher is better).",
+ "Average Relevance Score": "Average score measuring how well responses address the specific query intent (higher is better).",
}
# Create individual charts
retrieval_fig = create_plotly_bar_chart(
- retrieval_labels,
- retrieval_scores,
- f"Retrieval Evaluation Metrics",
+ retrieval_labels,
+ retrieval_scores,
+ f"Retrieval Evaluation Metrics",
alternate_colors=True,
- descriptions=metric_descriptions
+ descriptions=metric_descriptions,
)
-
+
generation_basic_fig = create_plotly_bar_chart(
- generation_basic_labels,
- generation_basic_scores,
- f"Basic Generation Metrics",
+ generation_basic_labels,
+ generation_basic_scores,
+ f"Basic Generation Metrics",
percentage_scale=True,
- descriptions=metric_descriptions
+ descriptions=metric_descriptions,
)
-
+
generation_quality_fig = create_plotly_bar_chart(
- generation_quality_labels,
- generation_quality_scores,
+ generation_quality_labels,
+ generation_quality_scores,
f"Generation Quality Metrics",
- descriptions=metric_descriptions
+ descriptions=metric_descriptions,
)
# Create summary metrics cards
composite_quality = metrics_metadata.get("composite.overall_quality", 0)
- retrieval_effectiveness = metrics_metadata.get("composite.retrieval_effectiveness", 0)
-
+ retrieval_effectiveness = metrics_metadata.get(
+ "composite.retrieval_effectiveness", 0
+ )
+
# Combine into complete HTML report
html = f"""
@@ -388,7 +391,7 @@ def generate_evaluation_html(