Skip to content

Commit 1beb378

Browse files
authored
Adding tags support (#42125)
* Initial draft * update tests * address PR comments * run tox black * Update the samples * fix formatting using tox black * address PR comments * address PR comment * address PR comments * Trigger build pipeline * address PR comments * Update changelog.md for evaluation * address PR comment * update the _version.py
1 parent 8a42e73 commit 1beb378

File tree

11 files changed

+768
-56
lines changed

11 files changed

+768
-56
lines changed

sdk/evaluation/azure-ai-evaluation/CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
# Release History
22

3+
## 1.11.0 (Unreleased)
4+
5+
### Features Added
6+
7+
- Added support for user-supplied tags in the `evaluate` function. Tags are key-value pairs that can be used for experiment tracking, A/B testing, filtering, and organizing evaluation runs. The function accepts a `tags` parameter.
8+
9+
310
## 1.10.0 (2025-07-31)
411

512
### Breaking Changes

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_eval_run.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
8181
~azure.ai.evaluation._promptflow.azure._lite_azure_management_client.LiteMLClient
8282
:param promptflow_run: The promptflow run used by the
8383
:type promptflow_run: Optional[promptflow._sdk.entities.Run]
84+
:param tags: A dictionary of tags to be added to the evaluation run for tracking and organization purposes.
85+
:type tags: Optional[Dict[str, str]]
8486
"""
8587

8688
_MAX_RETRIES = 5
@@ -98,6 +100,7 @@ def __init__(
98100
workspace_name: str,
99101
management_client: LiteMLClient,
100102
promptflow_run: Optional[Run] = None,
103+
tags: Optional[Dict[str, str]] = None,
101104
) -> None:
102105
self._tracking_uri: str = tracking_uri
103106
self._subscription_id: str = subscription_id
@@ -107,6 +110,7 @@ def __init__(
107110
self._is_promptflow_run: bool = promptflow_run is not None
108111
self._run_name = run_name
109112
self._promptflow_run = promptflow_run
113+
self._tags = tags or {}
110114
self._status = RunStatus.NOT_STARTED
111115
self._url_base: Optional[str] = None
112116
self._info: Optional[RunInfo] = None
@@ -173,11 +177,20 @@ def _start_run(self) -> None:
173177
)
174178
else:
175179
url = f"https://{self._url_base}/mlflow/v2.0" f"{self._get_scope()}/api/2.0/mlflow/runs/create"
180+
181+
# Prepare tags: start with user tags, ensure mlflow.user is set
182+
run_tags = self._tags.copy()
183+
if "mlflow.user" not in run_tags:
184+
run_tags["mlflow.user"] = "azure-ai-evaluation"
185+
186+
# Convert tags to MLflow format
187+
tags_list = [{"key": key, "value": value} for key, value in run_tags.items()]
188+
176189
body = {
177190
"experiment_id": "0",
178191
"user_id": "azure-ai-evaluation",
179192
"start_time": int(time.time() * 1000),
180-
"tags": [{"key": "mlflow.user", "value": "azure-ai-evaluation"}],
193+
"tags": tags_list,
181194
}
182195
if self._run_name:
183196
body["run_name"] = self._run_name

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -464,7 +464,7 @@ def _validate_columns_for_evaluators(
464464
)
465465

466466

467-
def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name):
467+
def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name, tags):
468468
if data is None:
469469
msg = "The 'data' parameter is required for evaluation."
470470
raise EvaluationException(
@@ -725,6 +725,7 @@ def evaluate(
725725
azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
726726
output_path: Optional[Union[str, os.PathLike]] = None,
727727
fail_on_evaluator_errors: bool = False,
728+
tags: Optional[Dict[str, str]] = None,
728729
**kwargs,
729730
) -> EvaluationResult:
730731
"""Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
@@ -757,6 +758,10 @@ def evaluate(
757758
Defaults to false, which means that evaluations will continue regardless of failures.
758759
If such failures occur, metrics may be missing, and evidence of failures can be found in the evaluation's logs.
759760
:paramtype fail_on_evaluator_errors: bool
761+
:keyword tags: A dictionary of tags to be added to the evaluation run for tracking and organization purposes.
762+
Keys and values must be strings. For more information about tag limits, see:
763+
https://learn.microsoft.com/en-us/azure/machine-learning/resource-limits-capacity?view=azureml-api-2#runs
764+
:paramtype tags: Optional[Dict[str, str]]
760765
:keyword user_agent: A string to append to the default user-agent sent with evaluation http requests
761766
:paramtype user_agent: Optional[str]
762767
:return: Evaluation results.
@@ -793,6 +798,7 @@ def evaluate(
793798
azure_ai_project=azure_ai_project,
794799
output_path=output_path,
795800
fail_on_evaluator_errors=fail_on_evaluator_errors,
801+
tags=tags,
796802
**kwargs,
797803
)
798804
except Exception as e:
@@ -861,6 +867,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
861867
azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
862868
output_path: Optional[Union[str, os.PathLike]] = None,
863869
fail_on_evaluator_errors: bool = False,
870+
tags: Optional[Dict[str, str]] = None,
864871
**kwargs,
865872
) -> EvaluationResult:
866873
if fail_on_evaluator_errors:
@@ -877,6 +884,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
877884
azure_ai_project=azure_ai_project,
878885
evaluation_name=evaluation_name,
879886
fail_on_evaluator_errors=fail_on_evaluator_errors,
887+
tags=tags,
880888
**kwargs,
881889
)
882890

@@ -956,15 +964,15 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
956964
name_map = _map_names_to_builtins(evaluators, graders)
957965
if is_onedp_project(azure_ai_project):
958966
studio_url = _log_metrics_and_instance_results_onedp(
959-
metrics, results_df, azure_ai_project, evaluation_name, name_map, **kwargs
967+
metrics, results_df, azure_ai_project, evaluation_name, name_map, tags=tags, **kwargs
960968
)
961969
else:
962970
# Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
963971
trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
964972
studio_url = None
965973
if trace_destination:
966974
studio_url = _log_metrics_and_instance_results(
967-
metrics, results_df, trace_destination, None, evaluation_name, name_map, **kwargs
975+
metrics, results_df, trace_destination, None, evaluation_name, name_map, tags=tags, **kwargs
968976
)
969977

970978
result_df_dict = results_df.to_dict("records")
@@ -985,14 +993,15 @@ def _preprocess_data(
985993
azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
986994
evaluation_name: Optional[str] = None,
987995
fail_on_evaluator_errors: bool = False,
996+
tags: Optional[Dict[str, str]] = None,
988997
**kwargs,
989998
) -> __ValidatedData:
990999
# Process evaluator config to replace ${target.} with ${data.}
9911000
if evaluator_config is None:
9921001
evaluator_config = {}
9931002

9941003
input_data_df = _validate_and_load_data(
995-
target, data, evaluators_and_graders, output_path, azure_ai_project, evaluation_name
1004+
target, data, evaluators_and_graders, output_path, azure_ai_project, evaluation_name, tags
9961005
)
9971006
if target is not None:
9981007
_validate_columns_for_target(input_data_df, target)

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ def _log_metrics_and_instance_results_onedp(
138138
project_url: str,
139139
evaluation_name: Optional[str],
140140
name_map: Dict[str, str],
141+
tags: Optional[Dict[str, str]] = None,
141142
**kwargs,
142143
) -> Optional[str]:
143144

@@ -191,6 +192,7 @@ def _log_metrics_and_instance_results_onedp(
191192
evaluation=EvaluationUpload(
192193
display_name=evaluation_name,
193194
properties=properties,
195+
tags=tags,
194196
)
195197
)
196198

@@ -215,6 +217,7 @@ def _log_metrics_and_instance_results(
215217
run: Optional[Run],
216218
evaluation_name: Optional[str],
217219
name_map: Dict[str, str],
220+
tags: Optional[Dict[str, str]] = None,
218221
**kwargs,
219222
) -> Optional[str]:
220223
from azure.ai.evaluation._evaluate._eval_run import EvalRun
@@ -244,6 +247,7 @@ def _log_metrics_and_instance_results(
244247
workspace_name=ws_triad.workspace_name,
245248
management_client=management_client,
246249
promptflow_run=run,
250+
tags=tags,
247251
) as ev_run:
248252
artifact_name = EvalRun.EVALUATION_ARTIFACT
249253

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@
33
# ---------------------------------------------------------
44
# represents upcoming version
55

6-
VERSION = "1.10.0"
6+
VERSION = "1.11.0"

sdk/evaluation/azure-ai-evaluation/samples/aoai_score_model_grader_sample.py

Lines changed: 61 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,22 +9,31 @@
99
This sample shows how to:
1010
1. Configure an Azure OpenAI model for grading
1111
2. Create a score model grader with custom prompts
12-
3. Run evaluation using the evaluate() method
12+
3. Run evaluation using the evaluate() method with both foundry and hub-based projects
1313
4. Interpret continuous scoring results
1414
1515
Prerequisites:
1616
- Azure OpenAI resource with API key and endpoint
1717
- Model deployment (e.g., gpt-4, gpt-4o-mini)
1818
- Sample conversation data in JSONL format
1919
- Environment variables configured in .env file
20+
- Azure AI project configuration (either foundry-based or hub-based)
21+
22+
Azure AI Project Configuration Options:
23+
1. Foundry-based project (recommended):
24+
- AZURE_AI_PROJECT_ENDPOINT
25+
2. Hub-based project (legacy):
26+
- AZURE_SUBSCRIPTION_ID
27+
- AZURE_RESOURCE_GROUP_NAME
28+
- AZURE_PROJECT_NAME
2029
"""
2130

2231
import json
2332
import os
2433
from dotenv import load_dotenv
2534
import pandas as pd
2635
from azure.ai.evaluation import evaluate, AzureOpenAIScoreModelGrader
27-
from azure.ai.evaluation import AzureOpenAIModelConfiguration
36+
from azure.ai.evaluation import AzureOpenAIModelConfiguration, AzureAIProject
2837

2938
# Load environment variables
3039
load_dotenv()
@@ -141,6 +150,37 @@ def create_sample_data() -> str:
141150
return filename
142151

143152

153+
def get_azure_ai_project():
154+
"""
155+
Get Azure AI project configuration based on available environment variables.
156+
157+
Returns either:
158+
1. Foundry-based project (preferred): Uses AZURE_AI_PROJECT_ENDPOINT
159+
2. Hub-based project (legacy): Uses subscription_id, resource_group_name, project_name
160+
"""
161+
# Try foundry-based project first (newer approach)
162+
foundry_endpoint = os.environ.get("AZURE_AI_PROJECT_ENDPOINT")
163+
if foundry_endpoint:
164+
print("✅ Using foundry-based Azure AI project")
165+
return foundry_endpoint
166+
167+
# Fall back to hub-based project (legacy approach)
168+
subscription_id = os.environ.get("AZURE_SUBSCRIPTION_ID")
169+
resource_group = os.environ.get("AZURE_RESOURCE_GROUP_NAME")
170+
project_name = os.environ.get("AZURE_PROJECT_NAME")
171+
172+
if subscription_id and resource_group and project_name:
173+
print("✅ Using hub-based Azure AI project (legacy)")
174+
return AzureAIProject(
175+
subscription_id=subscription_id,
176+
resource_group_name=resource_group,
177+
project_name=project_name,
178+
)
179+
180+
print("⚠️ No Azure AI project configuration found")
181+
return None
182+
183+
144184
def demonstrate_score_model_grader():
145185
"""Demonstrate the AzureOpenAIScoreModelGrader usage with real credentials."""
146186

@@ -160,7 +200,15 @@ def demonstrate_score_model_grader():
160200

161201
print("✅ Model configuration loaded successfully")
162202

163-
# 2. Create conversation quality grader
203+
# 2. Get Azure AI project configuration (supports both foundry and hub-based projects)
204+
azure_ai_project = get_azure_ai_project()
205+
if not azure_ai_project:
206+
print("❌ No Azure AI project configuration found. Please set either:")
207+
print(" - AZURE_AI_PROJECT_ENDPOINT (for foundry-based projects), or")
208+
print(" - AZURE_SUBSCRIPTION_ID, AZURE_RESOURCE_GROUP_NAME, AZURE_PROJECT_NAME (for hub-based projects)")
209+
return
210+
211+
# 3. Create conversation quality grader
164212
conversation_quality_grader = AzureOpenAIScoreModelGrader(
165213
model_config=model_config,
166214
name="Conversation Quality Assessment",
@@ -192,16 +240,22 @@ def demonstrate_score_model_grader():
192240

193241
print("✅ Conversation quality grader created successfully")
194242

195-
# 3. Run evaluation with the score model grader
243+
# 4. Run evaluation with the score model grader
196244
print("\n🚀 Running evaluation with score model grader...")
197-
198245
result = evaluate(
199246
data=data_file,
200247
evaluators={"conversation_quality": conversation_quality_grader},
201-
azure_ai_project=os.environ.get("AZURE_AI_PROJECT_ENDPOINT"),
248+
azure_ai_project=azure_ai_project,
249+
tags={
250+
"grader_type": "score_model",
251+
"model": "gpt-4o-mini",
252+
"evaluation_focus": "conversation_quality",
253+
"sample_size": "demo",
254+
"automation_level": "full",
255+
},
202256
)
203257

204-
# 4. Display results
258+
# 5. Display results
205259
print("\n=== Evaluation Results ===")
206260
print(f"Total samples evaluated: {len(result['rows'])}")
207261

sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate.py

Lines changed: 40 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,13 @@ def evaluation_evaluate_classes_methods(self):
6262
},
6363
},
6464
},
65+
# Example of using tags for tracking and organization
66+
tags={
67+
"experiment": "basic_evaluation",
68+
"model": "gpt-4",
69+
"dataset": "sample_qa_data",
70+
"environment": "development",
71+
},
6572
)
6673

6774
# [END evaluate_method]
@@ -363,23 +370,6 @@ def evaluation_evaluate_classes_methods(self):
363370
)
364371
# [END similarity_evaluator]
365372

366-
# [START completeness_evaluator]
367-
import os
368-
from azure.ai.evaluation import CompletenessEvaluator
369-
370-
model_config = {
371-
"azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
372-
"api_key": os.environ.get("AZURE_OPENAI_KEY"),
373-
"azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
374-
}
375-
376-
completeness_eval = CompletenessEvaluator(model_config=model_config)
377-
completeness_eval(
378-
response="The capital of Japan is Tokyo.",
379-
ground_truth="Tokyo is Japan's capital.",
380-
)
381-
# [END completeness_evaluator]
382-
383373
# [START task_adherence_evaluator]
384374
import os
385375
from azure.ai.evaluation import TaskAdherenceEvaluator
@@ -509,19 +499,19 @@ def evaluation_evaluate_classes_methods(self):
509499
from azure.ai.evaluation import DocumentRetrievalEvaluator
510500

511501
retrieval_ground_truth = [
512-
{"document_id": "1", "query_relevance_judgement": 4},
513-
{"document_id": "2", "query_relevance_judgement": 2},
514-
{"document_id": "3", "query_relevance_judgement": 3},
515-
{"document_id": "4", "query_relevance_judgement": 1},
516-
{"document_id": "5", "query_relevance_judgement": 0},
502+
{"document_id": "1", "query_relevance_label": 4},
503+
{"document_id": "2", "query_relevance_label": 2},
504+
{"document_id": "3", "query_relevance_label": 3},
505+
{"document_id": "4", "query_relevance_label": 1},
506+
{"document_id": "5", "query_relevance_label": 0},
517507
]
518508

519509
retrieved_documents = [
520-
{"document_id": "2", "query_relevance_judgement": 45.1},
521-
{"document_id": "6", "query_relevance_judgement": 35.8},
522-
{"document_id": "3", "query_relevance_judgement": 29.2},
523-
{"document_id": "5", "query_relevance_judgement": 25.4},
524-
{"document_id": "7", "query_relevance_judgement": 18.8},
510+
{"document_id": "2", "relevance_score": 45.1},
511+
{"document_id": "6", "relevance_score": 35.8},
512+
{"document_id": "3", "relevance_score": 29.2},
513+
{"document_id": "5", "relevance_score": 25.4},
514+
{"document_id": "7", "relevance_score": 18.8},
525515
]
526516

527517
document_retrieval_evaluator = DocumentRetrievalEvaluator()
@@ -530,6 +520,29 @@ def evaluation_evaluate_classes_methods(self):
530520
)
531521
# [END document_retrieval_evaluator]
532522

523+
# [START evaluate_with_tags_examples]
524+
evaluate(
525+
data=path,
526+
evaluators={"coherence": CoherenceEvaluator(model_config=model_config)},
527+
evaluator_config={
528+
"coherence": {
529+
"column_mapping": {
530+
"response": "${data.response}",
531+
"query": "${data.query}",
532+
},
533+
},
534+
},
535+
azure_ai_project=azure_ai_project,
536+
tags={
537+
"experiment_name": "coherence_baseline",
538+
"model_version": "gpt-4-0613",
539+
"dataset_version": "v1.2",
540+
"researcher": "data_science_team",
541+
"cost_center": "ai_research",
542+
},
543+
)
544+
# [END evaluate_with_tags_examples]
545+
533546

534547
if __name__ == "__main__":
535548
from dotenv import load_dotenv

0 commit comments

Comments
 (0)