Skip to content

Commit 6d62111

Browse files
authored
redteam aoai instance results updates v3 (#43256)
* aoai instance results updates v3 * add label to RedTeamRunOutputItemResult * reformatting
1 parent 839fb9d commit 6d62111

File tree

3 files changed

+47
-23
lines changed

3 files changed

+47
-23
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_mlflow_integration.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -204,11 +204,6 @@ async def log_redteam_results_to_mlflow(
204204
raise ValueError("aoai_summary parameter is required but was not provided")
205205

206206
payload = dict(aoai_summary) # Make a copy
207-
# Ensure conversations are included for scan output
208-
if "conversations" not in payload:
209-
payload["conversations"] = (
210-
redteam_result.attack_details or redteam_result.scan_result.get("attack_details") or []
211-
)
212207
json.dump(payload, f)
213208

214209
# Save legacy format as instance_results.json

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_red_team_result.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -271,19 +271,23 @@ class EvaluationRunOutputItemMessage(TypedDict, total=False):
271271

272272
@experimental
273273
class RedTeamRunOutputItemResult(TypedDict, total=False):
274-
"""Flattened evaluation result for a single risk category."""
274+
"""Flattened evaluation result for a single risk category.
275+
276+
:param label: String label "pass" or "fail" that aligns with the passed field
277+
:type label: Optional[str]
278+
"""
275279

276280
# Should extend EvaluationRunOutputItemResult
277281

278282
object: str
279283
type: str
280284
name: str
281285
passed: Optional[bool]
286+
label: Optional[str]
282287
score: Optional[float]
283288
metric: Optional[str]
284289
threshold: Optional[float]
285290
reason: Optional[str]
286-
sample: "RedTeamRunOutputItemSample"
287291
properties: RedTeamOutputResultProperties
288292

289293

@@ -376,18 +380,21 @@ class ResultCount(TypedDict):
376380

377381

378382
@experimental
379-
class PerTestingCriteriaResult(TypedDict):
383+
class PerTestingCriteriaResult(TypedDict, total=False):
380384
"""Result count for a specific testing criteria.
381385
382386
:param testing_criteria: The name of the testing criteria (e.g., risk category)
383387
:type testing_criteria: str
388+
:param attack_strategy: The attack strategy used (optional, for attack strategy summaries)
389+
:type attack_strategy: Optional[str]
384390
:param passed: Number of passed results for this criteria
385391
:type passed: int
386392
:param failed: Number of failed results for this criteria
387393
:type failed: int
388394
"""
389395

390396
testing_criteria: str
397+
attack_strategy: Optional[str]
391398
passed: int
392399
failed: int
393400

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_result_processor.py

Lines changed: 37 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -375,7 +375,6 @@ def to_red_team_result(
375375
output_items=ordered_output_items,
376376
eval_run=eval_run,
377377
red_team_info=red_team_info,
378-
include_conversations=False,
379378
scan_name=scan_name,
380379
run_id_override=run_id_override,
381380
eval_id_override=eval_id_override,
@@ -413,7 +412,7 @@ def _build_output_item(
413412
results = self._build_output_result(
414413
conversation,
415414
eval_row,
416-
sample_payload=sample_payload,
415+
sample_payload=None,
417416
)
418417
output_item_id = self._resolve_output_item_id(
419418
eval_row, datasource_item_id, conversation_key, conversation_index
@@ -431,6 +430,7 @@ def _build_output_item(
431430
"id": output_item_id,
432431
"created_time": created_time,
433432
"status": status,
433+
"sample": sample_payload,
434434
"results": results,
435435
}
436436

@@ -584,6 +584,7 @@ def _build_output_result(
584584
"name": risk_value,
585585
"metric": risk_value,
586586
"passed": passed,
587+
"label": "pass" if passed is True else ("fail" if passed is False else None),
587588
"score": score,
588589
"threshold": threshold,
589590
"reason": reason,
@@ -592,9 +593,6 @@ def _build_output_result(
592593
if properties:
593594
result_entry["properties"] = properties
594595

595-
if sample_payload:
596-
result_entry["sample"] = sample_payload
597-
598596
results.append(result_entry)
599597

600598
if not results:
@@ -624,6 +622,7 @@ def _build_output_result(
624622
"name": risk_value,
625623
"metric": risk_value,
626624
"passed": None,
625+
"label": None,
627626
"score": None,
628627
"threshold": attack_threshold,
629628
"reason": fallback_reason,
@@ -632,9 +631,6 @@ def _build_output_result(
632631
if properties:
633632
fallback_result["properties"] = properties
634633

635-
if sample_payload:
636-
fallback_result["sample"] = sample_payload
637-
638634
results.append(fallback_result)
639635

640636
return results
@@ -1096,9 +1092,12 @@ def _compute_result_count(output_items: List[Dict[str, Any]]) -> Dict[str, int]:
10961092

10971093
@staticmethod
10981094
def _compute_per_testing_criteria(output_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
1099-
"""Build aggregated pass/fail counts per testing criteria (risk category)."""
1095+
"""Build aggregated pass/fail counts per testing criteria (risk category and attack strategy)."""
11001096

1097+
# Track by risk category (testing_criteria)
11011098
criteria: Dict[str, Dict[str, int]] = {}
1099+
# Track by attack strategy
1100+
strategy_criteria: Dict[str, Dict[str, int]] = {}
11021101

11031102
for item in output_items:
11041103
for result in item.get("results", []):
@@ -1111,13 +1110,28 @@ def _compute_per_testing_criteria(output_items: List[Dict[str, Any]]) -> List[Di
11111110
if passed_value is None:
11121111
continue
11131112

1113+
# Track by risk category
11141114
bucket = criteria.setdefault(str(name), {"passed": 0, "failed": 0})
11151115
if passed_value:
11161116
bucket["passed"] += 1
11171117
else:
11181118
bucket["failed"] += 1
11191119

1120-
return [
1120+
# Track by attack strategy from properties
1121+
properties = result.get("properties", {})
1122+
if isinstance(properties, dict):
1123+
attack_technique = properties.get("attack_technique")
1124+
if attack_technique:
1125+
strategy_bucket = strategy_criteria.setdefault(
1126+
str(attack_technique), {"passed": 0, "failed": 0}
1127+
)
1128+
if passed_value:
1129+
strategy_bucket["passed"] += 1
1130+
else:
1131+
strategy_bucket["failed"] += 1
1132+
1133+
# Build results list with risk categories
1134+
results = [
11211135
{
11221136
"testing_criteria": criteria_name,
11231137
"passed": counts["passed"],
@@ -1126,6 +1140,19 @@ def _compute_per_testing_criteria(output_items: List[Dict[str, Any]]) -> List[Di
11261140
for criteria_name, counts in sorted(criteria.items())
11271141
]
11281142

1143+
# Add attack strategy summaries
1144+
for strategy_name, counts in sorted(strategy_criteria.items()):
1145+
results.append(
1146+
{
1147+
"testing_criteria": strategy_name,
1148+
"attack_strategy": strategy_name,
1149+
"passed": counts["passed"],
1150+
"failed": counts["failed"],
1151+
}
1152+
)
1153+
1154+
return results
1155+
11291156
@staticmethod
11301157
def _build_data_source_section(parameters: Dict[str, Any], red_team_info: Optional[Dict]) -> Dict[str, Any]:
11311158
"""Build the data_source portion of the run payload for red-team scans."""
@@ -1179,7 +1206,6 @@ def _build_results_payload(
11791206
output_items: List[Dict[str, Any]],
11801207
eval_run: Optional[Any] = None,
11811208
red_team_info: Optional[Dict] = None,
1182-
include_conversations: bool = False,
11831209
scan_name: Optional[str] = None,
11841210
run_id_override: Optional[str] = None,
11851211
eval_id_override: Optional[str] = None,
@@ -1191,7 +1217,6 @@ def _build_results_payload(
11911217
:param output_items: List of output items containing results for each conversation
11921218
:param eval_run: The MLFlow run object (optional)
11931219
:param red_team_info: Red team tracking information (optional)
1194-
:param include_conversations: Whether to include conversation details (optional)
11951220
:param scan_name: Name of the scan (optional)
11961221
:param run_id_override: Override for run ID (optional)
11971222
:param eval_id_override: Override for eval ID (optional)
@@ -1290,7 +1315,4 @@ def _build_results_payload(
12901315
"output_items": list_wrapper,
12911316
}
12921317

1293-
if include_conversations:
1294-
run_payload["conversations"] = redteam_result.attack_details or scan_result.get("attack_details") or []
1295-
12961318
return run_payload

0 commit comments

Comments
 (0)