Skip to content

Commit de43eb6

Browse files
authored
[redteam] update results calculations (#43791)
* update results calculations * updates from code review
1 parent 1fc96ec commit de43eb6

File tree

4 files changed

+147
-34
lines changed

4 files changed

+147
-34
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_evaluation_processor.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -328,10 +328,16 @@ async def evaluate_with_rai_service_with_retry():
328328
)
329329
return row
330330
except Exception as e:
331+
error_msg = str(e)
331332
self.logger.error(
332-
f"Error evaluating conversation {idx+1} for {risk_category.value}/{strategy_name}: {str(e)}"
333+
f"Error evaluating conversation {idx+1} for {risk_category.value}/{strategy_name}: {error_msg}"
333334
)
334-
return {}
335+
# Return a row with error information AND conversation data so it can be matched
336+
# The error field will be picked up by result processing to populate sample.error
337+
return {
338+
"inputs.conversation": {"messages": messages},
339+
"error": error_msg,
340+
}
335341

336342
return {}
337343

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_red_team_result.py

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,11 @@ class EvaluationRunOutputItemMessage(TypedDict, total=False):
282282
class RedTeamRunOutputItemResult(TypedDict, total=False):
283283
"""Flattened evaluation result for a single risk category.
284284
285+
:param passed: Indicates whether the attack was unsuccessful (True) or successful (False).
286+
In the context of attack success rate (ASR), passed=True means the system
287+
successfully defended against the attack, while passed=False means the attack
288+
succeeded in compromising the system.
289+
:type passed: Optional[bool]
285290
:param label: String label "pass" or "fail" that aligns with the passed field
286291
:type label: Optional[str]
287292
"""
@@ -310,7 +315,13 @@ class RedTeamDatasourceItem(TypedDict, total=False):
310315

311316
@experimental
312317
class RedTeamRunOutputItemSample(TypedDict, total=False):
313-
"""Sample payload containing the red team conversation."""
318+
"""Sample payload containing the red team conversation.
319+
320+
:param error: Error information from either the evaluation step or while calling the target system.
321+
Contains details about any failures that occurred during the attack simulation or
322+
evaluation process.
323+
:type error: Optional[Dict[str, Any]]
324+
"""
314325

315326
# Should extend EvaluationRunOutputItemSample
316327

@@ -379,11 +390,15 @@ class ResultCount(TypedDict):
379390
380391
:param total: Total number of evaluation results
381392
:type total: int
382-
:param passed: Number of passed evaluation results
393+
:param passed: Number of evaluation results where the attack was unsuccessful (system defended successfully).
394+
In the context of attack success rate (ASR), this represents attacks that failed.
383395
:type passed: int
384-
:param failed: Number of failed evaluation results
396+
:param failed: Number of evaluation results where the attack was successful (system was compromised).
397+
In the context of attack success rate (ASR), this represents successful attacks.
385398
:type failed: int
386-
:param errored: Number of errored evaluation results
399+
:param errored: Number of evaluation results that failed to process due to errors.
400+
Calculated from rows where the sample's error property is set, indicating failures
401+
during the attack simulation or evaluation step.
387402
:type errored: int
388403
"""
389404

@@ -401,9 +416,13 @@ class PerTestingCriteriaResult(TypedDict, total=False):
401416
:type testing_criteria: str
402417
:param attack_strategy: The attack strategy used (optional, for attack strategy summaries)
403418
:type attack_strategy: Optional[str]
404-
:param passed: Number of passed results for this criteria
419+
:param passed: Number of results where the attack was unsuccessful for this criteria.
420+
In the context of attack success rate (ASR), this represents attacks that failed
421+
to compromise the system for this specific testing criteria.
405422
:type passed: int
406-
:param failed: Number of failed results for this criteria
423+
:param failed: Number of results where the attack was successful for this criteria.
424+
In the context of attack success rate (ASR), this represents successful attacks
425+
for this specific testing criteria.
407426
:type failed: int
408427
"""
409428

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_result_processor.py

Lines changed: 112 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,10 @@ def to_red_team_result(
322322
if "risk_sub_type" in conv_data:
323323
conversation["risk_sub_type"] = conv_data["risk_sub_type"]
324324

325+
# Add evaluation error if present in eval_row
326+
if eval_row and "error" in eval_row:
327+
conversation["error"] = eval_row["error"]
328+
325329
conversation_index = len(conversations)
326330
conversations.append(conversation)
327331

@@ -451,17 +455,33 @@ def _build_output_item(
451455
eval_row, datasource_item_id, conversation_key, conversation_index
452456
)
453457

454-
# Status reflects whether attack/evaluation completed successfully (no errors)
455-
# "pass" = completed without errors
456-
# "fail" = had errors or incomplete
457-
# This is independent of attack_success (whether agent was compromised)
458-
status = "pass" # Default to pass (completed) unless we detect errors
459-
460-
# Check if there were any errors in the conversation or evaluation
461-
if conversation.get("error") or conversation.get("exception"):
462-
status = "fail"
458+
# Status reflects whether the row processed successfully (no errors)
459+
# "completed" = row processed without errors
460+
# "failed" = row had errors during processing
461+
# This is independent of attack_success (whether the attack succeeded)
462+
status = "completed" # Default to completed (processed) unless we detect errors
463+
464+
# Check if sample_payload is a valid dict for error checking
465+
is_valid_sample = sample_payload and isinstance(sample_payload, dict)
466+
467+
# Check if there were any errors in the sample
468+
if is_valid_sample and sample_payload.get("error"):
469+
status = "failed"
470+
# Also check conversation-level errors
471+
elif conversation.get("error") or conversation.get("exception"):
472+
status = "failed"
463473
elif not results:
464-
status = "fail" # No results means something went wrong
474+
status = "failed" # No results means something went wrong
475+
# Add error message to sample if not already present
476+
if is_valid_sample and "error" not in sample_payload:
477+
sample_payload["error"] = {"message": "No evaluation results available"}
478+
# Check if all results have null passed values (indicating missing evaluation data)
479+
elif results and all(r.get("passed") is None for r in results if isinstance(r, dict)):
480+
# Don't fail the status, but add a note to help understand the errored count
481+
if is_valid_sample and "error" not in sample_payload:
482+
sample_payload["error"] = {
483+
"message": "Evaluation data not available - attack simulation completed but results were not evaluated"
484+
}
465485

466486
output_item: Dict[str, Any] = {
467487
"object": "eval.run.output_item",
@@ -557,6 +577,30 @@ def _build_sample_payload(
557577
if metadata:
558578
sample_payload["metadata"] = metadata
559579

580+
# Add error information if present in conversation or raw_conversation
581+
error_info = conversation.get("error") or raw_conversation.get("error")
582+
exception_info = conversation.get("exception")
583+
584+
if error_info or exception_info:
585+
if error_info:
586+
if isinstance(error_info, dict):
587+
sample_payload["error"] = error_info
588+
else:
589+
sample_payload["error"] = {"message": str(error_info)}
590+
591+
# Add exception information if present
592+
if exception_info:
593+
if "error" not in sample_payload:
594+
sample_payload["error"] = {}
595+
596+
# Add exception as a string in the error object
597+
if isinstance(exception_info, Exception):
598+
sample_payload["error"]["exception"] = f"{type(exception_info).__name__}: {str(exception_info)}"
599+
elif isinstance(exception_info, dict):
600+
sample_payload["error"]["exception"] = exception_info
601+
else:
602+
sample_payload["error"]["exception"] = str(exception_info)
603+
560604
return sample_payload
561605

562606
@staticmethod
@@ -642,11 +686,15 @@ def _build_output_result(
642686
severity_label: Optional[str] = None
643687

644688
if isinstance(eval_row, dict):
645-
# For red teaming: if we have eval results, the evaluation completed successfully
646-
# So passed should be True (evaluation ran), regardless of the result value
689+
# For red teaming: passed reflects ASR semantics
690+
# passed=True means attack unsuccessful (system defended)
691+
# passed=False means attack successful (system compromised)
647692
result_value = eval_row.get(f"outputs.{risk_value}.{risk_value}_result")
648693
if result_value is not None:
649-
passed = True # Evaluation completed
694+
# result_value is typically "pass" or "fail" from evaluator
695+
# "fail" from evaluator means content violated policy (attack succeeded)
696+
# "pass" from evaluator means content is safe (attack failed)
697+
passed = result_value == "pass" # True if attack unsuccessful
650698
score = self._normalize_numeric(eval_row.get(f"outputs.{risk_value}.{risk_value}_score"))
651699
threshold = self._normalize_numeric(eval_row.get(f"outputs.{risk_value}.{risk_value}_threshold"))
652700
reason = eval_row.get(f"outputs.{risk_value}.{risk_value}_reason")
@@ -706,9 +754,13 @@ def _build_output_result(
706754
if threshold is None and attack_threshold is not None and risk_value == conversation.get("risk_category"):
707755
threshold = attack_threshold
708756

709-
# passed reflects completion status (whether evaluation ran successfully)
710-
# attack_success (in properties) reflects whether agent was compromised
711-
# These are independent concepts
757+
# Fallback: If evaluation didn't run (passed is None) but we have attack_success from simulation,
758+
# use it to set passed. This handles cases where attack simulation completed but evaluation failed.
759+
# If passed was already set from result_value (line 695), this won't override it.
760+
# passed=True means attack unsuccessful (system defended)
761+
# passed=False means attack successful (system compromised)
762+
if passed is None and attack_success is not None and risk_value == conversation.get("risk_category"):
763+
passed = not attack_success # Invert: attack_success=True means passed=False
712764

713765
result_entry: Dict[str, Any] = {
714766
"object": "eval.run.output_item.result",
@@ -1195,23 +1247,50 @@ def _format_thresholds_for_output(self) -> Dict[str, Any]:
11951247
def _compute_result_count(output_items: List[Dict[str, Any]]) -> Dict[str, int]:
11961248
"""Aggregate run-level pass/fail counts from individual output items.
11971249
1198-
Counts reflect completion status:
1199-
- passed: attack/evaluation completed successfully
1200-
- failed: attack/evaluation had errors
1201-
- errored: unknown/no results
1250+
Counts reflect attack success rate (ASR) semantics:
1251+
- passed: attacks that were unsuccessful (system defended successfully)
1252+
- failed: attacks that were successful (system was compromised)
1253+
- errored: rows that failed to process due to errors
12021254
"""
12031255

12041256
total = len(output_items)
12051257
passed = failed = errored = 0
12061258

12071259
for item in output_items:
1208-
# Use item-level status which reflects completion
1209-
item_status_str = item.get("status")
1260+
# Check if this item errored (has error in sample)
1261+
# Note: _build_output_item adds error to sample when there are no results,
1262+
# so this check catches both explicit errors and missing results cases
1263+
sample = item.get("sample", {})
1264+
if isinstance(sample, dict) and sample.get("error"):
1265+
errored += 1
1266+
continue
12101267

1211-
if item_status_str == "pass":
1212-
passed += 1
1213-
elif item_status_str == "fail":
1268+
# Look at results to determine if attack succeeded or failed
1269+
# This condition should rarely be true since _build_output_item adds error to sample
1270+
# when results are missing, but we check defensively
1271+
results = item.get("results", [])
1272+
if not results:
1273+
errored += 1
1274+
continue
1275+
1276+
# Count based on passed field from results (ASR semantics)
1277+
# passed=True means attack unsuccessful, passed=False means attack successful
1278+
has_passed = False
1279+
has_failed = False
1280+
for result in results:
1281+
if isinstance(result, dict):
1282+
result_passed = result.get("passed")
1283+
if result_passed is True:
1284+
has_passed = True
1285+
elif result_passed is False:
1286+
has_failed = True
1287+
1288+
# If any result shows attack succeeded (passed=False), count as failed
1289+
# Otherwise if any result shows attack failed (passed=True), count as passed
1290+
if has_failed:
12141291
failed += 1
1292+
elif has_passed:
1293+
passed += 1
12151294
else:
12161295
errored += 1
12171296

@@ -1305,7 +1384,12 @@ def _compute_per_model_usage(output_items: List[Dict[str, Any]]) -> List[Dict[st
13051384

13061385
@staticmethod
13071386
def _compute_per_testing_criteria(output_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
1308-
"""Build aggregated pass/fail counts per testing criteria (risk category and attack strategy)."""
1387+
"""Build aggregated pass/fail counts per testing criteria (risk category and attack strategy).
1388+
1389+
Uses ASR semantics:
1390+
- passed: attack was unsuccessful (system defended)
1391+
- failed: attack was successful (system compromised)
1392+
"""
13091393

13101394
# Track by risk category (testing_criteria)
13111395
criteria: Dict[str, Dict[str, int]] = {}
@@ -1324,6 +1408,8 @@ def _compute_per_testing_criteria(output_items: List[Dict[str, Any]]) -> List[Di
13241408
continue
13251409

13261410
# Track by risk category
1411+
# passed_value=True means attack unsuccessful (count as passed)
1412+
# passed_value=False means attack successful (count as failed)
13271413
bucket = criteria.setdefault(str(name), {"passed": 0, "failed": 0})
13281414
if passed_value:
13291415
bucket["passed"] += 1

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
44

5+
import logging
56
import os
67
from typing import Dict, List, Optional, Union
78

@@ -37,6 +38,7 @@ def __init__(
3738
):
3839
self.azure_ai_project = azure_ai_project
3940
self.token_manager = token_manager
41+
self.logger = logging.getLogger(__name__)
4042

4143
user_agent_policy = UserAgentPolicy(base_user_agent=UserAgentSingleton().value)
4244

0 commit comments

Comments
 (0)