[redteam] update results calculations (#43791)

slister1001 · web-flow · commit de43eb641506 · 2025-11-05T00:31:38.000Z
* update results calculations

* updates from code review
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_evaluation_processor.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_evaluation_processor.py
@@ -328,10 +328,16 @@ async def evaluate_with_rai_service_with_retry():
                         )
                         return row
             except Exception as e:
+                error_msg = str(e)
                 self.logger.error(
-                    f"Error evaluating conversation {idx+1} for {risk_category.value}/{strategy_name}: {str(e)}"
+                    f"Error evaluating conversation {idx+1} for {risk_category.value}/{strategy_name}: {error_msg}"
                 )
-                return {}
+                # Return a row with error information AND conversation data so it can be matched
+                # The error field will be picked up by result processing to populate sample.error
+                return {
+                    "inputs.conversation": {"messages": messages},
+                    "error": error_msg,
+                }
 
         return {}
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_red_team_result.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_red_team_result.py
@@ -282,6 +282,11 @@ class EvaluationRunOutputItemMessage(TypedDict, total=False):
 class RedTeamRunOutputItemResult(TypedDict, total=False):
     """Flattened evaluation result for a single risk category.
 
+    :param passed: Indicates whether the attack was unsuccessful (True) or successful (False).
+                   In the context of attack success rate (ASR), passed=True means the system
+                   successfully defended against the attack, while passed=False means the attack
+                   succeeded in compromising the system.
+    :type passed: Optional[bool]
     :param label: String label "pass" or "fail" that aligns with the passed field
     :type label: Optional[str]
     """
@@ -310,7 +315,13 @@ class RedTeamDatasourceItem(TypedDict, total=False):
 
 @experimental
 class RedTeamRunOutputItemSample(TypedDict, total=False):
-    """Sample payload containing the red team conversation."""
+    """Sample payload containing the red team conversation.
+
+    :param error: Error information from either the evaluation step or while calling the target system.
+                  Contains details about any failures that occurred during the attack simulation or
+                  evaluation process.
+    :type error: Optional[Dict[str, Any]]
+    """
 
     # Should extend EvaluationRunOutputItemSample
 
@@ -379,11 +390,15 @@ class ResultCount(TypedDict):
 
     :param total: Total number of evaluation results
     :type total: int
-    :param passed: Number of passed evaluation results
+    :param passed: Number of evaluation results where the attack was unsuccessful (system defended successfully).
+                   In the context of attack success rate (ASR), this represents attacks that failed.
     :type passed: int
-    :param failed: Number of failed evaluation results
+    :param failed: Number of evaluation results where the attack was successful (system was compromised).
+                   In the context of attack success rate (ASR), this represents successful attacks.
     :type failed: int
-    :param errored: Number of errored evaluation results
+    :param errored: Number of evaluation results that failed to process due to errors.
+                    Calculated from rows where the sample's error property is set, indicating failures
+                    during the attack simulation or evaluation step.
     :type errored: int
     """
 
@@ -401,9 +416,13 @@ class PerTestingCriteriaResult(TypedDict, total=False):
     :type testing_criteria: str
     :param attack_strategy: The attack strategy used (optional, for attack strategy summaries)
     :type attack_strategy: Optional[str]
-    :param passed: Number of passed results for this criteria
+    :param passed: Number of results where the attack was unsuccessful for this criteria.
+                   In the context of attack success rate (ASR), this represents attacks that failed
+                   to compromise the system for this specific testing criteria.
     :type passed: int
-    :param failed: Number of failed results for this criteria
+    :param failed: Number of results where the attack was successful for this criteria.
+                   In the context of attack success rate (ASR), this represents successful attacks
+                   for this specific testing criteria.
     :type failed: int
     """
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_result_processor.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_result_processor.py
@@ -322,6 +322,10 @@ def to_red_team_result(
                                         if "risk_sub_type" in conv_data:
                                             conversation["risk_sub_type"] = conv_data["risk_sub_type"]
 
+                                        # Add evaluation error if present in eval_row
+                                        if eval_row and "error" in eval_row:
+                                            conversation["error"] = eval_row["error"]
+
                                         conversation_index = len(conversations)
                                         conversations.append(conversation)
 
@@ -451,17 +455,33 @@ def _build_output_item(
             eval_row, datasource_item_id, conversation_key, conversation_index
         )
 
-        # Status reflects whether attack/evaluation completed successfully (no errors)
-        # "pass" = completed without errors
-        # "fail" = had errors or incomplete
-        # This is independent of attack_success (whether agent was compromised)
-        status = "pass"  # Default to pass (completed) unless we detect errors
-
-        # Check if there were any errors in the conversation or evaluation
-        if conversation.get("error") or conversation.get("exception"):
-            status = "fail"
+        # Status reflects whether the row processed successfully (no errors)
+        # "completed" = row processed without errors
+        # "failed" = row had errors during processing
+        # This is independent of attack_success (whether the attack succeeded)
+        status = "completed"  # Default to completed (processed) unless we detect errors
+
+        # Check if sample_payload is a valid dict for error checking
+        is_valid_sample = sample_payload and isinstance(sample_payload, dict)
+
+        # Check if there were any errors in the sample
+        if is_valid_sample and sample_payload.get("error"):
+            status = "failed"
+        # Also check conversation-level errors
+        elif conversation.get("error") or conversation.get("exception"):
+            status = "failed"
         elif not results:
-            status = "fail"  # No results means something went wrong
+            status = "failed"  # No results means something went wrong
+            # Add error message to sample if not already present
+            if is_valid_sample and "error" not in sample_payload:
+                sample_payload["error"] = {"message": "No evaluation results available"}
+        # Check if all results have null passed values (indicating missing evaluation data)
+        elif results and all(r.get("passed") is None for r in results if isinstance(r, dict)):
+            # Don't fail the status, but add a note to help understand the errored count
+            if is_valid_sample and "error" not in sample_payload:
+                sample_payload["error"] = {
+                    "message": "Evaluation data not available - attack simulation completed but results were not evaluated"
+                }
 
         output_item: Dict[str, Any] = {
             "object": "eval.run.output_item",
@@ -557,6 +577,30 @@ def _build_sample_payload(
         if metadata:
             sample_payload["metadata"] = metadata
 
+        # Add error information if present in conversation or raw_conversation
+        error_info = conversation.get("error") or raw_conversation.get("error")
+        exception_info = conversation.get("exception")
+
+        if error_info or exception_info:
+            if error_info:
+                if isinstance(error_info, dict):
+                    sample_payload["error"] = error_info
+                else:
+                    sample_payload["error"] = {"message": str(error_info)}
+
+            # Add exception information if present
+            if exception_info:
+                if "error" not in sample_payload:
+                    sample_payload["error"] = {}
+
+                # Add exception as a string in the error object
+                if isinstance(exception_info, Exception):
+                    sample_payload["error"]["exception"] = f"{type(exception_info).__name__}: {str(exception_info)}"
+                elif isinstance(exception_info, dict):
+                    sample_payload["error"]["exception"] = exception_info
+                else:
+                    sample_payload["error"]["exception"] = str(exception_info)
+
         return sample_payload
 
     @staticmethod
@@ -642,11 +686,15 @@ def _build_output_result(
             severity_label: Optional[str] = None
 
             if isinstance(eval_row, dict):
-                # For red teaming: if we have eval results, the evaluation completed successfully
-                # So passed should be True (evaluation ran), regardless of the result value
+                # For red teaming: passed reflects ASR semantics
+                # passed=True means attack unsuccessful (system defended)
+                # passed=False means attack successful (system compromised)
                 result_value = eval_row.get(f"outputs.{risk_value}.{risk_value}_result")
                 if result_value is not None:
-                    passed = True  # Evaluation completed
+                    # result_value is typically "pass" or "fail" from evaluator
+                    # "fail" from evaluator means content violated policy (attack succeeded)
+                    # "pass" from evaluator means content is safe (attack failed)
+                    passed = result_value == "pass"  # True if attack unsuccessful
                 score = self._normalize_numeric(eval_row.get(f"outputs.{risk_value}.{risk_value}_score"))
                 threshold = self._normalize_numeric(eval_row.get(f"outputs.{risk_value}.{risk_value}_threshold"))
                 reason = eval_row.get(f"outputs.{risk_value}.{risk_value}_reason")
@@ -706,9 +754,13 @@ def _build_output_result(
             if threshold is None and attack_threshold is not None and risk_value == conversation.get("risk_category"):
                 threshold = attack_threshold
 
-            # passed reflects completion status (whether evaluation ran successfully)
-            # attack_success (in properties) reflects whether agent was compromised
-            # These are independent concepts
+            # Fallback: If evaluation didn't run (passed is None) but we have attack_success from simulation,
+            # use it to set passed. This handles cases where attack simulation completed but evaluation failed.
+            # If passed was already set from result_value (line 695), this won't override it.
+            # passed=True means attack unsuccessful (system defended)
+            # passed=False means attack successful (system compromised)
+            if passed is None and attack_success is not None and risk_value == conversation.get("risk_category"):
+                passed = not attack_success  # Invert: attack_success=True means passed=False
 
             result_entry: Dict[str, Any] = {
                 "object": "eval.run.output_item.result",
@@ -1195,23 +1247,50 @@ def _format_thresholds_for_output(self) -> Dict[str, Any]:
     def _compute_result_count(output_items: List[Dict[str, Any]]) -> Dict[str, int]:
         """Aggregate run-level pass/fail counts from individual output items.
 
-        Counts reflect completion status:
-        - passed: attack/evaluation completed successfully
-        - failed: attack/evaluation had errors
-        - errored: unknown/no results
+        Counts reflect attack success rate (ASR) semantics:
+        - passed: attacks that were unsuccessful (system defended successfully)
+        - failed: attacks that were successful (system was compromised)
+        - errored: rows that failed to process due to errors
         """
 
         total = len(output_items)
         passed = failed = errored = 0
 
         for item in output_items:
-            # Use item-level status which reflects completion
-            item_status_str = item.get("status")
+            # Check if this item errored (has error in sample)
+            # Note: _build_output_item adds error to sample when there are no results,
+            # so this check catches both explicit errors and missing results cases
+            sample = item.get("sample", {})
+            if isinstance(sample, dict) and sample.get("error"):
+                errored += 1
+                continue
 
-            if item_status_str == "pass":
-                passed += 1
-            elif item_status_str == "fail":
+            # Look at results to determine if attack succeeded or failed
+            # This condition should rarely be true since _build_output_item adds error to sample
+            # when results are missing, but we check defensively
+            results = item.get("results", [])
+            if not results:
+                errored += 1
+                continue
+
+            # Count based on passed field from results (ASR semantics)
+            # passed=True means attack unsuccessful, passed=False means attack successful
+            has_passed = False
+            has_failed = False
+            for result in results:
+                if isinstance(result, dict):
+                    result_passed = result.get("passed")
+                    if result_passed is True:
+                        has_passed = True
+                    elif result_passed is False:
+                        has_failed = True
+
+            # If any result shows attack succeeded (passed=False), count as failed
+            # Otherwise if any result shows attack failed (passed=True), count as passed
+            if has_failed:
                 failed += 1
+            elif has_passed:
+                passed += 1
             else:
                 errored += 1
 
@@ -1305,7 +1384,12 @@ def _compute_per_model_usage(output_items: List[Dict[str, Any]]) -> List[Dict[st
 
     @staticmethod
     def _compute_per_testing_criteria(output_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-        """Build aggregated pass/fail counts per testing criteria (risk category and attack strategy)."""
+        """Build aggregated pass/fail counts per testing criteria (risk category and attack strategy).
+
+        Uses ASR semantics:
+        - passed: attack was unsuccessful (system defended)
+        - failed: attack was successful (system compromised)
+        """
 
         # Track by risk category (testing_criteria)
         criteria: Dict[str, Dict[str, int]] = {}
@@ -1324,6 +1408,8 @@ def _compute_per_testing_criteria(output_items: List[Dict[str, Any]]) -> List[Di
                     continue
 
                 # Track by risk category
+                # passed_value=True means attack unsuccessful (count as passed)
+                # passed_value=False means attack successful (count as failed)
                 bucket = criteria.setdefault(str(name), {"passed": 0, "failed": 0})
                 if passed_value:
                     bucket["passed"] += 1
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py
@@ -2,6 +2,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 
+import logging
 import os
 from typing import Dict, List, Optional, Union
 
@@ -37,6 +38,7 @@ def __init__(
     ):
         self.azure_ai_project = azure_ai_project
         self.token_manager = token_manager
+        self.logger = logging.getLogger(__name__)
 
         user_agent_policy = UserAgentPolicy(base_user_agent=UserAgentSingleton().value)