Auth fix and logging improvements (Azure#40362)

slister1001 · web-flow · commit 126516f783b9 · 2025-04-04T01:16:34.000Z
* try adding auth headers on orchestrator call for entra auth

* improve logged artifacts

* test fixes and setup update

* pin termcolor version

* minor fix
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_red_team.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_red_team.py
@@ -230,6 +230,19 @@ async def _log_redteam_results_to_mlflow(
                     f.write(json.dumps({"conversations": redteam_output.redteaming_data or []}))
                 elif redteam_output.red_team_result:
                     json.dump(redteam_output.red_team_result, f)
+
+            eval_info_name = "redteam_info.json"
+            eval_info_path = os.path.join(self.scan_output_dir, eval_info_name)
+            self.logger.debug(f"Saving evaluation info to scan output directory: {eval_info_path}")
+            with open (eval_info_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
+                # Remove evaluation_result from red_team_info before logging
+                red_team_info_logged = {}
+                for strategy, harms_dict in self.red_team_info.items():
+                    red_team_info_logged[strategy] = {}
+                    for harm, info_dict in harms_dict.items():
+                        info_dict.pop("evaluation_result", None)
+                        red_team_info_logged[strategy][harm] = info_dict
+                f.write(json.dumps(red_team_info_logged))
             
             # Also save a human-readable scorecard if available
             if not data_only and redteam_output.red_team_result:
@@ -270,6 +283,7 @@ async def _log_redteam_results_to_mlflow(
                 # Log the entire directory to MLFlow
                 try:
                     eval_run.log_artifact(tmpdir, artifact_name)
+                    eval_run.log_artifact(tmpdir, eval_info_name)
                     self.logger.debug(f"Successfully logged artifacts directory to MLFlow")
                 except Exception as e:
                     self.logger.warning(f"Failed to log artifacts to MLFlow: {str(e)}")
@@ -675,11 +689,13 @@ async def _prompt_sending_orchestrator(
                         # Set task status to TIMEOUT
                         batch_task_key = f"{strategy_name}_{risk_category}_batch_{batch_idx+1}"
                         self.task_statuses[batch_task_key] = TASK_STATUS["TIMEOUT"]
+                        self.red_team_info[strategy_name][risk_category]["status"] = TASK_STATUS["INCOMPLETE"]
                         # Continue with partial results rather than failing completely
                         continue
                     except Exception as e:
                         log_error(self.logger, f"Error processing batch {batch_idx+1}", e, f"{strategy_name}/{risk_category}")
                         self.logger.debug(f"ERROR: Strategy {strategy_name}, Risk {risk_category}, Batch {batch_idx+1}: {str(e)}")
+                        self.red_team_info[strategy_name][risk_category]["status"] = TASK_STATUS["INCOMPLETE"]
                         # Continue with other batches even if one fails
                         continue
             else:
@@ -699,9 +715,11 @@ async def _prompt_sending_orchestrator(
                     # Set task status to TIMEOUT
                     single_batch_task_key = f"{strategy_name}_{risk_category}_single_batch"
                     self.task_statuses[single_batch_task_key] = TASK_STATUS["TIMEOUT"]
+                    self.red_team_info[strategy_name][risk_category]["status"] = TASK_STATUS["INCOMPLETE"]
                 except Exception as e:
                     log_error(self.logger, "Error processing prompts", e, f"{strategy_name}/{risk_category}")
                     self.logger.debug(f"ERROR: Strategy {strategy_name}, Risk {risk_category}: {str(e)}")
+                    self.red_team_info[strategy_name][risk_category]["status"] = TASK_STATUS["INCOMPLETE"]
             
             self.task_statuses[task_key] = TASK_STATUS["COMPLETED"]
             return orchestrator
@@ -1266,7 +1284,6 @@ def filter(self, record):
                 output_path=result_path,
             )
             eval_logger.debug(f"Completed evaluation for {risk_category.value}/{strategy_name}")
-            
         finally:
             # Restore original stdout and stderr
             sys.stdout = original_stdout
@@ -1299,6 +1316,7 @@ def filter(self, record):
                 self.logger.warning(f"Failed to clean up logger: {str(e)}")
         self.red_team_info[self._get_strategy_name(strategy)][risk_category.value]["evaluation_result_file"] = str(result_path)
         self.red_team_info[self._get_strategy_name(strategy)][risk_category.value]["evaluation_result"] = evaluate_outputs
+        self.red_team_info[self._get_strategy_name(strategy)][risk_category.value]["status"] = TASK_STATUS["COMPLETED"]
         self.logger.debug(f"Evaluation complete for {strategy_name}/{risk_category.value}, results stored in red_team_info")
 
     async def _process_attack(
@@ -1341,6 +1359,8 @@ async def _process_attack(
             converter = self._get_converter_for_strategy(strategy)
             try:
                 self.logger.debug(f"Calling orchestrator for {strategy_name} strategy")
+                if isinstance(self.chat_target,OpenAIChatTarget) and not self.chat_target._headers.get("Api-Key", None):
+                    self.chat_target._headers["Authorization"] = f"Bearer {self.chat_target._token_provider()}"
                 orchestrator = await call_orchestrator(self.chat_target, all_prompts, converter, strategy_name, risk_category.value, timeout)
             except PyritException as e:
                 log_error(self.logger, f"Error calling orchestrator for {strategy_name} strategy", e)
@@ -1370,6 +1390,7 @@ async def _process_attack(
             except Exception as e:
                 log_error(self.logger, f"Error during evaluation for {strategy_name}/{risk_category.value}", e)
                 print(f"⚠️ Evaluation error for {strategy_name}/{risk_category.value}: {str(e)}")
+                self.red_team_info[strategy_name][risk_category.value]["status"] = TASK_STATUS["FAILED"]
                 # Continue processing even if evaluation fails
             
             async with progress_bar_lock:
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_utils/constants.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_utils/constants.py
@@ -60,5 +60,6 @@
     "RUNNING": "running",
     "COMPLETED": "completed",
     "FAILED": "failed",
-    "TIMEOUT": "timeout"
+    "TIMEOUT": "timeout",
+    "INCOMPLETE": "incomplete",
 }
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_utils/formatting_utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_utils/formatting_utils.py
@@ -110,9 +110,14 @@ def format_scorecard(redteam_result: _RedTeamResult) -> str:
     :rtype: str
     """
     scorecard = redteam_result["redteaming_scorecard"]
-    overall_asr = scorecard["risk_category_summary"][0]["overall_asr"] if scorecard["risk_category_summary"] else 0
+    risk_summary = scorecard["risk_category_summary"][0] if scorecard["risk_category_summary"] else {}
+    overall_asr = risk_summary.get("overall_asr", 0)
     
     output = [f"Overall ASR: {overall_asr}%"]
+    overall_successes = risk_summary.get("overall_attack_successes", 0)
+    overall_total = risk_summary.get("overall_total", 0)
+    output.append(f"Attack Success: {overall_successes}/{overall_total} attacks were successful")
+    
     
     separator = "-" * 108
     output.append(separator)
@@ -121,10 +126,16 @@ def format_scorecard(redteam_result: _RedTeamResult) -> str:
     
     for item in scorecard["joint_risk_attack_summary"]:
         risk_category = item["risk_category"].replace("_", "-").capitalize()
-        baseline = f"{item['baseline_asr']}%" if 'baseline_asr' in item else "N/A"
-        easy = f"{item['easy_complexity_asr']}%" if 'easy_complexity_asr' in item else "N/A"
-        moderate = f"{item['moderate_complexity_asr']}%" if 'moderate_complexity_asr' in item else "N/A"
-        difficult = f"{item['difficult_complexity_asr']}%" if 'difficult_complexity_asr' in item else "N/A"
+
+        baseline_val = item.get('baseline_asr')
+        easy_val = item.get('easy_complexity_asr')
+        moderate_val = item.get('moderate_complexity_asr')
+        difficult_val = item.get('difficult_complexity_asr')
+        
+        baseline = "N/A" if is_none_or_nan(baseline_val) else f"{baseline_val}%"
+        easy = "N/A" if is_none_or_nan(easy_val) else f"{easy_val}%"
+        moderate = "N/A" if is_none_or_nan(moderate_val) else f"{moderate_val}%"
+        difficult = "N/A" if is_none_or_nan(difficult_val) else f"{difficult_val}%"
         
         output.append(f"{risk_category:<15}| {baseline:<14} | {easy:<28} | {moderate:<31} | {difficult:<30}")
     
diff --git a/sdk/evaluation/azure-ai-evaluation/setup.py b/sdk/evaluation/azure-ai-evaluation/setup.py
@@ -84,7 +84,7 @@
     extras_require={
         "redteam": [
             "termcolor==2.5.0",
-            "pyrit==0.8.0"
+            "pyrit>=0.8.0"
         ]
     },
     project_urls={
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_red_team.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_red_team.py
@@ -448,7 +448,6 @@ async def test_get_attack_objectives_with_risk_category(
             risk_category=RiskCategory.Violence,
             application_scenario="Test scenario"
         )
-        import pdb; pdb.set_trace()
         mock_generated_rai_client_instance.get_attack_objectives.assert_called_with(
             risk_category="violence",
             application_scenario="Test scenario",
@@ -767,6 +766,13 @@ async def test_prompt_sending_orchestrator_timeout(self, red_team):
             mock_orchestrator = MagicMock()
             mock_orchestrator.send_prompts_async = AsyncMock()
             mock_orch_class.return_value = mock_orchestrator
+
+            red_team.red_team_info = {
+                'test_strategy':
+                    {
+                        'test_risk': {}
+                    }
+                }
             
             result = await red_team._prompt_sending_orchestrator(
                 chat_target=mock_chat_target,
@@ -785,6 +791,7 @@ async def test_prompt_sending_orchestrator_timeout(self, red_team):
             assert "test_strategy_test_risk_single_batch" in red_team.task_statuses
             assert red_team.task_statuses["test_strategy_test_risk_single_batch"] == "timeout"
             assert red_team.task_statuses["test_strategy_test_risk_orchestrator"] == "completed"
+            assert red_team.red_team_info["test_strategy"]["test_risk"]["status"] == "incomplete"
 
 
 @pytest.mark.unittest

Original file line number	Diff line number	Diff line change
`@@ -60,5 +60,6 @@`
`60`	`60`	`"RUNNING": "running",`
`61`	`61`	`"COMPLETED": "completed",`
`62`	`62`	`"FAILED": "failed",`
`63`		`- "TIMEOUT": "timeout"`
	`63`	`+ "TIMEOUT": "timeout",`
	`64`	`+ "INCOMPLETE": "incomplete",`
`64`	`65`	`}`
Original file line number	Diff line number	Diff line change
`@@ -84,7 +84,7 @@`
`84`	`84`	`extras_require={`
`85`	`85`	`"redteam": [`
`86`	`86`	`"termcolor==2.5.0",`
`87`		`- "pyrit==0.8.0"`
	`87`	`+ "pyrit>=0.8.0"`
`88`	`88`	`]`
`89`	`89`	`},`
`90`	`90`	`project_urls={`