Skip to content

Commit 126516f

Browse files
authored
Auth fix and logging improvements (Azure#40362)
* try adding auth headers on orchestrator call for entra auth * improve logged artifacts * test fixes and setup update * pin termcolor version * minor fix
1 parent 452cd7d commit 126516f

File tree

5 files changed

+49
-9
lines changed

5 files changed

+49
-9
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_red_team.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,19 @@ async def _log_redteam_results_to_mlflow(
230230
f.write(json.dumps({"conversations": redteam_output.redteaming_data or []}))
231231
elif redteam_output.red_team_result:
232232
json.dump(redteam_output.red_team_result, f)
233+
234+
eval_info_name = "redteam_info.json"
235+
eval_info_path = os.path.join(self.scan_output_dir, eval_info_name)
236+
self.logger.debug(f"Saving evaluation info to scan output directory: {eval_info_path}")
237+
with open (eval_info_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
238+
# Remove evaluation_result from red_team_info before logging
239+
red_team_info_logged = {}
240+
for strategy, harms_dict in self.red_team_info.items():
241+
red_team_info_logged[strategy] = {}
242+
for harm, info_dict in harms_dict.items():
243+
info_dict.pop("evaluation_result", None)
244+
red_team_info_logged[strategy][harm] = info_dict
245+
f.write(json.dumps(red_team_info_logged))
233246

234247
# Also save a human-readable scorecard if available
235248
if not data_only and redteam_output.red_team_result:
@@ -270,6 +283,7 @@ async def _log_redteam_results_to_mlflow(
270283
# Log the entire directory to MLFlow
271284
try:
272285
eval_run.log_artifact(tmpdir, artifact_name)
286+
eval_run.log_artifact(tmpdir, eval_info_name)
273287
self.logger.debug(f"Successfully logged artifacts directory to MLFlow")
274288
except Exception as e:
275289
self.logger.warning(f"Failed to log artifacts to MLFlow: {str(e)}")
@@ -675,11 +689,13 @@ async def _prompt_sending_orchestrator(
675689
# Set task status to TIMEOUT
676690
batch_task_key = f"{strategy_name}_{risk_category}_batch_{batch_idx+1}"
677691
self.task_statuses[batch_task_key] = TASK_STATUS["TIMEOUT"]
692+
self.red_team_info[strategy_name][risk_category]["status"] = TASK_STATUS["INCOMPLETE"]
678693
# Continue with partial results rather than failing completely
679694
continue
680695
except Exception as e:
681696
log_error(self.logger, f"Error processing batch {batch_idx+1}", e, f"{strategy_name}/{risk_category}")
682697
self.logger.debug(f"ERROR: Strategy {strategy_name}, Risk {risk_category}, Batch {batch_idx+1}: {str(e)}")
698+
self.red_team_info[strategy_name][risk_category]["status"] = TASK_STATUS["INCOMPLETE"]
683699
# Continue with other batches even if one fails
684700
continue
685701
else:
@@ -699,9 +715,11 @@ async def _prompt_sending_orchestrator(
699715
# Set task status to TIMEOUT
700716
single_batch_task_key = f"{strategy_name}_{risk_category}_single_batch"
701717
self.task_statuses[single_batch_task_key] = TASK_STATUS["TIMEOUT"]
718+
self.red_team_info[strategy_name][risk_category]["status"] = TASK_STATUS["INCOMPLETE"]
702719
except Exception as e:
703720
log_error(self.logger, "Error processing prompts", e, f"{strategy_name}/{risk_category}")
704721
self.logger.debug(f"ERROR: Strategy {strategy_name}, Risk {risk_category}: {str(e)}")
722+
self.red_team_info[strategy_name][risk_category]["status"] = TASK_STATUS["INCOMPLETE"]
705723

706724
self.task_statuses[task_key] = TASK_STATUS["COMPLETED"]
707725
return orchestrator
@@ -1266,7 +1284,6 @@ def filter(self, record):
12661284
output_path=result_path,
12671285
)
12681286
eval_logger.debug(f"Completed evaluation for {risk_category.value}/{strategy_name}")
1269-
12701287
finally:
12711288
# Restore original stdout and stderr
12721289
sys.stdout = original_stdout
@@ -1299,6 +1316,7 @@ def filter(self, record):
12991316
self.logger.warning(f"Failed to clean up logger: {str(e)}")
13001317
self.red_team_info[self._get_strategy_name(strategy)][risk_category.value]["evaluation_result_file"] = str(result_path)
13011318
self.red_team_info[self._get_strategy_name(strategy)][risk_category.value]["evaluation_result"] = evaluate_outputs
1319+
self.red_team_info[self._get_strategy_name(strategy)][risk_category.value]["status"] = TASK_STATUS["COMPLETED"]
13021320
self.logger.debug(f"Evaluation complete for {strategy_name}/{risk_category.value}, results stored in red_team_info")
13031321

13041322
async def _process_attack(
@@ -1341,6 +1359,8 @@ async def _process_attack(
13411359
converter = self._get_converter_for_strategy(strategy)
13421360
try:
13431361
self.logger.debug(f"Calling orchestrator for {strategy_name} strategy")
1362+
if isinstance(self.chat_target,OpenAIChatTarget) and not self.chat_target._headers.get("Api-Key", None):
1363+
self.chat_target._headers["Authorization"] = f"Bearer {self.chat_target._token_provider()}"
13441364
orchestrator = await call_orchestrator(self.chat_target, all_prompts, converter, strategy_name, risk_category.value, timeout)
13451365
except PyritException as e:
13461366
log_error(self.logger, f"Error calling orchestrator for {strategy_name} strategy", e)
@@ -1370,6 +1390,7 @@ async def _process_attack(
13701390
except Exception as e:
13711391
log_error(self.logger, f"Error during evaluation for {strategy_name}/{risk_category.value}", e)
13721392
print(f"⚠️ Evaluation error for {strategy_name}/{risk_category.value}: {str(e)}")
1393+
self.red_team_info[strategy_name][risk_category.value]["status"] = TASK_STATUS["FAILED"]
13731394
# Continue processing even if evaluation fails
13741395

13751396
async with progress_bar_lock:

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_utils/constants.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,5 +60,6 @@
6060
"RUNNING": "running",
6161
"COMPLETED": "completed",
6262
"FAILED": "failed",
63-
"TIMEOUT": "timeout"
63+
"TIMEOUT": "timeout",
64+
"INCOMPLETE": "incomplete",
6465
}

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_utils/formatting_utils.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -110,9 +110,14 @@ def format_scorecard(redteam_result: _RedTeamResult) -> str:
110110
:rtype: str
111111
"""
112112
scorecard = redteam_result["redteaming_scorecard"]
113-
overall_asr = scorecard["risk_category_summary"][0]["overall_asr"] if scorecard["risk_category_summary"] else 0
113+
risk_summary = scorecard["risk_category_summary"][0] if scorecard["risk_category_summary"] else {}
114+
overall_asr = risk_summary.get("overall_asr", 0)
114115

115116
output = [f"Overall ASR: {overall_asr}%"]
117+
overall_successes = risk_summary.get("overall_attack_successes", 0)
118+
overall_total = risk_summary.get("overall_total", 0)
119+
output.append(f"Attack Success: {overall_successes}/{overall_total} attacks were successful")
120+
116121

117122
separator = "-" * 108
118123
output.append(separator)
@@ -121,10 +126,16 @@ def format_scorecard(redteam_result: _RedTeamResult) -> str:
121126

122127
for item in scorecard["joint_risk_attack_summary"]:
123128
risk_category = item["risk_category"].replace("_", "-").capitalize()
124-
baseline = f"{item['baseline_asr']}%" if 'baseline_asr' in item else "N/A"
125-
easy = f"{item['easy_complexity_asr']}%" if 'easy_complexity_asr' in item else "N/A"
126-
moderate = f"{item['moderate_complexity_asr']}%" if 'moderate_complexity_asr' in item else "N/A"
127-
difficult = f"{item['difficult_complexity_asr']}%" if 'difficult_complexity_asr' in item else "N/A"
129+
130+
baseline_val = item.get('baseline_asr')
131+
easy_val = item.get('easy_complexity_asr')
132+
moderate_val = item.get('moderate_complexity_asr')
133+
difficult_val = item.get('difficult_complexity_asr')
134+
135+
baseline = "N/A" if is_none_or_nan(baseline_val) else f"{baseline_val}%"
136+
easy = "N/A" if is_none_or_nan(easy_val) else f"{easy_val}%"
137+
moderate = "N/A" if is_none_or_nan(moderate_val) else f"{moderate_val}%"
138+
difficult = "N/A" if is_none_or_nan(difficult_val) else f"{difficult_val}%"
128139

129140
output.append(f"{risk_category:<15}| {baseline:<14} | {easy:<28} | {moderate:<31} | {difficult:<30}")
130141

sdk/evaluation/azure-ai-evaluation/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@
8484
extras_require={
8585
"redteam": [
8686
"termcolor==2.5.0",
87-
"pyrit==0.8.0"
87+
"pyrit>=0.8.0"
8888
]
8989
},
9090
project_urls={

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_red_team.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,6 @@ async def test_get_attack_objectives_with_risk_category(
448448
risk_category=RiskCategory.Violence,
449449
application_scenario="Test scenario"
450450
)
451-
import pdb; pdb.set_trace()
452451
mock_generated_rai_client_instance.get_attack_objectives.assert_called_with(
453452
risk_category="violence",
454453
application_scenario="Test scenario",
@@ -767,6 +766,13 @@ async def test_prompt_sending_orchestrator_timeout(self, red_team):
767766
mock_orchestrator = MagicMock()
768767
mock_orchestrator.send_prompts_async = AsyncMock()
769768
mock_orch_class.return_value = mock_orchestrator
769+
770+
red_team.red_team_info = {
771+
'test_strategy':
772+
{
773+
'test_risk': {}
774+
}
775+
}
770776

771777
result = await red_team._prompt_sending_orchestrator(
772778
chat_target=mock_chat_target,
@@ -785,6 +791,7 @@ async def test_prompt_sending_orchestrator_timeout(self, red_team):
785791
assert "test_strategy_test_risk_single_batch" in red_team.task_statuses
786792
assert red_team.task_statuses["test_strategy_test_risk_single_batch"] == "timeout"
787793
assert red_team.task_statuses["test_strategy_test_risk_orchestrator"] == "completed"
794+
assert red_team.red_team_info["test_strategy"]["test_risk"]["status"] == "incomplete"
788795

789796

790797
@pytest.mark.unittest

0 commit comments

Comments
 (0)