Skip to content

Commit 50d2424

Browse files
slister1001Copilot
andauthored
Red team ensure proper XPIA prompt application for model targets (#43707)
* ensure xpia prompt application for model targets * updates * Update sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_red_team.py Co-authored-by: Copilot <[email protected]> * updates --------- Co-authored-by: Copilot <[email protected]>
1 parent 894d166 commit 50d2424

File tree

3 files changed

+28
-130
lines changed

3 files changed

+28
-130
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_callback_chat_target.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,9 @@ async def send_prompt_async(self, *, prompt_request: PromptRequestResponse) -> P
6363

6464
# Check if any context has agent-specific fields for logging
6565
has_agent_fields = any(
66-
isinstance(ctx, dict) and ("context_type" in ctx or "tool_name" in ctx) for ctx in contexts
66+
isinstance(ctx, dict)
67+
and ("context_type" in ctx and "tool_name" in ctx and ctx["tool_name"] is not None)
68+
for ctx in contexts
6769
)
6870

6971
if has_agent_fields:

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_orchestrator_manager.py

Lines changed: 7 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,9 @@ async def _prompt_sending_orchestrator(
271271

272272
# Check if any context has agent-specific fields (context_type, tool_name)
273273
has_agent_fields = any(
274-
isinstance(ctx, dict) and ("context_type" in ctx or "tool_name" in ctx) for ctx in contexts
274+
isinstance(ctx, dict)
275+
and ("context_type" in ctx and "tool_name" in ctx and ctx["tool_name"] is not None)
276+
for ctx in contexts
275277
)
276278

277279
# Build context_dict to pass via memory labels
@@ -284,46 +286,6 @@ async def _prompt_sending_orchestrator(
284286
else None
285287
)
286288

287-
# Initialize processed_prompt with the original prompt as default
288-
processed_prompt = prompt
289-
290-
# Determine how to handle the prompt based on target type and context fields
291-
if isinstance(chat_target, _CallbackChatTarget):
292-
# CallbackChatTarget: Always pass contexts via context_dict, embed in prompt content
293-
if contexts and not has_agent_fields:
294-
# For contexts without agent fields, the prompt already has context embedded
295-
# (done in _extract_objective_content), so just use it as-is
296-
processed_prompt = prompt
297-
self.logger.debug(
298-
f"CallbackChatTarget: Prompt has embedded context, passing {len(contexts)} context source(s) in context_dict"
299-
)
300-
else:
301-
# Agent fields present - prompt is clean, contexts have structure
302-
processed_prompt = prompt
303-
tool_names = [
304-
ctx.get("tool_name") for ctx in contexts if isinstance(ctx, dict) and "tool_name" in ctx
305-
]
306-
self.logger.debug(
307-
f"CallbackChatTarget: Passing {len(contexts)} structured context(s) with agent fields, tool_names={tool_names}"
308-
)
309-
else:
310-
# Non-CallbackChatTarget: Embed contexts in the actual PyRIT message
311-
if has_agent_fields:
312-
# Agent target with structured context - don't embed in prompt
313-
processed_prompt = prompt
314-
tool_names = [
315-
ctx.get("tool_name") for ctx in contexts if isinstance(ctx, dict) and "tool_name" in ctx
316-
]
317-
self.logger.debug(
318-
f"Non-CallbackChatTarget with agent fields: {len(contexts)} context source(s), tool_names={tool_names}"
319-
)
320-
elif contexts:
321-
# Model target without agent fields - embed context in prompt
322-
# Note: The prompt already has context embedded from _extract_objective_content
323-
# But for non-CallbackChatTarget, we may need additional wrapping
324-
processed_prompt = prompt
325-
self.logger.debug(f"Non-CallbackChatTarget: Using prompt with embedded context")
326-
327289
try:
328290
# Create retry-enabled function using the reusable decorator
329291
@network_retry_decorator(
@@ -339,7 +301,7 @@ async def send_prompt_with_retry():
339301
memory_labels["risk_sub_type"] = risk_sub_type
340302
return await asyncio.wait_for(
341303
orchestrator.send_prompts_async(
342-
prompt_list=[processed_prompt],
304+
prompt_list=[prompt],
343305
memory_labels=memory_labels,
344306
),
345307
timeout=calculated_timeout,
@@ -514,46 +476,6 @@ async def _multi_turn_orchestrator(
514476
ctx.get("content", "") if isinstance(ctx, dict) else str(ctx) for ctx in contexts
515477
)
516478

517-
# Initialize processed_prompt with the original prompt as default
518-
processed_prompt = prompt
519-
520-
# Determine how to handle the prompt based on target type and context fields
521-
if isinstance(chat_target, _CallbackChatTarget):
522-
# CallbackChatTarget: Always pass contexts via context_dict, embed in prompt content
523-
if contexts and not has_agent_fields:
524-
# For contexts without agent fields, the prompt already has context embedded
525-
# (done in _extract_objective_content), so just use it as-is
526-
processed_prompt = prompt
527-
self.logger.debug(
528-
f"CallbackChatTarget: Prompt has embedded context, passing {len(contexts)} context source(s) in context_dict"
529-
)
530-
else:
531-
# Agent fields present - prompt is clean, contexts have structure
532-
processed_prompt = prompt
533-
tool_names = [
534-
ctx.get("tool_name") for ctx in contexts if isinstance(ctx, dict) and "tool_name" in ctx
535-
]
536-
self.logger.debug(
537-
f"CallbackChatTarget: Passing {len(contexts)} structured context(s) with agent fields, tool_names={tool_names}"
538-
)
539-
else:
540-
# Non-CallbackChatTarget: Embed contexts in the actual PyRIT message
541-
if has_agent_fields:
542-
# Agent target with structured context - don't embed in prompt
543-
processed_prompt = prompt
544-
tool_names = [
545-
ctx.get("tool_name") for ctx in contexts if isinstance(ctx, dict) and "tool_name" in ctx
546-
]
547-
self.logger.debug(
548-
f"Non-CallbackChatTarget with agent fields: {len(contexts)} context source(s), tool_names={tool_names}"
549-
)
550-
elif contexts:
551-
# Model target without agent fields - embed context in prompt
552-
# Note: The prompt already has context embedded from _extract_objective_content
553-
# But for non-CallbackChatTarget, we may need additional wrapping
554-
processed_prompt = prompt
555-
self.logger.debug(f"Non-CallbackChatTarget: Using prompt with embedded context")
556-
557479
try:
558480
azure_rai_service_scorer = AzureRAIServiceTrueFalseScorer(
559481
client=self.generated_rai_client,
@@ -741,7 +663,9 @@ async def _crescendo_orchestrator(
741663

742664
# Check if any context has agent-specific fields (context_type, tool_name)
743665
has_agent_fields = any(
744-
isinstance(ctx, dict) and ("context_type" in ctx or "tool_name" in ctx) for ctx in contexts
666+
isinstance(ctx, dict)
667+
and ("context_type" in ctx and "tool_name" in ctx and ctx["tool_name"] is not None)
668+
for ctx in contexts
745669
)
746670

747671
# Build context_dict to pass via memory labels
@@ -762,46 +686,6 @@ async def _crescendo_orchestrator(
762686
ctx.get("content", "") if isinstance(ctx, dict) else str(ctx) for ctx in contexts
763687
)
764688

765-
# Initialize processed_prompt with the original prompt as default
766-
processed_prompt = prompt
767-
768-
# Determine how to handle the prompt based on target type and context fields
769-
if isinstance(chat_target, _CallbackChatTarget):
770-
# CallbackChatTarget: Always pass contexts via context_dict, embed in prompt content
771-
if contexts and not has_agent_fields:
772-
# For contexts without agent fields, the prompt already has context embedded
773-
# (done in _extract_objective_content), so just use it as-is
774-
processed_prompt = prompt
775-
self.logger.debug(
776-
f"CallbackChatTarget: Prompt has embedded context, passing {len(contexts)} context source(s) in context_dict"
777-
)
778-
else:
779-
# Agent fields present - prompt is clean, contexts have structure
780-
processed_prompt = prompt
781-
tool_names = [
782-
ctx.get("tool_name") for ctx in contexts if isinstance(ctx, dict) and "tool_name" in ctx
783-
]
784-
self.logger.debug(
785-
f"CallbackChatTarget: Passing {len(contexts)} structured context(s) with agent fields, tool_names={tool_names}"
786-
)
787-
else:
788-
# Non-CallbackChatTarget: Embed contexts in the actual PyRIT message
789-
if has_agent_fields:
790-
# Agent target with structured context - don't embed in prompt
791-
processed_prompt = prompt
792-
tool_names = [
793-
ctx.get("tool_name") for ctx in contexts if isinstance(ctx, dict) and "tool_name" in ctx
794-
]
795-
self.logger.debug(
796-
f"Non-CallbackChatTarget with agent fields: {len(contexts)} context source(s), tool_names={tool_names}"
797-
)
798-
elif contexts:
799-
# Model target without agent fields - embed context in prompt
800-
# Note: The prompt already has context embedded from _extract_objective_content
801-
# But for non-CallbackChatTarget, we may need additional wrapping
802-
processed_prompt = prompt
803-
self.logger.debug(f"Non-CallbackChatTarget: Using prompt with embedded context")
804-
805689
try:
806690
red_llm_scoring_target = RAIServiceEvalChatTarget(
807691
logger=self.logger,

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_red_team.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -369,7 +369,9 @@ async def _get_attack_objectives(
369369

370370
if custom_objectives:
371371
# Use custom objectives for this risk category
372-
return await self._get_custom_attack_objectives(risk_cat_value, num_objectives, strategy, current_key)
372+
return await self._get_custom_attack_objectives(
373+
risk_cat_value, num_objectives, strategy, current_key, is_agent_target
374+
)
373375
else:
374376
# No custom objectives for this risk category, but risk_categories was specified
375377
# Fetch from service if this risk category is in the requested list
@@ -412,7 +414,12 @@ async def _get_attack_objectives(
412414
)
413415

414416
async def _get_custom_attack_objectives(
415-
self, risk_cat_value: str, num_objectives: int, strategy: str, current_key: tuple
417+
self,
418+
risk_cat_value: str,
419+
num_objectives: int,
420+
strategy: str,
421+
current_key: tuple,
422+
is_agent_target: Optional[bool] = None,
416423
) -> List[str]:
417424
"""Get attack objectives from custom seed prompts."""
418425
attack_objective_generator = self.attack_objective_generator
@@ -439,10 +446,12 @@ async def _get_custom_attack_objectives(
439446
else:
440447
selected_cat_objectives = custom_objectives
441448
self.logger.info(f"Using all {len(custom_objectives)} available objectives for {risk_cat_value}")
442-
449+
target_type_str = "agent" if is_agent_target else "model" if is_agent_target is not None else None
443450
# Handle jailbreak strategy - need to apply jailbreak prefixes to messages
444451
if strategy == "jailbreak":
445452
selected_cat_objectives = await self._apply_jailbreak_prefixes(selected_cat_objectives)
453+
elif strategy == "indirect_jailbreak":
454+
selected_cat_objectives = await self._apply_xpia_prompts(selected_cat_objectives, target_type_str)
446455

447456
# Extract content from selected objectives
448457
selected_prompts = []
@@ -517,6 +526,8 @@ async def _get_rai_attack_objectives(
517526
# Handle jailbreak strategy
518527
if strategy == "jailbreak":
519528
objectives_response = await self._apply_jailbreak_prefixes(objectives_response)
529+
elif strategy == "indirect_jailbreak":
530+
objectives_response = await self._apply_xpia_prompts(objectives_response, target_type_str)
520531

521532
except Exception as e:
522533
self.logger.error(f"Error calling get_attack_objectives: {str(e)}")
@@ -566,8 +577,7 @@ async def _get_rai_attack_objectives(
566577
if strategy == "jailbreak":
567578
objectives_response = await self._apply_jailbreak_prefixes(objectives_response)
568579
elif strategy == "indirect_jailbreak":
569-
# Try agent-type XPIA first, will fallback to model-type XPIA within the method
570-
objectives_response = await self._apply_xpia_prompts(objectives_response, "agent")
580+
objectives_response = await self._apply_xpia_prompts(objectives_response, target_type_str)
571581

572582
# Check if fallback response is also empty
573583
if not objectives_response or (
@@ -894,7 +904,9 @@ def _extract_objective_content(self, selected_objectives: List) -> List[str]:
894904

895905
# Check if any context has agent-specific fields
896906
has_agent_fields = any(
897-
isinstance(ctx, dict) and ("context_type" in ctx or "tool_name" in ctx) for ctx in contexts
907+
isinstance(ctx, dict)
908+
and ("context_type" in ctx and "tool_name" in ctx and ctx["tool_name"] is not None)
909+
for ctx in contexts
898910
)
899911

900912
# For contexts without agent fields, append them to the content

0 commit comments

Comments
 (0)