Red team ensure proper XPIA prompt application for model targets (#43707)

slister1001 · Copilot · web-flow · commit 50d24248596d · 2025-10-31T14:20:25.000-04:00
* ensure xpia prompt application for model targets

* updates

* Update sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_red_team.py

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;

* updates

---------

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_callback_chat_target.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_callback_chat_target.py
@@ -63,7 +63,9 @@ async def send_prompt_async(self, *, prompt_request: PromptRequestResponse) -> P
 
                 # Check if any context has agent-specific fields for logging
                 has_agent_fields = any(
-                    isinstance(ctx, dict) and ("context_type" in ctx or "tool_name" in ctx) for ctx in contexts
+                    isinstance(ctx, dict)
+                    and ("context_type" in ctx and "tool_name" in ctx and ctx["tool_name"] is not None)
+                    for ctx in contexts
                 )
 
                 if has_agent_fields:
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_orchestrator_manager.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_orchestrator_manager.py
@@ -271,7 +271,9 @@ async def _prompt_sending_orchestrator(
 
                 # Check if any context has agent-specific fields (context_type, tool_name)
                 has_agent_fields = any(
-                    isinstance(ctx, dict) and ("context_type" in ctx or "tool_name" in ctx) for ctx in contexts
+                    isinstance(ctx, dict)
+                    and ("context_type" in ctx and "tool_name" in ctx and ctx["tool_name"] is not None)
+                    for ctx in contexts
                 )
 
                 # Build context_dict to pass via memory labels
@@ -284,46 +286,6 @@ async def _prompt_sending_orchestrator(
                     else None
                 )
 
-                # Initialize processed_prompt with the original prompt as default
-                processed_prompt = prompt
-
-                # Determine how to handle the prompt based on target type and context fields
-                if isinstance(chat_target, _CallbackChatTarget):
-                    # CallbackChatTarget: Always pass contexts via context_dict, embed in prompt content
-                    if contexts and not has_agent_fields:
-                        # For contexts without agent fields, the prompt already has context embedded
-                        # (done in _extract_objective_content), so just use it as-is
-                        processed_prompt = prompt
-                        self.logger.debug(
-                            f"CallbackChatTarget: Prompt has embedded context, passing {len(contexts)} context source(s) in context_dict"
-                        )
-                    else:
-                        # Agent fields present - prompt is clean, contexts have structure
-                        processed_prompt = prompt
-                        tool_names = [
-                            ctx.get("tool_name") for ctx in contexts if isinstance(ctx, dict) and "tool_name" in ctx
-                        ]
-                        self.logger.debug(
-                            f"CallbackChatTarget: Passing {len(contexts)} structured context(s) with agent fields, tool_names={tool_names}"
-                        )
-                else:
-                    # Non-CallbackChatTarget: Embed contexts in the actual PyRIT message
-                    if has_agent_fields:
-                        # Agent target with structured context - don't embed in prompt
-                        processed_prompt = prompt
-                        tool_names = [
-                            ctx.get("tool_name") for ctx in contexts if isinstance(ctx, dict) and "tool_name" in ctx
-                        ]
-                        self.logger.debug(
-                            f"Non-CallbackChatTarget with agent fields: {len(contexts)} context source(s), tool_names={tool_names}"
-                        )
-                    elif contexts:
-                        # Model target without agent fields - embed context in prompt
-                        # Note: The prompt already has context embedded from _extract_objective_content
-                        # But for non-CallbackChatTarget, we may need additional wrapping
-                        processed_prompt = prompt
-                        self.logger.debug(f"Non-CallbackChatTarget: Using prompt with embedded context")
-
                 try:
                     # Create retry-enabled function using the reusable decorator
                     @network_retry_decorator(
@@ -339,7 +301,7 @@ async def send_prompt_with_retry():
                             memory_labels["risk_sub_type"] = risk_sub_type
                         return await asyncio.wait_for(
                             orchestrator.send_prompts_async(
-                                prompt_list=[processed_prompt],
+                                prompt_list=[prompt],
                                 memory_labels=memory_labels,
                             ),
                             timeout=calculated_timeout,
@@ -514,46 +476,6 @@ async def _multi_turn_orchestrator(
                     ctx.get("content", "") if isinstance(ctx, dict) else str(ctx) for ctx in contexts
                 )
 
-            # Initialize processed_prompt with the original prompt as default
-            processed_prompt = prompt
-
-            # Determine how to handle the prompt based on target type and context fields
-            if isinstance(chat_target, _CallbackChatTarget):
-                # CallbackChatTarget: Always pass contexts via context_dict, embed in prompt content
-                if contexts and not has_agent_fields:
-                    # For contexts without agent fields, the prompt already has context embedded
-                    # (done in _extract_objective_content), so just use it as-is
-                    processed_prompt = prompt
-                    self.logger.debug(
-                        f"CallbackChatTarget: Prompt has embedded context, passing {len(contexts)} context source(s) in context_dict"
-                    )
-                else:
-                    # Agent fields present - prompt is clean, contexts have structure
-                    processed_prompt = prompt
-                    tool_names = [
-                        ctx.get("tool_name") for ctx in contexts if isinstance(ctx, dict) and "tool_name" in ctx
-                    ]
-                    self.logger.debug(
-                        f"CallbackChatTarget: Passing {len(contexts)} structured context(s) with agent fields, tool_names={tool_names}"
-                    )
-            else:
-                # Non-CallbackChatTarget: Embed contexts in the actual PyRIT message
-                if has_agent_fields:
-                    # Agent target with structured context - don't embed in prompt
-                    processed_prompt = prompt
-                    tool_names = [
-                        ctx.get("tool_name") for ctx in contexts if isinstance(ctx, dict) and "tool_name" in ctx
-                    ]
-                    self.logger.debug(
-                        f"Non-CallbackChatTarget with agent fields: {len(contexts)} context source(s), tool_names={tool_names}"
-                    )
-                elif contexts:
-                    # Model target without agent fields - embed context in prompt
-                    # Note: The prompt already has context embedded from _extract_objective_content
-                    # But for non-CallbackChatTarget, we may need additional wrapping
-                    processed_prompt = prompt
-                    self.logger.debug(f"Non-CallbackChatTarget: Using prompt with embedded context")
-
             try:
                 azure_rai_service_scorer = AzureRAIServiceTrueFalseScorer(
                     client=self.generated_rai_client,
@@ -741,7 +663,9 @@ async def _crescendo_orchestrator(
 
             # Check if any context has agent-specific fields (context_type, tool_name)
             has_agent_fields = any(
-                isinstance(ctx, dict) and ("context_type" in ctx or "tool_name" in ctx) for ctx in contexts
+                isinstance(ctx, dict)
+                and ("context_type" in ctx and "tool_name" in ctx and ctx["tool_name"] is not None)
+                for ctx in contexts
             )
 
             # Build context_dict to pass via memory labels
@@ -762,46 +686,6 @@ async def _crescendo_orchestrator(
                     ctx.get("content", "") if isinstance(ctx, dict) else str(ctx) for ctx in contexts
                 )
 
-            # Initialize processed_prompt with the original prompt as default
-            processed_prompt = prompt
-
-            # Determine how to handle the prompt based on target type and context fields
-            if isinstance(chat_target, _CallbackChatTarget):
-                # CallbackChatTarget: Always pass contexts via context_dict, embed in prompt content
-                if contexts and not has_agent_fields:
-                    # For contexts without agent fields, the prompt already has context embedded
-                    # (done in _extract_objective_content), so just use it as-is
-                    processed_prompt = prompt
-                    self.logger.debug(
-                        f"CallbackChatTarget: Prompt has embedded context, passing {len(contexts)} context source(s) in context_dict"
-                    )
-                else:
-                    # Agent fields present - prompt is clean, contexts have structure
-                    processed_prompt = prompt
-                    tool_names = [
-                        ctx.get("tool_name") for ctx in contexts if isinstance(ctx, dict) and "tool_name" in ctx
-                    ]
-                    self.logger.debug(
-                        f"CallbackChatTarget: Passing {len(contexts)} structured context(s) with agent fields, tool_names={tool_names}"
-                    )
-            else:
-                # Non-CallbackChatTarget: Embed contexts in the actual PyRIT message
-                if has_agent_fields:
-                    # Agent target with structured context - don't embed in prompt
-                    processed_prompt = prompt
-                    tool_names = [
-                        ctx.get("tool_name") for ctx in contexts if isinstance(ctx, dict) and "tool_name" in ctx
-                    ]
-                    self.logger.debug(
-                        f"Non-CallbackChatTarget with agent fields: {len(contexts)} context source(s), tool_names={tool_names}"
-                    )
-                elif contexts:
-                    # Model target without agent fields - embed context in prompt
-                    # Note: The prompt already has context embedded from _extract_objective_content
-                    # But for non-CallbackChatTarget, we may need additional wrapping
-                    processed_prompt = prompt
-                    self.logger.debug(f"Non-CallbackChatTarget: Using prompt with embedded context")
-
             try:
                 red_llm_scoring_target = RAIServiceEvalChatTarget(
                     logger=self.logger,
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_red_team.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_red_team.py
@@ -369,7 +369,9 @@ async def _get_attack_objectives(
 
             if custom_objectives:
                 # Use custom objectives for this risk category
-                return await self._get_custom_attack_objectives(risk_cat_value, num_objectives, strategy, current_key)
+                return await self._get_custom_attack_objectives(
+                    risk_cat_value, num_objectives, strategy, current_key, is_agent_target
+                )
             else:
                 # No custom objectives for this risk category, but risk_categories was specified
                 # Fetch from service if this risk category is in the requested list
@@ -412,7 +414,12 @@ async def _get_attack_objectives(
             )
 
     async def _get_custom_attack_objectives(
-        self, risk_cat_value: str, num_objectives: int, strategy: str, current_key: tuple
+        self,
+        risk_cat_value: str,
+        num_objectives: int,
+        strategy: str,
+        current_key: tuple,
+        is_agent_target: Optional[bool] = None,
     ) -> List[str]:
         """Get attack objectives from custom seed prompts."""
         attack_objective_generator = self.attack_objective_generator
@@ -439,10 +446,12 @@ async def _get_custom_attack_objectives(
         else:
             selected_cat_objectives = custom_objectives
             self.logger.info(f"Using all {len(custom_objectives)} available objectives for {risk_cat_value}")
-
+        target_type_str = "agent" if is_agent_target else "model" if is_agent_target is not None else None
         # Handle jailbreak strategy - need to apply jailbreak prefixes to messages
         if strategy == "jailbreak":
             selected_cat_objectives = await self._apply_jailbreak_prefixes(selected_cat_objectives)
+        elif strategy == "indirect_jailbreak":
+            selected_cat_objectives = await self._apply_xpia_prompts(selected_cat_objectives, target_type_str)
 
         # Extract content from selected objectives
         selected_prompts = []
@@ -517,6 +526,8 @@ async def _get_rai_attack_objectives(
             # Handle jailbreak strategy
             if strategy == "jailbreak":
                 objectives_response = await self._apply_jailbreak_prefixes(objectives_response)
+            elif strategy == "indirect_jailbreak":
+                objectives_response = await self._apply_xpia_prompts(objectives_response, target_type_str)
 
         except Exception as e:
             self.logger.error(f"Error calling get_attack_objectives: {str(e)}")
@@ -566,8 +577,7 @@ async def _get_rai_attack_objectives(
                     if strategy == "jailbreak":
                         objectives_response = await self._apply_jailbreak_prefixes(objectives_response)
                     elif strategy == "indirect_jailbreak":
-                        # Try agent-type XPIA first, will fallback to model-type XPIA within the method
-                        objectives_response = await self._apply_xpia_prompts(objectives_response, "agent")
+                        objectives_response = await self._apply_xpia_prompts(objectives_response, target_type_str)
 
                     # Check if fallback response is also empty
                     if not objectives_response or (
@@ -894,7 +904,9 @@ def _extract_objective_content(self, selected_objectives: List) -> List[str]:
 
                     # Check if any context has agent-specific fields
                     has_agent_fields = any(
-                        isinstance(ctx, dict) and ("context_type" in ctx or "tool_name" in ctx) for ctx in contexts
+                        isinstance(ctx, dict)
+                        and ("context_type" in ctx and "tool_name" in ctx and ctx["tool_name"] is not None)
+                        for ctx in contexts
                     )
 
                     # For contexts without agent fields, append them to the content

Original file line number	Diff line number	Diff line change
`@@ -63,7 +63,9 @@ async def send_prompt_async(self, *, prompt_request: PromptRequestResponse) -> P`
`63`	`63`
`64`	`64`	`# Check if any context has agent-specific fields for logging`
`65`	`65`	`has_agent_fields = any(`
`66`		`- isinstance(ctx, dict) and ("context_type" in ctx or "tool_name" in ctx) for ctx in contexts`
	`66`	`+ isinstance(ctx, dict)`
	`67`	`+ and ("context_type" in ctx and "tool_name" in ctx and ctx["tool_name"] is not None)`
	`68`	`+ for ctx in contexts`
`67`	`69`	`)`
`68`	`70`
`69`	`71`	`if has_agent_fields:`