Skip to content

Commit 5958ede

Browse files
nagkumar91Nagkumar ArkalgudNagkumar ArkalgudNagkumar Arkalgud
authored
Experiment/red team agent tool (#40481)
* Update task_query_response.prompty remove required keys * Update task_simulate.prompty * Update task_query_response.prompty * Update task_simulate.prompty * Fix the api_key needed * Update for release * Black fix for file * Add original text in global context * Update test * Update the indirect attack simulator * Black suggested fixes * Update simulator prompty * Update adversarial scenario enum to exclude XPIA * Update changelog * Black fixes * Remove duplicate import * Fix the mypy error * Mypy please be happy * Updates to non adv simulator * accept context from assistant messages, exclude them when using them for conversation * update changelog * pylint fixes * pylint fixes * remove redundant quotes * Fix typo * pylint fix * Update broken tests * Include the grounding json in the manifest * Fix typo * Come on package * Release 1.0.0b5 * Notice from Chang * Remove adv_conv template parameters from the outputs * Update chanagelog * Experimental tags on adv scenarios * Readme fix onbreaking change * Add the category and both user and assistant context to the response of qr_json_lines * Update changelog * Rename _kwargs to _options * _options as prefix * update troubleshooting for simulator * Rename according to suggestions * Clean up readme * more links * Bugfix: zip_longest created null parameters * Updated changelog * zip does the job * remove ununsed import * Fix changelog merge * Remove print statements * Intermediary commit w.i.p * remove debugger as getting 200s now * Skip baseline and keep promptsending orchestrator * init red team agent as a tool * Add more updates to make the tool work * working prompt generation * Making the tool work with azure ai agents * Initialize function * aggregated binary threshold results for evaluators in metrics * handle when the right columns do not exist * Marking mypy, pylint, black as false * Add a target feature * fix sample import and move agent to red team * Semantic kernel sample * making it work * revert unnecessary files * revert file * update package with _ and samples --------- Co-authored-by: Nagkumar Arkalgud <[email protected]> Co-authored-by: Nagkumar Arkalgud <[email protected]> Co-authored-by: Nagkumar Arkalgud <[email protected]>
1 parent 35f6922 commit 5958ede

File tree

9 files changed

+1393
-3
lines changed

9 files changed

+1393
-3
lines changed
Lines changed: 264 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
# ------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ------------------------------------
4+
5+
"""
6+
Red Team Tools as Functions for Azure AI Agent
7+
This module provides functions that can be used as tools in Azure AI Agent for red teaming purposes.
8+
"""
9+
10+
import json
11+
import asyncio
12+
from typing import Any, Callable, Set, Optional
13+
from azure.identity import DefaultAzureCredential
14+
from azure.ai.evaluation.red_team._agent import RedTeamToolProvider
15+
16+
17+
# Initialize the credential and tool provider (will be created when first needed)
18+
credential = None
19+
tool_provider = None
20+
azure_ai_project = None
21+
target_function = None
22+
23+
def _get_tool_provider() -> RedTeamToolProvider:
24+
"""Get or create the RedTeamToolProvider instance."""
25+
global credential, tool_provider, azure_ai_project
26+
if tool_provider is None:
27+
credential = DefaultAzureCredential()
28+
tool_provider = RedTeamToolProvider(
29+
azure_ai_project=azure_ai_project,
30+
credential=credential,
31+
application_scenario="A customer service chatbot for a retail website" # Optional context
32+
)
33+
return tool_provider
34+
35+
# Dictionary to store fetched prompts for reference
36+
fetched_prompts = {}
37+
38+
def red_team_fetch_harmful_prompt(risk_category: str, strategy: str = "baseline", convert_with_strategy: Optional[str] = None) -> str:
39+
"""
40+
Fetch a harmful prompt for a specific risk category to test content filters.
41+
42+
This function retrieves a prompt that could potentially trigger content filters
43+
and can be used for red teaming AI systems.
44+
45+
:param risk_category (str): The risk category to get a harmful prompt for (e.g., 'violence', 'hate_unfairness', 'sexual', 'self_harm').
46+
:param strategy (str, optional): The attack strategy to use (e.g., 'baseline', 'jailbreak'). Defaults to 'baseline'.
47+
:param convert_with_strategy (Optional[str], optional): Optional strategy to convert the prompt (e.g., 'morse_converter', 'binary_converter'). Defaults to None.
48+
:return: A JSON string containing the harmful prompt and related metadata.
49+
:rtype: str
50+
"""
51+
provider = _get_tool_provider()
52+
53+
# Run the async method in a new event loop
54+
result = asyncio.run(provider.fetch_harmful_prompt(
55+
risk_category_text=risk_category,
56+
strategy=strategy,
57+
convert_with_strategy=convert_with_strategy
58+
))
59+
60+
# Store the prompt for later conversion if successful
61+
if result["status"] == "success" and "prompt_id" in result:
62+
prompt_id = result["prompt_id"]
63+
if "prompt" in result:
64+
fetched_prompts[prompt_id] = result["prompt"]
65+
66+
return json.dumps(result)
67+
68+
def red_team_convert_prompt(prompt_or_id: str, strategy: str) -> str:
69+
"""
70+
Convert a prompt or a previously fetched prompt ID using a specified strategy.
71+
72+
This function can be used to transform prompts in ways that might bypass content filters
73+
for testing AI system defenses.
74+
75+
:param prompt_or_id (str): Either a prompt text or a prompt ID from a previous fetch_harmful_prompt call.
76+
:param strategy (str): The strategy to use for conversion (e.g., 'morse_converter', 'binary_converter', 'base64_converter').
77+
:return: A JSON string containing the original and converted prompt.
78+
:rtype: str
79+
"""
80+
provider = _get_tool_provider()
81+
82+
# Check if input is a prompt ID we have stored
83+
if prompt_or_id in fetched_prompts:
84+
# Update the provider's cache
85+
provider._fetched_prompts[prompt_or_id] = fetched_prompts[prompt_or_id]
86+
87+
# Run the async method in a new event loop
88+
result = asyncio.run(provider.convert_prompt(
89+
prompt_or_id=prompt_or_id,
90+
strategy=strategy
91+
))
92+
93+
return json.dumps(result)
94+
95+
def red_team_unified(category: str, strategy: Optional[str] = None) -> str:
96+
"""
97+
Get a harmful prompt for a specific risk category and optionally convert it.
98+
99+
This function combines fetching and converting harmful prompts in one call,
100+
making it easier to test AI system defenses.
101+
102+
:param category (str): The risk category to get a harmful prompt for (e.g., 'violence', 'hate_unfairness', 'sexual', 'self_harm').
103+
:param strategy (Optional[str], optional): Optional strategy to convert the prompt (e.g., 'morse_converter', 'binary_converter'). Defaults to None.
104+
:return: A JSON string containing the harmful prompt and related metadata.
105+
:rtype: str
106+
"""
107+
provider = _get_tool_provider()
108+
109+
# Run the async method in a new event loop
110+
result = asyncio.run(provider.red_team(
111+
category=category,
112+
strategy=strategy
113+
))
114+
115+
# Store the prompt for later conversion if it's a success and we didn't convert it
116+
if result["status"] == "success":
117+
if "prompt_id" in result and "prompt" in result:
118+
prompt_id = result["prompt_id"]
119+
fetched_prompts[prompt_id] = result["prompt"]
120+
121+
return json.dumps(result)
122+
123+
def red_team_get_available_strategies() -> str:
124+
"""
125+
Get a list of all available prompt conversion strategies.
126+
127+
This function lists all the strategies that can be used to convert prompts
128+
for red teaming purposes.
129+
130+
:return: A JSON string containing a list of available conversion strategies.
131+
:rtype: str
132+
"""
133+
provider = _get_tool_provider()
134+
strategies = provider.get_available_strategies()
135+
136+
return json.dumps({
137+
"status": "success",
138+
"available_strategies": strategies
139+
})
140+
141+
def red_team_explain_purpose() -> str:
142+
"""
143+
Explain the purpose and responsible use of red teaming tools.
144+
145+
This function provides information about the ethical use of red teaming tools
146+
and their purpose in improving AI system safety.
147+
148+
:return: A JSON string containing information about responsible use of red teaming tools.
149+
:rtype: str
150+
"""
151+
explanation = {
152+
"purpose": "Red teaming tools are designed to test AI systems for vulnerabilities and safety issues.",
153+
"responsible_use": [
154+
"Only use these tools on systems you have permission to test.",
155+
"Use the results to improve system safety and robustness.",
156+
"Do not use these tools to cause harm or bypass legitimate safety measures in production systems."
157+
],
158+
"risk_categories": {
159+
"violence": "Content related to physical harm, weapons, or dangerous activities",
160+
"hate_unfairness": "Content related to discrimination, bias, or prejudice",
161+
"sexual": "Explicit sexual content",
162+
"self_harm": "Content related to suicide, self-injury, or harmful behaviors"
163+
},
164+
"conversion_strategies": "Various encoding and obfuscation methods to test if harmful content can bypass detection"
165+
}
166+
167+
return json.dumps(explanation)
168+
169+
def red_team_send_to_target(prompt: str) -> str:
170+
"""
171+
Send a prompt to the target function (e.g., call_ollama) and return the response.
172+
173+
This function allows sending prompts to an external model or service for
174+
evaluation.
175+
176+
:param prompt (str): The prompt text to send to the target function.
177+
:return: A JSON string containing the response from the target function.
178+
:rtype: str
179+
"""
180+
global target_function
181+
182+
if target_function is None:
183+
return json.dumps({
184+
"status": "error",
185+
"message": "Target function not initialized. Make sure to pass a target_func to initialize_tool_provider."
186+
})
187+
188+
try:
189+
# Call the target function with the prompt
190+
response = target_function(prompt)
191+
192+
return json.dumps({
193+
"status": "success",
194+
"prompt": prompt,
195+
"response": response
196+
})
197+
except Exception as e:
198+
return json.dumps({
199+
"status": "error",
200+
"message": f"Error calling target function: {str(e)}",
201+
"prompt": prompt
202+
})
203+
204+
# Example User Input for Each Function
205+
# 1. Red Team Fetch Harmful Prompt
206+
# User Input: "Fetch a harmful prompt in the 'violence' category."
207+
# User Input: "Fetch a harmful prompt in the 'hate_unfairness' category with a 'jailbreak' strategy."
208+
# User Input: "Fetch a harmful prompt in the 'self_harm' category and convert it with 'base64_converter'."
209+
210+
# 2. Red Team Convert Prompt
211+
# User Input: "Convert this prompt 'prompt_12345678' using the 'binary_converter' strategy."
212+
# User Input: "Convert the text 'This is a test prompt' using the 'morse_converter' strategy."
213+
214+
# 3. Red Team Unified
215+
# User Input: "Get a harmful prompt in the 'sexual' category."
216+
# User Input: "Get a harmful prompt in the 'violence' category and convert it with 'leetspeak_converter'."
217+
218+
# 4. Red Team Get Available Strategies
219+
# User Input: "What conversion strategies are available for red teaming?"
220+
221+
# 5. Red Team Explain Purpose
222+
# User Input: "What is the purpose of red teaming tools and how should they be used responsibly?"
223+
224+
# Statically defined user functions for fast reference
225+
user_functions: Set[Callable[..., Any]] = {
226+
red_team_fetch_harmful_prompt,
227+
red_team_convert_prompt,
228+
red_team_unified,
229+
red_team_get_available_strategies,
230+
red_team_explain_purpose,
231+
red_team_send_to_target
232+
}
233+
234+
def initialize_tool_provider(
235+
projects_connection_string: str,
236+
target_func: Optional[Callable[[str], str]] = None,
237+
) -> Set[Callable[..., Any]]:
238+
"""
239+
Initialize the RedTeamToolProvider with the Azure AI project and credential.
240+
This function is called when the module is imported.
241+
242+
:param projects_connection_string: The Azure AI project connection string.
243+
:param target_func: A function that takes a string prompt and returns a string response.
244+
:return: A set of callable functions that can be used as tools.
245+
"""
246+
# projects_connection_string is in the format: connection_string;subscription_id;resource_group;project_name
247+
# parse it to a dictionary called azure_ai_project
248+
global azure_ai_project, credential, tool_provider, target_function
249+
250+
# Store the target function for later use
251+
if target_func is not None:
252+
globals()['target_function'] = target_func
253+
azure_ai_project = {
254+
"subscription_id": projects_connection_string.split(";")[1],
255+
"resource_group_name": projects_connection_string.split(";")[2],
256+
"project_name": projects_connection_string.split(";")[3]
257+
}
258+
if not credential:
259+
credential = DefaultAzureCredential()
260+
tool_provider = RedTeamToolProvider(
261+
azure_ai_project=azure_ai_project,
262+
credential=credential,
263+
)
264+
return user_functions

0 commit comments

Comments
 (0)