Skip to content

Commit 166c255

Browse files
committed
updating prompt injection sys prompt and evals
1 parent 206db3e commit 166c255

File tree

9 files changed

+281
-66
lines changed

9 files changed

+281
-66
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,3 +101,6 @@ site/
101101
__pycache__/
102102
*.pyc
103103
.pytest_cache/
104+
105+
# internal
106+
internal_examples/
-24 KB
Loading

docs/ref/checks/prompt_injection_detection.md

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ Returns a `GuardrailResult` with the following `info` dictionary:
6565
"observation": "The assistant is calling get_weather function with location parameter",
6666
"flagged": false,
6767
"confidence": 0.1,
68+
"evidence": null,
6869
"threshold": 0.7,
6970
"user_goal": "What's the weather in Tokyo?",
7071
"action": [
@@ -81,6 +82,7 @@ Returns a `GuardrailResult` with the following `info` dictionary:
8182
- **`observation`**: What the AI action is doing
8283
- **`flagged`**: Whether the action is misaligned (boolean)
8384
- **`confidence`**: Confidence score (0.0 to 1.0) that the action is misaligned
85+
- **`evidence`**: Specific evidence from conversation history that supports the decision (null when aligned)
8486
- **`threshold`**: The confidence threshold that was configured
8587
- **`user_goal`**: The tracked user intent from conversation
8688
- **`action`**: The list of function calls or tool outputs analyzed for alignment
@@ -92,10 +94,8 @@ Returns a `GuardrailResult` with the following `info` dictionary:
9294

9395
This benchmark evaluates model performance on agent conversation traces:
9496

95-
- **Synthetic dataset**: 1,000 samples with 500 positive cases (50% prevalence) simulating realistic agent traces
96-
- **AgentDojo dataset**: 1,046 samples from AgentDojo's workspace, travel, banking, and Slack suite combined with the "important_instructions" attack (949 positive cases, 97 negative samples)
97-
- **Test scenarios**: Multi-turn conversations with function calls and tool outputs across realistic workplace domains
98-
- **Misalignment examples**: Unrelated function calls, harmful operations, and data leakage
97+
- **[AgentDojo dataset](https://github.com/ethz-spylab/agentdojo)**: 1,046 samples generated from running AgentDojo's benchmark script on workspace, travel, banking, and Slack suite combined with the "important_instructions" attack (949 positive cases, 97 negative samples)
98+
- **Internal synthetic dataset**: 537 positive cases simulating realistic, multi-turn agent conversation traces
9999

100100
**Example of misaligned conversation:**
101101

@@ -113,12 +113,12 @@ This benchmark evaluates model performance on agent conversation traces:
113113

114114
| Model | ROC AUC | Prec@R=0.80 | Prec@R=0.90 | Prec@R=0.95 | Recall@FPR=0.01 |
115115
|---------------|---------|-------------|-------------|-------------|-----------------|
116-
| gpt-5 | 0.9604 | 0.998 | 0.995 | 0.963 | 0.431 |
117-
| gpt-5-mini | 0.9796 | 0.999 | 0.999 | 0.966 | 0.000 |
118-
| gpt-5-nano | 0.8651 | 0.963 | 0.963 | 0.951 | 0.056 |
119-
| gpt-4.1 | 0.9846 | 0.998 | 0.998 | 0.998 | 0.000 |
120-
| gpt-4.1-mini (default) | 0.9728 | 0.995 | 0.995 | 0.995 | 0.000 |
121-
| gpt-4.1-nano | 0.8677 | 0.974 | 0.974 | 0.974 | 0.000 |
116+
| gpt-5 | 0.9931 | 0.9992 | 0.9992 | 0.9992 | 0.5845 |
117+
| gpt-5-mini | 0.9536 | 0.9951 | 0.9951 | 0.9951 | 0.0000 |
118+
| gpt-5-nano | 0.9283 | 0.9913 | 0.9913 | 0.9717 | 0.0350 |
119+
| gpt-4.1 | 0.9794 | 0.9973 | 0.9973 | 0.9973 | 0.0000 |
120+
| gpt-4.1-mini (default) | 0.9865 | 0.9986 | 0.9986 | 0.9986 | 0.0000 |
121+
| gpt-4.1-nano | 0.9142 | 0.9948 | 0.9948 | 0.9387 | 0.0000 |
122122

123123
**Notes:**
124124

src/__tests__/unit/prompt_injection_detection.test.ts

Lines changed: 101 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ const mockOpenAI = {
2222
flagged: false,
2323
confidence: 0.2,
2424
observation: "The LLM action is aligned with the user's goal",
25+
evidence: null,
2526
}),
2627
},
2728
},
@@ -65,8 +66,9 @@ describe('Prompt Injection Detection Check', () => {
6566
const result = await promptInjectionDetectionCheck(contextWithoutHistory, 'test data', config);
6667

6768
expect(result.tripwireTriggered).toBe(false);
68-
expect(result.info.observation).toBe('No conversation history available');
69+
expect(result.info.observation).toBe('No actionable tool messages to evaluate');
6970
expect(result.info.guardrail_name).toBe('Prompt Injection Detection');
71+
expect(result.info.evidence).toBeNull();
7072
});
7173

7274
it('should return skip result when only user messages', async () => {
@@ -120,6 +122,103 @@ describe('Prompt Injection Detection Check', () => {
120122
const result = await promptInjectionDetectionCheck(contextWithError, 'test data', config);
121123

122124
expect(result.tripwireTriggered).toBe(false);
123-
expect(result.info.observation).toBe('No conversation history available');
125+
expect(result.info.observation).toBe('No actionable tool messages to evaluate');
126+
});
127+
128+
it('should not flag benign weather check', async () => {
129+
const result = await promptInjectionDetectionCheck(mockContext, 'test data', config);
130+
131+
expect(result.tripwireTriggered).toBe(false);
132+
expect(result.info.confidence).toBeLessThan(config.confidence_threshold);
133+
expect(result.info.guardrail_name).toBe('Prompt Injection Detection');
134+
expect(result.info.evidence).toBeNull();
135+
});
136+
137+
it('should handle context with previous messages', async () => {
138+
const contextWithHistory = {
139+
...mockContext,
140+
getConversationHistory: () => [
141+
{ role: 'user', content: 'Can you help me?' },
142+
{ role: 'assistant', content: 'Of course!' },
143+
{ role: 'user', content: 'What is the weather in Tokyo?' },
144+
{ role: 'assistant', content: 'I will check the weather for you.' },
145+
{ type: 'function_call', name: 'get_weather', arguments: '{"location": "Tokyo"}' },
146+
],
147+
};
148+
149+
const result = await promptInjectionDetectionCheck(contextWithHistory, 'test data', config);
150+
151+
expect(result.tripwireTriggered).toBe(false);
152+
expect(result.info.user_goal).toContain('What is the weather in Tokyo?');
153+
expect(result.info.user_goal).toContain('Previous context');
154+
});
155+
156+
it('should process tool outputs correctly', async () => {
157+
const contextWithToolOutput = {
158+
...mockContext,
159+
getConversationHistory: () => [
160+
{ role: 'user', content: 'Check the weather in Paris' },
161+
{ type: 'function_call', name: 'get_weather', arguments: '{"location": "Paris"}' },
162+
{ type: 'function_call_output', call_id: 'call_456', output: '{"temperature": 18}' },
163+
],
164+
};
165+
166+
const result = await promptInjectionDetectionCheck(contextWithToolOutput, 'test data', config);
167+
168+
expect(result.info.action).toBeDefined();
169+
expect(result.info.action.length).toBeGreaterThan(0);
170+
});
171+
172+
it('should propagate evidence when LLM flags injection', async () => {
173+
const flaggedOpenAI = {
174+
chat: {
175+
completions: {
176+
create: async () => ({
177+
choices: [
178+
{
179+
message: {
180+
content: JSON.stringify({
181+
flagged: true,
182+
confidence: 0.9,
183+
observation: 'Detected malicious function call unrelated to user intent',
184+
evidence: 'function call: delete_files with arguments {}',
185+
}),
186+
},
187+
},
188+
],
189+
}),
190+
},
191+
},
192+
};
193+
194+
const flaggedContext = {
195+
...mockContext,
196+
guardrailLlm: flaggedOpenAI as unknown as OpenAI,
197+
};
198+
199+
const result = await promptInjectionDetectionCheck(flaggedContext, 'test data', config);
200+
201+
expect(result.tripwireTriggered).toBe(true);
202+
expect(result.info.evidence).toBe('function call: delete_files with arguments {}');
203+
});
204+
205+
it('should handle empty tool output', async () => {
206+
const contextWithEmptyOutput = {
207+
...mockContext,
208+
getConversationHistory: () => [
209+
{ role: 'user', content: 'Test query' },
210+
{ type: 'function_call', name: 'test_function', arguments: '{}' },
211+
{ type: 'function_call_output', call_id: 'call_789', output: '' },
212+
],
213+
};
214+
215+
const result = await promptInjectionDetectionCheck(
216+
contextWithEmptyOutput,
217+
'test data',
218+
config
219+
);
220+
221+
expect(result.tripwireTriggered).toBe(false);
222+
expect(result.info.action).toBeDefined();
124223
});
125224
});

src/checks/hallucination-detection.ts

Lines changed: 29 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import { z } from 'zod';
2121
import { CheckFn, GuardrailResult, GuardrailLLMContext } from '../types';
2222
import { defaultSpecRegistry } from '../registry';
23+
import { createErrorResult, LLMErrorOutput } from './llm-base';
2324

2425
/**
2526
* Configuration schema for hallucination detection.
@@ -196,22 +197,24 @@ export const hallucination_detection: CheckFn<
196197
parsedJson = JSON.parse(jsonText);
197198
} catch (error) {
198199
console.warn('Failed to parse LLM response as JSON:', jsonText);
199-
// Return a safe default if JSON parsing fails
200-
return {
201-
tripwireTriggered: false,
202-
info: {
203-
guardrail_name: 'Hallucination Detection',
204-
flagged: false,
205-
confidence: 0.0,
200+
// Return a safe default if JSON parsing fails using shared error helper
201+
const errorOutput: LLMErrorOutput = {
202+
flagged: false,
203+
confidence: 0.0,
204+
info: { error_message: `JSON parsing failed: ${error instanceof Error ? error.message : String(error)}` },
205+
};
206+
return createErrorResult(
207+
'Hallucination Detection',
208+
errorOutput,
209+
candidate,
210+
{
211+
threshold: config.confidence_threshold,
206212
reasoning: 'LLM response could not be parsed as JSON',
207213
hallucination_type: null,
208214
hallucinated_statements: null,
209215
verified_statements: null,
210-
threshold: config.confidence_threshold,
211-
error: `JSON parsing failed: ${error instanceof Error ? error.message : String(error)}`,
212-
checked_text: candidate,
213-
},
214-
};
216+
}
217+
);
215218
}
216219

217220
const analysis = HallucinationDetectionOutput.parse(parsedJson);
@@ -234,23 +237,25 @@ export const hallucination_detection: CheckFn<
234237
},
235238
};
236239
} catch (error) {
237-
// Log unexpected errors and return safe default
240+
// Log unexpected errors and return safe default using shared error helper
238241
console.error('Unexpected error in hallucination_detection:', error);
239-
return {
240-
tripwireTriggered: false,
241-
info: {
242-
guardrail_name: 'Hallucination Detection',
243-
flagged: false,
244-
confidence: 0.0,
242+
const errorOutput: LLMErrorOutput = {
243+
flagged: false,
244+
confidence: 0.0,
245+
info: { error_message: error instanceof Error ? error.message : String(error) },
246+
};
247+
return createErrorResult(
248+
'Hallucination Detection',
249+
errorOutput,
250+
candidate,
251+
{
252+
threshold: config.confidence_threshold,
245253
reasoning: `Analysis failed: ${error instanceof Error ? error.message : String(error)}`,
246254
hallucination_type: null,
247255
hallucinated_statements: null,
248256
verified_statements: null,
249-
threshold: config.confidence_threshold,
250-
error: error instanceof Error ? error.message : String(error),
251-
checked_text: candidate, // Hallucination Detection doesn't modify text, pass through unchanged
252-
},
253-
};
257+
}
258+
);
254259
}
255260
};
256261

src/checks/llm-base.ts

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,37 @@ export const LLMErrorOutput = LLMOutput.extend({
6262

6363
export type LLMErrorOutput = z.infer<typeof LLMErrorOutput>;
6464

65+
/**
66+
* Create a standardized error result for LLM-based guardrails.
67+
*
68+
* This helper provides a consistent way to handle errors across all LLM-based checks,
69+
* ensuring uniform error reporting and preventing tripwire triggers on execution failures.
70+
*
71+
* @param guardrailName - Name of the guardrail that encountered the error.
72+
* @param analysis - LLMErrorOutput containing error information.
73+
* @param checkedText - The original text that was being checked.
74+
* @param additionalInfo - Optional additional information to include in the result.
75+
* @returns GuardrailResult with tripwireTriggered=false and error information.
76+
*/
77+
export function createErrorResult(
78+
guardrailName: string,
79+
analysis: LLMErrorOutput,
80+
checkedText: string,
81+
additionalInfo: Record<string, unknown> = {}
82+
): GuardrailResult {
83+
return {
84+
tripwireTriggered: false,
85+
info: {
86+
guardrail_name: guardrailName,
87+
flagged: analysis.flagged,
88+
confidence: analysis.confidence,
89+
checked_text: checkedText,
90+
...analysis.info,
91+
...additionalInfo,
92+
},
93+
};
94+
}
95+
6596
/**
6697
* Assemble a complete LLM prompt with instructions and response schema.
6798
*

0 commit comments

Comments
 (0)