forked from FlorianBruniaux/claude-code-ultimate-guide
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprompt-injection-detector.sh
More file actions
executable file
·214 lines (190 loc) · 5.9 KB
/
prompt-injection-detector.sh
File metadata and controls
executable file
·214 lines (190 loc) · 5.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
#!/bin/bash
# Hook: PreToolUse - Detect prompt injection attempts
# Exit 0 = allow, Exit 2 = block (stderr message shown to Claude)
#
# This hook detects common prompt injection patterns that attempt to
# manipulate Claude's behavior through malicious instructions.
#
# Place in: .claude/hooks/prompt-injection-detector.sh
# Register in: .claude/settings.json under PreToolUse event
set -e
# Read JSON from stdin
INPUT=$(cat)
TOOL_NAME=$(echo "$INPUT" | jq -r '.tool_name // empty')
TOOL_INPUT=$(echo "$INPUT" | jq -r '.tool_input // empty')
# Only check tools that handle user-provided text content
case "$TOOL_NAME" in
Bash|Write|Edit|WebFetch)
;;
*)
exit 0
;;
esac
# Extract content to analyze based on tool type
CONTENT=""
case "$TOOL_NAME" in
Bash)
CONTENT=$(echo "$TOOL_INPUT" | jq -r '.command // empty')
;;
Write|Edit)
CONTENT=$(echo "$TOOL_INPUT" | jq -r '.content // .new_string // empty')
;;
WebFetch)
CONTENT=$(echo "$TOOL_INPUT" | jq -r '.url // empty')
;;
esac
# Skip if no content to analyze
[[ -z "$CONTENT" ]] && exit 0
# Convert to lowercase for case-insensitive matching
CONTENT_LOWER=$(echo "$CONTENT" | tr '[:upper:]' '[:lower:]')
# === ROLE OVERRIDE PATTERNS ===
# Attempts to override Claude's instructions or identity
ROLE_OVERRIDE_PATTERNS=(
"ignore previous instructions"
"ignore all previous"
"ignore your instructions"
"disregard previous"
"disregard your instructions"
"forget your instructions"
"forget everything"
"you are now"
"act as if"
"pretend you are"
"pretend to be"
"from now on you"
"new instructions:"
"override:"
"system prompt:"
)
for pattern in "${ROLE_OVERRIDE_PATTERNS[@]}"; do
if [[ "$CONTENT_LOWER" == *"$pattern"* ]]; then
echo "BLOCKED: Prompt injection detected - role override attempt: '$pattern'" >&2
exit 2
fi
done
# === JAILBREAK PATTERNS ===
# Known jailbreak techniques and bypass attempts
JAILBREAK_PATTERNS=(
"dan mode"
"developer mode"
"jailbreak"
"do anything now"
"bypass"
"no restrictions"
"without restrictions"
"unrestricted mode"
"god mode"
"sudo mode"
"admin mode"
"debug mode"
"maintenance mode"
)
for pattern in "${JAILBREAK_PATTERNS[@]}"; do
if [[ "$CONTENT_LOWER" == *"$pattern"* ]]; then
echo "BLOCKED: Prompt injection detected - jailbreak attempt: '$pattern'" >&2
exit 2
fi
done
# === DELIMITER INJECTION ===
# Attempts to inject fake system messages or close prompt sections
DELIMITER_PATTERNS=(
"</system>"
"<|endoftext|>"
"<|im_end|>"
"[/INST]"
"[INST]"
"<<SYS>>"
"<</SYS>>"
"### System:"
"### Human:"
"### Assistant:"
"```system"
"SYSTEM:"
)
for pattern in "${DELIMITER_PATTERNS[@]}"; do
if [[ "$CONTENT" == *"$pattern"* ]]; then
echo "BLOCKED: Prompt injection detected - delimiter injection: '$pattern'" >&2
exit 2
fi
done
# === AUTHORITY IMPERSONATION ===
# Claims of special authority or permissions
AUTHORITY_PATTERNS=(
"anthropic employee"
"anthropic staff"
"i am your creator"
"i am your developer"
"i have admin access"
"authorized to bypass"
"emergency override"
"security exception"
"this is a test"
"testing mode"
)
for pattern in "${AUTHORITY_PATTERNS[@]}"; do
if [[ "$CONTENT_LOWER" == *"$pattern"* ]]; then
echo "BLOCKED: Prompt injection detected - authority impersonation: '$pattern'" >&2
exit 2
fi
done
# === BASE64 ENCODED INSTRUCTIONS ===
# Detect potential base64-encoded payloads (heuristic)
# Look for long base64-like strings that might contain instructions
if echo "$CONTENT" | grep -qE '[A-Za-z0-9+/]{50,}={0,2}'; then
# Try to decode and check for injection patterns
DECODED=$(echo "$CONTENT" | grep -oE '[A-Za-z0-9+/]{50,}={0,2}' | head -1 | base64 -d 2>/dev/null || true)
DECODED_LOWER=$(echo "$DECODED" | tr '[:upper:]' '[:lower:]')
for pattern in "ignore" "override" "system" "jailbreak" "dan mode"; do
if [[ "$DECODED_LOWER" == *"$pattern"* ]]; then
echo "BLOCKED: Prompt injection detected - encoded payload containing: '$pattern'" >&2
exit 2
fi
done
fi
# === ANSI ESCAPE SEQUENCES ===
# Terminal manipulation via escape codes (CVE-related)
# \x1b[ CSI, \x1b] OSC, \x1b( charset selection
if echo "$CONTENT" | grep -qE $'\x1b\[|\x1b\]|\x1b\('; then
echo "BLOCKED: ANSI escape sequence detected - potential terminal injection" >&2
exit 2
fi
# === NULL BYTE INJECTION ===
# Null bytes can truncate strings and bypass security checks
if echo "$CONTENT" | grep -qP '\x00'; then
echo "BLOCKED: Null byte detected - potential truncation attack" >&2
exit 2
fi
# === NESTED COMMAND EXECUTION ===
# Detect $() and backtick command substitution that could bypass denylists
# This catches patterns like: $(curl evil.com | bash) or `rm -rf /`
NESTED_CMD_PATTERNS=(
'\$\([^)]*\b(curl|wget|bash|sh|nc|python|ruby|perl|php)\b'
'`[^`]*\b(curl|wget|bash|sh|nc|python|ruby|perl|php)\b'
'\$\([^)]*\b(rm|dd|mkfs|chmod|chown)\b'
'`[^`]*\b(rm|dd|mkfs|chmod|chown)\b'
)
for pattern in "${NESTED_CMD_PATTERNS[@]}"; do
if echo "$CONTENT" | grep -qE "$pattern"; then
echo "BLOCKED: Nested command execution detected - potential bypass attempt" >&2
exit 2
fi
done
# === CONTEXT MANIPULATION ===
# Attempts to manipulate the conversation context
CONTEXT_PATTERNS=(
"in the previous message"
"as i mentioned earlier"
"you agreed to"
"you already said"
"you promised"
"remember when you"
"our agreement was"
)
for pattern in "${CONTEXT_PATTERNS[@]}"; do
if [[ "$CONTENT_LOWER" == *"$pattern"* ]]; then
# Warning only - these could be legitimate
echo '{"systemMessage": "Warning: Detected potential context manipulation pattern. Verify legitimacy."}'
fi
done
# Allow by default
exit 0