-
Notifications
You must be signed in to change notification settings - Fork 283
Expand file tree
/
Copy pathunicode-injection-scanner.sh
More file actions
executable file
·141 lines (126 loc) · 4.65 KB
/
unicode-injection-scanner.sh
File metadata and controls
executable file
·141 lines (126 loc) · 4.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/bin/bash
# =============================================================================
# Unicode Injection Scanner Hook
# =============================================================================
# Event: PreToolUse (runs before Edit/Write operations)
# Purpose: Detect invisible Unicode characters used for prompt injection
#
# This hook detects evasion techniques that embed invisible instructions:
# - Zero-width characters (U+200B-U+200D, U+FEFF)
# - RTL/LTR override (U+202A-U+202E, U+2066-U+2069)
# - ANSI escape sequences (terminal injection)
# - Null bytes (truncation attacks)
# - Tag characters (U+E0000-U+E007F)
#
# Installation:
# Add to .claude/settings.json:
# {
# "hooks": {
# "PreToolUse": [{
# "matcher": "Edit|Write",
# "hooks": ["bash examples/hooks/bash/unicode-injection-scanner.sh"]
# }]
# }
# }
#
# Exit codes:
# 0 = allow (no injection detected)
# 2 = block (injection detected, stderr message shown to Claude)
#
# References:
# - CVE-2025-53109/53110: Unicode-based sandbox escape
# - Arxiv 2509.22040: Prompt Injection on Coding Assistants
# =============================================================================
set -euo pipefail
# Read the hook input from stdin
INPUT=$(cat)
TOOL_NAME=$(echo "$INPUT" | jq -r '.tool_name // empty')
TOOL_INPUT=$(echo "$INPUT" | jq -r '.tool_input // empty')
# Only check Edit and Write tools
case "$TOOL_NAME" in
Edit|Write)
;;
*)
exit 0
;;
esac
# Extract content to analyze
CONTENT=""
case "$TOOL_NAME" in
Write)
CONTENT=$(echo "$TOOL_INPUT" | jq -r '.content // empty')
;;
Edit)
CONTENT=$(echo "$TOOL_INPUT" | jq -r '.new_string // empty')
;;
esac
# Skip if no content
[[ -z "$CONTENT" ]] && exit 0
# === ZERO-WIDTH CHARACTERS ===
# U+200B Zero Width Space
# U+200C Zero Width Non-Joiner
# U+200D Zero Width Joiner
# U+FEFF Byte Order Mark (when not at start)
if echo "$CONTENT" | grep -qP '[\x{200B}-\x{200D}\x{FEFF}]'; then
echo "BLOCKED: Zero-width characters detected (U+200B-U+200D or BOM). These can hide malicious instructions." >&2
exit 2
fi
# === BIDIRECTIONAL TEXT OVERRIDE ===
# U+202A Left-to-Right Embedding
# U+202B Right-to-Left Embedding
# U+202C Pop Directional Formatting
# U+202D Left-to-Right Override
# U+202E Right-to-Left Override (most dangerous - reverses text display)
# U+2066-U+2069 Isolate controls
if echo "$CONTENT" | grep -qP '[\x{202A}-\x{202E}\x{2066}-\x{2069}]'; then
echo "BLOCKED: Bidirectional text override detected (U+202A-U+202E). These can disguise malicious commands." >&2
exit 2
fi
# === ANSI ESCAPE SEQUENCES ===
# \x1b[ CSI (Control Sequence Introducer) - terminal control
# \x1b] OSC (Operating System Command)
# \x1b( Character set selection
# These can manipulate terminal display or execute commands
if echo "$CONTENT" | grep -qE $'\x1b\[|\x1b\]|\x1b\('; then
echo "BLOCKED: ANSI escape sequence detected. These can manipulate terminal display." >&2
exit 2
fi
# === NULL BYTES ===
# \x00 can truncate strings and bypass security checks
if echo "$CONTENT" | grep -qP '\x00'; then
echo "BLOCKED: Null byte detected. These can cause string truncation attacks." >&2
exit 2
fi
# === TAG CHARACTERS ===
# U+E0000-U+E007F are invisible "tag" characters
# Sometimes used to embed hidden data
if echo "$CONTENT" | grep -qP '[\x{E0000}-\x{E007F}]'; then
echo "BLOCKED: Unicode tag characters detected (U+E0000-E007F). These can embed invisible data." >&2
exit 2
fi
# === OVERLONG UTF-8 SEQUENCES ===
# Detect potential overlong encodings (e.g., encoding '/' as C0 AF instead of 2F)
# These can bypass path filters
# Check for C0 or C1 bytes followed by 80-BF (overlong 2-byte sequences)
if echo "$CONTENT" | grep -qP '[\xC0-\xC1][\x80-\xBF]'; then
echo "BLOCKED: Overlong UTF-8 sequence detected. These can bypass security filters." >&2
exit 2
fi
# === HOMOGLYPHS WARNING ===
# Detect Cyrillic characters that look like Latin (confusables)
# Common in typosquatting and filter bypass
# а (U+0430) vs a, е (U+0435) vs e, о (U+043E) vs o, etc.
HOMOGLYPHS_FOUND=false
if echo "$CONTENT" | grep -qP '[\x{0430}\x{0435}\x{043E}\x{0440}\x{0441}\x{0445}]'; then
HOMOGLYPHS_FOUND=true
fi
if echo "$CONTENT" | grep -qP '[\x{0391}-\x{03C9}]' && echo "$CONTENT" | grep -qP '[a-zA-Z]'; then
# Greek mixed with Latin
HOMOGLYPHS_FOUND=true
fi
if [[ "$HOMOGLYPHS_FOUND" == "true" ]]; then
# Warning only - could be legitimate multilingual content
echo '{"systemMessage": "Warning: Potential homoglyph characters detected (Cyrillic/Greek mixed with Latin). Verify this is not an attempt to bypass filters."}'
fi
# All checks passed
exit 0