@@ -5,29 +5,24 @@ import (
55 "regexp"
66)
77
8- // InstructionOverrideDetector detects attempts to override or chain malicious instructions.
98type InstructionOverrideDetector struct {}
109
1110var (
12- // Temporal commands that chain instructions (e.g., "after summarizing, send email")
1311 temporalCommandsRe = regexp .MustCompile (`(?i)(after|once|when)\s+\w+ing.*?,` )
1412
15- // Direct override commands (targeting system instructions/rules)
16- overrideCommandsRe = regexp .MustCompile (`(?i)(ignore|disregard|forget)\s+(all|your|the|any)?\s*(previous|prior|above|earlier)?\s*(instructions?|rules?|directions?|commands?|prompts?)` )
13+ overrideCommandsRe = regexp .MustCompile (`(?i)(ignore|disregard|forget|bypass|circumvent|override|skip)\s+(all|your|the|any|my)?\s*(previous|prior|above|earlier|current|existing)?\s*(instructions?|rules?|directions?|commands?|prompts?|guidelines?|safety|training|filters?|constraints?|limitations?|context)` )
1714
18- // Instruction injection using delimiters
19- delimiterInjectionRe = regexp .MustCompile (`(?i)(new instructions?|additional task|also do|and then)\s*:` )
15+ positionalOverrideRe = regexp .MustCompile (`(?i)(ignore|disregard|forget|bypass)\s+(everything|all)\s*(above|before)` )
2016
21- // Priority override attempts
22- priorityOverrideRe = regexp .MustCompile (`(?i)(instead|rather than|don't|do not)\s+\w+` )
17+ delimiterInjectionRe = regexp .MustCompile (`(?i)(new instructions?\s*\w*|new task|additional task|also do|and then)\s*:` )
2318
24- // Reset/restart commands that attempt to clear context
25- // Examples: "let's start over", "forget everything", "reset", "begin again"
26- resetCommandsRe = regexp .MustCompile (`(?i)(let'?s |let us )?(start over|forget (everything|all|it)|reset|begin again|restart)` )
19+ resetCommandsRe = regexp .MustCompile (`(?i)(let'?s |let us )?(start (over|fresh|again|from scratch)|forget (everything|all|it|your|the)|reset|begin again|restart|clear (context|history|memory)|wipe (your )?(memory|context|history)|new session|context (window )?(cleared|reset|wiped))` )
2720
28- // Multi-step attack patterns with suspicious endings
29- // Match "first...then" ONLY when combined with attack keywords
3021 multiStepAttackRe = regexp .MustCompile (`(?i)(first|initially).*?(then|next|finally).*(ignore|bypass|override|reveal|show|admin|root|execute)` )
22+
23+ fromNowOnRe = regexp .MustCompile (`(?i)(from (now|this point) on|going forward|henceforth).{0,60}(ignore|forget|disregard|bypass|act|respond|behave|you are|no (restrictions?|rules?|guidelines?|filters?))` )
24+
25+ fakeAuthorityRe = regexp .MustCompile (`(?i)(your (real|actual|true|original|hidden) instructions? (are|say|tell you)|system update received|new (instructions?|directives?) (from|by) (admin|system|developer)|override\s*:)` )
3126)
3227
3328func NewInstructionOverrideDetector () * InstructionOverrideDetector {
@@ -44,7 +39,6 @@ func (d *InstructionOverrideDetector) Detect(ctx context.Context, input string)
4439 default :
4540 }
4641
47- // Check temporal commands (high risk: 0.8)
4842 if matches := temporalCommandsRe .FindAllString (input , - 1 ); len (matches ) > 0 {
4943 patterns = append (patterns , DetectedPattern {
5044 Type : "instruction_override_temporal" ,
@@ -56,7 +50,6 @@ func (d *InstructionOverrideDetector) Detect(ctx context.Context, input string)
5650 }
5751 }
5852
59- // Check override commands (high risk: 0.9)
6053 if matches := overrideCommandsRe .FindAllString (input , - 1 ); len (matches ) > 0 {
6154 patterns = append (patterns , DetectedPattern {
6255 Type : "instruction_override_direct" ,
@@ -68,22 +61,20 @@ func (d *InstructionOverrideDetector) Detect(ctx context.Context, input string)
6861 }
6962 }
7063
71- // Check delimiter injection (medium risk: 0.7)
72- if matches := delimiterInjectionRe .FindAllString (input , - 1 ); len (matches ) > 0 {
64+ if matches := positionalOverrideRe .FindAllString (input , - 1 ); len (matches ) > 0 {
7365 patterns = append (patterns , DetectedPattern {
74- Type : "instruction_override_delimiter " ,
75- Score : 0.7 ,
66+ Type : "instruction_override_direct " ,
67+ Score : 0.9 ,
7668 Matches : matches ,
7769 })
78- if 0.7 > maxScore {
79- maxScore = 0.7
70+ if 0.9 > maxScore {
71+ maxScore = 0.9
8072 }
8173 }
8274
83- // Check priority override (high risk: 0.7)
84- if matches := priorityOverrideRe .FindAllString (input , - 1 ); len (matches ) > 0 {
75+ if matches := delimiterInjectionRe .FindAllString (input , - 1 ); len (matches ) > 0 {
8576 patterns = append (patterns , DetectedPattern {
86- Type : "instruction_override_priority " ,
77+ Type : "instruction_override_delimiter " ,
8778 Score : 0.7 ,
8879 Matches : matches ,
8980 })
@@ -92,7 +83,6 @@ func (d *InstructionOverrideDetector) Detect(ctx context.Context, input string)
9283 }
9384 }
9485
95- // Check reset/restart commands (high risk: 0.85)
9686 if matches := resetCommandsRe .FindAllString (input , - 1 ); len (matches ) > 0 {
9787 patterns = append (patterns , DetectedPattern {
9888 Type : "instruction_override_reset" ,
@@ -104,7 +94,6 @@ func (d *InstructionOverrideDetector) Detect(ctx context.Context, input string)
10494 }
10595 }
10696
107- // Check multi-step attacks (high risk: 0.85)
10897 if matches := multiStepAttackRe .FindAllString (input , - 1 ); len (matches ) > 0 {
10998 patterns = append (patterns , DetectedPattern {
11099 Type : "instruction_override_multistep" ,
@@ -116,10 +105,31 @@ func (d *InstructionOverrideDetector) Detect(ctx context.Context, input string)
116105 }
117106 }
118107
108+ if matches := fromNowOnRe .FindAllString (input , - 1 ); len (matches ) > 0 {
109+ patterns = append (patterns , DetectedPattern {
110+ Type : "instruction_override_from_now_on" ,
111+ Score : 0.85 ,
112+ Matches : matches ,
113+ })
114+ if 0.85 > maxScore {
115+ maxScore = 0.85
116+ }
117+ }
118+
119+ if matches := fakeAuthorityRe .FindAllString (input , - 1 ); len (matches ) > 0 {
120+ patterns = append (patterns , DetectedPattern {
121+ Type : "instruction_override_fake_authority" ,
122+ Score : 0.9 ,
123+ Matches : matches ,
124+ })
125+ if 0.9 > maxScore {
126+ maxScore = 0.9
127+ }
128+ }
129+
119130 confidence := 0.0
120131 if maxScore > 0 {
121132 confidence = maxScore
122- // Boost confidence slightly if multiple patterns match
123133 if len (patterns ) > 1 {
124134 confidence = min (confidence + 0.05 , 1.0 )
125135 }
0 commit comments