Skip to content

Commit c9d0603

Browse files
authored
Merge branch 'vllm-project:main' into aibrix_profile
2 parents 4b84c21 + 4e2ee29 commit c9d0603

File tree

18 files changed

+827
-47
lines changed

18 files changed

+827
-47
lines changed

candle-binding/regex_provider.go

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
package candle_binding
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"regexp"
7+
"strings"
8+
"time"
9+
)
10+
11+
// RegexProviderConfig holds the configuration for the regex provider.
12+
type RegexProviderConfig struct {
13+
MaxPatterns int `yaml:"max_patterns"`
14+
MaxPatternLength int `yaml:"max_pattern_length"`
15+
MaxInputLength int `yaml:"max_input_length"`
16+
DefaultTimeoutMs int `yaml:"default_timeout_ms"`
17+
Patterns []RegexPattern `yaml:"patterns"`
18+
}
19+
20+
// RegexPattern defines a single regex pattern.
21+
type RegexPattern struct {
22+
ID string `yaml:"id"`
23+
Pattern string `yaml:"pattern"`
24+
Flags string `yaml:"flags"`
25+
Category string `yaml:"category"`
26+
}
27+
28+
// RegexProvider is a ReDoS-safe regex scanner.
29+
// It uses Go's built-in regexp package, which is based on RE2 and is not
30+
// vulnerable to regular expression denial of service attacks.
31+
type RegexProvider struct {
32+
compiled []*regexp.Regexp
33+
patterns []RegexPattern
34+
timeout time.Duration
35+
maxInputLength int
36+
testDelay time.Duration // For testing purposes
37+
}
38+
39+
// MatchResult represents a single regex match.
40+
type MatchResult struct {
41+
PatternID string
42+
Category string
43+
Match string
44+
StartIndex int
45+
EndIndex int
46+
}
47+
48+
// NewRegexProvider creates a new RegexProvider.
49+
func NewRegexProvider(cfg RegexProviderConfig, options ...func(*RegexProvider)) (*RegexProvider, error) {
50+
if len(cfg.Patterns) > cfg.MaxPatterns {
51+
return nil, fmt.Errorf("number of patterns (%d) exceeds max_patterns (%d)", len(cfg.Patterns), cfg.MaxPatterns)
52+
}
53+
54+
compiled := make([]*regexp.Regexp, 0, len(cfg.Patterns))
55+
for _, p := range cfg.Patterns {
56+
if len(p.Pattern) > cfg.MaxPatternLength {
57+
return nil, fmt.Errorf("pattern length for ID '%s' (%d) exceeds max_pattern_length (%d)", p.ID, len(p.Pattern), cfg.MaxPatternLength)
58+
}
59+
60+
pattern := p.Pattern
61+
if strings.Contains(p.Flags, "i") {
62+
pattern = "(?i)" + pattern
63+
}
64+
65+
re, err := regexp.Compile(pattern)
66+
if err != nil {
67+
return nil, fmt.Errorf("failed to compile pattern ID '%s': %w", p.ID, err)
68+
}
69+
compiled = append(compiled, re)
70+
}
71+
72+
rp := &RegexProvider{
73+
compiled: compiled,
74+
patterns: cfg.Patterns,
75+
timeout: time.Duration(cfg.DefaultTimeoutMs) * time.Millisecond,
76+
maxInputLength: cfg.MaxInputLength,
77+
}
78+
79+
for _, option := range options {
80+
option(rp)
81+
}
82+
83+
return rp, nil
84+
}
85+
86+
// WithTestDelay is a functional option to add a delay for testing timeouts.
87+
func WithTestDelay(d time.Duration) func(*RegexProvider) {
88+
return func(rp *RegexProvider) {
89+
rp.testDelay = d
90+
}
91+
}
92+
93+
// Scan scans the input string for matches.
94+
// The scan is performed in a separate goroutine and is subject to a timeout.
95+
// The timeout check is performed between each pattern, so a single very slow
96+
// pattern can still block for longer than the timeout. However, Go's regex
97+
// engine is very fast and not vulnerable to ReDoS, so this is not a major
98+
// concern in practice.
99+
func (rp *RegexProvider) Scan(input string) ([]MatchResult, error) {
100+
if len(input) > rp.maxInputLength {
101+
return nil, fmt.Errorf("input length (%d) exceeds max_input_length (%d)", len(input), rp.maxInputLength)
102+
}
103+
104+
ctx, cancel := context.WithTimeout(context.Background(), rp.timeout)
105+
defer cancel()
106+
107+
resultChan := make(chan struct {
108+
matches []MatchResult
109+
err error
110+
}, 1)
111+
112+
go func() {
113+
var matches []MatchResult
114+
for i, re := range rp.compiled {
115+
select {
116+
case <-ctx.Done():
117+
// The context was cancelled, so we don't need to continue.
118+
return
119+
default:
120+
// Introduce a delay for testing purposes
121+
if rp.testDelay > 0 {
122+
time.Sleep(rp.testDelay)
123+
}
124+
125+
locs := re.FindAllStringIndex(input, -1)
126+
for _, loc := range locs {
127+
matches = append(matches, MatchResult{
128+
PatternID: rp.patterns[i].ID,
129+
Category: rp.patterns[i].Category,
130+
Match: input[loc[0]:loc[1]],
131+
StartIndex: loc[0],
132+
EndIndex: loc[1],
133+
})
134+
}
135+
}
136+
}
137+
resultChan <- struct {
138+
matches []MatchResult
139+
err error
140+
}{matches, nil}
141+
}()
142+
143+
select {
144+
case res := <-resultChan:
145+
return res.matches, res.err
146+
case <-ctx.Done():
147+
return nil, fmt.Errorf("regex scan timed out after %v", rp.timeout)
148+
}
149+
}
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
package candle_binding
2+
3+
import (
4+
"fmt"
5+
"testing"
6+
)
7+
8+
func BenchmarkRegexProvider_Scan(b *testing.B) {
9+
cfg := RegexProviderConfig{
10+
MaxPatterns: 100,
11+
MaxPatternLength: 1000,
12+
MaxInputLength: 10000,
13+
DefaultTimeoutMs: 1000,
14+
Patterns: []RegexPattern{
15+
{ID: "email", Pattern: `\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b`},
16+
{ID: "word", Pattern: "hello"},
17+
{ID: "case", Pattern: "World", Flags: "i"},
18+
},
19+
}
20+
rp, err := NewRegexProvider(cfg)
21+
if err != nil {
22+
b.Fatalf("failed to create regex provider: %v", err)
23+
}
24+
25+
input := "my email is [email protected], say hello to the beautiful World"
26+
27+
b.Run("SinglePattern", func(b *testing.B) {
28+
singlePatternCfg := RegexProviderConfig{
29+
MaxPatterns: 1,
30+
MaxPatternLength: 100,
31+
MaxInputLength: 1000,
32+
DefaultTimeoutMs: 100,
33+
Patterns: []RegexPattern{
34+
{ID: "email", Pattern: `\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b`},
35+
},
36+
}
37+
singleRp, _ := NewRegexProvider(singlePatternCfg)
38+
for i := 0; i < b.N; i++ {
39+
_, _ = singleRp.Scan(input)
40+
}
41+
})
42+
43+
b.Run("MultiPattern", func(b *testing.B) {
44+
for i := 0; i < b.N; i++ {
45+
_, _ = rp.Scan(input)
46+
}
47+
})
48+
49+
b.Run("LargeInput", func(b *testing.B) {
50+
largeInput := ""
51+
for i := 0; i < 100; i++ {
52+
largeInput += fmt.Sprintf("email%[email protected] ", i)
53+
}
54+
for i := 0; i < b.N; i++ {
55+
_, _ = rp.Scan(largeInput)
56+
}
57+
})
58+
}
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
package candle_binding
2+
3+
import (
4+
"strings"
5+
"testing"
6+
"time"
7+
)
8+
9+
func TestNewRegexProvider(t *testing.T) {
10+
t.Run("ValidConfig", func(t *testing.T) {
11+
cfg := RegexProviderConfig{
12+
MaxPatterns: 10,
13+
MaxPatternLength: 100,
14+
MaxInputLength: 1000,
15+
DefaultTimeoutMs: 50,
16+
Patterns: []RegexPattern{
17+
{ID: "email", Pattern: `\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b`},
18+
},
19+
}
20+
_, err := NewRegexProvider(cfg)
21+
if err != nil {
22+
t.Fatalf("expected no error, got %v", err)
23+
}
24+
})
25+
26+
t.Run("TooManyPatterns", func(t *testing.T) {
27+
cfg := RegexProviderConfig{
28+
MaxPatterns: 1,
29+
Patterns: []RegexPattern{
30+
{ID: "p1", Pattern: "a"},
31+
{ID: "p2", Pattern: "b"},
32+
},
33+
}
34+
_, err := NewRegexProvider(cfg)
35+
if err == nil {
36+
t.Fatal("expected an error for too many patterns, got nil")
37+
}
38+
})
39+
40+
t.Run("PatternTooLong", func(t *testing.T) {
41+
cfg := RegexProviderConfig{
42+
MaxPatterns: 10,
43+
MaxPatternLength: 5,
44+
Patterns: []RegexPattern{
45+
{ID: "long", Pattern: "abcdef"},
46+
},
47+
}
48+
_, err := NewRegexProvider(cfg)
49+
if err == nil {
50+
t.Fatal("expected an error for pattern too long, got nil")
51+
}
52+
})
53+
54+
t.Run("InvalidRegex", func(t *testing.T) {
55+
cfg := RegexProviderConfig{
56+
MaxPatterns: 10,
57+
MaxPatternLength: 100,
58+
Patterns: []RegexPattern{
59+
{ID: "invalid", Pattern: `[`},
60+
},
61+
}
62+
_, err := NewRegexProvider(cfg)
63+
if err == nil {
64+
t.Fatal("expected an error for invalid regex, got nil")
65+
}
66+
})
67+
}
68+
69+
func TestRegexProvider_Scan(t *testing.T) {
70+
cfg := RegexProviderConfig{
71+
MaxPatterns: 10,
72+
MaxPatternLength: 100,
73+
MaxInputLength: 1000,
74+
DefaultTimeoutMs: 100,
75+
Patterns: []RegexPattern{
76+
{ID: "email", Pattern: `\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b`, Category: "pii"},
77+
{ID: "word", Pattern: "hello", Category: "greeting"},
78+
{ID: "case", Pattern: "World", Flags: "i", Category: "case-test"},
79+
},
80+
}
81+
rp, err := NewRegexProvider(cfg)
82+
if err != nil {
83+
t.Fatalf("failed to create regex provider: %v", err)
84+
}
85+
86+
t.Run("SimpleMatch", func(t *testing.T) {
87+
input := "say hello to the world"
88+
matches, err := rp.Scan(input)
89+
if err != nil {
90+
t.Fatalf("scan failed: %v", err)
91+
}
92+
if len(matches) != 2 { // "hello" and "world" (case-insensitive)
93+
t.Fatalf("expected 2 matches, got %d", len(matches))
94+
}
95+
})
96+
97+
t.Run("CaseInsensitiveMatch", func(t *testing.T) {
98+
input := "hello WORLD"
99+
matches, err := rp.Scan(input)
100+
if err != nil {
101+
t.Fatalf("scan failed: %v", err)
102+
}
103+
if len(matches) != 2 {
104+
t.Fatalf("expected 2 matches, got %d", len(matches))
105+
}
106+
})
107+
108+
t.Run("MultipleMatches", func(t *testing.T) {
109+
input := "my email is [email protected], say hello"
110+
matches, err := rp.Scan(input)
111+
if err != nil {
112+
t.Fatalf("scan failed: %v", err)
113+
}
114+
if len(matches) != 2 {
115+
t.Fatalf("expected 2 matches, got %d", len(matches))
116+
}
117+
})
118+
119+
t.Run("NoMatch", func(t *testing.T) {
120+
input := "nothing to see here"
121+
matches, err := rp.Scan(input)
122+
if err != nil {
123+
t.Fatalf("scan failed: %v", err)
124+
}
125+
if len(matches) != 0 {
126+
t.Fatalf("expected 0 matches, got %d", len(matches))
127+
}
128+
})
129+
130+
t.Run("InputTooLong", func(t *testing.T) {
131+
rp.maxInputLength = 5
132+
_, err := rp.Scan("abcdef")
133+
if err == nil {
134+
t.Fatal("expected an error for input too long, got nil")
135+
}
136+
rp.maxInputLength = 1000 // reset
137+
})
138+
139+
t.Run("Timeout", func(t *testing.T) {
140+
cfg := RegexProviderConfig{
141+
MaxPatterns: 1,
142+
MaxPatternLength: 100,
143+
MaxInputLength: 1000,
144+
DefaultTimeoutMs: 10, // 10ms
145+
Patterns: []RegexPattern{
146+
{ID: "any", Pattern: `.`},
147+
},
148+
}
149+
// Create a provider with a 20ms delay, which is longer than the timeout
150+
rp, err := NewRegexProvider(cfg, WithTestDelay(20*time.Millisecond))
151+
if err != nil {
152+
t.Fatalf("failed to create regex provider: %v", err)
153+
}
154+
155+
_, err = rp.Scan("a")
156+
if err == nil {
157+
t.Fatal("expected a timeout error, got nil")
158+
}
159+
if !strings.Contains(err.Error(), "timed out") {
160+
t.Errorf("expected timeout error, got: %v", err)
161+
}
162+
})
163+
164+
t.Run("ReDoSAttackVector", func(t *testing.T) {
165+
// This pattern is a known ReDoS vector for backtracking regex engines.
166+
// Go's engine is not vulnerable, so this should execute quickly.
167+
cfg := RegexProviderConfig{
168+
MaxPatterns: 1,
169+
MaxPatternLength: 100,
170+
MaxInputLength: 1000,
171+
DefaultTimeoutMs: 500, // 500ms timeout
172+
Patterns: []RegexPattern{
173+
{ID: "redos", Pattern: `(a+)+$`},
174+
},
175+
}
176+
rp, err := NewRegexProvider(cfg)
177+
if err != nil {
178+
t.Fatalf("failed to create regex provider: %v", err)
179+
}
180+
181+
// A long string of 'a's followed by a non-matching character.
182+
// In a vulnerable engine, this would cause catastrophic backtracking.
183+
input := "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab"
184+
185+
_, err = rp.Scan(input)
186+
if err != nil {
187+
t.Fatalf("scan failed for ReDoS pattern: %v", err)
188+
}
189+
})
190+
}

0 commit comments

Comments
 (0)