Skip to content

Commit 3abbf0f

Browse files
authored
Merge branch 'main' into feat/decision-based-routing-with-plugins
2 parents 91171f6 + 775e216 commit 3abbf0f

File tree

22 files changed

+850
-52
lines changed

22 files changed

+850
-52
lines changed

README.md

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -130,12 +130,15 @@ For questions, feedback, or to contribute, please join `#semantic-router` channe
130130

131131
We host bi-weekly community meetings to sync up with contributors across different time zones:
132132

133-
- **First Tuesday of the month**: 9:00-10:00 AM EST (accommodates US EST and Asia Pacific contributors)
134-
- **Zoom Link**: [https://nyu.zoom.us/j/95065349917](https://nyu.zoom.us/j/95065349917)
135-
- **Calendar Invite**: [https://calendar.app.google/EeP6xDgCpxte6d1eA](https://calendar.app.google/EeP6xDgCpxte6d1eA)
133+
- **First Tuesday of the month**: 9:00-10:00 AM EST (accommodates US EST, EU, and Asia Pacific contributors)
134+
- [Zoom Link](https://us05web.zoom.us/j/84122485631?pwd=BB88v03mMNLVHn60YzVk4PihuqBV9d.1)
135+
- [Google Calendar Invite](https://us05web.zoom.us/meeting/tZAsdeuspj4sGdVraOOR4UaXSstrH2jjPYFq/calendar/google/add?meetingMasterEventId=4jjzUKSLSLiBHtIKZpGc3g)
136+
- [ics file](https://drive.google.com/file/d/15wO8cg0ZjNxdr8OtGiZyAgkSS8_Wry0J/view?usp=sharing)
136137
- **Third Tuesday of the month**: 1:00-2:00 PM EST (accommodates US EST and California contributors)
137-
- **Zoom Link**: [https://nyu.zoom.us/j/98861585086](https://nyu.zoom.us/j/98861585086)
138-
- **Calendar Invite**: [https://calendar.app.google/oYsmt1Pu46o4gFuP8](https://calendar.app.google/oYsmt1Pu46o4gFuP8)
138+
- [Zoom Link](https://us06web.zoom.us/j/86871492845?pwd=LcTtXm9gtGu23JeWqXxbnLLCCvbumB.1)
139+
- [Google Calendar Invite](https://us05web.zoom.us/meeting/tZIlcOispzkiHtH2dlkWlLym68bEqvuf3MU5/calendar/google/add?meetingMasterEventId=PqWz2vk7TOCszPXqconGAA)
140+
- [ics file](https://drive.google.com/file/d/1T54mwYpXXoV9QfR76I56BFBPNbykSsTw/view?usp=sharing)
141+
- Meeting Recordings: [YouTube](https://www.youtube.com/@vLLMSemanticRouter/videos)
139142

140143
Join us to discuss the latest developments, share ideas, and collaborate on the project!
141144

candle-binding/regex_provider.go

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
package candle_binding
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"regexp"
7+
"strings"
8+
"time"
9+
)
10+
11+
// RegexProviderConfig holds the configuration for the regex provider.
12+
type RegexProviderConfig struct {
13+
MaxPatterns int `yaml:"max_patterns"`
14+
MaxPatternLength int `yaml:"max_pattern_length"`
15+
MaxInputLength int `yaml:"max_input_length"`
16+
DefaultTimeoutMs int `yaml:"default_timeout_ms"`
17+
Patterns []RegexPattern `yaml:"patterns"`
18+
}
19+
20+
// RegexPattern defines a single regex pattern.
21+
type RegexPattern struct {
22+
ID string `yaml:"id"`
23+
Pattern string `yaml:"pattern"`
24+
Flags string `yaml:"flags"`
25+
Category string `yaml:"category"`
26+
}
27+
28+
// RegexProvider is a ReDoS-safe regex scanner.
29+
// It uses Go's built-in regexp package, which is based on RE2 and is not
30+
// vulnerable to regular expression denial of service attacks.
31+
type RegexProvider struct {
32+
compiled []*regexp.Regexp
33+
patterns []RegexPattern
34+
timeout time.Duration
35+
maxInputLength int
36+
testDelay time.Duration // For testing purposes
37+
}
38+
39+
// MatchResult represents a single regex match.
40+
type MatchResult struct {
41+
PatternID string
42+
Category string
43+
Match string
44+
StartIndex int
45+
EndIndex int
46+
}
47+
48+
// NewRegexProvider creates a new RegexProvider.
49+
func NewRegexProvider(cfg RegexProviderConfig, options ...func(*RegexProvider)) (*RegexProvider, error) {
50+
if len(cfg.Patterns) > cfg.MaxPatterns {
51+
return nil, fmt.Errorf("number of patterns (%d) exceeds max_patterns (%d)", len(cfg.Patterns), cfg.MaxPatterns)
52+
}
53+
54+
compiled := make([]*regexp.Regexp, 0, len(cfg.Patterns))
55+
for _, p := range cfg.Patterns {
56+
if len(p.Pattern) > cfg.MaxPatternLength {
57+
return nil, fmt.Errorf("pattern length for ID '%s' (%d) exceeds max_pattern_length (%d)", p.ID, len(p.Pattern), cfg.MaxPatternLength)
58+
}
59+
60+
pattern := p.Pattern
61+
if strings.Contains(p.Flags, "i") {
62+
pattern = "(?i)" + pattern
63+
}
64+
65+
re, err := regexp.Compile(pattern)
66+
if err != nil {
67+
return nil, fmt.Errorf("failed to compile pattern ID '%s': %w", p.ID, err)
68+
}
69+
compiled = append(compiled, re)
70+
}
71+
72+
rp := &RegexProvider{
73+
compiled: compiled,
74+
patterns: cfg.Patterns,
75+
timeout: time.Duration(cfg.DefaultTimeoutMs) * time.Millisecond,
76+
maxInputLength: cfg.MaxInputLength,
77+
}
78+
79+
for _, option := range options {
80+
option(rp)
81+
}
82+
83+
return rp, nil
84+
}
85+
86+
// WithTestDelay is a functional option to add a delay for testing timeouts.
87+
func WithTestDelay(d time.Duration) func(*RegexProvider) {
88+
return func(rp *RegexProvider) {
89+
rp.testDelay = d
90+
}
91+
}
92+
93+
// Scan scans the input string for matches.
94+
// The scan is performed in a separate goroutine and is subject to a timeout.
95+
// The timeout check is performed between each pattern, so a single very slow
96+
// pattern can still block for longer than the timeout. However, Go's regex
97+
// engine is very fast and not vulnerable to ReDoS, so this is not a major
98+
// concern in practice.
99+
func (rp *RegexProvider) Scan(input string) ([]MatchResult, error) {
100+
if len(input) > rp.maxInputLength {
101+
return nil, fmt.Errorf("input length (%d) exceeds max_input_length (%d)", len(input), rp.maxInputLength)
102+
}
103+
104+
ctx, cancel := context.WithTimeout(context.Background(), rp.timeout)
105+
defer cancel()
106+
107+
resultChan := make(chan struct {
108+
matches []MatchResult
109+
err error
110+
}, 1)
111+
112+
go func() {
113+
var matches []MatchResult
114+
for i, re := range rp.compiled {
115+
select {
116+
case <-ctx.Done():
117+
// The context was cancelled, so we don't need to continue.
118+
return
119+
default:
120+
// Introduce a delay for testing purposes
121+
if rp.testDelay > 0 {
122+
time.Sleep(rp.testDelay)
123+
}
124+
125+
locs := re.FindAllStringIndex(input, -1)
126+
for _, loc := range locs {
127+
matches = append(matches, MatchResult{
128+
PatternID: rp.patterns[i].ID,
129+
Category: rp.patterns[i].Category,
130+
Match: input[loc[0]:loc[1]],
131+
StartIndex: loc[0],
132+
EndIndex: loc[1],
133+
})
134+
}
135+
}
136+
}
137+
resultChan <- struct {
138+
matches []MatchResult
139+
err error
140+
}{matches, nil}
141+
}()
142+
143+
select {
144+
case res := <-resultChan:
145+
return res.matches, res.err
146+
case <-ctx.Done():
147+
return nil, fmt.Errorf("regex scan timed out after %v", rp.timeout)
148+
}
149+
}
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
package candle_binding
2+
3+
import (
4+
"fmt"
5+
"testing"
6+
)
7+
8+
func BenchmarkRegexProvider_Scan(b *testing.B) {
9+
cfg := RegexProviderConfig{
10+
MaxPatterns: 100,
11+
MaxPatternLength: 1000,
12+
MaxInputLength: 10000,
13+
DefaultTimeoutMs: 1000,
14+
Patterns: []RegexPattern{
15+
{ID: "email", Pattern: `\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b`},
16+
{ID: "word", Pattern: "hello"},
17+
{ID: "case", Pattern: "World", Flags: "i"},
18+
},
19+
}
20+
rp, err := NewRegexProvider(cfg)
21+
if err != nil {
22+
b.Fatalf("failed to create regex provider: %v", err)
23+
}
24+
25+
input := "my email is [email protected], say hello to the beautiful World"
26+
27+
b.Run("SinglePattern", func(b *testing.B) {
28+
singlePatternCfg := RegexProviderConfig{
29+
MaxPatterns: 1,
30+
MaxPatternLength: 100,
31+
MaxInputLength: 1000,
32+
DefaultTimeoutMs: 100,
33+
Patterns: []RegexPattern{
34+
{ID: "email", Pattern: `\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b`},
35+
},
36+
}
37+
singleRp, _ := NewRegexProvider(singlePatternCfg)
38+
for i := 0; i < b.N; i++ {
39+
_, _ = singleRp.Scan(input)
40+
}
41+
})
42+
43+
b.Run("MultiPattern", func(b *testing.B) {
44+
for i := 0; i < b.N; i++ {
45+
_, _ = rp.Scan(input)
46+
}
47+
})
48+
49+
b.Run("LargeInput", func(b *testing.B) {
50+
largeInput := ""
51+
for i := 0; i < 100; i++ {
52+
largeInput += fmt.Sprintf("email%[email protected] ", i)
53+
}
54+
for i := 0; i < b.N; i++ {
55+
_, _ = rp.Scan(largeInput)
56+
}
57+
})
58+
}

0 commit comments

Comments
 (0)