Skip to content

Commit 15c3821

Browse files
committed
Merged PR 2668744: Regex backreference support
The regex backreference support works by parsing regexes at initialization to detect the presence of backreferences such as `\1`, `\2`, etc. These are then replaced with the content of the group that they were referencing. So for example `(abc+) \1` becomes `(abc+) (abc+)`. The `tryMatch` function then does the actual comparison to ensure that the two groups actually had the same content. Only groups that are surrounded by `\b` word boundaries are supported, such as `\b(abc+)\b`. This is because otherwise it becomes really complicated with situations like `11=111111`. The CRS also contains this construct that makes use of a negative lookahead to assert that a backreference is _not equal_ to a group (simplified): `\b(\w+)\b != (?!\b(\1)\b)(\w+)`. Support for these special construct is therefore also implemented. The changes to CRS rule 942130 is currently pending checkin: RE2 compatibility for 920120 SpiderLabs/owasp-modsecurity-crs#1663 Because this gets us to 100% of the CRS tests passing, I have removed code that was preventing running the CRS tests by default in the CI pipeline. So the CRS tests now always run by default. PR URL: https://msazure.visualstudio.com/DefaultCollection/One/_git/Networking-Azwaf/pullrequest/2668744 Related work items: #5697858
1 parent 45ea1f5 commit 15c3821

File tree

18 files changed

+1039
-87
lines changed

18 files changed

+1039
-87
lines changed

hyperscan/goenginechooser.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,13 @@ func (g *goRegexpFacade) FindSubmatchIndex(b []byte) []int {
6060
return g.goregexpBin.FindSubmatchIndex(b)
6161
}
6262

63+
func (g *goRegexpFacade) FindAllSubmatchIndex(b []byte, n int) [][]int {
64+
if g.goregexp != nil {
65+
return g.goregexp.FindAllSubmatchIndex(b, n)
66+
}
67+
return g.goregexpBin.FindAllSubmatchIndex(b, n)
68+
}
69+
6370
var hexEscapeRegexp = regexp.MustCompile(`((^|[^\\])(\\\\)*)\\x([0-9a-fA-F]{2})`)
6471

6572
func containsHexEscapedBytes(s string) bool {

hyperscan/multiregexengine.go

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ type engineImpl struct {
2020

2121
// Special case for when searching for an empty string, because Hyperscan returns an error if it's given an empty string
2222
emptyStringPatternIDs []int
23+
24+
// Special case for when searching for a regex with a backref, because this is not otherwise directly supported by Hyperscan and Go Regexp.
25+
regexesWithBackref map[int]regexWithBackref
2326
}
2427

2528
type scratchSpaceImpl struct {
@@ -38,6 +41,7 @@ func NewMultiRegexEngineFactory(dbCache DbCache) waf.MultiRegexEngineFactory {
3841
// NewMultiRegexEngine creates a MultiRegexEngine that uses Hyperscan in prefilter mode for the initial scan, and then uses Go regexp to re-validate matches and extract strings.
3942
func (f *engineFactoryImpl) NewMultiRegexEngine(mm []waf.MultiRegexEnginePattern) (engine waf.MultiRegexEngine, err error) {
4043
h := &engineImpl{}
44+
h.regexesWithBackref = make(map[int]regexWithBackref)
4145

4246
patterns := []*hs.Pattern{}
4347
for _, m := range mm {
@@ -47,7 +51,21 @@ func (f *engineFactoryImpl) NewMultiRegexEngine(mm []waf.MultiRegexEnginePattern
4751
continue
4852
}
4953

50-
p := hs.NewPattern(m.Expr, 0)
54+
expr := m.Expr
55+
56+
// Special case for when searching for a regex with a backref, because this is not otherwise directly supported by Hyperscan and Go Regexp.
57+
var hasBackref bool
58+
var r regexWithBackref
59+
hasBackref, r, err = newRegexWithBackref(m.Expr)
60+
if err != nil {
61+
return
62+
}
63+
if hasBackref {
64+
h.regexesWithBackref[m.ID] = r
65+
expr = r.newRegex
66+
}
67+
68+
p := hs.NewPattern(expr, 0)
5169
p.Id = m.ID
5270

5371
// SingleMatch makes Hyperscan only return one match per regex. So if a regex is found multiple time, still only one match is recorded.
@@ -88,6 +106,11 @@ func (f *engineFactoryImpl) NewMultiRegexEngine(mm []waf.MultiRegexEnginePattern
88106
// Make the PCRE regex compatible with Go regexp
89107
expr := removePcrePossessiveQuantifier(m.Expr)
90108

109+
// Special case for when searching for a regex with a backref, because this is not otherwise directly supported by Hyperscan and Go Regexp.
110+
if r, ok := h.regexesWithBackref[m.ID]; ok {
111+
expr = r.newRegex
112+
}
113+
91114
var r *goRegexpFacade
92115
r, err = compileRegexpFacade(expr)
93116
if err != nil {
@@ -140,12 +163,27 @@ func (h *engineImpl) Scan(input []byte, s waf.MultiRegexEngineScratchSpace) (mat
140163

141164
// Re-validate the potential matches using Go regexp
142165
for _, pmID := range potentialMatches {
166+
// Special case for when searching for a regex with a backref, because this is not otherwise directly supported by Hyperscan and Go Regexp.
167+
if rwbh, ok := h.regexesWithBackref[pmID]; ok {
168+
data, captureGroups := rwbh.tryMatch(input, h.goregexes[pmID])
169+
if data != nil {
170+
m := waf.MultiRegexEngineMatch{
171+
ID: pmID,
172+
Data: data,
173+
CaptureGroups: captureGroups,
174+
}
175+
matches = append(matches, m)
176+
}
177+
178+
continue
179+
}
180+
143181
loc := h.goregexes[pmID].FindSubmatchIndex(input)
144182
if loc == nil {
145183
continue
146184
}
147185

148-
// FindSubmatchIndex will always return an even number, because it returns pairs of start-end-locations.
186+
// The length of the array returned by FindSubmatchIndex will always be an even number, because it returns pairs of start-end-locations.
149187
var captureGroups [][]byte
150188
for i := 0; i < len(loc); i = i + 2 {
151189
if loc[i] != -1 {
@@ -161,7 +199,6 @@ func (h *engineImpl) Scan(input []byte, s waf.MultiRegexEngineScratchSpace) (mat
161199
Data: input[loc[0]:loc[1]],
162200
CaptureGroups: captureGroups,
163201
}
164-
165202
matches = append(matches, m)
166203
}
167204

hyperscan/multiregexengine_crsfiles_integration_test.go

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -67,12 +67,6 @@ func TestAllCrsReqRulesIndividually(t *testing.T) {
6767
continue
6868
}
6969

70-
// Skip this rule until we add support for backreferences
71-
// TODO add support for backreferences
72-
if rule.ID == 942130 {
73-
continue
74-
}
75-
7670
for itemIdx, item := range rule.Items {
7771
if item.Predicate.Op != ast.Rx {
7872
continue

0 commit comments

Comments
 (0)