Skip to content

Commit 2516b6a

Browse files
authored
perf: AVX2 Teddy, bidirectional DFA fallback, CompositeSequenceDFA fixes (#114)
perf: enable AVX2 Teddy, fix CompositeSequenceDFA overmatching, add digit-run skip
2 parents 0d64b44 + 5b6e649 commit 2516b6a

File tree

8 files changed

+245
-67
lines changed

8 files changed

+245
-67
lines changed

CHANGELOG.md

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,35 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1212
- ARM NEON SIMD support (waiting for Go 1.26 native SIMD)
1313
- SIMD prefilter for CompositeSequenceDFA (#83)
1414

15+
## [0.12.1] - 2026-02-15
16+
17+
### Performance
18+
- **DFA bidirectional fallback for BoundedBacktracker** — When BoundedBacktracker
19+
can't handle large inputs (exceeds 32M entry limit), use forward DFA + reverse
20+
DFA instead of PikeVM. Forward DFA finds match end, reverse DFA finds match
21+
start. O(n) total vs PikeVM's O(n*states). ~3x speedup on `(\w{2,8})+` at 6MB.
22+
- **Digit-run skip optimization** — For `\d+`-leading patterns (IP addresses,
23+
version numbers), skip entire digit run on DFA failure instead of advancing
24+
one byte at a time. Only enabled when the leading digit class has a greedy
25+
unbounded quantifier.
26+
27+
### Fixed
28+
- **Bounded repetitions blocked ReverseSuffix strategy** (Issue #115) —
29+
`isSafeForReverseSuffix` didn't recognize `OpRepeat{min>=1}` as a wildcard
30+
subexpression, blocking UseReverseSuffix for patterns with bounded repetitions
31+
like `{1,50}{1,10}`. These patterns fell to NFA full-scan instead of suffix
32+
prefilter + reverse DFA. Fix: **2500ms → 0.5ms** (5000x) on 100KB no-match.
33+
- **CompositeSequenceDFA overmatching for bounded patterns** — Bare character
34+
classes like `\w` (maxMatch=1) were treated as unbounded by the DFA, causing
35+
`\w\w` on "000" to return "000" instead of "00". Now rejects patterns with
36+
bounded maxMatch, falling back to CompositeSearcher backtracking.
37+
- **AVX2 Teddy assembly correctness** (Issue #74) — Fixed `teddySlimAVX2_2`
38+
returning position -1 (not-found sentinel) for valid candidates in short
39+
haystacks, caused by `DECQ SI` executing when there was no prior chunk
40+
boundary to cover. AVX2 dispatch remains disabled by default (SSSE3 is 4x
41+
faster on AMD EPYC due to VZEROUPPER overhead on frequent verification
42+
restarts).
43+
1544
## [0.12.0] - 2026-02-06
1645

1746
### Performance

meta/compile.go

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,12 +98,14 @@ func buildOnePassDFA(re *syntax.Regexp, nfaEngine *nfa.NFA, config Config) *onep
9898
// strategyEngines holds all strategy-specific engines built by buildStrategyEngines.
9999
type strategyEngines struct {
100100
dfa *lazy.DFA
101+
reverseDFA *lazy.DFA // Reverse DFA for bidirectional search fallback
101102
reverseSearcher *ReverseAnchoredSearcher
102103
reverseSuffixSearcher *ReverseSuffixSearcher
103104
reverseSuffixSetSearcher *ReverseSuffixSetSearcher
104105
reverseInnerSearcher *ReverseInnerSearcher
105106
multilineReverseSuffixSearcher *MultilineReverseSuffixSearcher // Issue #97
106107
digitPrefilter *prefilter.DigitPrefilter
108+
digitRunSkipSafe bool
107109
ahoCorasick *ahocorasick.Automaton
108110
finalStrategy Strategy
109111
}
@@ -141,7 +143,8 @@ func buildStrategyEngines(
141143
needsDFA := strategy == UseDFA || strategy == UseBoth ||
142144
strategy == UseReverseAnchored || strategy == UseReverseSuffix ||
143145
strategy == UseReverseSuffixSet || strategy == UseReverseInner ||
144-
strategy == UseMultilineReverseSuffix || strategy == UseDigitPrefilter
146+
strategy == UseMultilineReverseSuffix || strategy == UseDigitPrefilter ||
147+
strategy == UseBoundedBacktracker
145148

146149
if !needsDFA {
147150
return result
@@ -164,9 +167,25 @@ func buildStrategyEngines(
164167
}
165168
}
166169

170+
// Build forward+reverse DFA for BoundedBacktracker bidirectional fallback.
171+
// When BoundedBacktracker can't handle large inputs (CanHandle fails),
172+
// bidirectional DFA (forward→end, reverse→start) is O(n) vs PikeVM's O(n*states).
173+
if result.finalStrategy == UseBoundedBacktracker {
174+
fwdDFA, err := lazy.CompileWithPrefilter(nfaEngine, dfaConfig, pf)
175+
if err == nil {
176+
result.dfa = fwdDFA
177+
reverseNFA := nfa.ReverseAnchored(nfaEngine)
178+
revDFA, revErr := lazy.CompileWithConfig(reverseNFA, dfaConfig)
179+
if revErr == nil {
180+
result.reverseDFA = revDFA
181+
}
182+
}
183+
}
184+
167185
// For digit prefilter strategy, create the digit prefilter
168186
if result.finalStrategy == UseDigitPrefilter {
169187
result.digitPrefilter = prefilter.NewDigitPrefilter()
188+
result.digitRunSkipSafe = isDigitRunSkipSafe(re)
170189
}
171190

172191
return result
@@ -490,6 +509,7 @@ func CompileRegexp(re *syntax.Regexp, config Config) (*Engine, error) {
490509
asciiNFA: asciiNFAEngine,
491510
asciiBoundedBacktracker: asciiBT,
492511
dfa: engines.dfa,
512+
reverseDFA: engines.reverseDFA,
493513
pikevm: pikevm,
494514
boundedBacktracker: charClassResult.boundedBT,
495515
charClassSearcher: charClassResult.charClassSrch,
@@ -504,6 +524,7 @@ func CompileRegexp(re *syntax.Regexp, config Config) (*Engine, error) {
504524
reverseInnerSearcher: engines.reverseInnerSearcher,
505525
multilineReverseSuffixSearcher: engines.multilineReverseSuffixSearcher,
506526
digitPrefilter: engines.digitPrefilter,
527+
digitRunSkipSafe: engines.digitRunSkipSafe,
507528
ahoCorasick: engines.ahoCorasick,
508529
anchoredLiteralInfo: anchoredLiteralInfo,
509530
prefilter: pf,

meta/engine.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,13 @@ type Engine struct {
9999
// Note: The cache is now stored in pooled SearchState for thread-safety
100100
onepass *onepass.DFA
101101

102+
// reverseDFA is a reverse lazy DFA for bidirectional search fallback.
103+
// When BoundedBacktracker can't handle large inputs, forward DFA finds
104+
// match end and reverseDFA finds match start. O(n) total.
105+
// Placed after onepass to preserve field offsets of hot-path fields
106+
// (charClassSearcher, strategy, etc.) for cache alignment stability.
107+
reverseDFA *lazy.DFA
108+
102109
// statePool provides thread-safe pooling of per-search mutable state.
103110
// This enables concurrent searches on the same Engine instance.
104111
statePool *searchStatePool
@@ -115,6 +122,12 @@ type Engine struct {
115122
// isStartAnchored is true if the pattern is anchored at start (^).
116123
// Used for first-byte prefilter optimization.
117124
isStartAnchored bool
125+
126+
// digitRunSkipSafe is true when the leading digit class has a greedy
127+
// unbounded quantifier (\d+, \d*). On DFA failure, all positions in the
128+
// same digit run produce the same result, so the inner loop can skip
129+
// the entire run instead of trying each digit.
130+
digitRunSkipSafe bool
118131
}
119132

120133
// Stats tracks execution statistics for performance analysis.

meta/find_indices.go

Lines changed: 91 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -483,6 +483,25 @@ func (e *Engine) findIndicesMultilineReverseSuffixAt(haystack []byte, at int) (i
483483
return e.multilineReverseSuffixSearcher.FindIndicesAt(haystack, at)
484484
}
485485

486+
// findIndicesBidirectionalDFA uses forward DFA + reverse DFA for exact match bounds.
487+
// Forward DFA finds match end, reverse DFA finds match start. O(n) total.
488+
// Used as fallback when BoundedBacktracker can't handle large inputs.
489+
func (e *Engine) findIndicesBidirectionalDFA(haystack []byte, at int) (int, int, bool) {
490+
atomic.AddUint64(&e.stats.DFASearches, 1)
491+
end := e.dfa.FindAt(haystack, at)
492+
if end == -1 {
493+
return -1, -1, false
494+
}
495+
if end == at {
496+
return at, at, true // Empty match
497+
}
498+
start := e.reverseDFA.SearchReverse(haystack, at, end)
499+
if start < 0 {
500+
return -1, -1, false // Reverse DFA failed (cache full)
501+
}
502+
return start, end, true
503+
}
504+
486505
// findIndicesBoundedBacktracker searches using bounded backtracker - zero alloc.
487506
// Thread-safe: uses pooled state.
488507
func (e *Engine) findIndicesBoundedBacktracker(haystack []byte) (int, int, bool) {
@@ -499,7 +518,10 @@ func (e *Engine) findIndicesBoundedBacktracker(haystack []byte) (int, int, bool)
499518

500519
atomic.AddUint64(&e.stats.NFASearches, 1)
501520
if !e.boundedBacktracker.CanHandle(len(haystack)) {
502-
// Use optimized SlotTable-based search for large inputs
521+
// Bidirectional DFA: O(n) vs PikeVM's O(n*states) for large inputs
522+
if e.dfa != nil && e.reverseDFA != nil {
523+
return e.findIndicesBidirectionalDFA(haystack, 0)
524+
}
503525
return e.pikevm.SearchWithSlotTable(haystack, nfa.SearchModeFind)
504526
}
505527

@@ -529,21 +551,33 @@ func (e *Engine) findIndicesBoundedBacktrackerAt(haystack []byte, at int) (int,
529551
// to search the remaining portion, not the full haystack.
530552
remaining := haystack[at:]
531553

532-
// V11-002 ASCII optimization
533-
if e.asciiBoundedBacktracker != nil && simd.IsASCII(remaining) {
534-
if !e.asciiBoundedBacktracker.CanHandle(len(remaining)) {
535-
// Use optimized SlotTable-based search for large inputs
536-
return e.pikevm.SearchWithSlotTableAt(haystack, at, nfa.SearchModeFind)
554+
// V11-002 ASCII optimization.
555+
// For start-anchored patterns, limit the IsASCII check to a small prefix
556+
// to avoid O(n) scan of the entire input when only position 0 matters.
557+
if e.asciiBoundedBacktracker != nil {
558+
asciiCheck := remaining
559+
if e.isStartAnchored && len(asciiCheck) > 4096 {
560+
asciiCheck = asciiCheck[:4096]
537561
}
538-
start, end, found := e.asciiBoundedBacktracker.Search(remaining)
539-
if found {
540-
return at + start, at + end, true
562+
if simd.IsASCII(asciiCheck) {
563+
if !e.asciiBoundedBacktracker.CanHandle(len(remaining)) {
564+
if e.dfa != nil && e.reverseDFA != nil {
565+
return e.findIndicesBidirectionalDFA(haystack, at)
566+
}
567+
return e.pikevm.SearchWithSlotTableAt(haystack, at, nfa.SearchModeFind)
568+
}
569+
start, end, found := e.asciiBoundedBacktracker.Search(remaining)
570+
if found {
571+
return at + start, at + end, true
572+
}
573+
return -1, -1, false
541574
}
542-
return -1, -1, false
543575
}
544576

545577
if !e.boundedBacktracker.CanHandle(len(remaining)) {
546-
// Delegate to NFA path which uses prefilter if available
578+
if e.dfa != nil && e.reverseDFA != nil {
579+
return e.findIndicesBidirectionalDFA(haystack, at)
580+
}
547581
return e.findIndicesNFAAt(haystack, at)
548582
}
549583

@@ -730,6 +764,14 @@ func (e *Engine) findIndicesDigitPrefilter(haystack []byte) (int, int, bool) {
730764
}
731765

732766
pos = digitPos + 1
767+
// When the leading digit class is greedy unbounded (\d+, \d*), all
768+
// positions in the same digit run reach the same DFA state after
769+
// consuming digits, so they all fail identically. Skip the entire run.
770+
if e.digitRunSkipSafe {
771+
for pos < len(haystack) && haystack[pos] >= '0' && haystack[pos] <= '9' {
772+
pos++
773+
}
774+
}
733775
}
734776

735777
return -1, -1, false
@@ -767,6 +809,11 @@ func (e *Engine) findIndicesDigitPrefilterAt(haystack []byte, at int) (int, int,
767809
}
768810

769811
pos = digitPos + 1
812+
if e.digitRunSkipSafe {
813+
for pos < len(haystack) && haystack[pos] >= '0' && haystack[pos] <= '9' {
814+
pos++
815+
}
816+
}
770817
}
771818

772819
return -1, -1, false
@@ -931,46 +978,52 @@ func (e *Engine) findIndicesBoundedBacktrackerAtWithState(haystack []byte, at in
931978
// to search the remaining portion, not the full haystack.
932979
remaining := haystack[at:]
933980

934-
// V11-002 ASCII optimization
935-
if e.asciiBoundedBacktracker != nil && simd.IsASCII(remaining) {
936-
if !e.asciiBoundedBacktracker.CanHandle(len(remaining)) {
937-
// V12 Windowed BoundedBacktracker for ASCII path
938-
maxInput := e.asciiBoundedBacktracker.MaxInputSize()
939-
if maxInput > 0 && len(remaining) > maxInput {
940-
window := remaining[:maxInput]
941-
start, end, found := e.asciiBoundedBacktracker.Search(window)
942-
if found {
943-
return at + start, at + end, true
981+
// V11-002 ASCII optimization.
982+
// For start-anchored patterns, limit the IsASCII check to a small prefix
983+
// to avoid O(n) scan of the entire input when only position 0 matters.
984+
if e.asciiBoundedBacktracker != nil {
985+
asciiCheck := remaining
986+
if e.isStartAnchored && len(asciiCheck) > 4096 {
987+
asciiCheck = asciiCheck[:4096]
988+
}
989+
if simd.IsASCII(asciiCheck) {
990+
if !e.asciiBoundedBacktracker.CanHandle(len(remaining)) {
991+
// Bidirectional DFA: O(n) vs PikeVM's O(n*states)
992+
if e.dfa != nil && e.reverseDFA != nil {
993+
return e.findIndicesBidirectionalDFA(haystack, at)
994+
}
995+
// V12 Windowed BoundedBacktracker for ASCII path
996+
maxInput := e.asciiBoundedBacktracker.MaxInputSize()
997+
if maxInput > 0 && len(remaining) > maxInput {
998+
window := remaining[:maxInput]
999+
start, end, found := e.asciiBoundedBacktracker.Search(window)
1000+
if found {
1001+
return at + start, at + end, true
1002+
}
9441003
}
1004+
return state.pikevm.SearchWithSlotTableAt(haystack, at, nfa.SearchModeFind)
9451005
}
946-
return state.pikevm.SearchWithSlotTableAt(haystack, at, nfa.SearchModeFind)
947-
}
948-
start, end, found := e.asciiBoundedBacktracker.Search(remaining)
949-
if found {
950-
return at + start, at + end, true
1006+
start, end, found := e.asciiBoundedBacktracker.Search(remaining)
1007+
if found {
1008+
return at + start, at + end, true
1009+
}
1010+
return -1, -1, false
9511011
}
952-
return -1, -1, false
9531012
}
9541013

9551014
if !e.boundedBacktracker.CanHandle(len(remaining)) {
956-
// V12 Windowed BoundedBacktracker: For large inputs, try searching in a
957-
// window of maxInputSize bytes first. Most patterns produce short matches
958-
// (e.g., word patterns like (\w{2,8})+ match 2-8 chars), so the match
959-
// will be found within the first window. Only fall back to PikeVM if
960-
// no match is found in the window (rare for common patterns).
1015+
// Bidirectional DFA: O(n) vs PikeVM's O(n*states) for large inputs
1016+
if e.dfa != nil && e.reverseDFA != nil {
1017+
return e.findIndicesBidirectionalDFA(haystack, at)
1018+
}
1019+
// V12 Windowed BoundedBacktracker fallback
9611020
maxInput := e.boundedBacktracker.MaxInputSize()
9621021
if maxInput > 0 && len(remaining) > maxInput {
963-
// Search in the first window
9641022
window := remaining[:maxInput]
9651023
start, end, found := e.boundedBacktracker.SearchWithState(window, state.backtracker)
9661024
if found {
967-
// Match found within window - this is the common case
9681025
return at + start, at + end, true
9691026
}
970-
// No match in window - could be:
971-
// 1. No match exists in the full input
972-
// 2. Match exists beyond the window
973-
// Fall back to PikeVM to handle both cases correctly
9741027
}
9751028
return state.pikevm.SearchWithSlotTableAt(haystack, at, nfa.SearchModeFind)
9761029
}

0 commit comments

Comments
 (0)