Skip to content

Commit fce1691

Browse files
authored
Merge pull request #108 from coregx/feature/pikevm-6mb-optimization
perf: PikeVM 6MB optimization - 1.68x faster than stdlib
2 parents de173be + 92a035d commit fce1691

File tree

11 files changed

+2427
-116
lines changed

11 files changed

+2427
-116
lines changed

.golangci.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,12 @@ linters:
163163
- funlen # SIMD wrappers can be long
164164
- gocognit # CPU feature detection complexity
165165

166+
# NFA/PikeVM - hot path with intentional duplication
167+
- path: nfa/pikevm\.go
168+
linters:
169+
- dupl # addThreadForMatch/addThreadToNextForMatch are intentionally duplicated
170+
- nestif # Epsilon closure has nested conditions for performance
171+
166172
# DFA lazy construction - inherently complex
167173
- path: dfa/lazy/.*\.go
168174
linters:

CHANGELOG.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,41 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1414

1515
---
1616

17+
## [0.11.6] - 2026-02-01
18+
19+
### Performance
20+
- **Windowed BoundedBacktracker for large inputs** (Issue #107)
21+
- V12 optimization: When input exceeds BoundedBacktracker's maxInput (~914KB),
22+
search in a window of maxInput bytes first before falling back to PikeVM
23+
- Most patterns produce short matches found within the first window
24+
- **6MB now 1.68x FASTER than stdlib** (was 2.2x slower!)
25+
- Benchmark for `(\w{2,8})+` on 6MB: 1900ms → 628ms (3x improvement)
26+
- **SlotTable architecture** (Rust-style)
27+
- Per-state slot storage instead of per-thread COW captures
28+
- Dynamic slot sizing: 0 (IsMatch), 2 (Find), full (Captures)
29+
- Lightweight searchThread: 16 bytes (was 40+ bytes)
30+
- **BoundedBacktracker optimizations for word_repeat patterns** (Issue #107)
31+
- Switch from uint32 to uint16 generation tracking (2x memory savings)
32+
- Cache-friendly memory layout: `pos * numStates + state`
33+
- Slice haystack to remaining portion in `findIndicesBoundedBacktrackerAt`
34+
- **PikeVM visited state tracking optimization**
35+
- Consolidate Contains+Insert to single Insert call
36+
- Saves ~8% of SparseSet operations in hot path
37+
- **FindAll/Count sync.Pool overhead elimination**
38+
- Acquire SearchState once, reuse for all iterations
39+
- Allocations reduced from 1.29M to 49 for 6MB input
40+
41+
### Results for `(\w{2,8})+` pattern vs stdlib
42+
| Size | Speedup |
43+
|------|---------|
44+
| 10KB | **1.68x faster** |
45+
| 50KB | **1.88x faster** |
46+
| 100KB | **2.04x faster** |
47+
| 1MB | **1.67x faster** |
48+
| 6MB | **1.68x faster** |
49+
50+
---
51+
1752
## [0.11.5] - 2026-02-01
1853

1954
### Fixed

meta/find_indices.go

Lines changed: 207 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ package meta
88
import (
99
"sync/atomic"
1010

11+
"github.com/coregx/coregex/nfa"
1112
"github.com/coregx/coregex/simd"
1213
)
1314

@@ -131,7 +132,7 @@ func (e *Engine) findIndicesNFA(haystack []byte) (int, int, bool) {
131132
if useBT && e.boundedBacktracker.CanHandle(len(haystack)-pos) {
132133
start, end, found = e.boundedBacktracker.SearchAtWithState(haystack, pos, state.backtracker)
133134
} else {
134-
start, end, found = state.pikevm.SearchAt(haystack, pos)
135+
start, end, found = state.pikevm.SearchWithSlotTableAt(haystack, pos, nfa.SearchModeFind)
135136
}
136137
if found {
137138
return start, end, true
@@ -149,7 +150,8 @@ func (e *Engine) findIndicesNFA(haystack []byte) (int, int, bool) {
149150
return e.boundedBacktracker.SearchWithState(haystack, state.backtracker)
150151
}
151152

152-
return state.pikevm.Search(haystack)
153+
// Use optimized SlotTable-based search for large inputs
154+
return state.pikevm.SearchWithSlotTable(haystack, nfa.SearchModeFind)
153155
}
154156

155157
// findIndicesNFAAt searches using NFA starting at position - zero alloc.
@@ -182,7 +184,7 @@ func (e *Engine) findIndicesNFAAt(haystack []byte, at int) (int, int, bool) {
182184
if useBT && e.boundedBacktracker.CanHandle(len(haystack)-pos) {
183185
start, end, found = e.boundedBacktracker.SearchAtWithState(haystack, pos, state.backtracker)
184186
} else {
185-
start, end, found = state.pikevm.SearchAt(haystack, pos)
187+
start, end, found = state.pikevm.SearchWithSlotTableAt(haystack, pos, nfa.SearchModeFind)
186188
}
187189
if found {
188190
return start, end, true
@@ -200,7 +202,8 @@ func (e *Engine) findIndicesNFAAt(haystack []byte, at int) (int, int, bool) {
200202
return e.boundedBacktracker.SearchAtWithState(haystack, at, state.backtracker)
201203
}
202204

203-
return state.pikevm.SearchAt(haystack, at)
205+
// Use optimized SlotTable-based search for large inputs
206+
return state.pikevm.SearchWithSlotTableAt(haystack, at, nfa.SearchModeFind)
204207
}
205208

206209
// findIndicesDFA searches using DFA with prefilter - zero alloc.
@@ -474,7 +477,8 @@ func (e *Engine) findIndicesBoundedBacktracker(haystack []byte) (int, int, bool)
474477

475478
atomic.AddUint64(&e.stats.NFASearches, 1)
476479
if !e.boundedBacktracker.CanHandle(len(haystack)) {
477-
return e.pikevm.Search(haystack)
480+
// Use optimized SlotTable-based search for large inputs
481+
return e.pikevm.SearchWithSlotTable(haystack, nfa.SearchModeFind)
478482
}
479483

480484
state := e.getSearchState()
@@ -487,27 +491,47 @@ func (e *Engine) findIndicesBoundedBacktracker(haystack []byte) (int, int, bool)
487491
//
488492
// V11-002 ASCII optimization: When pattern contains '.' and input is ASCII-only,
489493
// uses the faster ASCII NFA.
494+
//
495+
// V11.5 optimization: When searching from position 'at', only check CanHandle for
496+
// the remaining portion haystack[at:], not the full haystack. This allows
497+
// BoundedBacktracker to handle large inputs in FindAll where each successive
498+
// search operates on a smaller remaining portion.
490499
func (e *Engine) findIndicesBoundedBacktrackerAt(haystack []byte, at int) (int, int, bool) {
491500
if e.boundedBacktracker == nil {
492501
return e.findIndicesNFAAt(haystack, at)
493502
}
494503
atomic.AddUint64(&e.stats.NFASearches, 1)
495504

505+
// Slice to remaining portion for more efficient BoundedBacktracker usage.
506+
// This allows BT to handle large inputs in FindAll where we only need
507+
// to search the remaining portion, not the full haystack.
508+
remaining := haystack[at:]
509+
496510
// V11-002 ASCII optimization
497-
if e.asciiBoundedBacktracker != nil && simd.IsASCII(haystack) {
498-
if !e.asciiBoundedBacktracker.CanHandle(len(haystack)) {
499-
return e.pikevm.SearchAt(haystack, at)
511+
if e.asciiBoundedBacktracker != nil && simd.IsASCII(remaining) {
512+
if !e.asciiBoundedBacktracker.CanHandle(len(remaining)) {
513+
// Use optimized SlotTable-based search for large inputs
514+
return e.pikevm.SearchWithSlotTableAt(haystack, at, nfa.SearchModeFind)
515+
}
516+
start, end, found := e.asciiBoundedBacktracker.Search(remaining)
517+
if found {
518+
return at + start, at + end, true
500519
}
501-
return e.asciiBoundedBacktracker.SearchAt(haystack, at)
520+
return -1, -1, false
502521
}
503522

504-
if !e.boundedBacktracker.CanHandle(len(haystack)) {
505-
return e.pikevm.SearchAt(haystack, at)
523+
if !e.boundedBacktracker.CanHandle(len(remaining)) {
524+
// Delegate to NFA path which uses prefilter if available
525+
return e.findIndicesNFAAt(haystack, at)
506526
}
507527

508528
state := e.getSearchState()
509529
defer e.putSearchState(state)
510-
return e.boundedBacktracker.SearchAtWithState(haystack, at, state.backtracker)
530+
start, end, found := e.boundedBacktracker.SearchWithState(remaining, state.backtracker)
531+
if found {
532+
return at + start, at + end, true
533+
}
534+
return -1, -1, false
511535
}
512536

513537
// findIndicesCharClassSearcher searches using char_class+ searcher - zero alloc.
@@ -753,3 +777,174 @@ func (e *Engine) findIndicesAhoCorasickAt(haystack []byte, at int) (int, int, bo
753777
}
754778
return m.Start, m.End, true
755779
}
780+
781+
// =============================================================================
782+
// Internal state-reusing methods (for findAllIndicesLoop optimization)
783+
// =============================================================================
784+
785+
// findIndicesAtWithState is the internal version that reuses provided state.
786+
// Used by findAllIndicesLoop to avoid sync.Pool overhead per match.
787+
// This dispatcher handles all strategies, delegating to existing methods for
788+
// strategies that don't need mutable state, and using *WithState methods for
789+
// strategies that do (NFA, BoundedBacktracker).
790+
func (e *Engine) findIndicesAtWithState(haystack []byte, at int, state *SearchState) (start, end int, found bool) {
791+
// Early impossibility check: anchored pattern can only match at position 0
792+
if at > 0 && e.nfa.IsAlwaysAnchored() {
793+
return -1, -1, false
794+
}
795+
796+
switch e.strategy {
797+
case UseNFA:
798+
return e.findIndicesNFAAtWithState(haystack, at, state)
799+
case UseDFA:
800+
// DFA uses e.pikevm (shared) for final bounds, not pooled state
801+
return e.findIndicesDFAAt(haystack, at)
802+
case UseBoth:
803+
// Adaptive uses e.pikevm (shared) or delegates to NFA path
804+
return e.findIndicesAdaptiveAt(haystack, at)
805+
case UseReverseSuffix:
806+
return e.findIndicesReverseSuffixAt(haystack, at)
807+
case UseReverseSuffixSet:
808+
return e.findIndicesReverseSuffixSetAt(haystack, at)
809+
case UseReverseInner:
810+
return e.findIndicesReverseInnerAt(haystack, at)
811+
case UseBoundedBacktracker:
812+
return e.findIndicesBoundedBacktrackerAtWithState(haystack, at, state)
813+
case UseCharClassSearcher:
814+
return e.findIndicesCharClassSearcherAt(haystack, at)
815+
case UseCompositeSearcher:
816+
return e.findIndicesCompositeSearcherAt(haystack, at)
817+
case UseBranchDispatch:
818+
return e.findIndicesBranchDispatchAt(haystack, at)
819+
case UseTeddy:
820+
return e.findIndicesTeddyAt(haystack, at)
821+
case UseDigitPrefilter:
822+
return e.findIndicesDigitPrefilterAt(haystack, at)
823+
case UseAhoCorasick:
824+
return e.findIndicesAhoCorasickAt(haystack, at)
825+
case UseMultilineReverseSuffix:
826+
return e.findIndicesMultilineReverseSuffixAt(haystack, at)
827+
default:
828+
return e.findIndicesNFAAtWithState(haystack, at, state)
829+
}
830+
}
831+
832+
// findIndicesNFAAtWithState searches using NFA starting at position - zero alloc.
833+
// This is the state-reusing version for findAllIndicesLoop optimization.
834+
// Thread-safe: reuses provided state (no sync.Pool Get/Put).
835+
func (e *Engine) findIndicesNFAAtWithState(haystack []byte, at int, state *SearchState) (int, int, bool) {
836+
atomic.AddUint64(&e.stats.NFASearches, 1)
837+
838+
// BoundedBacktracker can be used for Find operations only when safe
839+
useBT := e.boundedBacktracker != nil && !e.canMatchEmpty
840+
841+
// Use prefilter for skip-ahead if available
842+
if e.prefilter != nil {
843+
for at < len(haystack) {
844+
// Find next candidate position via prefilter
845+
pos := e.prefilter.Find(haystack, at)
846+
if pos == -1 {
847+
return -1, -1, false // No more candidates
848+
}
849+
atomic.AddUint64(&e.stats.PrefilterHits, 1)
850+
851+
// Try to match at candidate position
852+
var start, end int
853+
var found bool
854+
if useBT && e.boundedBacktracker.CanHandle(len(haystack)-pos) {
855+
start, end, found = e.boundedBacktracker.SearchAtWithState(haystack, pos, state.backtracker)
856+
} else {
857+
start, end, found = state.pikevm.SearchWithSlotTableAt(haystack, pos, nfa.SearchModeFind)
858+
}
859+
if found {
860+
return start, end, true
861+
}
862+
863+
// Move past this position
864+
atomic.AddUint64(&e.stats.PrefilterMisses, 1)
865+
at = pos + 1
866+
}
867+
return -1, -1, false
868+
}
869+
870+
// No prefilter: use BoundedBacktracker if available and safe
871+
if useBT && e.boundedBacktracker.CanHandle(len(haystack)-at) {
872+
return e.boundedBacktracker.SearchAtWithState(haystack, at, state.backtracker)
873+
}
874+
875+
// Use optimized SlotTable-based search for large inputs
876+
return state.pikevm.SearchWithSlotTableAt(haystack, at, nfa.SearchModeFind)
877+
}
878+
879+
// findIndicesBoundedBacktrackerAtWithState searches using bounded backtracker at position.
880+
// This is the state-reusing version for findAllIndicesLoop optimization.
881+
// Thread-safe: reuses provided state (no sync.Pool Get/Put).
882+
//
883+
// V11-002 ASCII optimization: When pattern contains '.' and input is ASCII-only,
884+
// uses the faster ASCII NFA.
885+
//
886+
// V11.5 optimization: When searching from position 'at', only check CanHandle for
887+
// the remaining portion haystack[at:], not the full haystack. This allows
888+
// BoundedBacktracker to handle large inputs in FindAll where each successive
889+
// search operates on a smaller remaining portion.
890+
func (e *Engine) findIndicesBoundedBacktrackerAtWithState(haystack []byte, at int, state *SearchState) (int, int, bool) {
891+
if e.boundedBacktracker == nil {
892+
return e.findIndicesNFAAtWithState(haystack, at, state)
893+
}
894+
atomic.AddUint64(&e.stats.NFASearches, 1)
895+
896+
// Slice to remaining portion for more efficient BoundedBacktracker usage.
897+
// This allows BT to handle large inputs in FindAll where we only need
898+
// to search the remaining portion, not the full haystack.
899+
remaining := haystack[at:]
900+
901+
// V11-002 ASCII optimization
902+
if e.asciiBoundedBacktracker != nil && simd.IsASCII(remaining) {
903+
if !e.asciiBoundedBacktracker.CanHandle(len(remaining)) {
904+
// V12 Windowed BoundedBacktracker for ASCII path
905+
maxInput := e.asciiBoundedBacktracker.MaxInputSize()
906+
if maxInput > 0 && len(remaining) > maxInput {
907+
window := remaining[:maxInput]
908+
start, end, found := e.asciiBoundedBacktracker.Search(window)
909+
if found {
910+
return at + start, at + end, true
911+
}
912+
}
913+
return state.pikevm.SearchWithSlotTableAt(haystack, at, nfa.SearchModeFind)
914+
}
915+
start, end, found := e.asciiBoundedBacktracker.Search(remaining)
916+
if found {
917+
return at + start, at + end, true
918+
}
919+
return -1, -1, false
920+
}
921+
922+
if !e.boundedBacktracker.CanHandle(len(remaining)) {
923+
// V12 Windowed BoundedBacktracker: For large inputs, try searching in a
924+
// window of maxInputSize bytes first. Most patterns produce short matches
925+
// (e.g., word patterns like (\w{2,8})+ match 2-8 chars), so the match
926+
// will be found within the first window. Only fall back to PikeVM if
927+
// no match is found in the window (rare for common patterns).
928+
maxInput := e.boundedBacktracker.MaxInputSize()
929+
if maxInput > 0 && len(remaining) > maxInput {
930+
// Search in the first window
931+
window := remaining[:maxInput]
932+
start, end, found := e.boundedBacktracker.SearchWithState(window, state.backtracker)
933+
if found {
934+
// Match found within window - this is the common case
935+
return at + start, at + end, true
936+
}
937+
// No match in window - could be:
938+
// 1. No match exists in the full input
939+
// 2. Match exists beyond the window
940+
// Fall back to PikeVM to handle both cases correctly
941+
}
942+
return state.pikevm.SearchWithSlotTableAt(haystack, at, nfa.SearchModeFind)
943+
}
944+
945+
start, end, found := e.boundedBacktracker.SearchWithState(remaining, state.backtracker)
946+
if found {
947+
return at + start, at + end, true
948+
}
949+
return -1, -1, false
950+
}

meta/findall.go

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ func (e *Engine) FindAllIndicesStreaming(haystack []byte, n int, results [][2]in
109109
}
110110

111111
// findAllIndicesLoop is the standard loop-based FindAll for non-streaming strategies.
112+
// Optimized: acquires SearchState once for entire loop to avoid sync.Pool overhead per match.
112113
func (e *Engine) findAllIndicesLoop(haystack []byte, n int, results [][2]int) [][2]int {
113114
if results == nil {
114115
// Smart allocation: anchored patterns have max 1 match, others use capped heuristic.
@@ -132,8 +133,12 @@ func (e *Engine) findAllIndicesLoop(haystack []byte, n int, results [][2]int) []
132133
pos := 0
133134
lastMatchEnd := -1
134135

136+
// Get state ONCE for entire iteration - eliminates 1.29M sync.Pool ops for FindAll
137+
state := e.getSearchState()
138+
defer e.putSearchState(state)
139+
135140
for n <= 0 || len(results) < n {
136-
start, end, found := e.FindIndicesAt(haystack, pos)
141+
start, end, found := e.findIndicesAtWithState(haystack, pos, state)
137142
if !found {
138143
break
139144
}
@@ -169,6 +174,7 @@ func (e *Engine) findAllIndicesLoop(haystack []byte, n int, results [][2]int) []
169174
// This is optimized for counting without allocating result slices.
170175
// Uses early termination for boolean checks at each step.
171176
// If n > 0, counts at most n matches. If n <= 0, counts all matches.
177+
// Optimized: acquires SearchState once for entire loop to avoid sync.Pool overhead per match.
172178
//
173179
// Example:
174180
//
@@ -184,9 +190,13 @@ func (e *Engine) Count(haystack []byte, n int) int {
184190
pos := 0
185191
lastNonEmptyEnd := -1
186192

193+
// Get state ONCE for entire iteration - eliminates sync.Pool overhead per match
194+
state := e.getSearchState()
195+
defer e.putSearchState(state)
196+
187197
for pos <= len(haystack) {
188-
// Use zero-allocation FindIndicesAt
189-
start, end, found := e.FindIndicesAt(haystack, pos)
198+
// Use state-reusing version for zero sync.Pool overhead per match
199+
start, end, found := e.findIndicesAtWithState(haystack, pos, state)
190200
if !found {
191201
break
192202
}

0 commit comments

Comments
 (0)