fix: checkHasWordBoundary 7M x slowdown → 8.6x faster than stdlib (#105) (#106)

kolkov · web-flow · commit de173be713fa · 2026-02-01T12:46:09.000+03:00
* fix: checkHasWordBoundary 7M x slowdown (#105) Root cause: O(N*M) complexity from scanning all NFA states per byte. Fixes applied: - Use NewBuilderWithWordBoundary() to avoid repeated O(N) scans - Add hasWordBoundary guards to skip unnecessary checks - Use anchored search for prefilter verification Result: 3m22s → 30µs (6,600,000x faster, stdlib parity) Closes #105 * perf: replace map with slice for DFA state lookup State IDs are assigned sequentially (0, 1, 2...), so direct slice indexing is faster than map hash lookup. Before: map lookups were 42% of CPU time After: slice indexing is only 3% of CPU time Issue #105 pattern benchmark: - Before: 99,510 ns/op - After: 56,384 ns/op (1.77x faster) * perf: extract literals from capture/repeat groups Literal extractor now looks inside OpRepeat and OpCapture to find literal prefixes for better prefilters. Example: `=($\w...){2}` now extracts `=$` (2 bytes) instead of just `=` (1 byte). Issue #105 pattern benchmark (79KB): - Before: 56.4 µs (2.2x slower than stdlib) - After: 3.6 µs (7.9x FASTER than stdlib) The 2-byte prefilter `=$` is much more selective, reducing false positives.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,30 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ---
 
+## [0.11.5] - 2026-02-01
+
+### Fixed
+- **checkHasWordBoundary catastrophic slowdown** (Issue #105)
+  - Patterns with `\w{n,m}` quantifiers were **7,000,000x slower** than stdlib
+  - Root cause: O(N*M) complexity from scanning all NFA states per byte
+  - Fix: Use `NewBuilderWithWordBoundary()`, add `hasWordBoundary` guards, anchored prefilter verification
+  - **Result: 3m22s → 3.6µs** (56,000,000x faster, **7.9x faster than stdlib**)
+
+### Performance
+- **DFA state lookup: map → slice** — 42% CPU time eliminated
+  - State IDs are sequential, so direct slice indexing beats hash lookups
+- **Literal extraction from capture/repeat groups** — better prefilters
+  - `=(\$\w...){2}` now extracts `=$` (2 bytes) instead of just `=` (1 byte)
+  - Reduces false positives in prefilter, massive speedup on selective patterns
+
+### Technical Details
+- Added `searchEarliestMatchAnchored()` for O(1) prefilter verification
+- Replaced `stateByID map[StateID]*State` with `states []*State`
+- Extended `tryExpandConcatSuffix()` to unwrap OpRepeat/OpCapture
+- Credits: @danslo for root cause analysis and fix suggestions
+
+---
+
 ## [0.11.4] - 2026-01-16
 
 ### Fixed
diff --git a/dfa/lazy/builder.go b/dfa/lazy/builder.go
@@ -128,7 +128,7 @@ func (b *Builder) Build() (*DFA, error) {
 		config:           b.config,
 		prefilter:        pf,
 		pikevm:           nfa.NewPikeVM(b.nfa),
-		stateByID:        make(map[StateID]*State, b.config.MaxStates),
+		states:           make([]*State, 0, b.config.MaxStates),
 		startTable:       startTable,
 		byteClasses:      b.nfa.ByteClasses(),
 		freshStartStates: freshStartStates,
diff --git a/dfa/lazy/lazy.go b/dfa/lazy/lazy.go
@@ -59,9 +59,10 @@ type DFA struct {
 	prefilter prefilter.Prefilter
 	pikevm    *nfa.PikeVM
 
-	// stateByID provides O(1) lookup of states by ID
-	// This maps StateID → *State for fast access during search
-	stateByID map[StateID]*State
+	// states provides O(1) lookup of states by ID via direct indexing.
+	// StateIDs are sequential (0, 1, 2...), so slice indexing is faster than map.
+	// This is a critical optimization - map lookups were 42% of CPU time!
+	states []*State
 
 	// startTable caches start states for different look-behind contexts
 	// This enables correct handling of assertions (^, \b, etc.) and
@@ -243,7 +244,8 @@ func (d *DFA) SearchAtAnchored(haystack []byte, at int) int {
 	for pos := at; pos < len(haystack); pos++ {
 		b := haystack[pos]
 
-		if d.checkWordBoundaryMatch(currentState, b) {
+		// Skip expensive check for patterns without word boundaries (Issue #105)
+		if d.hasWordBoundary && d.checkWordBoundaryMatch(currentState, b) {
 			return pos
 		}
 
@@ -322,8 +324,9 @@ func (d *DFA) isMatchWithPrefilter(haystack []byte) bool {
 		return false
 	}
 
-	// Try to match at candidate - use early termination
-	if d.searchEarliestMatch(haystack, pos) {
+	// Try to match at candidate - use ANCHORED search to verify match starts here
+	// Issue #105: unanchored search caused catastrophic slowdown
+	if d.searchEarliestMatchAnchored(haystack, pos) {
 		return true
 	}
 
@@ -335,7 +338,7 @@ func (d *DFA) isMatchWithPrefilter(haystack []byte) bool {
 			return false
 		}
 		pos = candidate
-		if d.searchEarliestMatch(haystack, pos) {
+		if d.searchEarliestMatchAnchored(haystack, pos) {
 			return true
 		}
 	}
@@ -393,7 +396,8 @@ func (d *DFA) searchEarliestMatch(haystack []byte, startPos int) bool {
 		// This handles patterns like `test\b` where after matching "test",
 		// the next byte '!' creates a word boundary that satisfies \b.
 		// We need to detect this match before trying to consume '!'.
-		if d.checkWordBoundaryMatch(currentState, b) {
+		// Skip expensive check for patterns without word boundaries (Issue #105)
+		if d.hasWordBoundary && d.checkWordBoundaryMatch(currentState, b) {
 			return true
 		}
 
@@ -444,6 +448,75 @@ func (d *DFA) searchEarliestMatch(haystack []byte, startPos int) bool {
 	return d.checkEOIMatch(currentState)
 }
 
+// searchEarliestMatchAnchored performs ANCHORED DFA search with early termination.
+// Unlike searchEarliestMatch, this requires the match to START exactly at startPos.
+// This is critical for prefilter verification - we need to confirm the match
+// actually starts at the candidate position, not somewhere after it.
+//
+// Issue #105: Using unanchored search for prefilter verification caused
+// catastrophic slowdown because it would re-scan from candidate to end.
+func (d *DFA) searchEarliestMatchAnchored(haystack []byte, startPos int) bool {
+	if startPos > len(haystack) {
+		return false
+	}
+
+	// Get ANCHORED start state (requires match to start exactly at startPos)
+	currentState := d.getStartState(haystack, startPos, true)
+	if currentState == nil {
+		// Fallback to NFA with anchored search
+		start, end, matched := d.pikevm.SearchAt(haystack, startPos)
+		// For anchored: match must start exactly at startPos
+		return matched && start == startPos && end >= start
+	}
+
+	// Check if start state is already a match (e.g., empty pattern)
+	if currentState.IsMatch() {
+		return true
+	}
+
+	// Scan input byte by byte with early termination
+	for pos := startPos; pos < len(haystack); pos++ {
+		b := haystack[pos]
+
+		// Skip expensive check for patterns without word boundaries (Issue #105)
+		if d.hasWordBoundary && d.checkWordBoundaryMatch(currentState, b) {
+			return true
+		}
+
+		// Get next state
+		classIdx := d.byteToClass(b)
+		nextID, ok := currentState.Transition(classIdx)
+		switch {
+		case !ok:
+			nextState, err := d.determinize(currentState, b)
+			if err != nil {
+				start, end, matched := d.pikevm.SearchAt(haystack, startPos)
+				return matched && start == startPos && end >= start
+			}
+			if nextState == nil {
+				return false
+			}
+			currentState = nextState
+
+		case nextID == DeadState:
+			return false
+
+		default:
+			currentState = d.getState(nextID)
+			if currentState == nil {
+				start, end, matched := d.pikevm.SearchAt(haystack, startPos)
+				return matched && start == startPos && end >= start
+			}
+		}
+
+		if currentState.IsMatch() {
+			return true
+		}
+	}
+
+	return d.checkEOIMatch(currentState)
+}
+
 // findWithPrefilterAt searches using prefilter to accelerate unanchored search.
 // This is used by FindAt to correctly handle anchors when searching from non-zero positions.
 func (d *DFA) findWithPrefilterAt(haystack []byte, startAt int) int {
@@ -801,17 +874,23 @@ func (d *DFA) getState(id StateID) *State {
 		return nil
 	}
 
-	// O(1) lookup via stateByID map
-	state, ok := d.stateByID[id]
-	if !ok {
+	// O(1) lookup via direct slice indexing (faster than map!)
+	idx := int(id)
+	if idx >= len(d.states) {
 		return nil
 	}
-	return state
+	return d.states[idx]
 }
 
-// registerState adds a state to the ID-based lookup map
+// registerState adds a state to the states slice for O(1) lookup.
+// StateIDs are assigned sequentially, so we can use direct indexing.
 func (d *DFA) registerState(state *State) {
-	d.stateByID[state.ID()] = state
+	id := int(state.ID())
+	// Grow slice if needed
+	for len(d.states) <= id {
+		d.states = append(d.states, nil)
+	}
+	d.states[id] = state
 }
 
 // checkEOIMatch checks if the current state would match at end-of-input.
@@ -828,7 +907,8 @@ func (d *DFA) checkEOIMatch(state *State) bool {
 	}
 
 	// Create a temporary builder for EOI resolution
-	builder := NewBuilder(d.nfa, d.config)
+	// Use NewBuilderWithWordBoundary to avoid O(states) scan per call (Issue #105)
+	builder := NewBuilderWithWordBoundary(d.nfa, d.config, d.hasWordBoundary)
 	return builder.CheckEOIMatch(state.NFAStates(), state.IsFromWord())
 }
 
@@ -853,7 +933,8 @@ func (d *DFA) checkWordBoundaryMatch(state *State, nextByte byte) bool {
 		return false
 	}
 
-	builder := NewBuilder(d.nfa, d.config)
+	// Use NewBuilderWithWordBoundary to avoid O(states) scan per call (Issue #105)
+	builder := NewBuilderWithWordBoundary(d.nfa, d.config, d.hasWordBoundary)
 	isFromWord := state.IsFromWord()
 	isNextWord := isWordByte(nextByte)
 	wordBoundarySatisfied := isFromWord != isNextWord
@@ -1017,7 +1098,7 @@ func (d *DFA) CacheStats() (size int, capacity uint32, hits, misses uint64, hitR
 // Primarily useful for testing and benchmarking.
 func (d *DFA) ResetCache() {
 	d.cache.Clear()
-	d.stateByID = make(map[StateID]*State, d.config.MaxStates)
+	d.states = make([]*State, 0, d.config.MaxStates)
 
 	// Reset StartTable
 	d.startTable = NewStartTable()
diff --git a/literal/extractor.go b/literal/extractor.go
@@ -576,6 +576,13 @@ func (e *Extractor) tryExpandConcatSuffix(prefixes *Seq, subs []*syntax.Regexp,
 		}
 	}
 
+	// Try Capture/Repeat group expansion: =(\$...){2} → =$...
+	// Extract literal prefix from inside the group and append to our prefixes.
+	// This handles patterns like `=(\$\w{1,10}...){2}` where `=$` is a better prefilter than just `=`.
+	if expanded := e.tryExpandRepeatCapture(prefixes, nextSub, depth); expanded != nil {
+		return expanded
+	}
+
 	// Default: mark prefixes as incomplete since more elements follow
 	lits := make([]Literal, prefixes.Len())
 	for i := 0; i < prefixes.Len(); i++ {
@@ -585,6 +592,49 @@ func (e *Extractor) tryExpandConcatSuffix(prefixes *Seq, subs []*syntax.Regexp,
 	return NewSeq(lits...)
 }
 
+// tryExpandRepeatCapture attempts to extract literals from inside OpRepeat/OpCapture groups.
+// For pattern `=(\$\w...){2}`, this extracts `=$` instead of just `=`.
+// Returns nil if no expansion is possible.
+func (e *Extractor) tryExpandRepeatCapture(prefixes *Seq, nextSub *syntax.Regexp, depth int) *Seq {
+	// Unwrap OpRepeat if present (e.g., {2,2} or {1,10})
+	innerNode := nextSub
+	if innerNode.Op == syntax.OpRepeat && len(innerNode.Sub) > 0 && innerNode.Min >= 1 {
+		innerNode = innerNode.Sub[0]
+	}
+	// Unwrap OpCapture if present
+	if innerNode.Op == syntax.OpCapture && len(innerNode.Sub) > 0 {
+		innerNode = innerNode.Sub[0]
+	}
+	// Only proceed if we actually unwrapped something
+	if innerNode == nextSub {
+		return nil
+	}
+
+	innerPrefixes := e.extractPrefixes(innerNode, depth+1)
+	if innerPrefixes.IsEmpty() {
+		return nil
+	}
+
+	// Append inner prefixes to our prefixes
+	var expanded []Literal
+	for i := 0; i < prefixes.Len(); i++ {
+		prefix := prefixes.Get(i)
+		for j := 0; j < innerPrefixes.Len(); j++ {
+			inner := innerPrefixes.Get(j)
+			combined := append([]byte{}, prefix.Bytes...)
+			combined = append(combined, inner.Bytes...)
+			if len(combined) > e.config.MaxLiteralLen {
+				combined = combined[:e.config.MaxLiteralLen]
+			}
+			expanded = append(expanded, NewLiteral(combined, false))
+		}
+	}
+	if len(expanded) == 0 {
+		return nil
+	}
+	return NewSeq(expanded...)
+}
+
 // expandLiteralAlternate expands Literal + Alternation back into individual complete literals.
 // This handles the case where the regex parser factors common prefixes:
 //
diff --git a/meta/issue105_test.go b/meta/issue105_test.go