Skip to content

Commit f6f6a76

Browse files
authored
Merge pull request #117 from coregx/feature/issue116-reverse-suffix-fix
fix: reject alternation patterns from ReverseSuffixSet (Issue #116)
2 parents 2516b6a + 5c5392b commit f6f6a76

File tree

6 files changed

+136
-11
lines changed

6 files changed

+136
-11
lines changed

CHANGELOG.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1212
- ARM NEON SIMD support (waiting for Go 1.26 native SIMD)
1313
- SIMD prefilter for CompositeSequenceDFA (#83)
1414

15+
## [0.12.2] - 2026-02-16
16+
17+
### Fixed
18+
- **Alternation patterns misrouted to ReverseSuffixSet** (Issue #116) —
19+
Patterns like `[cgt]gggtaaa|tttaccc[acg]` (alternation without `.*` prefix)
20+
were incorrectly routed to UseReverseSuffixSet strategy, which assumed
21+
match always starts at position 0. This produced wrong match boundaries
22+
(e.g., `[0, 37976]` instead of `[3, 11]`). Fix: added `isSafeForReverseSuffix`
23+
guard before UseReverseSuffixSet selection.
24+
- **`matchStartZero` optimization too aggressive** — The reverse DFA skip
25+
optimization (`matchStartZero`) was enabled for all unanchored patterns,
26+
but is only correct for `.*` prefix patterns. Patterns like `[^\s]+\.txt`
27+
or `.+\.txt` could return wrong match start positions. Fix: restricted
28+
`matchStartZero` to patterns with explicit `OpStar(AnyChar)` prefix.
29+
1530
## [0.12.1] - 2026-02-15
1631

1732
### Performance

meta/compile.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ func buildReverseSearchers(
217217

218218
case UseReverseSuffix:
219219
suffixLiterals := extractor.ExtractSuffixes(re)
220-
searcher, err := NewReverseSuffixSearcher(nfaEngine, suffixLiterals, dfaConfig)
220+
searcher, err := NewReverseSuffixSearcher(nfaEngine, suffixLiterals, dfaConfig, hasDotStarPrefix(re))
221221
if err != nil {
222222
result.finalStrategy = UseDFA
223223
} else {
@@ -226,7 +226,7 @@ func buildReverseSearchers(
226226

227227
case UseReverseSuffixSet:
228228
suffixLiterals := extractor.ExtractSuffixes(re)
229-
searcher, err := NewReverseSuffixSetSearcher(nfaEngine, suffixLiterals, dfaConfig)
229+
searcher, err := NewReverseSuffixSetSearcher(nfaEngine, suffixLiterals, dfaConfig, hasDotStarPrefix(re))
230230
if err != nil {
231231
result.finalStrategy = UseBoth
232232
} else {

meta/reverse_suffix.go

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ func NewReverseSuffixSearcher(
7272
forwardNFA *nfa.NFA,
7373
suffixLiterals *literal.Seq,
7474
config lazy.Config,
75+
matchStartZero bool,
7576
) (*ReverseSuffixSearcher, error) {
7677
// Get suffix bytes from longest common suffix
7778
var suffixBytes []byte
@@ -111,11 +112,9 @@ func NewReverseSuffixSearcher(
111112
// Create PikeVM for fallback
112113
pikevm := nfa.NewPikeVM(forwardNFA)
113114

114-
// Check if pattern is unanchored (starts matching from position 0)
115-
// For unanchored patterns with .* prefix, match always starts at 0
116-
// This allows us to skip the reverse DFA scan entirely!
117-
matchStartZero := !forwardNFA.IsAlwaysAnchored()
118-
115+
// matchStartZero is true only when pattern has .* prefix (e.g., `.*\.txt`).
116+
// Only OpStar(AnyChar) guarantees match starts at 0/at — skip reverse DFA.
117+
// Other wildcards like .+, [^\s]+, \w{2,8} do NOT guarantee this.
119118
return &ReverseSuffixSearcher{
120119
forwardNFA: forwardNFA,
121120
reverseNFA: reverseNFA,

meta/reverse_suffix_set.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ func NewReverseSuffixSetSearcher(
6161
forwardNFA *nfa.NFA,
6262
suffixLiterals *literal.Seq,
6363
config lazy.Config,
64+
matchStartZero bool,
6465
) (*ReverseSuffixSetSearcher, error) {
6566
if suffixLiterals == nil || suffixLiterals.IsEmpty() {
6667
return nil, ErrNoSuffixSet
@@ -103,9 +104,8 @@ func NewReverseSuffixSetSearcher(
103104
// Create PikeVM for fallback
104105
pikevm := nfa.NewPikeVM(forwardNFA)
105106

106-
// Check if pattern is unanchored (starts matching from position 0)
107-
matchStartZero := !forwardNFA.IsAlwaysAnchored()
108-
107+
// matchStartZero is true only when pattern has .* prefix (e.g., `.*\.(txt|log|md)`).
108+
// Only OpStar(AnyChar) guarantees match starts at 0/at — skip reverse DFA.
109109
return &ReverseSuffixSetSearcher{
110110
forwardNFA: forwardNFA,
111111
reverseNFA: reverseNFA,

meta/reverse_suffix_test.go

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -391,3 +391,88 @@ func TestReverseSuffix_EdgeCases(t *testing.T) {
391391
})
392392
}
393393
}
394+
395+
// TestIssue116_AlternationWithoutWildcard tests that alternation patterns without
396+
// wildcard prefix (like `[cgt]gggtaaa|tttaccc[acg]`) are NOT routed to
397+
// UseReverseSuffixSet, which would produce wrong match positions.
398+
// See: https://github.com/coregx/coregex/issues/116
399+
func TestIssue116_AlternationWithoutWildcard(t *testing.T) {
400+
tests := []struct {
401+
name string
402+
pattern string
403+
input string
404+
want []string
405+
}{
406+
{
407+
name: "original bug report pattern",
408+
pattern: `[cgt]gggtaaa|tttaccc[acg]`,
409+
input: "xxxcgggtaaaxxx",
410+
want: []string{"cgggtaaa"},
411+
},
412+
{
413+
name: "original bug report pattern - second alt",
414+
pattern: `[cgt]gggtaaa|tttaccc[acg]`,
415+
input: "xxxtttacccaxxx",
416+
want: []string{"tttaccca"},
417+
},
418+
{
419+
name: "multiple matches",
420+
pattern: `[cgt]gggtaaa|tttaccc[acg]`,
421+
input: "cgggtaaa tttaccca ggggtaaa tttacccg",
422+
want: []string{"cgggtaaa", "tttaccca", "ggggtaaa", "tttacccg"},
423+
},
424+
{
425+
name: "no match",
426+
pattern: `[cgt]gggtaaa|tttaccc[acg]`,
427+
input: "agggtaaa tttacccd",
428+
want: nil,
429+
},
430+
{
431+
name: "simple alternation without char class",
432+
pattern: `foo|bar`,
433+
input: "xxxfooxxxbarxxx",
434+
want: []string{"foo", "bar"},
435+
},
436+
}
437+
438+
for _, tt := range tests {
439+
t.Run(tt.name, func(t *testing.T) {
440+
e, err := Compile(tt.pattern)
441+
if err != nil {
442+
t.Fatalf("Compile(%q) failed: %v", tt.pattern, err)
443+
}
444+
445+
// Verify strategy is NOT UseReverseSuffixSet for these patterns
446+
strategy := e.Strategy()
447+
if strategy == UseReverseSuffixSet {
448+
t.Errorf("pattern %q should NOT use UseReverseSuffixSet strategy", tt.pattern)
449+
}
450+
451+
var got []string
452+
haystack := []byte(tt.input)
453+
at := 0
454+
for at < len(haystack) {
455+
start, end, found := e.FindIndicesAt(haystack, at)
456+
if !found {
457+
break
458+
}
459+
got = append(got, string(haystack[start:end]))
460+
if end > at {
461+
at = end
462+
} else {
463+
at++
464+
}
465+
}
466+
467+
if len(got) != len(tt.want) {
468+
t.Fatalf("FindAll(%q) = %v (%d matches), want %v (%d matches)",
469+
tt.input, got, len(got), tt.want, len(tt.want))
470+
}
471+
for i := range got {
472+
if got[i] != tt.want[i] {
473+
t.Errorf("match[%d] = %q, want %q", i, got[i], tt.want[i])
474+
}
475+
}
476+
})
477+
}
478+
}

meta/strategy.go

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -574,6 +574,30 @@ func isDigitRunSkipSafe(re *syntax.Regexp) bool {
574574
// - Internal anchors: `0?^0`, `a$b` - position constraints don't reverse
575575
// - Star of CharClass: `[^\s]*suffix` - zero-width match edge cases
576576

577+
// hasDotStarPrefix checks if a pattern has `.*` as its first subexpression.
578+
// Only `.*` (OpStar of AnyChar/AnyCharNotNL) guarantees matching from position 0
579+
// to any position without fail. Used to determine if reverse DFA scan can be skipped.
580+
//
581+
// Returns true for: `.*\.txt`, `(.*)\.txt`
582+
// Returns false for: `.+\.txt`, `[^\s]+\.txt`, `\w{2,8}\.txt`, `foo|bar`
583+
func hasDotStarPrefix(re *syntax.Regexp) bool {
584+
// Unwrap captures
585+
for re.Op == syntax.OpCapture && len(re.Sub) > 0 {
586+
re = re.Sub[0]
587+
}
588+
if re.Op != syntax.OpConcat || len(re.Sub) < 2 {
589+
return false
590+
}
591+
first := re.Sub[0]
592+
// Unwrap captures on first sub
593+
for first.Op == syntax.OpCapture && len(first.Sub) > 0 {
594+
first = first.Sub[0]
595+
}
596+
// .* = OpStar(OpAnyChar or OpAnyCharNotNL)
597+
return first.Op == syntax.OpStar && len(first.Sub) > 0 &&
598+
(first.Sub[0].Op == syntax.OpAnyChar || first.Sub[0].Op == syntax.OpAnyCharNotNL)
599+
}
600+
577601
// isWildcardSubexpression checks if a subexpression acts as a "wildcard" that can
578602
// consume variable-length input. Used by isSafeForReverseSuffix to identify patterns
579603
// suitable for reverse suffix search.
@@ -995,7 +1019,9 @@ func selectReverseStrategy(n *nfa.NFA, re *syntax.Regexp, literals *literal.Seq,
9951019

9961020
// No common suffix (LCS empty), but check if multiple suffix literals available
9971021
// for Teddy multi-suffix prefilter. This handles patterns like `.*\.(txt|log|md)`.
998-
if shouldUseReverseSuffixSet(literals, suffixLiterals) {
1022+
// Must also pass isSafeForReverseSuffix to reject patterns without wildcard prefix
1023+
// (e.g., `[cgt]gggtaaa|tttaccc[acg]` — Issue #116).
1024+
if isSafeForReverseSuffix(re) && shouldUseReverseSuffixSet(literals, suffixLiterals) {
9991025
return UseReverseSuffixSet
10001026
}
10011027

0 commit comments

Comments
 (0)