@@ -11,7 +11,7 @@ internal static class FuzzyWordMatcher
1111 private const double DefaultErrorProbability = 0.04 ;
1212 private const double DefaultTailProbability = 0.01 ;
1313
14- public static void Match ( ref MatchState state , int minWordSize , int levenshteinMaxWordSize )
14+ public static void Match ( ref MatchState state , CoverageSetup setup )
1515 {
1616 int qCount = state . QCount ;
1717 int dCount = state . DCount ;
@@ -23,23 +23,30 @@ public static void Match(ref MatchState state, int minWordSize, int levenshteinM
2323
2424 if ( maxQueryLength == 0 ) return ;
2525
26- // Use a principled maximum edit distance based on a simple
27- // Binomial(L, p) error model instead of an uncalibrated
28- // relative distance knob.
29- //
30- // We choose the smallest d such that:
31- // Pr[D ≤ d] ≥ 1 - alpha
32- //
33- // where D ~ Binomial(L, p), p ≈ DefaultErrorProbability and
34- // alpha ≈ DefaultTailProbability.
35- int maxEditDist = EditDistanceModel . GetMaxEditsForLength (
36- maxQueryLength ,
37- DefaultErrorProbability ,
38- DefaultTailProbability ) ;
26+ int maxEditDist ;
27+
28+ if ( maxQueryLength >= setup . MinLengthTwoTypos )
29+ maxEditDist = 2 ;
30+ else if ( maxQueryLength >= setup . MinLengthOneTypo )
31+ maxEditDist = 1 ;
32+ else
33+ maxEditDist = 0 ;
3934
40- // Ensure we allow at least one edit for non-empty queries.
41- if ( maxEditDist < 1 )
35+ // Special handling for len=2 queries if default logic forbids typos (maxEditDist=0)
36+ // If we have very short words (len 2) that are disallowed normal typos,
37+ // we conditionally allow 1 typo ONLY if it matches a target of length 3 (Insertion).
38+ // This supports common cases like "te" -> "the" while avoiding high-noise substitutions like "at" -> "it".
39+ bool hasSpecialShortWord = ( maxQueryLength == 2 && maxEditDist == 0 && setup . NumTypos >= 1 ) ;
40+ if ( hasSpecialShortWord )
41+ {
4242 maxEditDist = 1 ;
43+ }
44+
45+ // Respect the global cap
46+ if ( maxEditDist > setup . NumTypos )
47+ maxEditDist = setup . NumTypos ;
48+
49+ if ( maxEditDist == 0 ) return ;
4350
4451 for ( int editDist = 1 ; editDist <= maxEditDist ; editDist ++ )
4552 {
@@ -55,12 +62,32 @@ public static void Match(ref MatchState state, int minWordSize, int levenshteinM
5562 int qLen = qSlice . Length ;
5663
5764 // Skip query tokens that are too short for meaningful fuzzy matching
58- // but allow 2-char tokens since they can fuzzy-match 3-char words (e.g., "te" → "the")
59- if ( qLen < minWordSize ) continue ;
65+ if ( qLen < setup . MinWordSize ) continue ;
66+
67+ // Calculate max edits for THIS token
68+ int tokenMaxEdits = 0 ;
69+ if ( qLen >= setup . MinLengthTwoTypos ) tokenMaxEdits = 2 ;
70+ else if ( qLen >= setup . MinLengthOneTypo ) tokenMaxEdits = 1 ;
71+ else tokenMaxEdits = 0 ;
6072
73+ // Apply special short word logic for individual token
74+ bool isSpecialShortCase = false ;
75+ if ( qLen == 2 && tokenMaxEdits == 0 && setup . NumTypos >= 1 )
76+ {
77+ tokenMaxEdits = 1 ;
78+ isSpecialShortCase = true ;
79+ }
80+
81+ if ( tokenMaxEdits > setup . NumTypos ) tokenMaxEdits = setup . NumTypos ;
82+
83+ if ( editDist > tokenMaxEdits ) continue ;
84+
85+ // For special short case, we only process editDist=1
86+ if ( isSpecialShortCase && editDist != 1 ) continue ;
87+
6188 // Calculate the valid document word length range for this query token and edit distance
62- int minLen = Math . Max ( minWordSize , qLen - editDist ) ;
63- int maxLen = Math . Min ( levenshteinMaxWordSize , qLen + editDist ) ;
89+ int minLen = Math . Max ( setup . MinWordSize , qLen - editDist ) ;
90+ int maxLen = Math . Min ( setup . LevenshteinMaxWordSize , qLen + editDist ) ;
6491 if ( maxLen > 63 ) maxLen = 63 ;
6592
6693 ReadOnlySpan < char > qText = state . QuerySpan . Slice ( qSlice . Offset , qSlice . Length ) ;
@@ -73,6 +100,16 @@ public static void Match(ref MatchState state, int minWordSize, int levenshteinM
73100 if ( dLen > maxLen || dLen < minLen ) continue ;
74101
75102 ReadOnlySpan < char > dText = state . DocSpan . Slice ( dSlice . Offset , dSlice . Length ) ;
103+
104+ // Enforce special short word constraints
105+ if ( isSpecialShortCase )
106+ {
107+ // Special handling for short words: First character MUST match.
108+ // This allows "te" -> "the" (ins) or "te" -> "to" (sub),
109+ // but prevents high-noise matches like "at" -> "cat" (prefix ins) or "at" -> "it" (start sub).
110+ if ( dText . Length == 0 || char . ToLowerInvariant ( dText [ 0 ] ) != char . ToLowerInvariant ( qText [ 0 ] ) )
111+ continue ;
112+ }
76113
77114 int dist = LevenshteinDistance . CalculateDamerau ( qText , dText , editDist , ignoreCase : true ) ;
78115
@@ -107,4 +144,3 @@ public static bool AllTermsFullyMatched(ref MatchState state)
107144 return true ;
108145 }
109146}
110-
0 commit comments