Skip to content

Commit 37753aa

Browse files
committed
improve recall on short documents
1 parent dfffe44 commit 37753aa

6 files changed

Lines changed: 201 additions & 24 deletions

File tree

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
using Microsoft.VisualStudio.TestTools.UnitTesting;
2+
using Infidex.Core;
3+
using Infidex.Api;
4+
using System.Linq;
5+
using System;
6+
7+
namespace Infidex.Tests;
8+
9+
[TestClass]
10+
public class FuzzyRegressionTests
11+
{
12+
private SearchEngine _engine = null!;
13+
14+
[TestInitialize]
15+
public void Setup()
16+
{
17+
_engine = SearchEngine.CreateDefault();
18+
19+
var docs = new[]
20+
{
21+
new Document(1L, "The Mat"),
22+
new Document(2L, "The Matrix"),
23+
new Document(3L, "The Matriarx"),
24+
new Document(4L, "The Match"),
25+
new Document(5L, "The Meatrix")
26+
};
27+
28+
_engine.IndexDocuments(docs);
29+
}
30+
31+
[TestMethod]
32+
public void Search_TheMatrx_RanksMatrixAboveMat()
33+
{
34+
// Query: "the matrx"
35+
// Target: "The Matrix" (Doc 2) should be higher than "The Mat" (Doc 1)
36+
// "matrx" is a typo for "matrix".
37+
// "The Matriarx" (Doc 3) contains "matrx" exactly? If tokenizer splits it?
38+
// "Matriarx" -> "matriarx". "matrx" is not "matriarx".
39+
// Unless "matrx" is in "Matriarx" as n-gram? Yes.
40+
// But "The Matrix" should beat "The Mat" because "matrx" -> "matrix" (fuzzy) is a whole-word match
41+
// whereas "The Mat" is only a partial n-gram match (which should be suppressed!).
42+
43+
var result = _engine.Search(new Query("the matrx", 10));
44+
45+
Console.WriteLine("Results for 'the matrx':");
46+
foreach (var r in result.Records)
47+
{
48+
Console.WriteLine($"[{r.Score:F1}] Doc {r.DocumentId}");
49+
}
50+
51+
var doc1 = result.Records.FirstOrDefault(r => r.DocumentId == 1L); // The Mat
52+
var doc2 = result.Records.FirstOrDefault(r => r.DocumentId == 2L); // The Matrix
53+
54+
Assert.IsNotNull(doc2, "The Matrix should be found");
55+
56+
// The Matrix should score higher than The Mat
57+
Assert.IsTrue(doc2.Score > doc1.Score,
58+
$"The Matrix ({doc2.Score}) should rank higher than The Mat ({doc1.Score})");
59+
}
60+
}

src/Infidex/Coverage/CoverageEngine.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,7 @@ private CoverageResult CalculateCoverageInternal(string query, string documentTe
289289
PrefixSuffixMatcher.Match(ref state);
290290

291291
if (_setup.CoverFuzzyWords && qCount > 0 && !FuzzyWordMatcher.AllTermsFullyMatched(ref state))
292-
FuzzyWordMatcher.Match(ref state, _setup.MinWordSize, _setup.LevenshteinMaxWordSize);
292+
FuzzyWordMatcher.Match(ref state, _setup);
293293
}
294294
finally
295295
{

src/Infidex/Coverage/CoverageSetup.cs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,24 @@ public class CoverageSetup
1414
/// Maximum word size for Levenshtein fuzzy matching
1515
/// </summary>
1616
public int LevenshteinMaxWordSize { get; set; } = 20;
17+
18+
/// <summary>
19+
/// Maximum number of typographical errors (0, 1 or 2) that would be tolerated.
20+
/// Default: 2.
21+
/// </summary>
22+
public int NumTypos { get; set; } = 2;
23+
24+
/// <summary>
25+
/// Minimum word length for 1-typo correction to be applied.
26+
/// Default: 3.
27+
/// </summary>
28+
public int MinLengthOneTypo { get; set; } = 3;
29+
30+
/// <summary>
31+
/// Minimum word length for 2-typo correction to be applied.
32+
/// Default: 7.
33+
/// </summary>
34+
public int MinLengthTwoTypos { get; set; } = 7;
1735

1836
/// <summary>
1937
/// Minimum absolute number of word matches required
@@ -98,6 +116,9 @@ internal CoverageSetup(CoverageSetup source)
98116
{
99117
MinWordSize = source.MinWordSize;
100118
LevenshteinMaxWordSize = source.LevenshteinMaxWordSize;
119+
NumTypos = source.NumTypos;
120+
MinLengthOneTypo = source.MinLengthOneTypo;
121+
MinLengthTwoTypos = source.MinLengthTwoTypos;
101122
CoverageMinWordHitsAbs = source.CoverageMinWordHitsAbs;
102123
CoverageMinWordHitsRelative = source.CoverageMinWordHitsRelative;
103124
CoverageQLimitForErrorTolerance = source.CoverageQLimitForErrorTolerance;

src/Infidex/Coverage/FuzzyWordMatcher.cs

Lines changed: 57 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ internal static class FuzzyWordMatcher
1111
private const double DefaultErrorProbability = 0.04;
1212
private const double DefaultTailProbability = 0.01;
1313

14-
public static void Match(ref MatchState state, int minWordSize, int levenshteinMaxWordSize)
14+
public static void Match(ref MatchState state, CoverageSetup setup)
1515
{
1616
int qCount = state.QCount;
1717
int dCount = state.DCount;
@@ -23,23 +23,30 @@ public static void Match(ref MatchState state, int minWordSize, int levenshteinM
2323

2424
if (maxQueryLength == 0) return;
2525

26-
// Use a principled maximum edit distance based on a simple
27-
// Binomial(L, p) error model instead of an uncalibrated
28-
// relative distance knob.
29-
//
30-
// We choose the smallest d such that:
31-
// Pr[D ≤ d] ≥ 1 - alpha
32-
//
33-
// where D ~ Binomial(L, p), p ≈ DefaultErrorProbability and
34-
// alpha ≈ DefaultTailProbability.
35-
int maxEditDist = EditDistanceModel.GetMaxEditsForLength(
36-
maxQueryLength,
37-
DefaultErrorProbability,
38-
DefaultTailProbability);
26+
int maxEditDist;
27+
28+
if (maxQueryLength >= setup.MinLengthTwoTypos)
29+
maxEditDist = 2;
30+
else if (maxQueryLength >= setup.MinLengthOneTypo)
31+
maxEditDist = 1;
32+
else
33+
maxEditDist = 0;
3934

40-
// Ensure we allow at least one edit for non-empty queries.
41-
if (maxEditDist < 1)
35+
// Special handling for len=2 queries if default logic forbids typos (maxEditDist=0)
36+
// If we have very short words (len 2) that are disallowed normal typos,
37+
// we conditionally allow 1 typo ONLY if it matches a target of length 3 (Insertion).
38+
// This supports common cases like "te" -> "the" while avoiding high-noise substitutions like "at" -> "it".
39+
bool hasSpecialShortWord = (maxQueryLength == 2 && maxEditDist == 0 && setup.NumTypos >= 1);
40+
if (hasSpecialShortWord)
41+
{
4242
maxEditDist = 1;
43+
}
44+
45+
// Respect the global cap
46+
if (maxEditDist > setup.NumTypos)
47+
maxEditDist = setup.NumTypos;
48+
49+
if (maxEditDist == 0) return;
4350

4451
for (int editDist = 1; editDist <= maxEditDist; editDist++)
4552
{
@@ -55,12 +62,32 @@ public static void Match(ref MatchState state, int minWordSize, int levenshteinM
5562
int qLen = qSlice.Length;
5663

5764
// Skip query tokens that are too short for meaningful fuzzy matching
58-
// but allow 2-char tokens since they can fuzzy-match 3-char words (e.g., "te" → "the")
59-
if (qLen < minWordSize) continue;
65+
if (qLen < setup.MinWordSize) continue;
66+
67+
// Calculate max edits for THIS token
68+
int tokenMaxEdits = 0;
69+
if (qLen >= setup.MinLengthTwoTypos) tokenMaxEdits = 2;
70+
else if (qLen >= setup.MinLengthOneTypo) tokenMaxEdits = 1;
71+
else tokenMaxEdits = 0;
6072

73+
// Apply special short word logic for individual token
74+
bool isSpecialShortCase = false;
75+
if (qLen == 2 && tokenMaxEdits == 0 && setup.NumTypos >= 1)
76+
{
77+
tokenMaxEdits = 1;
78+
isSpecialShortCase = true;
79+
}
80+
81+
if (tokenMaxEdits > setup.NumTypos) tokenMaxEdits = setup.NumTypos;
82+
83+
if (editDist > tokenMaxEdits) continue;
84+
85+
// For special short case, we only process editDist=1
86+
if (isSpecialShortCase && editDist != 1) continue;
87+
6188
// Calculate the valid document word length range for this query token and edit distance
62-
int minLen = Math.Max(minWordSize, qLen - editDist);
63-
int maxLen = Math.Min(levenshteinMaxWordSize, qLen + editDist);
89+
int minLen = Math.Max(setup.MinWordSize, qLen - editDist);
90+
int maxLen = Math.Min(setup.LevenshteinMaxWordSize, qLen + editDist);
6491
if (maxLen > 63) maxLen = 63;
6592

6693
ReadOnlySpan<char> qText = state.QuerySpan.Slice(qSlice.Offset, qSlice.Length);
@@ -73,6 +100,16 @@ public static void Match(ref MatchState state, int minWordSize, int levenshteinM
73100
if (dLen > maxLen || dLen < minLen) continue;
74101

75102
ReadOnlySpan<char> dText = state.DocSpan.Slice(dSlice.Offset, dSlice.Length);
103+
104+
// Enforce special short word constraints
105+
if (isSpecialShortCase)
106+
{
107+
// Special handling for short words: First character MUST match.
108+
// This allows "te" -> "the" (ins) or "te" -> "to" (sub),
109+
// but prevents high-noise matches like "at" -> "cat" (prefix ins) or "at" -> "it" (start sub).
110+
if (dText.Length == 0 || char.ToLowerInvariant(dText[0]) != char.ToLowerInvariant(qText[0]))
111+
continue;
112+
}
76113

77114
int dist = LevenshteinDistance.CalculateDamerau(qText, dText, editDist, ignoreCase: true);
78115

@@ -107,4 +144,3 @@ public static bool AllTermsFullyMatched(ref MatchState state)
107144
return true;
108145
}
109146
}
110-

src/Infidex/Indexing/VectorModel.cs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -400,6 +400,10 @@ internal TopKHeap SearchWithMaxScore(string queryText, int topK, Dictionary<int,
400400
term.QueryOccurrences = (byte)shingle.Occurrences;
401401
queryTerms.Add(term);
402402
}
403+
else if (term == null && _fstIndex != null && shingle.Text.Length >= 4)
404+
{
405+
ExpandMissingTerm(shingle, queryTerms);
406+
}
403407
}
404408

405409
if (queryTerms.Count == 0 || _documents.Count == 0)
@@ -412,6 +416,22 @@ internal TopKHeap SearchWithMaxScore(string queryText, int topK, Dictionary<int,
412416
return Bm25Scorer.Search(queryTerms, topK, totalDocs, _docLengths!, _avgDocLength, _stopTermLimit, _documents, bestSegmentsMap, queryIndex, _shortQueryIndex, queryText);
413417
}
414418

419+
private void ExpandMissingTerm(Shingle shingle, List<Term> queryTerms)
420+
{
421+
// Fuzzy fallback for missing terms (Edit Distance 1 only)
422+
List<int> fuzzyOutputs = new List<int>();
423+
_fstIndex!.GetWithinEditDistance1(shingle.Text.AsSpan(), fuzzyOutputs);
424+
425+
foreach (int termId in fuzzyOutputs)
426+
{
427+
Term? fuzzyTerm = _termCollection.GetTermByIndex(termId);
428+
if (fuzzyTerm != null && fuzzyTerm.DocumentFrequency <= _stopTermLimit)
429+
{
430+
queryTerms.Add(fuzzyTerm);
431+
}
432+
}
433+
}
434+
415435
public void Save(string filePath)
416436
{
417437
using FileStream stream = File.Create(filePath);

src/Infidex/Tokenization/Tokenizer.cs

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,48 @@ public List<Shingle> TokenizeForIndexing(string text, bool isSegmentContinuation
7878
// Add padding
7979
string paddedText = startPad + text + _stopPadding;
8080

81-
return GenerateShingles(paddedText);
81+
List<Shingle> shingles = GenerateShingles(paddedText);
82+
83+
// index full words to support fuzzy correction and exact word matching
84+
// this ensures that words longer than max n-gram size are present in the index/FST
85+
if (TokenizerSetup != null)
86+
{
87+
ReadOnlySpan<char> span = text.AsSpan();
88+
char[] delimiters = TokenizerSetup.Delimiters;
89+
int baseOffset = isSegmentContinuation ? 0 : StartPadSize;
90+
91+
int i = 0;
92+
while (i < span.Length)
93+
{
94+
// Skip delimiters
95+
int start = i;
96+
while (start < span.Length && delimiters.Contains(span[start]))
97+
{
98+
start++;
99+
}
100+
101+
if (start >= span.Length) break;
102+
103+
// Find end of word
104+
int end = start;
105+
while (end < span.Length && !delimiters.Contains(span[end]))
106+
{
107+
end++;
108+
}
109+
110+
int len = end - start;
111+
if (len >= IndexSizes[0])
112+
{
113+
string word = new string(span.Slice(start, len));
114+
// Exact position in padded text
115+
shingles.Add(new Shingle(word, 1, baseOffset + start));
116+
}
117+
118+
i = end;
119+
}
120+
}
121+
122+
return shingles;
82123
}
83124

84125
/// <summary>
@@ -351,4 +392,3 @@ public HashSet<string> GetWordTokensForCoverage(string text, int minWordSize)
351392
return result;
352393
}
353394
}
354-

0 commit comments

Comments
 (0)