Skip to content

Commit 9765840

Browse files
committed
assert exact prefixes are respected
1 parent 5f41a74 commit 9765840

6 files changed

Lines changed: 690 additions & 339 deletions

File tree

src/Infidex.Tests/MovieSearchParityTests.cs

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -658,6 +658,123 @@ public void Search_MixedTerms_LongAndShort_ReturnsCorrectResults()
658658
}
659659
}
660660

661+
[TestMethod]
662+
public void Search_TwoF_PrefersStrictPrefixMatch()
663+
{
664+
var engine = GetEngine();
665+
666+
// Query: "two f"
667+
// Expected: "Two for ..." titles should rank higher than partial matches like "Flat Two"
668+
// because "two" is an exact token match and "f" is a clean prefix of "for/feet/face"
669+
var result = engine.Search(new Query("two f", 10));
670+
671+
Console.WriteLine($"Search 'two f' returned {result.Records.Length} results");
672+
for (int i = 0; i < Math.Min(10, result.Records.Length); i++)
673+
{
674+
var doc = engine.GetDocument(result.Records[i].DocumentId);
675+
Console.WriteLine($" [{i + 1}] [{result.Records[i].Score}] {doc?.IndexedText}");
676+
}
677+
678+
Assert.IsTrue(result.Records.Length >= 2, "Should return at least 2 results for 'two f'");
679+
680+
// First result MUST be a "Two [word starting with 'f']" title, NOT "Flat Two" or similar
681+
var doc1 = engine.GetDocument(result.Records[0].DocumentId);
682+
Assert.IsNotNull(doc1);
683+
684+
// Valid strict prefix matches: "Two for ...", "Two Faces", "Happy Feet Two", etc.
685+
// The key is that "Two" appears first and is followed by a token starting with "f"
686+
bool isValidPrefixMatch = doc1!.IndexedText.StartsWith("Two ", StringComparison.OrdinalIgnoreCase) &&
687+
System.Text.RegularExpressions.Regex.IsMatch(
688+
doc1.IndexedText,
689+
@"\bTwo\s+[Ff]",
690+
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
691+
692+
Assert.IsTrue(isValidPrefixMatch,
693+
$"Position #1 MUST be a strict prefix match like 'Two for/face/feet...', but was '{doc1.IndexedText}'");
694+
}
695+
696+
[TestMethod]
697+
public void Search_TwoFo_AllExactPrefixesBeforePartialMatches()
698+
{
699+
var engine = GetEngine();
700+
701+
// Query: "two fo"
702+
// GUARANTEE: ALL documents starting with "Two Fo..." MUST rank above documents where "Two" is not at the start
703+
var result = engine.Search(new Query("two fo", 20));
704+
705+
Console.WriteLine($"Search 'two fo' returned {result.Records.Length} results");
706+
for (int i = 0; i < Math.Min(15, result.Records.Length); i++)
707+
{
708+
var doc = engine.GetDocument(result.Records[i].DocumentId);
709+
Console.WriteLine($" [{i + 1}] [{result.Records[i].Score}] {doc?.IndexedText}");
710+
}
711+
712+
Assert.IsTrue(result.Records.Length >= 5, "Should return at least 5 results for 'two fo'");
713+
714+
// Find where exact prefix matches end
715+
int firstNonPrefixIndex = -1;
716+
for (int i = 0; i < result.Records.Length; i++)
717+
{
718+
var doc = engine.GetDocument(result.Records[i].DocumentId);
719+
bool isExactPrefix = doc!.IndexedText.StartsWith("Two Fo", StringComparison.OrdinalIgnoreCase);
720+
721+
if (!isExactPrefix)
722+
{
723+
firstNonPrefixIndex = i;
724+
break;
725+
}
726+
}
727+
728+
// Now verify ALL documents before firstNonPrefixIndex start with "Two Fo"
729+
// and NO documents after that point have higher scores
730+
if (firstNonPrefixIndex > 0)
731+
{
732+
var lastPrefixDoc = engine.GetDocument(result.Records[firstNonPrefixIndex - 1].DocumentId);
733+
var firstNonPrefixDoc = engine.GetDocument(result.Records[firstNonPrefixIndex].DocumentId);
734+
735+
Assert.IsTrue(lastPrefixDoc!.IndexedText.StartsWith("Two Fo", StringComparison.OrdinalIgnoreCase),
736+
$"Document at index {firstNonPrefixIndex - 1} should be an exact prefix match");
737+
738+
Assert.IsFalse(firstNonPrefixDoc!.IndexedText.StartsWith("Two Fo", StringComparison.OrdinalIgnoreCase),
739+
$"Document at index {firstNonPrefixIndex} should NOT be an exact prefix match");
740+
741+
// The key assertion: prefix match MUST score higher than non-prefix
742+
Assert.IsTrue(result.Records[firstNonPrefixIndex - 1].Score > result.Records[firstNonPrefixIndex].Score,
743+
$"Exact prefix '{lastPrefixDoc.IndexedText}' (score {result.Records[firstNonPrefixIndex - 1].Score}) " +
744+
$"MUST score higher than non-prefix '{firstNonPrefixDoc.IndexedText}' (score {result.Records[firstNonPrefixIndex].Score})");
745+
}
746+
747+
// Additional check: "Tea for Two" should appear AFTER all "Two for..." variants
748+
var teaForTwoIndex = -1;
749+
for (int i = 0; i < result.Records.Length; i++)
750+
{
751+
var doc = engine.GetDocument(result.Records[i].DocumentId);
752+
if (doc!.IndexedText.Equals("Tea for Two", StringComparison.OrdinalIgnoreCase))
753+
{
754+
teaForTwoIndex = i;
755+
break;
756+
}
757+
}
758+
759+
if (teaForTwoIndex >= 0)
760+
{
761+
// Count how many "Two for..." documents appear BEFORE "Tea for Two"
762+
int twoForCount = 0;
763+
for (int i = 0; i < teaForTwoIndex; i++)
764+
{
765+
var doc = engine.GetDocument(result.Records[i].DocumentId);
766+
if (doc!.IndexedText.StartsWith("Two Fo", StringComparison.OrdinalIgnoreCase))
767+
{
768+
twoForCount++;
769+
}
770+
}
771+
772+
Assert.IsTrue(twoForCount > 0,
773+
"'Tea for Two' should appear AFTER at least one 'Two for...' variant, " +
774+
$"but it appeared at index {teaForTwoIndex} with no 'Two for...' variants before it");
775+
}
776+
}
777+
661778
[TestMethod]
662779
public void FellowshipOfTheRing_PrefersCorrectLotrMovie()
663780
{

src/Infidex/Coverage/CoverageEngine.cs

Lines changed: 48 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ public void SetCorpusStatistics(TermCollection termCollection, int totalDocument
3333
public byte CalculateCoverageScore(string query, string documentText, double lcsSum, out int wordHits)
3434
{
3535
CoverageResult result = CalculateCoverageInternal(query, documentText, lcsSum, out wordHits,
36-
out _, out _, out _, out _, out _, out _, out _, out _, out _, out _);
36+
out _, out _, out _, out _, out _, out _, out _, out _, out _, out _, out _);
3737
return result.CoverageScore;
3838
}
3939

@@ -49,7 +49,8 @@ public ushort CalculateRankedScore(string query, string documentText, double lcs
4949
out _, // suffixPrefixRun
5050
out _, // phraseSpan
5151
out _, // precedingStrictCount
52-
out _); // lastTokenHasPrefix
52+
out _, // lastTokenHasPrefix
53+
out _); // fusionSignals
5354

5455
return CoverageScorer.CalculateRankedScore(
5556
result,
@@ -78,7 +79,8 @@ public CoverageFeatures CalculateFeatures(string query, string documentText, dou
7879
out int suffixPrefixRun,
7980
out int phraseSpan,
8081
out int precedingStrictCount,
81-
out bool lastTokenHasPrefix);
82+
out bool lastTokenHasPrefix,
83+
out var fusionSignals);
8284

8385
return new CoverageFeatures(
8486
result.CoverageScore,
@@ -101,7 +103,8 @@ public CoverageFeatures CalculateFeatures(string query, string documentText, dou
101103
result.LastTermIsTypeAhead,
102104
result.IdfCoverage,
103105
result.TotalIdf,
104-
result.MissingIdf);
106+
result.MissingIdf,
107+
fusionSignals);
105108
}
106109

107110
private CoverageResult CalculateCoverageInternal(string query, string documentText, double lcsSum,
@@ -115,7 +118,8 @@ private CoverageResult CalculateCoverageInternal(string query, string documentTe
115118
out int suffixPrefixRun,
116119
out int phraseSpan,
117120
out int precedingStrictCount,
118-
out bool lastTokenHasPrefix)
121+
out bool lastTokenHasPrefix,
122+
out FusionSignals fusionSignals)
119123
{
120124
wordHits = 0;
121125
docTokenCount = 0;
@@ -128,6 +132,7 @@ private CoverageResult CalculateCoverageInternal(string query, string documentTe
128132
phraseSpan = 0;
129133
precedingStrictCount = 0;
130134
lastTokenHasPrefix = false;
135+
fusionSignals = default;
131136

132137
if (query.Length == 0)
133138
return new CoverageResult(0, 0, -1, 0);
@@ -147,6 +152,7 @@ private CoverageResult CalculateCoverageInternal(string query, string documentTe
147152
if (qCountRaw == 0)
148153
{
149154
if (queryTokenArray != null) ArrayPool<StringSlice>.Shared.Return(queryTokenArray);
155+
fusionSignals = default;
150156
return new CoverageResult(0, 0, -1, 0);
151157
}
152158

@@ -306,7 +312,7 @@ private CoverageResult CalculateCoverageInternal(string query, string documentTe
306312

307313
wordHits = state.WordHits;
308314

309-
return CoverageScorer.CalculateFinalScore(
315+
CoverageResult coverageResult = CoverageScorer.CalculateFinalScore(
310316
ref state,
311317
queryLen,
312318
lcsSum,
@@ -320,6 +326,42 @@ private CoverageResult CalculateCoverageInternal(string query, string documentTe
320326
out phraseSpan,
321327
out precedingStrictCount,
322328
out lastTokenHasPrefix);
329+
330+
// Fusion signals need all tokens (no MinWordSize filtering)
331+
int maxFusionQueryTokens = queryLen / 2 + 1;
332+
StringSlice[]? fusionQueryTokenArray = null;
333+
Span<StringSlice> fusionQueryTokens = maxFusionQueryTokens <= CoverageTokenizer.MaxStackTerms
334+
? stackalloc StringSlice[maxFusionQueryTokens]
335+
: (fusionQueryTokenArray = ArrayPool<StringSlice>.Shared.Rent(maxFusionQueryTokens));
336+
337+
int fusionQCount = CoverageTokenizer.TokenizeToSpan(query, fusionQueryTokens, minWordSize: 0, delimiters);
338+
339+
int maxFusionDocTokens = docLen / 2 + 1;
340+
StringSlice[]? fusionDocTokenArray = null;
341+
Span<StringSlice> fusionDocTokens = maxFusionDocTokens <= CoverageTokenizer.MaxStackTerms
342+
? stackalloc StringSlice[maxFusionDocTokens]
343+
: fusionDocTokenArray = ArrayPool<StringSlice>.Shared.Rent(maxFusionDocTokens);
344+
345+
int fusionDCount = CoverageTokenizer.TokenizeToSpan(documentText, fusionDocTokens, minWordSize: 0, delimiters);
346+
347+
try
348+
{
349+
fusionSignals = FusionSignalComputer.ComputeSignals(
350+
querySpan,
351+
docSpan,
352+
fusionQueryTokens[..fusionQCount],
353+
fusionDocTokens[..fusionDCount],
354+
fusionQCount,
355+
fusionDCount,
356+
_setup.MinWordSize);
357+
}
358+
finally
359+
{
360+
if (fusionQueryTokenArray != null) ArrayPool<StringSlice>.Shared.Return(fusionQueryTokenArray);
361+
if (fusionDocTokenArray != null) ArrayPool<StringSlice>.Shared.Return(fusionDocTokenArray);
362+
}
363+
364+
return coverageResult;
323365
}
324366

325367
/// <summary>

src/Infidex/Coverage/CoverageFeatures.cs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ public readonly struct CoverageFeatures
2323
public readonly float IdfCoverage; // Information-weighted coverage (IDF-based)
2424
public readonly float TotalIdf; // Total information content of query
2525
public readonly float MissingIdf; // Information content of unmatched terms
26+
27+
// Precomputed fusion signals (Lucene-style: no string ops in fusion layer)
28+
public readonly FusionSignals FusionSignals;
2629

2730
public CoverageFeatures(
2831
byte coverageScore,
@@ -45,7 +48,8 @@ public CoverageFeatures(
4548
bool lastTermIsTypeAhead = false,
4649
float idfCoverage = 0f,
4750
float totalIdf = 0f,
48-
float missingIdf = 0f)
51+
float missingIdf = 0f,
52+
FusionSignals fusionSignals = default)
4953
{
5054
CoverageScore = coverageScore;
5155
TermsCount = termsCount;
@@ -68,5 +72,6 @@ public CoverageFeatures(
6872
IdfCoverage = idfCoverage;
6973
TotalIdf = totalIdf;
7074
MissingIdf = missingIdf;
75+
FusionSignals = fusionSignals;
7176
}
7277
}

0 commit comments

Comments
 (0)