Skip to content

Commit 083e283

Browse files
committed
tfidf optimizations pt1
1 parent 28269ae commit 083e283

18 files changed

Lines changed: 2015 additions & 451 deletions

src/Infidex.Example/Program.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ static void Main(string[] args)
88
{
99
ExampleMode mode = GetModeFromArgs(args);
1010
int? dataset = GetDatasetFromArgs(args);
11+
1112
if (dataset.HasValue)
1213
{
1314
switch (dataset.Value)
@@ -23,6 +24,8 @@ static void Main(string[] args)
2324
return;
2425
}
2526
}
27+
28+
2629

2730
while (true)
2831
{

src/Infidex.Tests/SegmentTests.cs

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
using Infidex.Core;
2+
using Infidex.Indexing.Segments;
3+
using Microsoft.VisualStudio.TestTools.UnitTesting;
4+
5+
namespace Infidex.Tests;
6+
7+
[TestClass]
8+
public class SegmentTests
9+
{
10+
[TestMethod]
11+
public void WriteAndReadSegment_ShouldWork()
12+
{
13+
var terms = new TermCollection();
14+
15+
// Term "apple": Doc 1 (wt 10), Doc 3 (wt 20)
16+
var t1 = terms.CountTermUsage("apple", 100);
17+
t1.FirstCycleAdd(1, 100, false, 10.0f);
18+
t1.FirstCycleAdd(3, 100, false, 20.0f);
19+
20+
// Term "banana": Doc 2 (wt 5)
21+
var t2 = terms.CountTermUsage("banana", 100);
22+
t2.FirstCycleAdd(2, 100, false, 5.0f);
23+
24+
string path = "test_segment.seg";
25+
if (File.Exists(path)) File.Delete(path);
26+
27+
var writer = new SegmentWriter();
28+
writer.WriteSegment(terms, 5, 0, path); // 5 docs total
29+
30+
using (var reader = new SegmentReader(path))
31+
{
32+
Assert.AreEqual(2, reader.FstIndex.TermCount);
33+
Assert.AreEqual(5, reader.DocCount);
34+
35+
var applePostings = reader.GetPostings("apple");
36+
Assert.IsNotNull(applePostings);
37+
Assert.AreEqual(2, applePostings.Value.DocIds.Length);
38+
Assert.AreEqual(1, applePostings.Value.DocIds[0]);
39+
Assert.AreEqual(3, applePostings.Value.DocIds[1]);
40+
Assert.AreEqual((byte)10, applePostings.Value.Weights[0]);
41+
42+
var bananaPostings = reader.GetPostings("banana");
43+
Assert.IsNotNull(bananaPostings);
44+
Assert.AreEqual(1, bananaPostings.Value.DocIds.Length);
45+
Assert.AreEqual(2, bananaPostings.Value.DocIds[0]);
46+
47+
Assert.IsNull(reader.GetPostings("orange"));
48+
}
49+
50+
File.Delete(path);
51+
}
52+
53+
[TestMethod]
54+
public void MergeSegments_ShouldWork()
55+
{
56+
string seg1Path = "seg1.seg";
57+
string seg2Path = "seg2.seg";
58+
string mergedPath = "merged.seg";
59+
60+
// Segment 1 (Docs 0-4)
61+
var terms1 = new TermCollection();
62+
var t1 = terms1.CountTermUsage("common", 100);
63+
t1.FirstCycleAdd(1, 100, false, 10f);
64+
var t2 = terms1.CountTermUsage("unique1", 100);
65+
t2.FirstCycleAdd(2, 100, false, 20f);
66+
67+
var writer = new SegmentWriter();
68+
writer.WriteSegment(terms1, 5, 0, seg1Path);
69+
70+
// Segment 2 (Docs 0-4 -> mapped to 5-9)
71+
var terms2 = new TermCollection();
72+
var t3 = terms2.CountTermUsage("common", 100);
73+
t3.FirstCycleAdd(0, 100, false, 30f); // Becomes Doc 5
74+
var t4 = terms2.CountTermUsage("unique2", 100);
75+
t4.FirstCycleAdd(3, 100, false, 40f); // Becomes Doc 8
76+
77+
writer.WriteSegment(terms2, 5, 0, seg2Path);
78+
79+
// Merge
80+
var merger = new SegmentMerger();
81+
var readers = new List<SegmentReader>
82+
{
83+
new SegmentReader(seg1Path),
84+
new SegmentReader(seg2Path)
85+
};
86+
87+
merger.MergeSegments(readers, mergedPath);
88+
89+
foreach(var r in readers) r.Dispose();
90+
91+
// Verify Merged
92+
using (var reader = new SegmentReader(mergedPath))
93+
{
94+
Assert.AreEqual(3, reader.FstIndex.TermCount); // common, unique1, unique2
95+
Assert.AreEqual(10, reader.DocCount);
96+
97+
var common = reader.GetPostings("common");
98+
Assert.IsNotNull(common);
99+
Assert.AreEqual(2, common.Value.DocIds.Length);
100+
Assert.AreEqual(1, common.Value.DocIds[0]); // From Seg1
101+
Assert.AreEqual(5, common.Value.DocIds[1]); // From Seg2 (0 + 5)
102+
Assert.AreEqual((byte)10, common.Value.Weights[0]);
103+
Assert.AreEqual((byte)30, common.Value.Weights[1]);
104+
105+
var unique1 = reader.GetPostings("unique1");
106+
Assert.AreEqual(1, unique1.Value.DocIds.Length);
107+
Assert.AreEqual(2, unique1.Value.DocIds[0]);
108+
109+
var unique2 = reader.GetPostings("unique2");
110+
Assert.AreEqual(1, unique2.Value.DocIds.Length);
111+
Assert.AreEqual(8, unique2.Value.DocIds[0]); // From Seg2 (3 + 5)
112+
}
113+
114+
File.Delete(seg1Path);
115+
File.Delete(seg2Path);
116+
File.Delete(mergedPath);
117+
}
118+
}

0 commit comments

Comments
 (0)