Skip to content

Commit bae2f24

Browse files
committed
Make DeltaData optional
1 parent 35a7a8a commit bae2f24

File tree

5 files changed

+71
-54
lines changed

5 files changed

+71
-54
lines changed

Src/FastData.Tests/KeyAnalyzerTests.cs

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,11 +67,13 @@ public void GetStringProperties_LengthMap_Test(string[] data)
6767
[Theory]
6868
[InlineData(new[] { "item1", "item2", "item3", "item4" }, 4, 0)]
6969
[InlineData(new[] { "1item", "2item", "3item", "4item" }, 0, 4)]
70-
[InlineData(new[] { "a", "aa", "aaa", "aaaaa" }, 1, 1)]
71-
public void GetStringProperties_EntropyData_Test(string[] data, int leftZero, int rightZero)
70+
[InlineData(new[] { "a", "ab", "abc" }, 0, 0)] // The shortest string would become empty, so we don't support it
71+
[InlineData(new[] { "aa", "aaa", "aaaaa" }, 0, 0)] // If all strings consist of the same character, they will be reduced to nothing, so we don't support it
72+
[InlineData(new[] { "hello world" }, 0, 0)] // One key should result in no prefix/suffix calculation
73+
public void GetStringProperties_DeltaData_Test(string[] data, int leftZero, int rightZero)
7274
{
7375
StringProperties res = GetStringProperties(data);
74-
Assert.Equal(res.DeltaData.LeftZeroCount, leftZero);
75-
Assert.Equal(res.DeltaData.RightZeroCount, rightZero);
76+
Assert.Equal(leftZero, res.DeltaData.LeftZeroCount);
77+
Assert.Equal(rightZero, res.DeltaData.RightZeroCount);
7678
}
7779
}

Src/FastData.Tests/SegmentGeneratorTests.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ public void DeltaGeneratorPatternTest(string[] input, uint offset, int length)
117117
o.WriteLine($"{segment}. res: {string.Join(",", input.Select(x => SegmentHelper.InsertSegmentBounds(x, segment)))}");
118118
}
119119

120-
Assert.Equal(res[0], expected);
120+
Assert.Equal(expected, res[0]);
121121
}
122122

123123
[Theory]

Src/FastData/Internal/Analysis/Data/DeltaData.cs

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,12 @@
11
using System.Runtime.InteropServices;
2-
using Genbox.FastData.Internal.Enums;
3-
using Genbox.FastData.Internal.Misc;
42

53
namespace Genbox.FastData.Internal.Analysis.Data;
64

75
[StructLayout(LayoutKind.Auto)]
8-
internal readonly record struct DeltaData(int[] Left, int[] Right)
6+
internal readonly record struct DeltaData(int[]? Left, int[]? Right)
97
{
10-
internal int LeftZeroCount => CountZero(Left);
11-
internal int RightZeroCount => CountZero(Right);
8+
internal int LeftZeroCount => Left == null ? 0 : CountZero(Left);
9+
internal int RightZeroCount => Right == null ? 0 : CountZero(Right);
1210

1311
//TODO: See the todo in KeyAnalyzer about supporting prefix/suffix only
1412
// internal IEnumerable<ArraySegment> GetSegments()
@@ -35,11 +33,8 @@ internal readonly record struct DeltaData(int[] Left, int[] Right)
3533
// }
3634
// }
3735

38-
private static int CountZero(int[]? data)
36+
internal static int CountZero(int[] data)
3937
{
40-
if (data == null)
41-
throw new InvalidOperationException("Cannot count map data");
42-
4338
int count;
4439
for (count = 0; count < data.Length; count++)
4540
{

Src/FastData/Internal/Analysis/KeyAnalyzer.cs

Lines changed: 40 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ internal static StringProperties GetStringProperties(string[] keys)
3535
int minUtf16ByteLength = int.MaxValue;
3636
int maxUtf16ByteLength = int.MinValue;
3737
bool uniq = true;
38+
bool allAscii = true;
3839

3940
foreach (string str in keys)
4041
{
@@ -51,6 +52,12 @@ internal static StringProperties GetStringProperties(string[] keys)
5152

5253
minLength = Math.Min(minLength, str.Length); //Track the smallest string. It might be more than what lengthmap supports
5354
uniq &= !lengthMap.SetTrue(str.Length);
55+
56+
foreach (char c in str)
57+
{
58+
if (c > 127)
59+
allAscii = false;
60+
}
5461
}
5562

5663
// The code beneath there calculate entropy maps that cna be used to derive the longest common substrings or longest prefix/suffix strings.
@@ -62,28 +69,29 @@ internal static StringProperties GetStringProperties(string[] keys)
6269
int[]? left = null;
6370
int[]? right = null;
6471

65-
bool allAscii = true;
66-
67-
// Special case: If all strings have the same length, we can build an entropy map in O(n) with O(1) memory
68-
// TODO: For now FastData only supports prefix/suffix
69-
// if (minLength == maxStr.Length)
70-
// {
71-
// map = new int[minLength];
72-
//
73-
// foreach (string str in keys)
74-
// {
75-
// for (int i = 0; i < str.Length; i++)
76-
// {
77-
// char c = str[i];
78-
// map[i] ^= c;
79-
//
80-
// if (c > 127)
81-
// allAscii = false;
82-
// }
83-
// }
84-
// }
85-
// else
86-
// {
72+
// Prefix/suffix tracking only makes sense when there are multiple keys, and they are long enough
73+
if (keys.Length > 1 && minLength > 1)
74+
{
75+
// Special case: If all strings have the same length, we can build an entropy map in O(n) with O(1) memory
76+
// TODO: For now FastData only supports prefix/suffix
77+
// if (minLength == maxStr.Length)
78+
// {
79+
// map = new int[minLength];
80+
//
81+
// foreach (string str in keys)
82+
// {
83+
// for (int i = 0; i < str.Length; i++)
84+
// {
85+
// char c = str[i];
86+
// map[i] ^= c;
87+
//
88+
// if (c > 127)
89+
// allAscii = false;
90+
// }
91+
// }
92+
// }
93+
// else
94+
// {
8795
//Build a forward and reverse map of merged entropy
8896
//We can derive common prefix/suffix from it that can be used later for high-entropy hash/equality functions
8997
left = new int[maxStr.Length];
@@ -98,9 +106,6 @@ internal static StringProperties GetStringProperties(string[] keys)
98106

99107
left[i] ^= lc;
100108
right[i] ^= rc;
101-
102-
if (lc > 127)
103-
allAscii = false;
104109
}
105110
}
106111

@@ -119,7 +124,16 @@ internal static StringProperties GetStringProperties(string[] keys)
119124
//We do not add to characterMap here since it does not need the duplicate
120125
}
121126
}
122-
// }
127+
128+
// Make sure that we handle the case where all characters in the inputs are the same
129+
if (DeltaData.CountZero(left) == minLength || DeltaData.CountZero(right) == minLength)
130+
{
131+
left = null;
132+
right = null;
133+
}
134+
135+
// }
136+
}
123137

124138
return new StringProperties(new LengthData((uint)minLength, (uint)maxStr.Length, (uint)minUtf8ByteLength, (uint)maxUtf8ByteLength, (uint)minUtf16ByteLength, (uint)maxUtf16ByteLength, uniq, lengthMap), new DeltaData(left, right), new CharacterData(allAscii));
125139
}

Src/FastData/Internal/Analysis/SegmentGenerators/DeltaGenerator.cs

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -67,26 +67,32 @@ ... repeats ...
6767

6868
public IEnumerable<ArraySegment> Generate(StringProperties props)
6969
{
70-
// We start from the left, which is faster due to not having to do right-align checks
71-
foreach (ArraySegment segment in CalculateSegments(props.DeltaData.Left))
70+
if (props.DeltaData.Left != null)
7271
{
73-
// Left Alignment: offset + length <= Min
74-
int maxLength = (int)(props.LengthData.Min - segment.Offset);
75-
int length = maxLength < 0 ? 0 : Math.Min(segment.Length, maxLength);
72+
// We start from the left, which is faster due to not having to do right-align checks
73+
foreach (ArraySegment segment in CalculateSegments(props.DeltaData.Left))
74+
{
75+
// Left Alignment: offset + length <= Min
76+
int maxLength = (int)(props.LengthData.Min - segment.Offset);
77+
int length = maxLength < 0 ? 0 : Math.Min(segment.Length, maxLength);
7678

77-
if (length > 0)
78-
yield return new ArraySegment(segment.Offset, length, Alignment.Left);
79+
if (length > 0)
80+
yield return new ArraySegment(segment.Offset, length, Alignment.Left);
81+
}
7982
}
8083

81-
// Process right-aligned segments
82-
foreach (ArraySegment segment in CalculateSegments(props.DeltaData.Right))
84+
if (props.DeltaData.Right != null)
8385
{
84-
// Right Alignment: offset + length <= Min
85-
int maxLength = (int)(props.LengthData.Min - segment.Offset);
86-
int length = maxLength < 0 ? 0 : Math.Min(segment.Length, maxLength);
86+
// Process right-aligned segments
87+
foreach (ArraySegment segment in CalculateSegments(props.DeltaData.Right))
88+
{
89+
// Right Alignment: offset + length <= Min
90+
int maxLength = (int)(props.LengthData.Min - segment.Offset);
91+
int length = maxLength < 0 ? 0 : Math.Min(segment.Length, maxLength);
8792

88-
if (length > 0)
89-
yield return new ArraySegment(segment.Offset, length, Alignment.Right);
93+
if (length > 0)
94+
yield return new ArraySegment(segment.Offset, length, Alignment.Right);
95+
}
9096
}
9197
}
9298

0 commit comments

Comments
 (0)