Skip to content

Commit bfada96

Browse files
committed
Lets do xor instead of add/subtract mixing
Also add incomplete support for subsegments
1 parent 1521a97 commit bfada96

File tree

2 files changed

+91
-33
lines changed

2 files changed

+91
-33
lines changed

Src/FastData/Internal/Analysis/Data/DeltaData.cs

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
using System.Runtime.InteropServices;
2+
using Genbox.FastData.Internal.Enums;
3+
using Genbox.FastData.Internal.Misc;
24

35
namespace Genbox.FastData.Internal.Analysis.Data;
46

@@ -8,8 +10,36 @@ internal readonly record struct DeltaData(int[] Left, int[] Right)
810
internal int LeftZeroCount => CountZero(Left);
911
internal int RightZeroCount => CountZero(Right);
1012

11-
private static int CountZero(int[] data)
13+
//TODO: See the todo in KeyAnalyzer about supporting prefix/suffix only
14+
// internal IEnumerable<ArraySegment> GetSegments()
15+
// {
16+
// if (Map == null)
17+
// throw new InvalidOperationException("Cannot get map data");
18+
//
19+
// int index = 0;
20+
//
21+
// while (index < Map.Length)
22+
// {
23+
// while (index < Map.Length && Map[index] != 0)
24+
// index++;
25+
//
26+
// if (index >= Map.Length)
27+
// break;
28+
//
29+
// int start = index;
30+
//
31+
// while (index < Map.Length && Map[index] == 0)
32+
// index++;
33+
//
34+
// yield return new ArraySegment((uint)start, index, Alignment.Left);
35+
// }
36+
// }
37+
38+
private static int CountZero(int[]? data)
1239
{
40+
if (data == null)
41+
throw new InvalidOperationException("Cannot count map data");
42+
1343
int count;
1444
for (count = 0; count < data.Length; count++)
1545
{

Src/FastData/Internal/Analysis/KeyAnalyzer.cs

Lines changed: 60 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -53,45 +53,73 @@ internal static StringProperties GetStringProperties(string[] keys)
5353
uniq &= !lengthMap.SetTrue(str.Length);
5454
}
5555

56-
//Build a forward and reverse map of merged entropy
57-
//We can derive common substrings from it, as well as high-entropy substring hash functions
58-
int[] left = new int[maxStr.Length];
59-
int[] right = new int[maxStr.Length];
60-
bool flag = true;
56+
// The code beneath there calculate entropy maps that cna be used to derive the longest common substrings or longest prefix/suffix strings.
57+
// It works by adding characters to an accumulator, and then potentially removing the value from it again if the characters are the same.
58+
// If the accumulator for an offset contains 0 after all strings have been accumulated, it is highly likely that all the characters were the same.
59+
// However, there is a risk that an accumulator is 0, even if the characters are not the same. So we do a sanity check at the end to ensure we did it right.
60+
61+
// int[]? map = null;
62+
int[]? left = null;
63+
int[]? right = null;
64+
6165
bool allAscii = true;
6266

63-
foreach (string str in keys)
64-
{
65-
for (int i = 0; i < str.Length; i++)
67+
// Special case: If all strings have the same length, we can build an entropy map in O(n) with O(1) memory
68+
// TODO: For now FastData only supports prefix/suffix
69+
// if (minLength == maxStr.Length)
70+
// {
71+
// map = new int[minLength];
72+
//
73+
// foreach (string str in keys)
74+
// {
75+
// for (int i = 0; i < str.Length; i++)
76+
// {
77+
// char c = str[i];
78+
// map[i] ^= c;
79+
//
80+
// if (c > 127)
81+
// allAscii = false;
82+
// }
83+
// }
84+
// }
85+
// else
86+
// {
87+
//Build a forward and reverse map of merged entropy
88+
//We can derive common prefix/suffix from it that can be used later for high-entropy hash/equality functions
89+
left = new int[maxStr.Length];
90+
right = new int[maxStr.Length];
91+
92+
foreach (string str in keys)
6693
{
67-
char c = str[i];
68-
char rc = str[str.Length - 1 - i];
94+
for (int i = 0; i < str.Length; i++)
95+
{
96+
char lc = str[i];
97+
char rc = str[str.Length - 1 - i];
6998

70-
left[i] += flag ? c : -c;
71-
right[i] += flag ? rc : -rc;
99+
left[i] ^= lc;
100+
right[i] ^= rc;
72101

73-
if (c > 127)
74-
allAscii = false;
102+
if (lc > 127)
103+
allAscii = false;
104+
}
75105
}
76106

77-
flag = !flag;
78-
}
79-
80-
//Odd number of items. We need it to be even
81-
if (keys.Length % 2 != 0)
82-
{
83-
for (int i = 0; i < maxStr.Length; i++)
107+
//Odd number of items. We need it to be even
108+
if (keys.Length % 2 != 0)
84109
{
85-
//For best mixing, we take the longest string
86-
char c = maxStr[i];
87-
char rc = maxStr[maxStr.Length - 1 - i];
110+
for (int i = 0; i < maxStr.Length; i++)
111+
{
112+
//For best mixing, we take the longest string
113+
char lc = maxStr[i];
114+
char rc = maxStr[maxStr.Length - 1 - i];
88115

89-
left[i] += flag ? c : -c;
90-
right[i] += flag ? rc : -rc;
116+
left[i] ^= lc;
117+
right[i] ^= rc;
91118

92-
//We do not add to characterMap here since it does not need the duplicate
119+
//We do not add to characterMap here since it does not need the duplicate
120+
}
93121
}
94-
}
122+
// }
95123

96124
return new StringProperties(new LengthData((uint)minLength, (uint)maxStr.Length, (uint)minUtf8ByteLength, (uint)maxUtf8ByteLength, (uint)minUtf16ByteLength, (uint)maxUtf16ByteLength, uniq, lengthMap), new DeltaData(left, right), new CharacterData(allAscii));
97125
}
@@ -133,7 +161,7 @@ private static KeyProperties<float> GetSingleProperties(float[] keys)
133161
}
134162

135163
ulong range = ClampRangeToUInt64(max - min);
136-
return new KeyProperties<float>(min, max, range, hasZeroOrNaN, IsFloatContiguous(keys, min, max, hasNaNOrInfinity));
164+
return new KeyProperties<float>(min, max, range, hasZeroOrNaN, IsFloatConsecutive(keys, min, max, hasNaNOrInfinity));
137165
}
138166

139167
private static KeyProperties<double> GetDoubleProperties(double[] keys)
@@ -159,7 +187,7 @@ private static KeyProperties<double> GetDoubleProperties(double[] keys)
159187
}
160188

161189
ulong range = ClampRangeToUInt64(max - min);
162-
return new KeyProperties<double>(min, max, range, hasZeroOrNaN, IsDoubleContiguous(keys, min, max, hasNaNOrInfinity));
190+
return new KeyProperties<double>(min, max, range, hasZeroOrNaN, IsDoubleConsecutive(keys, min, max, hasNaNOrInfinity));
163191
}
164192

165193
private static KeyProperties<byte> GetByteProperties(byte[] keys)
@@ -275,7 +303,7 @@ private static KeyProperties<ulong> GetUInt64Properties(ulong[] keys)
275303
return new KeyProperties<ulong>(min, max, max - min, false, keys.Length <= 1 || max - min == (ulong)(keys.Length - 1));
276304
}
277305

278-
private static bool IsFloatContiguous(float[] keys, float min, float max, bool hasNaNOrInfinity)
306+
private static bool IsFloatConsecutive(float[] keys, float min, float max, bool hasNaNOrInfinity)
279307
{
280308
if (hasNaNOrInfinity)
281309
return false;
@@ -301,7 +329,7 @@ private static bool IsFloatContiguous(float[] keys, float min, float max, bool h
301329
return true;
302330
}
303331

304-
private static bool IsDoubleContiguous(double[] keys, double min, double max, bool hasNaNOrInfinity)
332+
private static bool IsDoubleConsecutive(double[] keys, double min, double max, bool hasNaNOrInfinity)
305333
{
306334
if (hasNaNOrInfinity)
307335
return false;

0 commit comments

Comments
 (0)