Skip to content

Commit 18f18d1

Browse files
committed
Track min/max in LengthBitArray to make consecutive check cheaper
Also fix a bug with word boundaries
1 parent 0354142 commit 18f18d1

File tree

2 files changed

+26
-15
lines changed

2 files changed

+26
-15
lines changed

Src/FastData/Internal/Analysis/Data/LengthBitArray.cs

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
using System.Runtime.CompilerServices;
2-
using Genbox.FastData.Internal.Helpers;
32

43
namespace Genbox.FastData.Internal.Analysis.Data;
54

@@ -8,20 +7,20 @@ internal sealed class LengthBitArray(int length = 64)
87
private int _length = length;
98
private ulong[] _values = new ulong[GetLength(length)];
109

10+
public int Min { get; private set; } = int.MaxValue;
11+
public int Max { get; private set; } = int.MinValue;
12+
1113
internal ulong[] Values => _values;
1214
internal int BitCount { get; private set; }
1315

1416
internal bool Consecutive
1517
{
1618
get
1719
{
18-
foreach (ulong val in _values)
19-
{
20-
if (!BitHelper.AreBitsConsecutive(val))
21-
return false;
22-
}
20+
if (BitCount == 0)
21+
return false;
2322

24-
return true;
23+
return BitCount == Max - Min + 1;
2524
}
2625
}
2726

@@ -31,7 +30,8 @@ internal bool Get(int index)
3130
if (unchecked((uint)index >= (uint)_length))
3231
throw new ArgumentException("Index out of range: " + index, nameof(index));
3332

34-
return (_values[index >> 6] & (1UL << ((index & 63) - 1))) != 0; //-1 because we want a length of 1 to set the 0th bit
33+
GetPosition(index, out int wordIndex, out ulong mask);
34+
return (_values[wordIndex] & mask) != 0;
3535
}
3636

3737
[MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -45,12 +45,16 @@ internal bool SetTrue(int index)
4545

4646
unchecked
4747
{
48-
ulong mask = 1UL << ((index & 63) - 1); //-1 because we want a length of 1 to set the 0th bit
49-
ref ulong slot = ref _values[index >> 6];
48+
GetPosition(index, out int wordIndex, out ulong mask);
49+
ref ulong slot = ref _values[wordIndex];
5050
bool alreadySet = (slot & mask) != 0;
5151

5252
if (!alreadySet)
53+
{
5354
BitCount++;
55+
Min = Math.Min(Min, index);
56+
Max = Math.Max(Max, index);
57+
}
5458

5559
slot |= mask;
5660
return alreadySet;
@@ -74,4 +78,13 @@ private static int GetLength(int n)
7478

7579
return (n + 63) >> 6;
7680
}
81+
82+
private static void GetPosition(int index, out int wordIndex, out ulong mask)
83+
{
84+
int remainder = index & 63;
85+
int bitIndex = remainder == 0 ? 63 : remainder - 1; //-1 because we want a length of 1 to set the 0th bit
86+
87+
wordIndex = index >> 6;
88+
mask = 1UL << bitIndex;
89+
}
7790
}

Src/FastData/Internal/Analysis/KeyAnalyzer.cs

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ internal static StringProperties GetStringProperties(string[] keys, bool enableT
2929

3030
//We need to know the longest string for optimal mixing. Probably not 100% necessary.
3131
string maxStr = keys[0];
32-
int minLength = int.MaxValue;
3332
int minUtf8ByteLength = int.MaxValue;
3433
int maxUtf8ByteLength = int.MinValue;
3534
int minUtf16ByteLength = int.MaxValue;
@@ -50,7 +49,6 @@ internal static StringProperties GetStringProperties(string[] keys, bool enableT
5049
minUtf16ByteLength = Math.Min(utf16ByteCount, minUtf16ByteLength);
5150
maxUtf16ByteLength = Math.Max(utf16ByteCount, maxUtf16ByteLength);
5251

53-
minLength = Math.Min(minLength, str.Length); //Track the smallest string. It might be more than what lengthmap supports
5452
uniq &= !lengthMap.SetTrue(str.Length);
5553

5654
foreach (char c in str)
@@ -70,7 +68,7 @@ internal static StringProperties GetStringProperties(string[] keys, bool enableT
7068
int[]? right = null;
7169

7270
// Prefix/suffix tracking only makes sense when there are multiple keys, and they are long enough
73-
if (enableTrimming && keys.Length > 1 && minLength > 1)
71+
if (enableTrimming && keys.Length > 1 && lengthMap.Min > 1)
7472
{
7573
// Special case: If all strings have the same length, we can build an entropy map in O(n) with O(1) memory
7674
// TODO: For now FastData only supports prefix/suffix
@@ -126,7 +124,7 @@ internal static StringProperties GetStringProperties(string[] keys, bool enableT
126124
}
127125

128126
// Make sure that we handle the case where all characters in the inputs are the same
129-
if (DeltaData.CountZero(left) == minLength || DeltaData.CountZero(right) == minLength)
127+
if (DeltaData.CountZero(left) == lengthMap.Min || DeltaData.CountZero(right) == lengthMap.Min)
130128
{
131129
left = null;
132130
right = null;
@@ -135,7 +133,7 @@ internal static StringProperties GetStringProperties(string[] keys, bool enableT
135133
// }
136134
}
137135

138-
return new StringProperties(new LengthData((uint)minLength, (uint)maxStr.Length, (uint)minUtf8ByteLength, (uint)maxUtf8ByteLength, (uint)minUtf16ByteLength, (uint)maxUtf16ByteLength, uniq, lengthMap), new DeltaData(left, right), new CharacterData(allAscii));
136+
return new StringProperties(new LengthData((uint)lengthMap.Min, (uint)maxStr.Length, (uint)minUtf8ByteLength, (uint)maxUtf8ByteLength, (uint)minUtf16ByteLength, (uint)maxUtf16ByteLength, uniq, lengthMap), new DeltaData(left, right), new CharacterData(allAscii));
139137
}
140138

141139
private static KeyProperties<char> GetCharProperties(char[] keys)

0 commit comments

Comments
 (0)