Skip to content

Commit 181e21a

Browse files
committed
Introduce SwitchingBitSet that encapsulate logic for going from bitset to hashset
1 parent 27f4fca commit 181e21a

File tree

5 files changed

+153
-52
lines changed

5 files changed

+153
-52
lines changed

Src/FastData.Tests/LengthBitArrayTests.cs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ public void CustomLength_AllBitsFalseWithinRange()
2929
[Theory]
3030
[InlineData(-1)]
3131
[InlineData(-100)]
32-
[InlineData(64)]
3332
public void Get_OutOfRange_Throws(int index)
3433
{
3534
LengthBitArray bits = new LengthBitArray();

Src/FastData/Generators/GeneratorConfig.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,9 @@ private static IEnumerable<IEarlyExit> GetEarlyExits(KeyProperties<T> props, uin
127127

128128
private static bool ShouldUseBitSet(LengthData lengthData)
129129
{
130+
if (!lengthData.LengthMap.HasBitSet)
131+
return false;
132+
130133
if (lengthData.LengthMap.Consecutive)
131134
return false;
132135

Lines changed: 8 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,18 @@
11
using System.Runtime.CompilerServices;
2+
using Genbox.FastData.Internal.Misc;
23

34
namespace Genbox.FastData.Internal.Analysis.Data;
45

56
internal sealed class LengthBitArray(int length = 64)
67
{
7-
private int _length = length;
8-
private ulong[] _values = new ulong[GetLength(length)];
8+
private readonly SwitchingBitSet _tracker = new SwitchingBitSet(length, true);
99

1010
public int Min { get; private set; } = int.MaxValue;
1111
public int Max { get; private set; } = int.MinValue;
1212

13-
internal ulong[] Values => _values;
13+
internal ulong[] Values => _tracker.BitSet;
1414
internal int BitCount { get; private set; }
15+
internal bool HasBitSet => _tracker.IsBitSet;
1516

1617
internal bool Consecutive
1718
{
@@ -25,66 +26,26 @@ internal bool Consecutive
2526
}
2627

2728
[MethodImpl(MethodImplOptions.AggressiveInlining)]
28-
internal bool Get(int index)
29-
{
30-
if (unchecked((uint)index >= (uint)_length))
31-
throw new ArgumentException("Index out of range: " + index, nameof(index));
32-
33-
GetPosition(index, out int wordIndex, out ulong mask);
34-
return (_values[wordIndex] & mask) != 0;
35-
}
29+
internal bool Get(int index) => _tracker.Contains(index);
3630

3731
[MethodImpl(MethodImplOptions.AggressiveInlining)]
3832
internal bool SetTrue(int index)
3933
{
4034
if (index < 0)
4135
throw new ArgumentException("Index must be non-negative: " + index, nameof(index));
4236

43-
if (index >= _length)
44-
Expand(index + 1);
45-
4637
unchecked
4738
{
48-
GetPosition(index, out int wordIndex, out ulong mask);
49-
ref ulong slot = ref _values[wordIndex];
50-
bool alreadySet = (slot & mask) != 0;
39+
bool added = _tracker.Add(index);
5140

52-
if (!alreadySet)
41+
if (added)
5342
{
5443
BitCount++;
5544
Min = Math.Min(Min, index);
5645
Max = Math.Max(Max, index);
5746
}
5847

59-
slot |= mask;
60-
return alreadySet;
48+
return !added;
6149
}
6250
}
63-
64-
private void Expand(int newLength)
65-
{
66-
int newSize = GetLength(newLength);
67-
68-
if (newSize > _values.Length)
69-
Array.Resize(ref _values, newSize);
70-
71-
_length = newLength;
72-
}
73-
74-
private static int GetLength(int n)
75-
{
76-
if (n == 0)
77-
throw new InvalidOperationException("Length must be greater than zero.");
78-
79-
return (n + 63) >> 6;
80-
}
81-
82-
private static void GetPosition(int index, out int wordIndex, out ulong mask)
83-
{
84-
int remainder = index & 63;
85-
int bitIndex = remainder == 0 ? 63 : remainder - 1; //-1 because we want a length of 1 to set the 0th bit
86-
87-
wordIndex = index >> 6;
88-
mask = 1UL << bitIndex;
89-
}
9051
}

Src/FastData/Internal/Misc/HashData.cs

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,12 @@ internal static HashData Create<T>(T[] data, int capacityFactor, HashFunc<T> fun
1515
if (size == 0)
1616
throw new InvalidOperationException("HashCapacityFactor results in zero-sized hash table.");
1717

18+
if (size > int.MaxValue)
19+
throw new InvalidOperationException("HashCapacityFactor results in a hash table that is too large.");
20+
1821
ulong[] hashCodes = new ulong[size];
1922
HashSet<ulong> uniqSet = new HashSet<ulong>();
20-
HashSet<ulong> perfectSet = new HashSet<ulong>(); //TODO: Use direct addressing
23+
SwitchingBitSet perfectTracker = new SwitchingBitSet((int)size, false);
2124

2225
bool uniq = true;
2326
bool perfect = true;
@@ -29,13 +32,13 @@ internal static HashData Create<T>(T[] data, int capacityFactor, HashFunc<T> fun
2932
ulong hash = func(data[i]);
3033
hashCodes[i] = hash;
3134

32-
minHashCode = hash < minHashCode ? hash : minHashCode;
33-
maxHashCode = hash > maxHashCode ? hash : maxHashCode;
35+
minHashCode = Math.Min(minHashCode, hash);
36+
maxHashCode = Math.Max(maxHashCode, hash);
3437

3538
if (uniq && !uniqSet.Add(hash)) //The unique check is first so that when it is false, we don't try the other conditions
3639
uniq = false;
3740

38-
if (perfect && !perfectSet.Add(hash % size))
41+
if (perfect && !perfectTracker.Add((int)(hash % size)))
3942
perfect = false;
4043
}
4144

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
namespace Genbox.FastData.Internal.Misc;
2+
3+
/// <summary>This class starts out as a bitset, but when too many items are added, it switches to a HashSet</summary>
4+
internal sealed class SwitchingBitSet
5+
{
6+
private readonly int _maxBitSetWords;
7+
private readonly bool _offByOneMode; // "Off by one mode" is needed for length bitarrays as we occupy the first bit as length = 1
8+
private ulong[]? _bits;
9+
private HashSet<int>? _set;
10+
11+
internal SwitchingBitSet(int length, bool offByOneMode, int maxBitSetWords = 131072)
12+
{
13+
if (length <= 0)
14+
throw new InvalidOperationException("Length must be greater than zero.");
15+
16+
_offByOneMode = offByOneMode;
17+
_maxBitSetWords = maxBitSetWords;
18+
19+
int wordLength = GetWordLength(length);
20+
21+
if (wordLength <= maxBitSetWords)
22+
_bits = new ulong[wordLength];
23+
else
24+
_set = new HashSet<int>();
25+
}
26+
27+
internal bool IsBitSet => _bits != null;
28+
internal ulong[] BitSet => _bits ?? [];
29+
30+
internal bool Add(int index)
31+
{
32+
if (index < 0)
33+
throw new ArgumentException("Index must be non-negative: " + index, nameof(index));
34+
35+
if (_bits != null)
36+
{
37+
EnsureCapacityForIndex(index);
38+
39+
if (_bits == null)
40+
return _set!.Add(index);
41+
42+
GetPosition(index, out int wordIndex, out ulong mask);
43+
44+
if ((_bits[wordIndex] & mask) != 0)
45+
return false;
46+
47+
_bits[wordIndex] |= mask;
48+
return true;
49+
}
50+
51+
return _set!.Add(index);
52+
}
53+
54+
internal bool Contains(int index)
55+
{
56+
if (index < 0)
57+
throw new ArgumentException("Index must be non-negative: " + index, nameof(index));
58+
59+
if (_bits != null)
60+
{
61+
GetPosition(index, out int wordIndex, out ulong mask);
62+
63+
if (wordIndex >= _bits.Length)
64+
return false;
65+
66+
return (_bits[wordIndex] & mask) != 0;
67+
}
68+
69+
return _set!.Contains(index);
70+
}
71+
72+
private static int GetWordLength(int length) => (int)(((long)length + 63) >> 6);
73+
74+
private void GetPosition(int index, out int wordIndex, out ulong mask)
75+
{
76+
wordIndex = index >> 6;
77+
78+
if (_offByOneMode)
79+
{
80+
ulong remainder = (ulong)(index & 63);
81+
int bitIndex = remainder == 0 ? 63 : (int)remainder - 1;
82+
mask = 1UL << bitIndex;
83+
}
84+
else
85+
{
86+
mask = 1UL << (index & 63);
87+
}
88+
}
89+
90+
private void SwitchToSet()
91+
{
92+
HashSet<int> set = new HashSet<int>();
93+
94+
for (int wordIndex = 0; wordIndex < _bits!.Length; wordIndex++)
95+
{
96+
ulong word = _bits[wordIndex];
97+
if (word == 0)
98+
continue;
99+
100+
for (int bitIndex = 0; bitIndex < 64; bitIndex++)
101+
{
102+
if ((word & (1UL << bitIndex)) == 0)
103+
continue;
104+
105+
set.Add(GetIndex(wordIndex, bitIndex));
106+
}
107+
}
108+
109+
_set = set;
110+
_bits = null;
111+
}
112+
113+
private int GetIndex(int wordIndex, int bitIndex)
114+
{
115+
if (_offByOneMode)
116+
return (int)(((ulong)wordIndex << 6) + (bitIndex == 63 ? 0UL : (uint)bitIndex + 1));
117+
118+
return (int)(((ulong)wordIndex << 6) + (uint)bitIndex);
119+
}
120+
121+
private void EnsureCapacityForIndex(int index)
122+
{
123+
int wordLength = (index >> 6) + 1;
124+
if (wordLength <= _bits!.Length)
125+
return;
126+
127+
if (wordLength > _maxBitSetWords)
128+
{
129+
SwitchToSet();
130+
return;
131+
}
132+
133+
Array.Resize(ref _bits, wordLength);
134+
}
135+
}

0 commit comments

Comments
 (0)