Skip to content
This repository was archived by the owner on Jan 5, 2025. It is now read-only.

Commit cd47ec8

Browse files
committed
bit packed integer encoding
1 parent 1032020 commit cd47ec8

File tree

4 files changed

+327
-14
lines changed

4 files changed

+327
-14
lines changed

src/stripe/encode/bit_packed_bool.zig

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,9 @@ comptime {
1515
}
1616

1717
pub const Validator = struct {
18-
const Self = @This();
19-
2018
count: u32,
2119

20+
const Self = @This();
2221
pub const Encoder = bit_packed_bool.Encoder;
2322

2423
pub fn init() Self {
@@ -43,13 +42,11 @@ pub const Validator = struct {
4342
};
4443

4544
pub const Encoder = struct {
46-
const Self = @This();
47-
48-
const BitIndexInt = u6;
49-
5045
word: Word,
5146
bit_index: BitIndexInt,
5247

48+
const Self = @This();
49+
const BitIndexInt = u6;
5350
const Value = bool;
5451

5552
fn init() Self {
@@ -83,11 +80,11 @@ pub const Encoder = struct {
8380
};
8481

8582
pub const Decoder = struct {
86-
const Self = @This();
87-
8883
index: usize,
8984
current_word: ?Word,
9085

86+
const Self = @This();
87+
9188
pub fn init() Self {
9289
return .{
9390
.index = 0,
@@ -118,10 +115,6 @@ pub const Decoder = struct {
118115
return (self.current_word.? >> bit_index) & 1 > 0;
119116
}
120117

121-
pub fn readAll(_: *Self, _: []bool, _: anytype) !void {
122-
@panic("todo");
123-
}
124-
125118
fn loadWord(self: *Self, blob: anytype, word_index: usize) !void {
126119
var buf: [@sizeOf(Word)]u8 = undefined;
127120
const byte_index = word_index * @sizeOf(Word);
Lines changed: 310 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,310 @@
1+
//! Packs zig-zag encoded integers from least significant bit to most in 64 bit, unsigned integers.
2+
//! Integers are encoded as little endian bytes.
3+
4+
const std = @import("std");
5+
const debug = std.debug;
6+
const mem = std.mem;
7+
const testing = std.testing;
8+
9+
const Encoding = @import("../encoding.zig").Encoding;
10+
const Error = @import("../error.zig").Error;
11+
const Valid = @import("../validator.zig").Valid;
12+
13+
const bit_packed_int = @This();
14+
15+
pub const Validator = struct {
16+
bit_width: u8,
17+
count: u32,
18+
19+
const Self = @This();
20+
pub const Encoder = bit_packed_int.Encoder;
21+
22+
pub fn init() Self {
23+
return .{
24+
.bit_width = 0,
25+
.count = 0,
26+
};
27+
}
28+
29+
pub fn next(self: *Self, value: i64) void {
30+
// Always zig-zag encode tha value
31+
// TODO is it faster to skip zig zag if all inputs are >= 0?
32+
// TODO is it faster to calculate zig zag bit width without doing the full encoding?
33+
const value_zz = zig_zag.encode(value);
34+
const sig_bits: u8 = 64 - @clz(value_zz);
35+
if (sig_bits > self.bit_width) {
36+
self.bit_width = sig_bits;
37+
}
38+
39+
self.count += 1;
40+
}
41+
42+
pub fn end(self: Self) !Valid(Self.Encoder) {
43+
// Do not support 64 bit bit-packing because then there isn't any point to packing
44+
if (self.bit_width == 64) {
45+
return Error.NotEncodable;
46+
}
47+
48+
// The number of octets written is always a factor of 8 (64 bits) so that reads and writes
49+
// operate on a 64-bit integer in memory and loads and stores operate on that same integer
50+
// Add 1 to the length to account for the byte that stores the bit width
51+
const byte_len = (((self.count * self.bit_width) + 64 - 1) / 64) * 8 + 1;
52+
return .{
53+
.meta = .{
54+
.byte_len = byte_len,
55+
.encoding = Encoding.BitPacked,
56+
},
57+
.encoder = Self.Encoder.init(@intCast(self.bit_width)),
58+
};
59+
}
60+
};
61+
62+
pub const Encoder = struct {
63+
bit_width: u6,
64+
65+
word_bit_index: u6,
66+
word: u64,
67+
68+
const Self = @This();
69+
70+
fn init(bit_width: u6) Self {
71+
return .{ .bit_width = bit_width, .word_bit_index = 0, .word = 0 };
72+
}
73+
74+
pub fn deinit(_: *Self) void {}
75+
76+
pub fn begin(self: *Self, writer: anytype) !bool {
77+
try writer.writeByte(self.bit_width);
78+
return true;
79+
}
80+
81+
pub fn write(self: *Self, writer: anytype, value: i64) !void {
82+
const value_zz = zig_zag.encode(value);
83+
// The int cast is safe because bit index is <= 63 at this point
84+
self.word |= value_zz << @intCast(self.word_bit_index);
85+
const bits_used: u8 = @as(u8, 64) - self.word_bit_index;
86+
self.word_bit_index +%= self.bit_width;
87+
88+
if (bits_used <= self.bit_width) {
89+
// Flush the word
90+
try writer.writeInt(u64, self.word, .little);
91+
92+
// Some bits might have been truncated. Write them to the next word
93+
self.word = value_zz >> @intCast(bits_used);
94+
}
95+
}
96+
97+
pub fn end(self: *Self, writer: anytype) !void {
98+
if (self.word_bit_index > 0) {
99+
try writer.writeInt(u64, self.word, .little);
100+
}
101+
}
102+
};
103+
104+
pub const Decoder = struct {
105+
bit_width: u6,
106+
107+
/// Global bit index
108+
bit_index: usize,
109+
current_words: ?struct {
110+
// Keep a window of 2 words because values can span at most two words
111+
buf: [2]u64,
112+
word_index: usize,
113+
},
114+
115+
const Self = @This();
116+
117+
pub fn init() Self {
118+
return .{
119+
.bit_width = 0,
120+
.bit_index = 0,
121+
.current_words = null,
122+
};
123+
}
124+
125+
pub fn begin(self: *Self, blob: anytype) !void {
126+
var buf: [1]u8 = undefined;
127+
try blob.readAt(buf[0..], 0);
128+
debug.assert(buf[0] < 64);
129+
self.bit_width = @intCast(buf[0]);
130+
}
131+
132+
pub fn next(self: *Self, n: u32) void {
133+
self.bit_index += @as(usize, self.bit_width) * n;
134+
135+
if (self.current_words != null and
136+
self.currEndWordIndex() - self.current_words.?.word_index > 1)
137+
{
138+
self.current_words = null;
139+
}
140+
}
141+
142+
pub fn read(self: *Self, blob: anytype) !i64 {
143+
if (self.current_words == null) {
144+
try self.loadWords(blob);
145+
}
146+
147+
const curr_words = self.current_words.?;
148+
const word_bit_index: u6 = @intCast(self.bit_index % 64);
149+
if (curr_words.word_index == self.currWordIndex()) {
150+
// Value starts in the buf[0] word. The value may be spread across two words
151+
const lower: u64 = (curr_words.buf[0] >> word_bit_index) &
152+
((@as(u64, 1) << self.bit_width) - 1);
153+
154+
const end_bit_index: u8 = @as(u8, word_bit_index) + self.bit_width;
155+
if (end_bit_index > 64) {
156+
const upper: u64 = (curr_words.buf[1] &
157+
((@as(u64, 1) << @as(u6, @intCast(end_bit_index - 64))) - 1));
158+
const full = (upper << @intCast(@as(u8, 64) - word_bit_index)) | lower;
159+
return zig_zag.decode(full);
160+
}
161+
162+
return zig_zag.decode(lower);
163+
}
164+
165+
// Value starts in the buf[1] word. The entire value must fit in the buf[1] word
166+
debug.assert(self.currWordIndex() == curr_words.word_index + 1);
167+
const value: u64 = (curr_words.buf[1] >> word_bit_index) &
168+
((@as(u64, 1) << @intCast(self.bit_width)) - 1);
169+
return zig_zag.decode(value);
170+
}
171+
172+
fn loadWords(self: *Self, blob: anytype) !void {
173+
var buf: [16]u8 = undefined;
174+
const word_index = self.currWordIndex();
175+
// Add 1 to account for the byte that stores the bit width
176+
const byte_index = word_index * 8 + 1;
177+
178+
// Account for byte that stores the bit width
179+
if (byte_index + 8 == blob.len()) {
180+
// Read the last byte
181+
try blob.readAt(buf[0..8], byte_index);
182+
self.current_words = .{
183+
.buf = [2]u64{
184+
mem.readInt(u64, buf[0..8], .little),
185+
undefined,
186+
},
187+
.word_index = word_index,
188+
};
189+
return;
190+
}
191+
192+
debug.assert(byte_index + 8 < blob.len());
193+
try blob.readAt(buf[0..], byte_index);
194+
self.current_words = .{
195+
.buf = [2]u64{
196+
mem.readInt(u64, buf[0..8], .little),
197+
mem.readInt(u64, buf[8..], .little),
198+
},
199+
.word_index = word_index,
200+
};
201+
}
202+
203+
fn currWordIndex(self: *Self) usize {
204+
return self.bit_index / 64;
205+
}
206+
207+
fn currEndWordIndex(self: *Self) usize {
208+
return (self.bit_index + self.bit_width - 1) / 64;
209+
}
210+
};
211+
212+
const zig_zag = struct {
213+
pub fn encode(value: i64) u64 {
214+
return @bitCast((2 * value) ^ (value >> (8 * 8 - 1)));
215+
}
216+
217+
pub fn decode(value: u64) i64 {
218+
return @as(i64, @bitCast(value >> 1)) ^ (-@as(i64, @bitCast(value & 1)));
219+
}
220+
};
221+
222+
const neg_10_to_10_encoded_bytes = [_]u8{
223+
0x33, 0xBE, 0xB6, 0xD2, 0x29, 0x23, 0x00, 0x41,
224+
0x0C, 0x52, 0xCC, 0x41, 0x49, 0x01, 0x00, 0x00,
225+
};
226+
227+
test "bit packed int: encode" {
228+
const allocator = testing.allocator;
229+
230+
var data = try std.ArrayList(u8).initCapacity(allocator, 17);
231+
defer data.deinit();
232+
const writer = data.writer();
233+
234+
var encoder = Encoder.init(5);
235+
const cont = try encoder.begin(writer);
236+
try testing.expectEqual(true, cont);
237+
var v: i64 = -10;
238+
while (v <= 10) {
239+
try encoder.write(writer, v);
240+
v += 1;
241+
}
242+
try encoder.end(writer);
243+
244+
try testing.expectEqual(5, data.items[0]);
245+
try testing.expectEqualSlices(u8, neg_10_to_10_encoded_bytes[0..], data.items[1..]);
246+
}
247+
248+
test "bit packed int: decode" {
249+
const MemoryBlob = @import("../../MemoryBlob.zig");
250+
251+
var blob_data: [neg_10_to_10_encoded_bytes.len + 1]u8 = undefined;
252+
blob_data[0] = 5;
253+
@memcpy(blob_data[1..], neg_10_to_10_encoded_bytes[0..]);
254+
var blob = MemoryBlob{ .data = &blob_data };
255+
256+
var decoder = Decoder.init();
257+
try decoder.begin(&blob);
258+
var expected: i64 = -10;
259+
while (expected <= 10) {
260+
const v = try decoder.read(&blob);
261+
try testing.expectEqual(expected, v);
262+
decoder.next(1);
263+
expected += 1;
264+
}
265+
}
266+
267+
test "bit packed int: round trip" {
268+
const MemoryBlob = @import("../../MemoryBlob.zig");
269+
const allocator = testing.allocator;
270+
271+
const bit_widths = [_]u6{ 1, 2, 5, 7, 8, 30, 32, 63 };
272+
273+
for (&bit_widths) |bit_width| {
274+
// TODO this isn't max value but number of encodable numbers (centered at 0)
275+
const min_value: i64 = @intCast(@max(
276+
-10_000,
277+
-@divTrunc(std.math.pow(i128, 2, @intCast(bit_width)), 2),
278+
));
279+
const max_value: i64 = @intCast(@min(
280+
10_000,
281+
@divTrunc(std.math.pow(i128, 2, @intCast(bit_width)), 2) - 1,
282+
));
283+
284+
var data = try std.ArrayList(u8).initCapacity(allocator, 17);
285+
defer data.deinit();
286+
const writer = data.writer();
287+
288+
var encoder = Encoder.init(bit_width);
289+
const cont = try encoder.begin(writer);
290+
try testing.expectEqual(true, cont);
291+
var v: i64 = min_value;
292+
while (v <= max_value) {
293+
try encoder.write(writer, v);
294+
v += 1;
295+
}
296+
try encoder.end(writer);
297+
298+
var blob = MemoryBlob{ .data = data.items };
299+
300+
var decoder = Decoder.init();
301+
try decoder.begin(&blob);
302+
var expected: i64 = min_value;
303+
while (expected <= max_value) {
304+
const value = try decoder.read(&blob);
305+
try testing.expectEqual(expected, value);
306+
decoder.next(1);
307+
expected += 1;
308+
}
309+
}
310+
}

src/stripe/encode/direct.zig

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,14 +96,16 @@ pub fn Decoder(
9696
return;
9797
}
9898

99-
// TODO more testing needed
10099
var bytes_dest: []u8 = undefined;
101100
bytes_dest.len = dst.len * @sizeOf(Value);
102101
bytes_dest.ptr = @ptrCast(dst.ptr);
103102
try blob.readAt(bytes_dest[0..], self.index * @sizeOf(Value));
104103
for (dst, 0..) |*v, idx| {
105104
const start = idx * @sizeOf(Value);
106-
v.* = fromBytes(@as(*const [@sizeOf(Value)]u8, @ptrCast(bytes_dest[start..(start + @sizeOf(Value))])));
105+
v.* = fromBytes(@as(
106+
*const [@sizeOf(Value)]u8,
107+
@ptrCast(bytes_dest[start..(start + @sizeOf(Value))]),
108+
));
107109
}
108110
}
109111
};

0 commit comments

Comments
 (0)