Skip to content

Commit e0213d1

Browse files
horghclaude
andcommitted
Use only bucket-size bits for IPv6 bucket integers
Previously, IPv6BucketToInt64 always extracted 60 bits regardless of the configured bucket size. This produced very large integers (e.g., ~144 quadrillion for 2001:: with /16 buckets). Now it extracts only the configured bucketSize bits, producing much smaller integers that may be stored more efficiently in BigQuery. Examples with bucket size 16: - 2001:: → 8193 (was ~144 quadrillion) - abcd:: → 43981 (was ~773 quadrillion) - ffff:: → 65535 (was ~1.15 quintillion) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <[email protected]>
1 parent 62e7a0d commit e0213d1

File tree

6 files changed

+85
-116
lines changed

6 files changed

+85
-116
lines changed

internal/network/utils.go

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,34 +19,34 @@ func IPv4ToUint32(addr netip.Addr) uint32 {
1919
return binary.BigEndian.Uint32(bytes[:])
2020
}
2121

22-
// IPv6BucketToInt64 converts the first 60 bits of an IPv6 address to int64.
22+
// IPv6BucketToInt64 converts the first bucketSize bits of an IPv6 address to int64.
2323
//
2424
// NOTE: The address must already be masked to the appropriate bucket (i.e., if
2525
// you have a bucket size of /16, you must provide 2001:: as opposed to
2626
// something like 2001:abcd::).
2727
//
28-
// We use 60 bits (not 64) because 60-bit values always fit in a positive
29-
// int64, which simplifies queries (no two's complement handling needed).
30-
//
31-
// We use 60 bits in particular as that is what 15 hex characters provides.
32-
// This is already more bits than we'd typically need.
28+
// Only the first bucketSize bits are kept, which produces smaller integers
29+
// that may be stored more efficiently in analytics platforms like BigQuery.
3330
//
3431
// In BigQuery, you can compute the same value using:
3532
//
3633
// CAST(CONCAT('0x', SUBSTR(
37-
// TO_HEX(NET.IP_TRUNC(NET.IP_FROM_STRING(ip), bucket_size)), 1, 15
34+
// TO_HEX(NET.IP_TRUNC(NET.IP_FROM_STRING(ip), bucket_size)),
35+
// 1,
36+
// CAST(CEILING(bucket_size / 4) AS INT64)
3837
// )) AS INT64)
3938
//
4039
// where bucket_size is the prefix length used for bucketing.
41-
func IPv6BucketToInt64(addr netip.Addr) (int64, error) {
40+
func IPv6BucketToInt64(addr netip.Addr, bucketSize int) (int64, error) {
4241
if !addr.Is6() {
4342
return 0, errors.New("IPv6BucketToInt64 called with non-IPv6 address")
4443
}
4544
bytes := addr.As16()
46-
// Read first 64 bits, then right-shift by 4 to get top 60 bits
4745
val := binary.BigEndian.Uint64(bytes[:8])
48-
//nolint:gosec // 60-bit value always fits in positive int64
49-
return int64(val >> 4), nil
46+
// Right-shift to keep only bucketSize bits
47+
shift := 64 - bucketSize
48+
//nolint:gosec // bucketSize is validated to be <= 60, so result fits in positive int64
49+
return int64(val >> shift), nil
5050
}
5151

5252
// IsAdjacent checks if two IP addresses are consecutive (no gap between them).

internal/network/utils_test.go

Lines changed: 56 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -41,37 +41,37 @@ func TestIPv4ToUint32(t *testing.T) {
4141
}
4242
}
4343

44-
// TestIPv6BucketToInt64 tests conversion of IPv6 addresses to 60-bit integers.
44+
// TestIPv6BucketToInt64 tests conversion of IPv6 addresses to bucket integers.
4545
//
4646
// Expected values can be verified with this BigQuery query (using bucket size 16):
4747
//
4848
// SELECT
4949
// ip,
5050
// expected,
5151
// CAST(CONCAT('0x', SUBSTR(
52-
// TO_HEX(NET.IP_TRUNC(NET.IP_FROM_STRING(ip), 16)), 1, 15
52+
// TO_HEX(NET.IP_TRUNC(NET.IP_FROM_STRING(ip), 16)), 1, 4
5353
// )) AS INT64) as bucket_int,
5454
// CAST(CONCAT('0x', SUBSTR(
55-
// TO_HEX(NET.IP_TRUNC(NET.IP_FROM_STRING(ip), 16)), 1, 15
55+
// TO_HEX(NET.IP_TRUNC(NET.IP_FROM_STRING(ip), 16)), 1, 4
5656
// )) AS INT64) = expected as matches
5757
// FROM UNNEST([
5858
// STRUCT('::' AS ip, 0 AS expected),
59-
// ('2001::', 144132780261900288),
60-
// ('2001:db8::', 144132780261900288),
61-
// ('abcc::', 773704342233415680),
62-
// ('abcd::', 773721934419460096),
63-
// ('ffff:ffff:ffff:ffff::', 1152903912420802560),
64-
// ('8000::', 576460752303423488),
65-
// ('7fff:ffff:ffff:ffff::', 576443160117379072),
59+
// ('2001::', 8193),
60+
// ('2001:db8::', 8193),
61+
// ('abcc::', 43980),
62+
// ('abcd::', 43981),
63+
// ('ffff:ffff:ffff:ffff::', 65535),
64+
// ('8000::', 32768),
65+
// ('7fff:ffff:ffff:ffff::', 32767),
6666
// ('0:0:0:f::', 0),
6767
// ('0:0:0:10::', 0),
6868
// ('0:0:0:1f::', 0),
6969
// ('0:0:0:20::', 0),
70-
// ('2001:db8::1', 144132780261900288),
71-
// ('2001:db8:0:0:ffff:ffff:ffff:ffff', 144132780261900288),
72-
// ('ffff:ffff:ffff:fff0::', 1152903912420802560),
73-
// ('ffff:ffff:ffff:ffef::', 1152903912420802560),
74-
// ('1::', 17592186044416),
70+
// ('2001:db8::1', 8193),
71+
// ('2001:db8:0:0:ffff:ffff:ffff:ffff', 8193),
72+
// ('ffff:ffff:ffff:fff0::', 65535),
73+
// ('ffff:ffff:ffff:ffef::', 65535),
74+
// ('1::', 1),
7575
// ('::ffff:192.168.1.1', 0)
7676
// ]);
7777
func TestIPv6BucketToInt64(t *testing.T) {
@@ -91,43 +91,43 @@ func TestIPv6BucketToInt64(t *testing.T) {
9191
name: "2001:: (common prefix)",
9292
ip: "2001::",
9393
bucketSize: 16,
94-
expected: 144132780261900288,
94+
expected: 0x2001, // 8193
9595
},
9696
{
9797
name: "2001:db8:: (documentation prefix, truncated to /16)",
9898
ip: "2001:db8::",
9999
bucketSize: 16,
100-
expected: 144132780261900288,
100+
expected: 0x2001, // 8193
101101
},
102102
{
103103
name: "abcc:: (test bucket boundary)",
104104
ip: "abcc::",
105105
bucketSize: 16,
106-
expected: 773704342233415680,
106+
expected: 0xabcc, // 43980
107107
},
108108
{
109109
name: "abcd:: (adjacent to abcc::)",
110110
ip: "abcd::",
111111
bucketSize: 16,
112-
expected: 773721934419460096,
112+
expected: 0xabcd, // 43981
113113
},
114114
{
115115
name: "ffff:ffff:ffff:ffff:: (truncated to ffff::)",
116116
ip: "ffff:ffff:ffff:ffff::",
117117
bucketSize: 16,
118-
expected: 1152903912420802560,
118+
expected: 0xffff, // 65535
119119
},
120120
{
121-
name: "8000:: (high bit set, still positive in 60-bit)",
121+
name: "8000:: (high bit set)",
122122
ip: "8000::",
123123
bucketSize: 16,
124-
expected: 576460752303423488,
124+
expected: 0x8000, // 32768
125125
},
126126
{
127127
name: "7fff:ffff:ffff:ffff:: (truncated to 7fff::)",
128128
ip: "7fff:ffff:ffff:ffff::",
129129
bucketSize: 16,
130-
expected: 576443160117379072,
130+
expected: 0x7fff, // 32767
131131
},
132132
// Beyond /16 boundary tests - all truncate to ::
133133
{
@@ -159,33 +159,33 @@ func TestIPv6BucketToInt64(t *testing.T) {
159159
name: "2001:db8::1 (truncated to 2001::)",
160160
ip: "2001:db8::1",
161161
bucketSize: 16,
162-
expected: 144132780261900288,
162+
expected: 0x2001, // 8193
163163
},
164164
{
165165
name: "2001:db8:0:0:ffff:ffff:ffff:ffff (truncated to 2001::)",
166166
ip: "2001:db8:0:0:ffff:ffff:ffff:ffff",
167167
bucketSize: 16,
168-
expected: 144132780261900288,
168+
expected: 0x2001, // 8193
169169
},
170170
// Same bucket when truncated to /16
171171
{
172172
name: "ffff:ffff:ffff:fff0:: (truncated to ffff::)",
173173
ip: "ffff:ffff:ffff:fff0::",
174174
bucketSize: 16,
175-
expected: 1152903912420802560,
175+
expected: 0xffff, // 65535
176176
},
177177
{
178178
name: "ffff:ffff:ffff:ffef:: (truncated to ffff::)",
179179
ip: "ffff:ffff:ffff:ffef::",
180180
bucketSize: 16,
181-
expected: 1152903912420802560,
181+
expected: 0xffff, // 65535
182182
},
183183
// Single-bit position test
184184
{
185-
name: "1:: (high bit position)",
185+
name: "1:: (low value)",
186186
ip: "1::",
187187
bucketSize: 16,
188-
expected: 17592186044416,
188+
expected: 1,
189189
},
190190
// IPv4-mapped address - truncated to ::
191191
{
@@ -194,14 +194,39 @@ func TestIPv6BucketToInt64(t *testing.T) {
194194
bucketSize: 16,
195195
expected: 0,
196196
},
197+
// Different bucket sizes
198+
{
199+
name: "2001:db80:: with /32 bucket",
200+
ip: "2001:db80::",
201+
bucketSize: 32,
202+
expected: 0x2001db80, // 536928128
203+
},
204+
{
205+
name: "2001:db8a:bcde:: with /48 bucket",
206+
ip: "2001:db8a:bcde::",
207+
bucketSize: 48,
208+
expected: 0x2001db8abcde, // 35343878970590
209+
},
210+
{
211+
name: "ffff:: with /8 bucket",
212+
ip: "ffff::",
213+
bucketSize: 8,
214+
expected: 0xff, // 255
215+
},
216+
{
217+
name: "dead:beef:: with /24 bucket",
218+
ip: "dead:beef::",
219+
bucketSize: 24,
220+
expected: 0xdeadbe, // 14593470
221+
},
197222
}
198223

199224
for _, tt := range tests {
200225
t.Run(tt.name, func(t *testing.T) {
201226
ip := netip.MustParseAddr(tt.ip)
202227
prefix := netip.PrefixFrom(ip, tt.bucketSize)
203228
maskedIP := prefix.Masked().Addr()
204-
result, err := IPv6BucketToInt64(maskedIP)
229+
result, err := IPv6BucketToInt64(maskedIP, tt.bucketSize)
205230
require.NoError(t, err)
206231
assert.Equal(t, tt.expected, result)
207232
})
@@ -210,7 +235,7 @@ func TestIPv6BucketToInt64(t *testing.T) {
210235

211236
func TestIPv6BucketToInt64_IPv4Error(t *testing.T) {
212237
ip := netip.MustParseAddr("192.168.1.1")
213-
_, err := IPv6BucketToInt64(ip)
238+
_, err := IPv6BucketToInt64(ip, 16)
214239
require.Error(t, err)
215240
assert.Contains(t, err.Error(), "non-IPv6")
216241
}

internal/writer/csv.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,7 @@ func (w *CSVWriter) generateNetworkColumnValue(
321321
if w.config.Output.CSV.IPv6BucketType != config.IPv6BucketTypeInt {
322322
return fmt.Sprintf("%x", bucketAddr.As16()), nil
323323
}
324-
val, err := network.IPv6BucketToInt64(bucketAddr)
324+
val, err := network.IPv6BucketToInt64(bucketAddr, w.getBucketSize(true))
325325
if err != nil {
326326
return "", fmt.Errorf("converting IPv6 bucket to int64: %w", err)
327327
}

internal/writer/csv_test.go

Lines changed: 8 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -763,7 +763,7 @@ func TestCSVWriter_NetworkBucket_IPv6_Int(t *testing.T) {
763763
network: "2001:0d00::/24",
764764
bucketSize: 16,
765765
expectedRowCount: 1,
766-
expectedBuckets: []string{csvIPv6BucketToInt("2001::")},
766+
expectedBuckets: []string{"8193"}, // 0x2001
767767
expectedStartInt: csvIPv6ToInt("2001:0d00::"),
768768
expectedEndInt: csvIPv6ToInt("2001:0dff:ffff:ffff:ffff:ffff:ffff:ffff"),
769769
},
@@ -772,10 +772,7 @@ func TestCSVWriter_NetworkBucket_IPv6_Int(t *testing.T) {
772772
network: "abcc::/15",
773773
bucketSize: 16,
774774
expectedRowCount: 2,
775-
expectedBuckets: []string{
776-
csvIPv6BucketToInt("abcc::"),
777-
csvIPv6BucketToInt("abcd::"),
778-
},
775+
expectedBuckets: []string{"43980", "43981"}, // 0xabcc, 0xabcd
779776
expectedStartInt: csvIPv6ToInt("abcc::"),
780777
expectedEndInt: csvIPv6ToInt("abcd:ffff:ffff:ffff:ffff:ffff:ffff:ffff"),
781778
},
@@ -785,22 +782,10 @@ func TestCSVWriter_NetworkBucket_IPv6_Int(t *testing.T) {
785782
bucketSize: 16,
786783
expectedRowCount: 16,
787784
expectedBuckets: []string{
788-
csvIPv6BucketToInt("2000::"),
789-
csvIPv6BucketToInt("2001::"),
790-
csvIPv6BucketToInt("2002::"),
791-
csvIPv6BucketToInt("2003::"),
792-
csvIPv6BucketToInt("2004::"),
793-
csvIPv6BucketToInt("2005::"),
794-
csvIPv6BucketToInt("2006::"),
795-
csvIPv6BucketToInt("2007::"),
796-
csvIPv6BucketToInt("2008::"),
797-
csvIPv6BucketToInt("2009::"),
798-
csvIPv6BucketToInt("200a::"),
799-
csvIPv6BucketToInt("200b::"),
800-
csvIPv6BucketToInt("200c::"),
801-
csvIPv6BucketToInt("200d::"),
802-
csvIPv6BucketToInt("200e::"),
803-
csvIPv6BucketToInt("200f::"),
785+
"8192", "8193", "8194", "8195", // 0x2000-0x2003
786+
"8196", "8197", "8198", "8199", // 0x2004-0x2007
787+
"8200", "8201", "8202", "8203", // 0x2008-0x200b
788+
"8204", "8205", "8206", "8207", // 0x200c-0x200f
804789
},
805790
expectedStartInt: csvIPv6ToInt("2000::"),
806791
expectedEndInt: csvIPv6ToInt("200f:ffff:ffff:ffff:ffff:ffff:ffff:ffff"),
@@ -810,10 +795,7 @@ func TestCSVWriter_NetworkBucket_IPv6_Int(t *testing.T) {
810795
network: "2001:0000::/23",
811796
bucketSize: 24,
812797
expectedRowCount: 2,
813-
expectedBuckets: []string{
814-
csvIPv6BucketToInt("2001::"),
815-
csvIPv6BucketToInt("2001:100::"),
816-
},
798+
expectedBuckets: []string{"2097408", "2097409"}, // 0x200100, 0x200101
817799
expectedStartInt: csvIPv6ToInt("2001::"),
818800
expectedEndInt: csvIPv6ToInt("2001:01ff:ffff:ffff:ffff:ffff:ffff:ffff"),
819801
},
@@ -822,7 +804,7 @@ func TestCSVWriter_NetworkBucket_IPv6_Int(t *testing.T) {
822804
network: "2001:db8::1/128",
823805
bucketSize: 16,
824806
expectedRowCount: 1,
825-
expectedBuckets: []string{csvIPv6BucketToInt("2001::")},
807+
expectedBuckets: []string{"8193"}, // 0x2001
826808
expectedStartInt: csvIPv6ToInt("2001:db8::1"),
827809
expectedEndInt: csvIPv6ToInt("2001:db8::1"),
828810
},
@@ -909,13 +891,3 @@ func csvIPv6ToInt(s string) string {
909891
i.SetBytes(b[:])
910892
return i.String()
911893
}
912-
913-
// csvIPv6BucketToInt converts an IPv6 bucket address string to its 60-bit decimal string.
914-
func csvIPv6BucketToInt(s string) string {
915-
ip := netip.MustParseAddr(s)
916-
val, err := network.IPv6BucketToInt64(ip)
917-
if err != nil {
918-
panic(err)
919-
}
920-
return strconv.FormatInt(val, 10)
921-
}

internal/writer/parquet.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ func (w *ParquetWriter) generateNetworkColumnValue(
268268
if w.config.Output.Parquet.IPv6BucketType != config.IPv6BucketTypeInt {
269269
return fmt.Sprintf("%x", bucketAddr.As16()), nil
270270
}
271-
val, err := network.IPv6BucketToInt64(bucketAddr)
271+
val, err := network.IPv6BucketToInt64(bucketAddr, w.getBucketSize(true))
272272
if err != nil {
273273
return nil, fmt.Errorf("converting IPv6 bucket to int64: %w", err)
274274
}

0 commit comments

Comments
 (0)