Support IPv6 buckets being integer

horgh · horgh · commit 32bc23a34178 · 2025-12-24T17:24:46.000Z
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,12 +18,14 @@ and this project adheres to
 - New `network_bucket` network column type for Parquet output, enabling
   efficient IP lookups in BigQuery and other analytics platforms. When a network
   spans multiple buckets, rows are duplicated with different bucket values while
-  preserving original network info. For IPv4, the bucket is integer (matching
-  `start_int`/`end_int`). For IPv6, the bucket is a hex string (e.g.,
-  "200f0000000000000000000000000000"). Requires split output files (`ipv4_file`
-  and `ipv6_file`).
+  preserving original network info. For IPv4, the bucket is an integer. For
+  IPv6, the bucket is either a hex string (e.g.,
+  "200f0000000000000000000000000000") or an integer depending on
+  `ipv6_bucket_type`. Requires split output files (`ipv4_file` and `ipv6_file`).
 - New Parquet options `ipv4_bucket_size` and `ipv6_bucket_size` to configure
   bucket prefix lengths (default: 16).
+- New Parquet option `ipv6_bucket_type` to configure the IPv6 network bucket
+  column format (default: string).
 
 ## [0.1.0] - 2025-11-07
 
diff --git a/README.md b/README.md
@@ -440,8 +440,9 @@ ipv4_file = "geoip-v4.parquet"
 ipv6_file = "geoip-v6.parquet"
 
 [output.parquet]
-ipv4_bucket_size = 16  # Optional, defaults to 16
-ipv6_bucket_size = 16  # Optional, defaults to 16
+ipv4_bucket_size = 16     # Optional, defaults to 16
+ipv6_bucket_size = 16     # Optional, defaults to 16
+ipv6_bucket_type = "int"  # Optional: "string" (default) or "int"
 
 [[network.columns]]
 name = "start_int"
@@ -456,9 +457,9 @@ name = "network_bucket"
 type = "network_bucket"
 ```
 
-For IPv4, the bucket is an integer (matching `start_int`/`end_int`). For IPv6,
-the bucket is a hex string (e.g., "200f0000000000000000000000000000"). This
-requires split output files. See
+For IPv4, the bucket is an integer. For IPv6, the bucket is either a hex string
+(default) or an integer when `ipv6_bucket_type = "int"` is configured. Using
+`network_bucket` requires split output files. See
 [docs/parquet-queries.md](docs/parquet-queries.md) for BigQuery query examples.
 
 **Note:** When a network is larger than the bucket size (e.g., a /15 with /16
diff --git a/docs/config.md b/docs/config.md
@@ -84,14 +84,16 @@ compression = "snappy"    # Compression: "none", "snappy", "gzip", "lz4", "zstd"
 row_group_size = 500000   # Rows per row group (default: 500000)
 ipv4_bucket_size = 16     # Bucket prefix length for IPv4 (default: 16)
 ipv6_bucket_size = 16     # Bucket prefix length for IPv6 (default: 16)
+ipv6_bucket_type = "string"  # IPv6 bucket value type: "string" or "int" (default: "string")
 ```
 
-| Option             | Description                                                        | Default  |
-| ------------------ | ------------------------------------------------------------------ | -------- |
-| `compression`      | Compression codec: "none", "snappy", "gzip", "lz4", "zstd"         | "snappy" |
-| `row_group_size`   | Number of rows per row group                                       | 500000   |
-| `ipv4_bucket_size` | Prefix length for IPv4 buckets (when `network_bucket` column used) | 16       |
-| `ipv6_bucket_size` | Prefix length for IPv6 buckets (when `network_bucket` column used) | 16       |
+| Option             | Description                                                                | Default  |
+| ------------------ | -------------------------------------------------------------------------- | -------- |
+| `compression`      | Compression codec: "none", "snappy", "gzip", "lz4", "zstd"                 | "snappy" |
+| `row_group_size`   | Number of rows per row group                                               | 500000   |
+| `ipv4_bucket_size` | Prefix length for IPv4 buckets (1-32, when `network_bucket` column used)   | 16       |
+| `ipv6_bucket_size` | Prefix length for IPv6 buckets (1-60, when `network_bucket` column used)   | 16       |
+| `ipv6_bucket_type` | IPv6 bucket value type: "string" (hex) or "int" (first 60 bits as integer) | "string" |
 
 #### MMDB Options
 
@@ -144,14 +146,14 @@ type = "cidr"       # Output type
 
 **Available types:**
 
-| Type             | Description                                                                                                                                 |
-| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `cidr`           | CIDR notation (e.g., "203.0.113.0/24")                                                                                                      |
-| `start_ip`       | Starting IP address (e.g., "203.0.113.0")                                                                                                   |
-| `end_ip`         | Ending IP address (e.g., "203.0.113.255")                                                                                                   |
-| `start_int`      | Starting IP as integer                                                                                                                      |
-| `end_int`        | Ending IP as integer                                                                                                                        |
-| `network_bucket` | Bucket for efficient lookups. For IPv4: integer (same as `start_int`/`end_int`). For IPv6: hex string. Requires split files (Parquet only). |
+| Type             | Description                                                                                                                                                |
+| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `cidr`           | CIDR notation (e.g., "203.0.113.0/24")                                                                                                                     |
+| `start_ip`       | Starting IP address (e.g., "203.0.113.0")                                                                                                                  |
+| `end_ip`         | Ending IP address (e.g., "203.0.113.255")                                                                                                                  |
+| `start_int`      | Starting IP as integer                                                                                                                                     |
+| `end_int`        | Ending IP as integer                                                                                                                                       |
+| `network_bucket` | Bucket for efficient lookups. IPv4: integer. IPv6: hex string (default) or integer (with `ipv6_bucket_type = "int"`). Requires split files (Parquet only). |
 
 **Default behavior:** If no `[[network.columns]]` sections are defined:
 
diff --git a/docs/parquet-queries.md b/docs/parquet-queries.md
@@ -345,8 +345,9 @@ ipv4_file = "geoip-v4.parquet"
 ipv6_file = "geoip-v6.parquet"
 
 [output.parquet]
-ipv4_bucket_size = 16  # Default: /16 prefix
-ipv6_bucket_size = 16  # Default: /16 prefix
+ipv4_bucket_size = 16    # Default: /16 prefix
+ipv6_bucket_size = 16    # Default: /16 prefix
+ipv6_bucket_type = "string"  # Default: "string" (hex), or "int" (60-bit integer)
 
 [[network.columns]]
 name = "start_int"
@@ -386,7 +387,9 @@ AND NET.IPV4_TO_INT64(NET.IP_FROM_STRING('203.0.113.100')) BETWEEN start_int AND
 
 #### IPv6 Lookup
 
-For IPv6, the bucket is a hex string. Use `TO_HEX()` with `NET.IP_TRUNC()`:
+The query depends on your `ipv6_bucket_type` configuration.
+
+**Using default `ipv6_bucket_type = "string"` (hex string):**
 
 ```sql
 -- Using default ipv6_bucket_size = 16
@@ -396,6 +399,21 @@ WHERE network_bucket = TO_HEX(NET.IP_TRUNC(NET.IP_FROM_STRING('2001:db8::1'), 16
 AND NET.IP_FROM_STRING('2001:db8::1') BETWEEN start_int AND end_int;
 ```
 
+**Using `ipv6_bucket_type = "int"` (60-bit int64):**
+
+```sql
+-- Using default ipv6_bucket_size = 16
+SELECT *
+FROM `project.dataset.geoip_v6`
+WHERE network_bucket = CAST(CONCAT('0x', SUBSTR(
+    TO_HEX(NET.IP_TRUNC(NET.IP_FROM_STRING('2001:db8::1'), 16)), 1, 15
+  )) AS INT64)
+AND NET.IP_FROM_STRING('2001:db8::1') BETWEEN start_int AND end_int;
+```
+
+The int type expression extracts the first 60 bits (15 hex chars) of the
+truncated IPv6 address as an integer.
+
 ### Why Bucketing Helps
 
 Without bucketing, BigQuery must scan every row to check the range condition.
@@ -422,13 +440,32 @@ Both rows have the same `start_int`/`end_int` (the full /15 range), but
 different `network_bucket` values (2.0.0.0 = 33554432, 2.1.0.0 = 33619968).
 Queries for IPs in either bucket will find the network.
 
-### Why Bucket is a Hex String
+### IPv6 Bucket Type Options
+
+IPv6 buckets can be stored as either hex strings (default) or int64 values:
+
+**String type (default):**
+
+- Format: 32-character hex string (e.g., "20010db8000000000000000000000000")
+- Storage: 32 bytes per value
+
+**Int type (`ipv6_bucket_type = "int"`):**
+
+- Format: First 60 bits of the bucket address as int64
+- Storage: 8 bytes per value (4x smaller than string)
+
+We use 60 bits (not 64) because 60-bit values always fit in a positive int64,
+which simplifies BigQuery queries by avoiding two's complement handling.
+
+**When to use each type:**
+
+- Use **string** (default) for databases where hex string representations are
+  simpler to work with.
+- Use **int** for reduced storage cost at the price of more complicated queries.
 
-BigQuery cannot cluster on the `bytes` type, so we can't use the same type as we
-do for `start_int` and `end_int`. Using `int` to include the prefix or using
-`bignumeric` would be an option, but both are more complicated to query with.
-Another reason to use a hex string is Snowflake's `PARSE_IP()` function provides
-the address in this format.
+We do not provide a `bytes` type for the IPv6 bucket. Primarily this is because
+there so far has not been a need. For example, BigQuery cannot cluster on
+`bytes`, so it is not helpful there.
 
 ## Common Query Patterns
 
diff --git a/internal/config/config.go b/internal/config/config.go
@@ -16,6 +16,11 @@ const (
 	formatCSV     = "csv"
 	formatParquet = "parquet"
 	formatMMDB    = "mmdb"
+
+	// IPv6BucketTypeString stores IPv6 bucket values as hex strings.
+	IPv6BucketTypeString = "string"
+	// IPv6BucketTypeInt stores IPv6 bucket values as int64 (first 60 bits).
+	IPv6BucketTypeInt = "int"
 )
 
 // Config represents the complete configuration file structure.
@@ -51,6 +56,7 @@ type ParquetConfig struct {
 	RowGroupSize   int    `toml:"row_group_size"`   // Rows per row group (default: 500000)
 	IPv4BucketSize int    `toml:"ipv4_bucket_size"` // Bucket prefix length for IPv4 (default: 16)
 	IPv6BucketSize int    `toml:"ipv6_bucket_size"` // Bucket prefix length for IPv6 (default: 16)
+	IPv6BucketType string `toml:"ipv6_bucket_type"` // "string" or "int" (default: "string")
 }
 
 // MMDBConfig defines MMDB output options.
@@ -182,6 +188,9 @@ func applyDefaults(config *Config) {
 	if config.Output.Parquet.IPv6BucketSize == 0 {
 		config.Output.Parquet.IPv6BucketSize = 16
 	}
+	if config.Output.Parquet.IPv6BucketType == "" {
+		config.Output.Parquet.IPv6BucketType = IPv6BucketTypeString
+	}
 
 	// MMDB defaults
 	if config.Output.Format == formatMMDB {
@@ -373,13 +382,24 @@ func validate(config *Config) error {
 				config.Output.Parquet.IPv4BucketSize,
 			)
 		}
+		// IPv6 bucket size capped at 60 to support int type (60-bit values fit in
+		// positive int64, simplifying BigQuery queries)
 		if config.Output.Parquet.IPv6BucketSize < 1 ||
-			config.Output.Parquet.IPv6BucketSize > 128 {
+			config.Output.Parquet.IPv6BucketSize > 60 {
 			return fmt.Errorf(
-				"ipv6_bucket_size must be between 1 and 128, got %d",
+				"ipv6_bucket_size must be between 1 and 60, got %d",
 				config.Output.Parquet.IPv6BucketSize,
 			)
 		}
+
+		// Validate IPv6 bucket type
+		if config.Output.Parquet.IPv6BucketType != IPv6BucketTypeString &&
+			config.Output.Parquet.IPv6BucketType != IPv6BucketTypeInt {
+			return fmt.Errorf(
+				"ipv6_bucket_type must be 'string' or 'int', got '%s'",
+				config.Output.Parquet.IPv6BucketType,
+			)
+		}
 	}
 
 	// Validate data columns
diff --git a/internal/config/config_test.go b/internal/config/config_test.go
@@ -322,6 +322,50 @@ path = ["country", "iso_code"]
 				}
 			},
 		},
+		{
+			name: "parquet with ipv6_bucket_type int",
+			toml: `
+[output]
+format = "parquet"
+ipv4_file = "output-v4.parquet"
+ipv6_file = "output-v6.parquet"
+
+[output.parquet]
+ipv6_bucket_type = "int"
+ipv6_bucket_size = 48
+
+[[network.columns]]
+name = "start_int"
+type = "start_int"
+
+[[network.columns]]
+name = "network_bucket"
+type = "network_bucket"
+
+[[databases]]
+name = "geo"
+path = "/path/to/geo.mmdb"
+
+[[columns]]
+name = "country"
+database = "geo"
+path = ["country", "iso_code"]
+`,
+			validate: func(t *testing.T, cfg *Config) {
+				if cfg.Output.Parquet.IPv6BucketType != IPv6BucketTypeInt {
+					t.Errorf(
+						"expected IPv6BucketType=int, got %s",
+						cfg.Output.Parquet.IPv6BucketType,
+					)
+				}
+				if cfg.Output.Parquet.IPv6BucketSize != 48 {
+					t.Errorf(
+						"expected IPv6BucketSize=48, got %d",
+						cfg.Output.Parquet.IPv6BucketSize,
+					)
+				}
+			},
+		},
 	}
 
 	for _, tt := range tests {
@@ -818,7 +862,33 @@ ipv4_file = "output-v4.parquet"
 ipv6_file = "output-v6.parquet"
 
 [output.parquet]
-ipv6_bucket_size = 129
+ipv6_bucket_size = 61
+
+[[network.columns]]
+name = "network_bucket"
+type = "network_bucket"
+
+[[databases]]
+name = "geo"
+path = "/path/to/geo.mmdb"
+
+[[columns]]
+name = "country"
+database = "geo"
+path = ["country", "iso_code"]
+`,
+			expectError: "ipv6_bucket_size must be between 1 and 60",
+		},
+		{
+			name: "invalid ipv6_bucket_type",
+			toml: `
+[output]
+format = "parquet"
+ipv4_file = "output-v4.parquet"
+ipv6_file = "output-v6.parquet"
+
+[output.parquet]
+ipv6_bucket_type = "invalid"
 
 [[network.columns]]
 name = "network_bucket"
@@ -833,7 +903,7 @@ name = "country"
 database = "geo"
 path = ["country", "iso_code"]
 `,
-			expectError: "ipv6_bucket_size must be between 1 and 128",
+			expectError: "ipv6_bucket_type must be 'string' or 'int'",
 		},
 	}
 
diff --git a/internal/network/utils.go b/internal/network/utils.go
@@ -3,6 +3,7 @@ package network
 
 import (
 	"encoding/binary"
+	"errors"
 	"fmt"
 	"net/netip"
 
@@ -18,6 +19,36 @@ func IPv4ToUint32(addr netip.Addr) uint32 {
 	return binary.BigEndian.Uint32(bytes[:])
 }
 
+// IPv6BucketToInt64 converts the first 60 bits of an IPv6 address to int64.
+//
+// This is used for IPv6 bucket values where the address has been masked to
+// the bucket boundary (trailing bits are zero).
+//
+// NOTE: The address must already be masked to the appropriate bucket (i.e.,
+// if you have a bucket size of /16, you must provide 2001:: as opposed to
+// something like 2001:abcd::).
+//
+// We use 60 bits (not 64) because 60-bit values always fit in a positive int64,
+// which simplifies BigQuery queries (no two's complement handling needed).
+//
+// In BigQuery, you can compute the same value using:
+//
+//	CAST(CONCAT('0x', SUBSTR(
+//	  TO_HEX(NET.IP_TRUNC(NET.IP_FROM_STRING(ip), bucket_size)), 1, 15
+//	)) AS INT64)
+//
+// where bucket_size is the prefix length used for bucketing.
+func IPv6BucketToInt64(addr netip.Addr) (int64, error) {
+	if !addr.Is6() {
+		return 0, errors.New("IPv6BucketToInt64 called with non-IPv6 address")
+	}
+	bytes := addr.As16()
+	// Read first 64 bits, then right-shift by 4 to get top 60 bits
+	val := binary.BigEndian.Uint64(bytes[:8])
+	//nolint:gosec // 60-bit value always fits in positive int64
+	return int64(val >> 4), nil
+}
+
 // IsAdjacent checks if two IP addresses are consecutive (no gap between them).
 func IsAdjacent(endIP, startIP netip.Addr) bool {
 	if endIP.Is4() != startIP.Is4() {
diff --git a/internal/network/utils_test.go b/internal/network/utils_test.go
diff --git a/internal/writer/parquet.go b/internal/writer/parquet.go
diff --git a/internal/writer/parquet_test.go b/internal/writer/parquet_test.go