maxmind
diff --git a/‎CHANGELOG.md‎
Lines changed: 8 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 45 additions & 0 deletions b/‎README.md‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎docs/config.md‎
Lines changed: 19 additions & 6 deletions b/‎docs/config.md‎
Lines changed: 19 additions & 6 deletions
diff --git a/‎docs/parquet-queries.md‎
Lines changed: 80 additions & 0 deletions b/‎docs/parquet-queries.md‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎internal/config/config.go‎
Lines changed: 30 additions & 3 deletions b/‎internal/config/config.go‎
Lines changed: 30 additions & 3 deletions
@@ -15,6 +15,14 @@ and this project adheres to
   Parquet file declaring that rows are sorted by start_int in ascending order.
   This enables query engines like DuckDB, Spark, and Trino to use the sort
   order for potential optimizations like binary search.
+- New `network_bucket` network column type for Parquet output, enabling
+  efficient IP lookups in BigQuery and other analytics platforms. When a
+  network spans multiple buckets, rows are duplicated with different bucket
+  values while preserving original network info. The bucket type matches
+  `start_int`/`end_int`. Requires split output files (`ipv4_file` and
+  `ipv6_file`).
+- New Parquet options `ipv4_bucket_size` and `ipv6_bucket_size` to
+  configure bucket prefix lengths (default: 16).
 
 ## [0.1.0] - 2025-11-07
 
 
@@ -406,6 +406,10 @@ type = "start_int"     # e.g., 3405803776 (IPv4 only)
 [[network.columns]]
 name = "end_int"
 type = "end_int"       # e.g., 3405804031 (IPv4 only)
+
+[[network.columns]]
+name = "network_bucket"
+type = "network_bucket"  # Bucket (int for IPv4, bytes for IPv6). Requires split files.
 ```
 
 **Default network columns:** If you don't define any `[[network.columns]]`,
@@ -419,6 +423,47 @@ split your output into separate IPv4/IPv6 files via `output.ipv4_file` and
 `output.ipv6_file`. For single-file outputs that include IPv6 data, use string
 columns (`start_ip`, `end_ip`, `cidr`).
 
+**Note:** `network_bucket` is currently only supported for Parquet output.
+
+### Network Bucketing for Analytics (BigQuery, etc.)
+
+When loading network data into analytics platforms like BigQuery, range queries
+can be slow due to full table scans. The `network_bucket` column provides a
+join key that enables efficient queries by first filtering to a specific bucket.
+
+**Configuration:**
+
+```toml
+[output]
+format = "parquet"
+ipv4_file = "geoip-v4.parquet"
+ipv6_file = "geoip-v6.parquet"
+
+[output.parquet]
+ipv4_bucket_size = 16  # Optional, defaults to 16
+ipv6_bucket_size = 16  # Optional, defaults to 16
+
+[[network.columns]]
+name = "start_int"
+type = "start_int"
+
+[[network.columns]]
+name = "end_int"
+type = "end_int"
+
+[[network.columns]]
+name = "network_bucket"
+type = "network_bucket"
+```
+
+The bucket type matches `start_int`/`end_int`: int64 for IPv4, 16-byte array
+for IPv6. This requires split output files. See
+[docs/parquet-queries.md](docs/parquet-queries.md) for BigQuery query examples.
+
+**Note:** When a network is larger than the bucket size (e.g., a /15 with /16
+buckets), the row is duplicated for each bucket it spans. This ensures queries
+find the correct network regardless of which bucket the IP falls into.
+
 ### Data Type Hints
 
 Parquet supports native types for efficient storage and queries:
 
@@ -80,9 +80,19 @@ When `format = "parquet"`, you can specify Parquet-specific options:
 
 ```toml
 [output.parquet]
-compression = "snappy"  # Compression: "none", "snappy", "gzip", "lz4", "zstd" (default: "snappy")
+compression = "snappy"    # Compression: "none", "snappy", "gzip", "lz4", "zstd" (default: "snappy")
+row_group_size = 500000   # Rows per row group (default: 500000)
+ipv4_bucket_size = 16     # Bucket prefix length for IPv4 (default: 16)
+ipv6_bucket_size = 16     # Bucket prefix length for IPv6 (default: 16)
 ```
 
+| Option | Description | Default |
+|--------|-------------|---------|
+| `compression` | Compression codec: "none", "snappy", "gzip", "lz4", "zstd" | "snappy" |
+| `row_group_size` | Number of rows per row group | 500000 |
+| `ipv4_bucket_size` | Prefix length for IPv4 buckets (when `network_bucket` column used) | 16 |
+| `ipv6_bucket_size` | Prefix length for IPv6 buckets (when `network_bucket` column used) | 16 |
+
 #### MMDB Options
 
 When `format = "mmdb"`, you can specify MMDB-specific options:
@@ -134,11 +144,14 @@ type = "cidr"       # Output type
 
 **Available types:**
 
-- `cidr` - CIDR notation (e.g., "203.0.113.0/24")
-- `start_ip` - Starting IP address (e.g., "203.0.113.0")
-- `end_ip` - Ending IP address (e.g., "203.0.113.255")
-- `start_int` - Starting IP as integer
-- `end_int` - Ending IP as integer
+| Type | Description |
+|------|-------------|
+| `cidr` | CIDR notation (e.g., "203.0.113.0/24") |
+| `start_ip` | Starting IP address (e.g., "203.0.113.0") |
+| `end_ip` | Ending IP address (e.g., "203.0.113.255") |
+| `start_int` | Starting IP as integer (int64 for IPv4, 16-byte for IPv6) |
+| `end_int` | Ending IP as integer (int64 for IPv4, 16-byte for IPv6) |
+| `network_bucket` | Bucket for efficient lookups (int64 for IPv4, 16-byte for IPv6). Same types as `start_int`/`end_int`. Requires split files (Parquet only). |
 
 **Default behavior:** If no `[[network.columns]]` sections are defined:
 
 
@@ -330,6 +330,86 @@ ipv4_file = "geo_ipv4.parquet"
 ipv6_file = "geo_ipv6.parquet"
 ```
 
+## BigQuery with Network Bucketing
+
+BigQuery performs full table scans for range queries like `WHERE start_int <= ip
+AND end_int >= ip`. Use the `network_bucket` column to enable efficient lookups.
+
+### Configuration
+
+```toml
+[output]
+format = "parquet"
+ipv4_file = "geoip-v4.parquet"
+ipv6_file = "geoip-v6.parquet"
+
+[output.parquet]
+ipv4_bucket_size = 16  # Default: /16 prefix
+ipv6_bucket_size = 16  # Default: /16 prefix
+
+[[network.columns]]
+name = "start_int"
+type = "start_int"
+
+[[network.columns]]
+name = "end_int"
+type = "end_int"
+
+[[network.columns]]
+name = "network_bucket"
+type = "network_bucket"
+```
+
+### BigQuery Query Patterns
+
+#### IPv4 Lookup
+
+For IPv4, the bucket is int64. Use `NET.IP_TRUNC()` to get the bucket and
+`NET.IPV4_TO_INT64()` to convert to the integer type:
+
+```sql
+SELECT *
+FROM `project.dataset.geoip_v4`
+WHERE network_bucket = NET.IPV4_TO_INT64(NET.IP_TRUNC(NET.IP_FROM_STRING('203.0.113.100'), 16))
+AND NET.IPV4_TO_INT64(NET.IP_FROM_STRING('203.0.113.100')) BETWEEN start_int AND end_int;
+```
+
+#### IPv6 Lookup
+
+For IPv6, the bucket is bytes. Use `NET.IP_TRUNC()` to get the bucket:
+
+```sql
+SELECT *
+FROM `project.dataset.geoip_v6`
+WHERE network_bucket = NET.IP_TRUNC(NET.IP_FROM_STRING('2001:db8::1'), 16)
+AND NET.IP_FROM_STRING('2001:db8::1') BETWEEN start_int AND end_int;
+```
+
+### Why Bucketing Helps
+
+Without bucketing, BigQuery must scan every row to check the range condition.
+With bucketing:
+
+1. BigQuery first filters by exact match on `network_bucket`
+2. Only matching bucket rows are checked for the range condition
+3. Result: Query scans ~0.4% of rows (1 bucket out of 256 for /16)
+
+### Row Duplication
+
+Networks larger than the bucket size are duplicated. For example, a /15 network
+spans two /16 buckets:
+
+**IPv4 example (int64 bucket values):**
+
+| network | start_int | end_int | network_bucket |
+|---------|-----------|---------|----------------|
+| 2.0.0.0/15 | 33554432 | 33685503 | 33554432 |
+| 2.0.0.0/15 | 33554432 | 33685503 | 33619968 |
+
+Both rows have the same `start_int`/`end_int` (the full /15 range), but different
+`network_bucket` values (2.0.0.0 = 33554432, 2.1.0.0 = 33619968). Queries for IPs
+in either bucket will find the network.
+
 ## Common Query Patterns
 
 ### Single IP Lookup
 
@@ -47,8 +47,10 @@ type CSVConfig struct {
 
 // ParquetConfig defines Parquet output options.
 type ParquetConfig struct {
-	Compression  string `toml:"compression"`    // "none", "snappy", "gzip", "lz4", "zstd" (default: "snappy")
-	RowGroupSize int    `toml:"row_group_size"` // Rows per row group (default: 500000)
+	Compression    string `toml:"compression"`      // "none", "snappy", "gzip", "lz4", "zstd" (default: "snappy")
+	RowGroupSize   int    `toml:"row_group_size"`   // Rows per row group (default: 500000)
+	IPv4BucketSize int    `toml:"ipv4_bucket_size"` // Bucket prefix length for IPv4 (default: 16)
+	IPv6BucketSize int    `toml:"ipv6_bucket_size"` // Bucket prefix length for IPv6 (default: 16)
 }
 
 // MMDBConfig defines MMDB output options.
@@ -174,6 +176,12 @@ func applyDefaults(config *Config) {
 	if config.Output.Parquet.RowGroupSize == 0 {
 		config.Output.Parquet.RowGroupSize = 500000
 	}
+	if config.Output.Parquet.IPv4BucketSize == 0 {
+		config.Output.Parquet.IPv4BucketSize = 16
+	}
+	if config.Output.Parquet.IPv6BucketSize == 0 {
+		config.Output.Parquet.IPv6BucketSize = 16
+	}
 
 	// MMDB defaults
 	if config.Output.Format == formatMMDB {
@@ -315,8 +323,10 @@ func validate(config *Config) error {
 	// Validate network columns
 	validNetworkTypes := map[string]bool{
 		"cidr": true, "start_ip": true, "end_ip": true, "start_int": true, "end_int": true,
+		"network_bucket": true,
 	}
 	networkColNames := map[mmdbtype.String]bool{}
+	hasBucketColumn := false
 	for _, col := range config.Network.Columns {
 		if col.Name == "" {
 			return errors.New("network column name is required")
@@ -326,17 +336,34 @@ func validate(config *Config) error {
 		}
 		if !validNetworkTypes[col.Type] {
 			return fmt.Errorf(
-				"invalid network column type '%s' for column '%s', must be one of: cidr, start_ip, end_ip, start_int, end_int",
+				"invalid network column type '%s' for column '%s', must be one of: cidr, start_ip, end_ip, start_int, end_int, network_bucket",
 				col.Type,
 				col.Name,
 			)
 		}
+		if col.Type == "network_bucket" {
+			hasBucketColumn = true
+		}
 		if networkColNames[col.Name] {
 			return fmt.Errorf("duplicate network column name '%s'", col.Name)
 		}
 		networkColNames[col.Name] = true
 	}
 
+	// network_bucket column requires split files (different types for IPv4 vs IPv6)
+	if hasBucketColumn {
+		if config.Output.Format != formatParquet {
+			return errors.New(
+				"network_bucket column type is only supported for Parquet output",
+			)
+		}
+		if config.Output.IPv4File == "" || config.Output.IPv6File == "" {
+			return errors.New(
+				"network_bucket column requires split files (ipv4_file and ipv6_file)",
+			)
+		}
+	}
+
 	// Validate data columns
 	validDataTypes := map[string]bool{
 		"": true, "string": true, "int64": true, "float64": true, "bool": true, "binary": true,