maxmind
diff --git a/‎.precious.toml‎
Lines changed: 5 additions & 0 deletions b/‎.precious.toml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 16 additions & 5 deletions b/‎CHANGELOG.md‎
Lines changed: 16 additions & 5 deletions
diff --git a/‎README.md‎
Lines changed: 51 additions & 0 deletions b/‎README.md‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎docs/bigquery.md‎
Lines changed: 69 additions & 0 deletions b/‎docs/bigquery.md‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎docs/config.md‎
Lines changed: 59 additions & 6 deletions b/‎docs/config.md‎
Lines changed: 59 additions & 6 deletions
@@ -5,6 +5,7 @@ lint-flags = "--diff"
 ok-exit-codes = 0
 invoke = "once"
 include = ["**/*.go"]
+exclude = ["testdata/MaxMind-DB/**"]
 
 [commands.golangci-lint]
 type = "both"
@@ -15,12 +16,16 @@ expect-stderr = true
 invoke = "once"
 path-args = "dir"
 include = ["**/*.go"]
+exclude = ["testdata/MaxMind-DB/**"]
 
 [commands.prettier-markdown]
 type = "both"
 include = [
     "**/*.md"
 ]
+exclude = [
+    "testdata/MaxMind-DB/**"
+]
 cmd = [
     "npx",
     "prettier",
 
@@ -10,11 +10,22 @@ and this project adheres to
 
 ### Added
 
-- Parquet sorting column metadata for query optimization. When start_int
-  columns are configured, mmdbconvert now writes sorting metadata to the
-  Parquet file declaring that rows are sorted by start_int in ascending order.
-  This enables query engines like DuckDB, Spark, and Trino to use the sort
-  order for potential optimizations like binary search.
+- Parquet sorting column metadata for query optimization. When start_int columns
+  are configured, mmdbconvert now writes sorting metadata to the Parquet file
+  declaring that rows are sorted by start_int in ascending order. This enables
+  query engines like DuckDB, Spark, and Trino to use the sort order for
+  potential optimizations like binary search.
+- New `network_bucket` network column type for CSV and Parquet output, enabling
+  efficient IP lookups in BigQuery and other analytics platforms. When a network
+  spans multiple buckets, rows are duplicated with different bucket values while
+  preserving original network info. For IPv4, the bucket is an integer. For
+  IPv6, the bucket is either a hex string (e.g.,
+  "200f0000000000000000000000000000") or an integer depending on
+  `ipv6_bucket_type`. Requires split output files (`ipv4_file` and `ipv6_file`).
+- New CSV and Parquet options `ipv4_bucket_size` and `ipv6_bucket_size` to
+  configure bucket prefix lengths (default: 16).
+- New CSV and Parquet option `ipv6_bucket_type` to configure the IPv6 network
+  bucket column format (default: string).
 
 ## [0.1.0] - 2025-11-07
 
 
@@ -406,6 +406,10 @@ type = "start_int"     # e.g., 3405803776 (IPv4 only)
 [[network.columns]]
 name = "end_int"
 type = "end_int"       # e.g., 3405804031 (IPv4 only)
+
+[[network.columns]]
+name = "network_bucket"
+type = "network_bucket"  # Bucket for efficient lookups. Requires split files.
 ```
 
 **Default network columns:** If you don't define any `[[network.columns]]`,
@@ -419,6 +423,52 @@ split your output into separate IPv4/IPv6 files via `output.ipv4_file` and
 `output.ipv6_file`. For single-file outputs that include IPv6 data, use string
 columns (`start_ip`, `end_ip`, `cidr`).
 
+### Network Bucketing for Analytics (BigQuery, etc.)
+
+When loading network data into analytics platforms like BigQuery, range queries
+can be slow due to full table scans. The `network_bucket` column provides a join
+key that enables efficient queries by first filtering to a specific bucket.
+
+**Configuration:**
+
+```toml
+[output]
+format = "parquet"
+ipv4_file = "geoip-v4.parquet"
+ipv6_file = "geoip-v6.parquet"
+
+[output.parquet]
+ipv4_bucket_size = 16     # Optional, defaults to 16
+ipv6_bucket_size = 16     # Optional, defaults to 16
+ipv6_bucket_type = "int"  # Optional: "string" (default) or "int"
+
+[[network.columns]]
+name = "start_int"
+type = "start_int"
+
+[[network.columns]]
+name = "end_int"
+type = "end_int"
+
+[[network.columns]]
+name = "network_bucket"
+type = "network_bucket"
+```
+
+For IPv4, the bucket is a 32-bit integer. For IPv6, the bucket is either a hex
+string (default) or a 60-bit integer when `ipv6_bucket_type = "int"` is
+configured.
+
+Using `network_bucket` requires split output files.
+
+See [docs/bigquery.md](docs/bigquery.md) for BigQuery query examples.
+
+**Note:** When a network is larger than the bucket size (e.g., a /15 with /16
+buckets), the row is duplicated for each bucket it spans. This ensures queries
+find the correct network regardless of which bucket the IP falls into.
+
+**Note:** `network_bucket` is supported for CSV and Parquet output.
+
 ### Data Type Hints
 
 Parquet supports native types for efficient storage and queries:
@@ -527,6 +577,7 @@ This ensures accurate IP lookups with no ambiguity.
 
 - [Configuration Reference](docs/config.md) - Complete config file documentation
 - [Parquet Query Guide](docs/parquet-queries.md) - Optimizing IP lookup queries
+- [BigQuery Guide](docs/bigquery.md) - Network bucketing for BigQuery
 
 ## Requirements
 
 
@@ -0,0 +1,69 @@
+# BigQuery with Network Bucketing
+
+BigQuery performs full table scans for range queries like
+`WHERE start_int <= ip AND end_int >= ip`. Use a `network_bucket` column to
+enable efficient lookups.
+
+**Note:** The BigQuery table must be clustered on the `network_bucket` column
+for efficient querying.
+
+**Important:** The bucket size in your queries must match the configured bucket
+size. The examples below use the default `/16` bucket size. If you configured a
+different `ipv4_bucket_size` or `ipv6_bucket_size`, adjust the second argument
+to `NET.IP_TRUNC()` accordingly.
+
+## IPv4 Lookup
+
+For IPv4, the bucket is int64. Use `NET.IP_TRUNC()` to get the bucket and
+`NET.IPV4_TO_INT64()` to convert to the integer type:
+
+```sql
+-- Using default ipv4_bucket_size = 16
+SELECT *
+FROM `project.dataset.geoip_v4`
+WHERE network_bucket = NET.IPV4_TO_INT64(NET.IP_TRUNC(NET.IP_FROM_STRING('203.0.113.100'), 16))
+AND NET.IPV4_TO_INT64(NET.IP_FROM_STRING('203.0.113.100')) BETWEEN start_int AND end_int;
+```
+
+## IPv6 Lookup
+
+The query depends on your `ipv6_bucket_type` configuration.
+
+**Note:** For IPv6 files, `start_int` and `end_int` columns are stored as
+16-byte binary values, not integers. The comparison with `NET.IP_FROM_STRING()`
+works because it also returns BYTES.
+
+**Using default `ipv6_bucket_type = "string"` (hex string):**
+
+```sql
+-- Using default ipv6_bucket_size = 16
+SELECT *
+FROM `project.dataset.geoip_v6`
+WHERE network_bucket = TO_HEX(NET.IP_TRUNC(NET.IP_FROM_STRING('2001:db8::1'), 16))
+AND NET.IP_FROM_STRING('2001:db8::1') BETWEEN start_int AND end_int;
+```
+
+**Using `ipv6_bucket_type = "int"` (60-bit int64):**
+
+```sql
+-- Using default ipv6_bucket_size = 16
+SELECT *
+FROM `project.dataset.geoip_v6`
+WHERE network_bucket = CAST(CONCAT('0x', SUBSTR(
+    TO_HEX(NET.IP_TRUNC(NET.IP_FROM_STRING('2001:db8::1'), 16)), 1, 15
+  )) AS INT64)
+AND NET.IP_FROM_STRING('2001:db8::1') BETWEEN start_int AND end_int;
+```
+
+The int type expression extracts the first 60 bits (15 hex chars) of the
+truncated IPv6 address as an integer.
+
+## Why Bucketing Helps
+
+Without bucketing, BigQuery must scan every row to check the range condition.
+With bucketing:
+
+1. BigQuery first filters by exact match on `network_bucket`
+2. Only matching bucket rows are checked for the range condition
+3. Result: Query scans only rows in the matching bucket instead of the entire
+   table
@@ -72,17 +72,40 @@ When `format = "csv"`, you can specify CSV-specific options:
 [output.csv]
 delimiter = ","           # Field delimiter (default: ",")
 include_header = true     # Include column headers (default: true)
+ipv4_bucket_size = 16     # Bucket prefix length for IPv4 (default: 16)
+ipv6_bucket_size = 16     # Bucket prefix length for IPv6 (default: 16)
+ipv6_bucket_type = "string"  # IPv6 bucket value type: "string" or "int" (default: "string")
 ```
 
+| Option             | Description                                                                | Default  |
+| ------------------ | -------------------------------------------------------------------------- | -------- |
+| `delimiter`        | Field delimiter character                                                  | ","      |
+| `include_header`   | Include column headers in output                                           | true     |
+| `ipv4_bucket_size` | Prefix length for IPv4 buckets (1-32, when `network_bucket` column used)   | 16       |
+| `ipv6_bucket_size` | Prefix length for IPv6 buckets (1-60, when `network_bucket` column used)   | 16       |
+| `ipv6_bucket_type` | IPv6 bucket value type: "string" (hex) or "int" (first 60 bits as integer) | "string" |
+
 #### Parquet Options
 
 When `format = "parquet"`, you can specify Parquet-specific options:
 
 ```toml
 [output.parquet]
-compression = "snappy"  # Compression: "none", "snappy", "gzip", "lz4", "zstd" (default: "snappy")
+compression = "snappy"    # Compression: "none", "snappy", "gzip", "lz4", "zstd" (default: "snappy")
+row_group_size = 500000   # Rows per row group (default: 500000)
+ipv4_bucket_size = 16     # Bucket prefix length for IPv4 (default: 16)
+ipv6_bucket_size = 16     # Bucket prefix length for IPv6 (default: 16)
+ipv6_bucket_type = "string"  # IPv6 bucket value type: "string" or "int" (default: "string")
 ```
 
+| Option             | Description                                                                | Default  |
+| ------------------ | -------------------------------------------------------------------------- | -------- |
+| `compression`      | Compression codec: "none", "snappy", "gzip", "lz4", "zstd"                 | "snappy" |
+| `row_group_size`   | Number of rows per row group                                               | 500000   |
+| `ipv4_bucket_size` | Prefix length for IPv4 buckets (1-32, when `network_bucket` column used)   | 16       |
+| `ipv6_bucket_size` | Prefix length for IPv6 buckets (1-60, when `network_bucket` column used)   | 16       |
+| `ipv6_bucket_type` | IPv6 bucket value type: "string" (hex) or "int" (first 60 bits as integer) | "string" |
+
 #### MMDB Options
 
 When `format = "mmdb"`, you can specify MMDB-specific options:
@@ -121,6 +144,33 @@ ipv6_file = "merged_ipv6.parquet"
 
 When splitting output, both `ipv4_file` and `ipv6_file` must be configured.
 
+#### IPv6 Bucket Type Options
+
+IPv6 buckets can be stored as either hex strings (default) or int64 values:
+
+**String type (default):**
+
+- Format: 32-character hex string (e.g., "20010db8000000000000000000000000")
+- Storage: 32 bytes per value
+
+**Int type (`ipv6_bucket_type = "int"`):**
+
+- Format: First 60 bits of the bucket address as int64
+- Storage: 8 bytes per value (4x smaller than string)
+
+We use 60 bits (not 64) because 60-bit values always fit in a positive int64,
+which simplifies queries by avoiding two's complement handling.
+
+**When to use each type:**
+
+- Use **string** (default) for databases where hex string representations are
+  simpler to work with.
+- Use **int** for reduced storage cost at the price of more complicated queries.
+
+We do not provide a `bytes` type for the IPv6 bucket. Primarily this is because
+there so far has not been a need. For example, BigQuery cannot cluster on
+`bytes`, so it is not helpful there.
+
 ### Network Columns
 
 Network columns define how IP network information is output. These columns
@@ -134,11 +184,14 @@ type = "cidr"       # Output type
 
 **Available types:**
 
-- `cidr` - CIDR notation (e.g., "203.0.113.0/24")
-- `start_ip` - Starting IP address (e.g., "203.0.113.0")
-- `end_ip` - Ending IP address (e.g., "203.0.113.255")
-- `start_int` - Starting IP as integer
-- `end_int` - Ending IP as integer
+| Type             | Description                                                                                                                                                        |
+| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `cidr`           | CIDR notation (e.g., "203.0.113.0/24")                                                                                                                             |
+| `start_ip`       | Starting IP address (e.g., "203.0.113.0")                                                                                                                          |
+| `end_ip`         | Ending IP address (e.g., "203.0.113.255")                                                                                                                          |
+| `start_int`      | Starting IP as integer                                                                                                                                             |
+| `end_int`        | Ending IP as integer                                                                                                                                               |
+| `network_bucket` | Bucket for efficient lookups. IPv4: integer. IPv6: hex string (default) or integer (with `ipv6_bucket_type = "int"`). Requires split files (CSV and Parquet only). |
 
 **Default behavior:** If no `[[network.columns]]` sections are defined: