Skip to content

Commit b1eaf8e

Browse files
committed
feat: added export all collections.
1 parent a9b56bc commit b1eaf8e

File tree

4 files changed

+82
-24
lines changed

4 files changed

+82
-24
lines changed

charts/pgstac-geoparquet-exporter/README.md

Lines changed: 47 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -25,46 +25,62 @@ helm install exporter ./charts/pgstac-geoparquet-exporter \
2525
--create-namespace \
2626
--set database.existingSecret=pgstac-db \
2727
--set storage.existingSecret=s3-creds \
28-
--set storage.outputPath=s3://my-bucket/exports
28+
--set storage.outputPath=s3://my-bucket/exports \
29+
--set stacApiUrl=https://example.com/stac/v1
2930
```
3031

3132
## Configuration
3233

3334
Edit `values.yaml` or create a custom values file:
3435

3536
```yaml
37+
# STAC API URL (required for link injection)
38+
stacApiUrl: "https://example.com/stac/v1"
39+
3640
# Collections to export
3741
exportConfig:
42+
# Export all collections from database (ignores collections list below)
43+
exportAll: false
44+
45+
# Specify individual collections (or leave empty and set exportAll: true)
3846
collections:
3947
- name: sentinel-2
4048
partition_by: year
4149
start_year: 2015
4250
- name: landsat-8
4351
partition_by: null # Single file
4452

45-
# Schedules
46-
completeExport:
47-
schedule: "0 2 1 * *" # Monthly
48-
incrementalExport:
49-
schedule: "0 3 * * *" # Daily
50-
51-
# Resources
52-
incrementalExport:
53-
resources:
54-
requests:
55-
memory: "1Gi"
56-
cpu: "250m"
53+
# Job schedules and resources
54+
jobs:
55+
complete:
56+
schedule: "0 2 1 * *" # Monthly
57+
resources:
58+
requests:
59+
memory: "2Gi"
60+
cpu: "500m"
61+
62+
incremental:
63+
schedule: "0 3 * * *" # Daily
64+
resources:
65+
requests:
66+
memory: "1Gi"
67+
cpu: "250m"
5768
```
5869
5970
## Key Parameters
6071
6172
| Parameter | Description | Default |
6273
|-----------|-------------|---------|
63-
| `database.existingSecret` | DB credentials secret | `default-pguser-eoapi` |
64-
| `storage.outputPath` | Output path (s3:// or local) | `s3://eoapi-geoparquet/geoparquet` |
65-
| `storage.existingSecret` | S3 credentials secret | `data-access` |
66-
| `completeExport.schedule` | Complete export cron | `"0 2 1 * *"` |
67-
| `incrementalExport.schedule` | Incremental export cron | `"0 3 * * *"` |
74+
| `stacApiUrl` | **Required**. STAC API URL for link injection | `""` |
75+
| `database.existingSecret` | DB credentials secret name | `default-pguser-eoapi` |
76+
| `storage.outputPath` | Output path (s3:// or local) | `s3://some-bucket/geoparquet` |
77+
| `storage.existingSecret` | S3 credentials secret name | `""` |
78+
| `storage.endpoint` | Custom S3 endpoint URL | `""` |
79+
| `storage.region` | AWS region | `""` |
80+
| `jobs.complete.schedule` | Complete export cron schedule | `"0 2 1 * *"` |
81+
| `jobs.incremental.schedule` | Incremental export cron schedule | `"0 3 * * *"` |
82+
| `exportConfig.exportAll` | Export all collections from database | `false` |
83+
| `exportConfig.collections[].partition_by` | Partitioning: `null`, `year`, `month` | `year` |
6884

6985
## Usage
7086

@@ -83,3 +99,16 @@ helm upgrade exporter ./charts/pgstac-geoparquet-exporter -n data-access -f valu
8399

84100
- **Complete**: Full export with optional yearly/monthly partitioning
85101
- **Incremental**: Only changed items since last run (state in `{OUTPUT_PATH}/.last_sync`)
102+
103+
### Export All Collections
104+
105+
Set `exportConfig.exportAll: true` to automatically export all collections from the pgSTAC database instead of manually specifying them. Useful when collections are added dynamically.
106+
107+
## Advanced Options
108+
109+
Additional configuration available in `values.yaml`:
110+
- `exportConfig.settings` - chunk_size, statement_timeout
111+
- `jobs` - successfulJobsHistoryLimit, failedJobsHistoryLimit, concurrencyPolicy
112+
- `extraEnv`, `extraVolumes`, `extraVolumeMounts` - Additional resources
113+
- `nodeSelector`, `tolerations`, `affinity` - Pod scheduling
114+
- `podSecurityContext`, `securityContext` - Security settings

charts/pgstac-geoparquet-exporter/templates/configmap.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ metadata:
1111
{{- end }}
1212
data:
1313
export-config.yaml: |
14+
exportAll: {{ .Values.exportConfig.exportAll }}
15+
1416
collections:
1517
{{- range .Values.exportConfig.collections }}
1618
- name: {{ .name }}

charts/pgstac-geoparquet-exporter/values.yaml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,11 @@ storage:
6767

6868
# Export configuration
6969
exportConfig:
70-
collections:
71-
- name: ""
72-
partition_by: year # Options: null, year, month
73-
start_year: 2015
70+
# Set to true to export all collections from the database
71+
# When true, collections list below is ignored
72+
exportAll: false
73+
74+
collections: []
7475

7576
settings:
7677
chunk_size: 10000

src/pgstac_geoparquet_exporter/__main__.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from pathlib import Path
77
from typing import Any, Callable
88

9+
import psycopg
910
import pyarrow.fs as pafs # type: ignore
1011
import yaml
1112
from stac_geoparquet.pgstac_reader import pgstac_to_parquet, sync_pgstac_to_parquet
@@ -31,6 +32,18 @@ def add_links(item: dict[str, Any]) -> dict[str, Any]:
3132
return add_links
3233

3334

35+
def get_all_collections(conninfo: str) -> list[dict[str, Any]]:
36+
"""Fetch all collection IDs from pgSTAC database."""
37+
conn = psycopg.connect(conninfo)
38+
try:
39+
with conn.cursor() as cur:
40+
cur.execute("SELECT id FROM pgstac.collections")
41+
collection_ids = [row[0] for row in cur.fetchall()]
42+
return [{"name": coll_id} for coll_id in collection_ids]
43+
finally:
44+
conn.close()
45+
46+
3447
def main() -> int:
3548
mode = os.environ.get("EXPORT_MODE", "complete")
3649
config_path = os.environ.get("CONFIG_PATH", "/config/export-config.yaml")
@@ -56,6 +69,19 @@ def main() -> int:
5669
with open(config_path) as f:
5770
config = yaml.safe_load(f)
5871

72+
# Determine which collections to export
73+
export_all = config.get("exportAll", False)
74+
if export_all:
75+
print("Fetching all collections from database...")
76+
collections = get_all_collections(conninfo)
77+
print(f"Found {len(collections)} collections")
78+
# Apply default settings from config
79+
default_settings = config.get("exportConfig", {}).get("settings", {})
80+
for coll in collections:
81+
coll.update(default_settings)
82+
else:
83+
collections = config.get("collections", [])
84+
5985
# Configure S3 filesystem if using S3/MinIO
6086
filesystem = None
6187
if output_base.startswith("s3://"):
@@ -77,7 +103,7 @@ def main() -> int:
77103

78104
if mode == "complete":
79105
# Complete export
80-
for coll in config.get("collections", []):
106+
for coll in collections:
81107
collection_id = coll["name"]
82108
partition_frequency = coll.get("partition_frequency")
83109

@@ -126,7 +152,7 @@ def main() -> int:
126152
# Incremental mode - export only updated items
127153
print("Using incremental mode with sync_pgstac_to_parquet")
128154

129-
for coll in config.get("collections", []):
155+
for coll in collections:
130156
collection_id = coll["name"]
131157
output_path = f"{output_base}/{collection_id}"
132158

0 commit comments

Comments
 (0)