Skip to content

Commit 260a99a

Browse files
feat/use new multiscales (#75)
* add option for using new multiscales convention in optimized conversion * remove OOP converter and use plain functions * use pydantic models for fingerprinting sentinel2 product * fix multiscales to use da.coarsen and propagate encoding * ensure that dtype is preserved after resampling * add new multiscales JSON example * add mypy pydantic plugin * lint * add s1 and s2 demo data to tests, and don't test against remote urls * fix e2e tests * remove network test workflow from CI * remove extra type definition and update tests * remove explicit zarr groups in favor of dynamic test fixtures * docstrings * Enhance CRS initialization and update S2 optimization commands - Added `initialize_crs_from_dataset` function to extract CRS from dataset metadata. - Updated S2 optimization commands to include new CRS handling. - Removed unused arguments related to geometry and meteorology groups. - Added comprehensive tests for CRS initialization from various sources. * Refactor code formatting for clarity in S2 optimization functions * fix failing / warning tests * add strict JSON schema equality check to e2e tests * support both flavors of multiscale metadata * dont manage return codes in cli functions * add s2 optimized test * add optimized geozarr exmaple hierarchies * format JSON documents * mid-debug of e2e tests * WIP e2e fixes * make cf standard name validator become a pass-through when no internet connection * update example schemas * narrow type to just tuples in types.py * refactor consolidation * use consolidated=False in conversion * update tests * lint * add both multiscales types to output * update comments in tests --------- Co-authored-by: Emmanuel Mathot <[email protected]>
1 parent 41c3490 commit 260a99a

37 files changed

+96938
-2195
lines changed

.github/workflows/ci.yml

Lines changed: 0 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -70,44 +70,6 @@ jobs:
7070
.venv
7171
key: uv-main-${{ hashFiles('uv.lock') }}
7272

73-
test-network:
74-
runs-on: ubuntu-latest
75-
# Temporarily enabled for PR to verify test fix
76-
if: github.event_name == 'push' || github.event_name == 'pull_request'
77-
steps:
78-
- uses: actions/checkout@v5
79-
80-
- name: Set up Python
81-
uses: actions/setup-python@v6
82-
with:
83-
python-version: '3.11'
84-
85-
- name: Restore global uv cache
86-
id: cache-restore
87-
uses: actions/cache/restore@v4
88-
with:
89-
path: |
90-
~/.cache/uv
91-
~/.local/share/uv
92-
.venv
93-
key: uv-main-${{ hashFiles('uv.lock') }}
94-
restore-keys: |
95-
uv-main-
96-
97-
- name: Install uv
98-
uses: astral-sh/setup-uv@v7
99-
with:
100-
version: "0.8.4"
101-
python-version: "3.13"
102-
enable-cache: false
103-
104-
- name: Install dependencies
105-
run: uv sync --group dev --group test
106-
107-
- name: Run network tests
108-
run: |
109-
uv run pytest tests/ -v --tb=short -m "network"
110-
11173
security:
11274
runs-on: ubuntu-latest
11375
steps:

.vscode/launch.json

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -168,10 +168,12 @@
168168
"module": "eopf_geozarr",
169169
"args": [
170170
"convert-s2-optimized",
171-
"https://objects.eodc.eu/e05ab01a9d56408d82ac32d69a5aae2a:202509-s02msil2a/08/products/cpm_v256/S2A_MSIL2A_20250908T100041_N0511_R122_T32TQM_20250908T115116.zarr",
171+
// "https://objects.eodc.eu/e05ab01a9d56408d82ac32d69a5aae2a:202509-s02msil2a/08/products/cpm_v256/S2A_MSIL2A_20250908T100041_N0511_R122_T32TQM_20250908T115116.zarr",
172+
"https://objects.eodc.eu/e05ab01a9d56408d82ac32d69a5aae2a:202511-s02msil2a-eu/15/products/cpm_v262/S2B_MSIL2A_20251115T091139_N0511_R050_T35SLU_20251115T111807.zarr",
172173
// "s3://esa-zarr-sentinel-explorer-fra/tests-output/sentinel-2-l2a-opt/S2A_MSIL2A_20250908T100041_N0511_R122_T32TQM_20250908T115116.zarr",
173-
"./tests-output/eopf_geozarr/s2l2_optimized.zarr",
174-
"--spatial-chunk", "1024",
174+
"s3://esa-zarr-sentinel-explorer-fra/tests-output/sentinel-2-l2a-pr75/S2B_MSIL2A_20251115T091139_N0511_R050_T35SLU_20251115T111807.zarr",
175+
// "./tests-output/eopf_geozarr/s2l2_optimized.zarr",
176+
"--spatial-chunk", "512",
175177
"--compression-level", "5",
176178
"--enable-sharding",
177179
"--dask-cluster",

pyproject.toml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,13 @@ warn_unused_ignores = true
129129
warn_no_return = true
130130
warn_unreachable = true
131131
strict_equality = true
132+
plugins = ["pydantic.mypy"]
133+
134+
[tool.pydantic-mypy]
135+
init_forbid_extra = true
136+
init_typed = true
137+
warn_required_dynamic_aliases = true
138+
warn_untyped_fields = true
132139

133140
[[tool.mypy.overrides]]
134141
module = ["zarr.*", "xarray.*", "rioxarray.*", "cf_xarray.*", "dask.*"]

src/eopf_geozarr/cli.py

Lines changed: 1 addition & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1182,14 +1182,6 @@ def add_s2_optimization_commands(subparsers: Any) -> None:
11821182
choices=range(1, 10),
11831183
help="Compression level 1-9 (default: 3)",
11841184
)
1185-
s2_parser.add_argument(
1186-
"--skip-geometry", action="store_true", help="Skip creating geometry group"
1187-
)
1188-
s2_parser.add_argument(
1189-
"--skip-meteorology",
1190-
action="store_true",
1191-
help="Skip creating meteorology group",
1192-
)
11931185
s2_parser.add_argument(
11941186
"--skip-validation", action="store_true", help="Skip output validation"
11951187
)
@@ -1204,7 +1196,7 @@ def add_s2_optimization_commands(subparsers: Any) -> None:
12041196
s2_parser.set_defaults(func=convert_s2_optimized_command)
12051197

12061198

1207-
def convert_s2_optimized_command(args: Any) -> int:
1199+
def convert_s2_optimized_command(args: Any) -> None:
12081200
"""Execute S2 optimized conversion command."""
12091201
# Set up dask cluster if requested
12101202
dask_client = setup_dask_cluster(
@@ -1229,22 +1221,10 @@ def convert_s2_optimized_command(args: Any) -> int:
12291221
enable_sharding=args.enable_sharding,
12301222
spatial_chunk=args.spatial_chunk,
12311223
compression_level=args.compression_level,
1232-
create_geometry_group=not args.skip_geometry,
1233-
create_meteorology_group=not args.skip_meteorology,
12341224
validate_output=not args.skip_validation,
1235-
verbose=args.verbose,
12361225
)
12371226

12381227
log.info("✅ S2 optimization completed", output_path=args.output_path)
1239-
return 0
1240-
1241-
except Exception as e:
1242-
log.info("❌ Error during S2 optimization", error=str(e))
1243-
if args.verbose:
1244-
import traceback
1245-
1246-
traceback.print_exc()
1247-
return 1
12481228
finally:
12491229
# Clean up dask client if it was created
12501230
if dask_client is not None:

src/eopf_geozarr/conversion/geozarr.py

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,8 @@ def create_geozarr_dataset(
128128
dt, groups, gcp_group
129129
)
130130

131+
log.info("GeoZarr groups prepared", groups_prepared=list(geozarr_groups.keys()))
132+
131133
# Create the GeoZarr compliant store through iterative processing
132134
dt_geozarr = iterative_copy(
133135
dt,
@@ -180,11 +182,19 @@ def setup_datatree_metadata_geozarr_spec_compliant(
180182
epsg_CPM_260 = epsg_CPM_260.split(":")[-1]
181183

182184
for key in groups:
183-
if not dt[key].data_vars:
185+
# Check if key exists in DataTree by attempting to access it
186+
try:
187+
node = dt[key]
188+
except KeyError:
189+
log.info(f"Skipping group {key} - not found in DataTree")
190+
continue
191+
192+
if not node.data_vars:
193+
log.info(f"Skipping group {key} - no data variables")
184194
continue
185195

186196
log.info(f"Processing group {key} for GeoZarr compliance")
187-
ds = dt[key].to_dataset().copy()
197+
ds = node.to_dataset().copy()
188198

189199
if gcp_group is not None:
190200
ds_gcp = dt[gcp_group].to_dataset()
@@ -233,7 +243,11 @@ def setup_datatree_metadata_geozarr_spec_compliant(
233243
_setup_grid_mapping(ds, grid_mapping_var_name)
234244

235245
geozarr_groups[key] = ds
246+
log.info(f"Added {key} to geozarr_groups")
236247

248+
log.info(
249+
f"Returning geozarr_groups with {len(geozarr_groups)} groups: {list(geozarr_groups.keys())}"
250+
)
237251
return geozarr_groups
238252

239253

@@ -287,7 +301,7 @@ def iterative_copy(
287301
dt_result.to_zarr(
288302
output_path,
289303
mode="a",
290-
consolidated=True,
304+
consolidated=False,
291305
compute=True,
292306
storage_options=storage_options,
293307
)
@@ -638,7 +652,7 @@ def create_geozarr_compliant_multiscales(
638652
level=ol["level"],
639653
width=ol["width"],
640654
height=ol["height"],
641-
scale_factor=ol["scale_factor"],
655+
scale_factor=ol["scale_relative"],
642656
)
643657

644658
# Create native CRS tile matrix set
@@ -679,7 +693,7 @@ def create_geozarr_compliant_multiscales(
679693

680694
width = overview["width"]
681695
height = overview["height"]
682-
scale_factor = overview["scale_factor"]
696+
scale_factor = overview["scale_relative"]
683697

684698
log.info(
685699
f"Creating overview level (scale) {level} with scale factor {scale_factor}"
@@ -733,7 +747,7 @@ def create_geozarr_compliant_multiscales(
733747
output_path,
734748
group=overview_group,
735749
mode="w",
736-
consolidated=True,
750+
consolidated=False,
737751
zarr_format=3,
738752
encoding=encoding,
739753
align_chunks=align_chunks_flag,
@@ -824,7 +838,9 @@ def calculate_overview_levels(
824838
"zoom": zoom,
825839
"width": current_width,
826840
"height": current_height,
827-
"scale_factor": 2**level,
841+
"translation_relative": 0.0,
842+
"scale_absolute": 1.0,
843+
"scale_relative": 2**level,
828844
}
829845
overview_levels.append(overview_level) # type: ignore[arg-type]
830846

@@ -877,8 +893,8 @@ def create_native_crs_tile_matrix_set(
877893
scale_denominator = cell_size * 3779.5275
878894

879895
# Calculate matrix dimensions
880-
tile_width = overview["chunks"][1][0] if "chunks" in overview else 256
881-
tile_height = overview["chunks"][0][0] if "chunks" in overview else 256
896+
tile_width = overview["chunks"][1][0] if "chunks" in overview else 256 # type: ignore[index]
897+
tile_height = overview["chunks"][0][0] if "chunks" in overview else 256 # type: ignore[index]
882898
matrix_width = int(np.ceil(width / tile_width))
883899
matrix_height = int(np.ceil(height / tile_height))
884900

@@ -889,7 +905,7 @@ def create_native_crs_tile_matrix_set(
889905
"id": matrix_id,
890906
"scaleDenominator": scale_denominator,
891907
"cellSize": cell_size,
892-
"pointOfOrigin": [left, top],
908+
"pointOfOrigin": (left, top),
893909
"tileWidth": tile_width,
894910
"tileHeight": tile_height,
895911
"matrixWidth": matrix_width,
@@ -910,8 +926,8 @@ def create_native_crs_tile_matrix_set(
910926
"title": f"Native CRS Tile Matrix Set ({native_crs})",
911927
"crs": crs_uri,
912928
"supportedCRS": crs_uri,
913-
"orderedAxes": ["X", "Y"],
914-
"tileMatrices": tile_matrices,
929+
"orderedAxes": ("X", "Y"),
930+
"tileMatrices": tuple(tile_matrices),
915931
}
916932

917933

@@ -1802,3 +1818,7 @@ def _is_sentinel1(dt: xr.DataTree) -> bool:
18021818
return True
18031819
else:
18041820
return False
1821+
1822+
1823+
def get_zarr_group(data: xr.DataTree) -> zarr.Group:
1824+
return data._close.__self__.zarr_group

src/eopf_geozarr/data_api/geozarr/common.py

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
import urllib
55
import urllib.request
66
from dataclasses import dataclass
7-
from typing import Annotated, Any, Mapping, Self, TypeVar
7+
from typing import Annotated, Any, Mapping, Self, TypeGuard, TypeVar
8+
from urllib.error import URLError
89

910
from cf_xarray.utils import parse_cf_standard_name_table
1011
from pydantic import AfterValidator, BaseModel, Field, model_validator
@@ -98,12 +99,9 @@ def get_cf_standard_names(url: str) -> tuple[str, ...]:
9899

99100
req = urllib.request.Request(url, headers=headers)
100101

101-
try:
102-
with urllib.request.urlopen(req) as response:
103-
content = response.read() # Read the entire response body into memory
104-
content_fobj = io.BytesIO(content)
105-
except urllib.error.URLError as e:
106-
raise e
102+
with urllib.request.urlopen(req) as response:
103+
content = response.read() # Read the entire response body into memory
104+
content_fobj = io.BytesIO(content)
107105

108106
_info, table, _aliases = parse_cf_standard_name_table(source=content_fobj)
109107
return tuple(table.keys())
@@ -117,7 +115,13 @@ def get_cf_standard_names(url: str) -> tuple[str, ...]:
117115

118116
# this does IO against github. consider locally storing this data instead if fetching every time
119117
# is problematic.
120-
CF_STANDARD_NAMES = get_cf_standard_names(url=CF_STANDARD_NAME_URL)
118+
119+
try:
120+
CF_STANDARD_NAMES = get_cf_standard_names(url=CF_STANDARD_NAME_URL)
121+
DO_CF_NAME_VALIDATION = True
122+
except URLError:
123+
CF_STANDARD_NAMES = ()
124+
DO_CF_NAME_VALIDATION = False
121125

122126

123127
def check_standard_name(name: str) -> str:
@@ -139,12 +143,13 @@ def check_standard_name(name: str) -> str:
139143
ValueError
140144
If the standard name is not valid.
141145
"""
142-
143-
if name in CF_STANDARD_NAMES:
144-
return name
145-
raise ValueError(
146-
f"Invalid standard name: {name}. This name was not found in the list of CF standard names."
147-
)
146+
if DO_CF_NAME_VALIDATION:
147+
if name in CF_STANDARD_NAMES:
148+
return name
149+
raise ValueError(
150+
f"Invalid standard name: {name}. This name was not found in the list of CF standard names."
151+
)
152+
return name
148153

149154

150155
CFStandardName = Annotated[str, AfterValidator(check_standard_name)]
@@ -245,9 +250,9 @@ class TileMatrixSet(BaseModel):
245250
tileMatrices: tuple[TileMatrix, ...]
246251

247252

248-
class Multiscales(BaseModel, extra="allow"):
253+
class TMSMultiscales(BaseModel, extra="allow"):
249254
"""
250-
Multiscale metadata for a GeoZarr dataset.
255+
Multiscale metadata for a GeoZarr dataset based on the OGC TileMatrixSet standard
251256
252257
Attributes
253258
----------
@@ -307,4 +312,8 @@ class MultiscaleGroupAttrs(BaseModel, extra="allow"):
307312
multiscales: MultiscaleAttrs
308313
"""
309314

310-
multiscales: Multiscales
315+
multiscales: TMSMultiscales
316+
317+
318+
def is_none(data: object) -> TypeGuard[None]:
319+
return data is None

src/eopf_geozarr/data_api/geozarr/geoproj.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,14 @@
44

55
from __future__ import annotations
66

7-
from typing import Literal, Self, TypeGuard
7+
from typing import Literal, Self
88

99
from pydantic import BaseModel, Field, model_validator
1010
from typing_extensions import TypedDict
1111

12+
from eopf_geozarr.data_api.geozarr.common import is_none
1213
from eopf_geozarr.data_api.geozarr.projjson import ProjJSON
1314

14-
15-
def is_none(data: object) -> TypeGuard[None]:
16-
return data is None
17-
18-
1915
GEO_PROJ_UUID: Literal["f17cb550-5864-4468-aeb7-f3180cfb622f"] = (
2016
"f17cb550-5864-4468-aeb7-f3180cfb622f"
2117
)

0 commit comments

Comments
 (0)