feat(collection): add bands,bounds,__repr__, fix metadata roundtrip, clean API surface

print-sid8 · print-sid8 · commit 9d37b40256b0 · 2026-02-25T09:15:17.000Z
- Add  property (band codes from _metadata columns)
    - Add  property (spatial extent via pyarrow.compute min/max)
    - Add  showing name, source, bands, records, date range
    - Add return type annotations: get_xarray -&gt; xr.Dataset,
      get_gdf -&gt; gpd.GeoDataFrame, to_torchgeo_dataset -&gt; RasteretGeoDataset
    - Restore metadata (name, data_source, description, date_range) from
      parquet schema on load -- fixes empty data_source after roundtrip
    - Make from_parquet() Hive-aware (tries Hive first, falls back)
    - Rename from_local() -&gt; _load_cached() to clarify internal vs public API
    - Add xr.Dataset return type to get_collection_xarray() in execution.py

    Signed-off-by: print-sid8 sidsub94@gmail.com
diff --git a/src/rasteret/__init__.py b/src/rasteret/__init__.py
@@ -115,7 +115,7 @@ def build_from_stac(
     collection_path = workspace_dir_path / f"{collection_name}_stac"
 
     if collection_path.exists() and not force:
-        return Collection.from_local(collection_path)
+        return Collection._load_cached(collection_path)
 
     from rasteret.cloud import CloudConfig, backend_config_from_cloud_config
     from rasteret.ingest.stac_indexer import StacCollectionBuilder
@@ -666,7 +666,7 @@ def build_from_table(
     if resolved_workspace is not None:
         rw = Path(resolved_workspace)
         if rw.exists() and not force:
-            return Collection.from_local(rw)
+            return Collection._load_cached(rw)
 
     # Arrow-native path: accept an in-memory Arrow table / dataset.
     import pyarrow as pa
diff --git a/src/rasteret/cli.py b/src/rasteret/cli.py
@@ -181,7 +181,7 @@ def _handle_cache_list(args: argparse.Namespace) -> int:
 def _handle_cache_info(args: argparse.Namespace) -> int:
     workspace_dir = _workspace_dir(args.workspace_dir)
     collection_path = _resolve_collection_path(args.name, workspace_dir)
-    collection = Collection.from_local(collection_path)
+    collection = Collection._load_cached(collection_path)
     summary = _collection_summary(collection, collection_path)
 
     if args.json:
@@ -220,7 +220,7 @@ def _handle_cache_import(args: argparse.Namespace) -> int:
     collection_path = workspace_dir / f"{args.name}_records"
     if collection_path.exists():
         if not args.force:
-            collection = Collection.from_local(collection_path)
+            collection = Collection._load_cached(collection_path)
             summary = _collection_summary(collection, collection_path)
             if args.json:
                 print(json.dumps(summary, indent=2))
diff --git a/src/rasteret/core/collection.py b/src/rasteret/core/collection.py
@@ -11,7 +11,7 @@
 from collections.abc import Sequence
 from datetime import datetime
 from pathlib import Path
-from typing import Any, AsyncIterator
+from typing import TYPE_CHECKING, Any, AsyncIterator
 
 import pandas as pd
 import pyarrow as pa
@@ -23,6 +23,12 @@
 from rasteret.core.raster_accessor import RasterAccessor
 from rasteret.types import RasterInfo
 
+if TYPE_CHECKING:
+    import geopandas as gpd
+    import xarray as xr
+
+    from rasteret.integrations.torchgeo import RasteretGeoDataset
+
 logger = logging.getLogger(__name__)
 
 # WKB geometry type id → GeoParquet type name (OGC Simple Features).
@@ -83,7 +89,7 @@ class Collection:
     Examples
     --------
     # From partitioned dataset
-    >>> collection = Collection.from_local("path/to/dataset")
+    >>> collection = Collection.from_parquet("path/to/dataset")
 
     # Filter and process
     >>> filtered = collection.subset(cloud_cover_lt=20)
@@ -136,12 +142,31 @@ def _view(self, dataset: ds.Dataset) -> Collection:
             end_date=self.end_date,
         )
 
+    @staticmethod
+    def _metadata_from_schema(dataset: ds.Dataset) -> dict[str, str]:
+        """Extract Rasteret metadata stored by ``export()``."""
+        raw = dataset.schema.metadata or {}
+        out: dict[str, str] = {}
+        for key in (b"name", b"data_source", b"description", b"date_range"):
+            val = raw.get(key)
+            if val:
+                try:
+                    out[key.decode()] = val.decode("utf-8")
+                except (UnicodeDecodeError, AttributeError):
+                    pass
+        return out
+
     @classmethod
-    def from_local(cls, path: str | Path) -> Collection:
-        """Create collection from a local Parquet dataset.
+    def _load_cached(cls, path: str | Path) -> Collection:
+        """Load a Collection from a workspace cache directory.
 
-        Tries Hive-style partitioning first (year/month), falls back to
-        plain Parquet if the directory isn't Hive-partitioned.
+        Internal fast-path for ``build()`` / ``build_from_table()`` cache
+        hits.  Trusts the data (no schema validation), strips workspace
+        suffixes (``_stac``, ``_records``) from the name, and detects
+        Hive partitioning.
+
+        For user-facing loading, use :meth:`from_parquet` or
+        :func:`rasteret.load` instead.
         """
         path = Path(path)
         if not path.exists():
@@ -161,22 +186,53 @@ def from_local(cls, path: str | Path) -> Collection:
                 exclude_invalid_files=True,
             )
 
-        name = path.stem.removesuffix("_stac").removesuffix("_records")
-        return cls(dataset=dataset, name=name)
+        meta = cls._metadata_from_schema(dataset)
+        name = meta.get("name") or path.stem.removesuffix("_stac").removesuffix(
+            "_records"
+        )
+
+        start_date = None
+        end_date = None
+        dr = meta.get("date_range", "")
+        if "," in dr:
+            start_date, end_date = dr.split(",", 1)
+
+        return cls(
+            dataset=dataset,
+            name=name,
+            data_source=meta.get("data_source", ""),
+            description=meta.get("description", ""),
+            start_date=start_date,
+            end_date=end_date,
+        )
 
     @classmethod
     def from_parquet(cls, path: str | Path, name: str = "") -> Collection:
         """Load a Collection from any Parquet file or directory.
 
-        The Parquet must contain the core columns:
-        ``id``, ``datetime``, ``geometry``, ``assets``, ``scene_bbox``.
+        Tries Hive-style partitioning first (year/month), falls back to
+        plain Parquet.  Validates that the core contract columns are present.
+
         See the `Schema Contract <../explanation/schema-contract/>`_ docs page.
         """
         path = Path(path)
         if not path.exists():
             raise FileNotFoundError(f"Parquet not found at {path}")
 
-        dataset = ds.dataset(str(path), format="parquet")
+        try:
+            dataset = ds.dataset(
+                str(path),
+                format="parquet",
+                partitioning="hive",
+                exclude_invalid_files=True,
+            )
+        except pa.ArrowInvalid:
+            dataset = ds.dataset(
+                str(path),
+                format="parquet",
+                exclude_invalid_files=True,
+            )
+
         required = {"id", "datetime", "geometry", "assets", "scene_bbox"}
         missing = required - set(dataset.schema.names)
         if missing:
@@ -185,8 +241,23 @@ def from_parquet(cls, path: str | Path, name: str = "") -> Collection:
                 "See the Schema Contract page in docs for the expected schema."
             )
 
-        name = name or path.stem
-        return cls(dataset=dataset, name=name)
+        meta = cls._metadata_from_schema(dataset)
+        resolved_name = name or meta.get("name") or path.stem
+
+        start_date = None
+        end_date = None
+        dr = meta.get("date_range", "")
+        if "," in dr:
+            start_date, end_date = dr.split(",", 1)
+
+        return cls(
+            dataset=dataset,
+            name=resolved_name,
+            data_source=meta.get("data_source", ""),
+            description=meta.get("description", ""),
+            start_date=start_date,
+            end_date=end_date,
+        )
 
     def subset(
         self,
@@ -589,6 +660,52 @@ async def get_first_raster(self) -> RasterAccessor:
             return raster
         raise ValueError("No raster records found in collection")
 
+    @property
+    def bands(self) -> list[str]:
+        """Available band codes in this collection."""
+        if self.dataset is None:
+            return []
+        return [
+            c.removesuffix("_metadata")
+            for c in self.dataset.schema.names
+            if c.endswith("_metadata")
+        ]
+
+    @property
+    def bounds(self) -> tuple[float, float, float, float] | None:
+        """Spatial extent as ``(minx, miny, maxx, maxy)`` or ``None``."""
+        if self.dataset is None:
+            return None
+        names = set(self.dataset.schema.names)
+        cols = ("bbox_minx", "bbox_miny", "bbox_maxx", "bbox_maxy")
+        if not all(c in names for c in cols):
+            return None
+        t = self.dataset.to_table(columns=list(cols))
+        return (
+            pc.min(t["bbox_minx"]).as_py(),
+            pc.min(t["bbox_miny"]).as_py(),
+            pc.max(t["bbox_maxx"]).as_py(),
+            pc.max(t["bbox_maxy"]).as_py(),
+        )
+
+    def __repr__(self) -> str:
+        n_bands = len(self.bands)
+        try:
+            n_rows = self.dataset.count_rows() if self.dataset is not None else 0
+        except Exception:
+            n_rows = "?"
+
+        parts = [f"Collection({self.name!r}"]
+        if self.data_source:
+            parts.append(f"source={self.data_source!r}")
+        parts.append(f"bands={n_bands}")
+        parts.append(f"records={n_rows}")
+        if self.start_date and self.end_date:
+            s = str(self.start_date)[:10]
+            e = str(self.end_date)[:10]
+            parts.append(f"{s}..{e}")
+        return ", ".join(parts) + ")"
+
     def _validate_parquet_dataset(self) -> None:
         """Basic dataset validation."""
         if not isinstance(self.dataset, ds.Dataset):
@@ -724,7 +841,7 @@ def to_torchgeo_dataset(
         backend: Any = None,
         time_series: bool = False,
         target_crs: int | None = None,
-    ) -> Any:
+    ) -> RasteretGeoDataset:
         """Create a TorchGeo GeoDataset backed by this Collection.
 
         This integration is optional and requires ``torchgeo`` and its
@@ -830,7 +947,7 @@ def get_xarray(
         backend: Any = None,
         target_crs: int | None = None,
         **filters: Any,
-    ) -> Any:
+    ) -> xr.Dataset:
         """Load selected bands into an xarray Dataset.
 
         Parameters
@@ -886,7 +1003,7 @@ def get_gdf(
         backend: Any = None,
         target_crs: int | None = None,
         **filters: Any,
-    ) -> Any:
+    ) -> gpd.GeoDataFrame:
         """Load selected bands into a GeoDataFrame.
 
         Parameters
diff --git a/src/rasteret/core/execution.py b/src/rasteret/core/execution.py
@@ -27,6 +27,8 @@
 from rasteret.core.utils import infer_data_source, run_sync
 
 if TYPE_CHECKING:  # pragma: no cover
+    import xarray as xr
+
     from rasteret.core.collection import Collection
 
 logger = logging.getLogger(__name__)
@@ -223,7 +225,7 @@ def get_collection_xarray(
     backend: object | None = None,
     target_crs: int | None = None,
     **filters: Any,
-):
+) -> xr.Dataset:
     """Load selected bands as an ``xarray.Dataset``.
 
     Parameters
diff --git a/src/rasteret/tests/test_cli.py b/src/rasteret/tests/test_cli.py
@@ -91,7 +91,7 @@ def test_cache_build_passes_args_and_returns_summary(
     def fake_build_from_stac(**kwargs):
         captured.update(kwargs)
         _write_cached_collection(cache_dir)
-        return Collection.from_local(cache_dir)
+        return Collection._load_cached(cache_dir)
 
     monkeypatch.setattr("rasteret.cli.build_from_stac", fake_build_from_stac)
 
diff --git a/src/rasteret/tests/test_execution.py b/src/rasteret/tests/test_execution.py
@@ -69,7 +69,7 @@ def _make_collection_and_infer(
             pq.write_to_dataset(
                 table, root_path=str(path), partition_cols=["year", "month"]
             )
-            c = Collection.from_local(path)
+            c = Collection._load_cached(path)
             c.data_source = data_source
             return infer_data_source(c)
 
@@ -161,7 +161,7 @@ def test_single_crs_returns_none(self):
             path = Path(tmp) / "single_crs"
             path.mkdir()
             pq.write_table(table, str(path / "data.parquet"))
-            c = Collection.from_local(path)
+            c = Collection._load_cached(path)
             assert _detect_target_crs(c, {}) is None
 
     def test_multi_crs_returns_most_common(self):
@@ -171,7 +171,7 @@ def test_multi_crs_returns_most_common(self):
             path = Path(tmp) / "multi_crs"
             path.mkdir()
             pq.write_table(table, str(path / "data.parquet"))
-            c = Collection.from_local(path)
+            c = Collection._load_cached(path)
             result = _detect_target_crs(c, {})
             assert result == 32632
 
@@ -182,6 +182,6 @@ def test_multi_crs_equal_counts_picks_one(self):
             path = Path(tmp) / "equal_crs"
             path.mkdir()
             pq.write_table(table, str(path / "data.parquet"))
-            c = Collection.from_local(path)
+            c = Collection._load_cached(path)
             result = _detect_target_crs(c, {})
             assert result in (32632, 32633)
diff --git a/src/rasteret/tests/test_ingest.py b/src/rasteret/tests/test_ingest.py
@@ -385,7 +385,7 @@ def test_build_from_table_workspace_dir_persists(self, tmp_path):
         # workspace_dir gets _records suffix for discoverability
         expected = out_dir / "demo_records"
         assert expected.exists()
-        reloaded = Collection.from_local(expected)
+        reloaded = Collection._load_cached(expected)
         assert reloaded.dataset is not None
         assert reloaded.dataset.count_rows() == collection.dataset.count_rows()
 
diff --git a/src/rasteret/tests/test_public_api_surface.py b/src/rasteret/tests/test_public_api_surface.py
@@ -59,7 +59,7 @@ def test_collection_analysis_methods_delegate_to_execution_layer() -> None:
     with TemporaryDirectory() as tmp_dir:
         dataset_path = Path(tmp_dir) / "example_stac"
         _write_minimal_partitioned_collection(dataset_path)
-        collection = Collection.from_local(dataset_path)
+        collection = Collection._load_cached(dataset_path)
 
     with (
         patch(
@@ -134,10 +134,10 @@ def test_load_rejects_missing_file() -> None:
         rasteret.load("/nonexistent/path.parquet")
 
 
-def test_from_local_fallback_to_non_hive() -> None:
-    """from_local should work on non-Hive partitioned parquet."""
+def test_from_parquet_fallback_to_non_hive() -> None:
+    """from_parquet should work on non-Hive partitioned parquet."""
     with TemporaryDirectory() as tmp_dir:
         path = Path(tmp_dir) / "flat.parquet"
         _write_minimal_flat_collection(path)
-        collection = Collection.from_local(path)
+        collection = Collection.from_parquet(path)
         assert isinstance(collection, Collection)