Import with npix directories (#613)

camposandro · web-flow · commit b96c5b2ab87a · 2025-11-04T16:27:40.000-05:00
diff --git a/src/hats_import/catalog/arguments.py b/src/hats_import/catalog/arguments.py
@@ -52,7 +52,10 @@ class ImportArguments(RuntimeArguments):
     add_healpix_29: bool = True
     """add the healpix-based hats spatial index field alongside the data"""
     npix_suffix: str = ".parquet"
-    """Suffix for Npix files."""
+    """Suffix for pixel data. When specified as "/" each pixel will have a directory in its name."""
+    npix_parquet_name: str | None = None
+    """Name of the pixel parquet file to be used when npix_suffix=/. By default, it will be named
+    after the pixel with a .parquet extension (e.g. 'Npix=10.parquet')"""
     write_table_kwargs: dict | None = None
     """additional keyword arguments to use when writing files to parquet (e.g. compression schemes)."""
     row_group_kwargs: dict | None = None
diff --git a/src/hats_import/catalog/map_reduce.py b/src/hats_import/catalog/map_reduce.py
@@ -11,6 +11,7 @@
 import pyarrow.parquet as pq
 from hats import pixel_math
 from hats.io import file_io, paths
+from hats.io.paths import PARTITION_PIXEL
 from hats.pixel_math.healpix_pixel import HealpixPixel
 from hats.pixel_math.sparse_histogram import HistogramAggregator, SparseHistogram
 from hats.pixel_math.spatial_index import SPATIAL_INDEX_COLUMN, spatial_index_to_healpix
@@ -199,7 +200,7 @@ def split_pixels(
         raise exception
 
 
-# pylint: disable=too-many-positional-arguments
+# pylint: disable=too-many-positional-arguments,too-many-statements
 def reduce_pixel_shards(
     cache_shard_path,
     resume_path,
@@ -218,6 +219,7 @@ def reduce_pixel_shards(
     write_table_kwargs=None,
     row_group_kwargs=None,
     npix_suffix=".parquet",
+    npix_parquet_name=None,
 ):
     """Reduce sharded source pixels into destination pixels.
 
@@ -250,6 +252,9 @@ def reduce_pixel_shards(
         row_group_kwargs (dict): additional keyword arguments to use in
             creation of rowgroups when writing files to parquet.
         npix_suffix (str): suffix for Npix files. Defaults to ".parquet".
+        npix_parquet_name (str): name of the pixel parquet file to be used
+            when npix_suffix=/. By default, it will be named after the pixel
+            with a .parquet extension (e.g. 'Npix=10.parquet').
 
     Raises:
         ValueError: if the number of rows written doesn't equal provided
@@ -263,6 +268,13 @@ def reduce_pixel_shards(
 
         healpix_pixel = HealpixPixel(destination_pixel_order, destination_pixel_number)
         destination_file = paths.pixel_catalog_file(output_path, healpix_pixel, npix_suffix=npix_suffix)
+
+        if npix_parquet_name is None:
+            npix_parquet_name = f"{PARTITION_PIXEL}={healpix_pixel.pixel}.parquet"
+        if npix_suffix == "/":
+            destination_file.mkdir(exist_ok=True)
+            destination_file = destination_file / npix_parquet_name
+
         if destination_file.exists():
             rows_written = file_io.read_parquet_metadata(destination_file).num_rows
             if rows_written != destination_pixel_size:
diff --git a/src/hats_import/catalog/run_import.py b/src/hats_import/catalog/run_import.py
@@ -121,6 +121,7 @@ def run(args, client):
                     write_table_kwargs=args.write_table_kwargs,
                     row_group_kwargs=args.row_group_kwargs,
                     npix_suffix=args.npix_suffix,
+                    npix_parquet_name=args.npix_parquet_name,
                 )
             )
 
diff --git a/tests/hats_import/catalog/test_run_import.py b/tests/hats_import/catalog/test_run_import.py
@@ -384,3 +384,43 @@ def test_import_mismatch_expectation(
 
     with pytest.raises(ValueError, match="does not match expectation"):
         runner.run(args, dask_client)
+
+
+@pytest.mark.dask
+def test_import_with_npix_dir(dask_client, small_sky_parts_dir, tmp_path, assert_parquet_file_ids):
+    """Test that we can create a catalog where the partition data
+    is stored inside a directory: npix_suffix=/"""
+    args = ImportArguments(
+        output_artifact_name="small_sky_object_catalog",
+        input_path=small_sky_parts_dir,
+        file_reader="csv",
+        output_path=tmp_path,
+        dask_tmp=tmp_path,
+        tmp_dir=tmp_path,
+        highest_healpix_order=0,
+        pixel_threshold=1000,
+        npix_suffix="/",
+        progress_bar=False,
+    )
+    runner.run(args, dask_client)
+
+    catalog = read_hats(args.catalog_path)
+    assert catalog.catalog_info.total_rows == 131
+    assert len(catalog.get_healpix_pixels()) == 1
+    assert catalog.catalog_info.npix_suffix == "/"
+
+    pix_dir = Path(args.catalog_path) / "dataset" / "Norder=0" / "Dir=0" / "Npix=11"
+    expected_ids = [*range(700, 831)]
+
+    # The file exists and contains the expected object IDs
+    output_file = pix_dir / "Npix=11.parquet"
+    assert_parquet_file_ids(output_file, "id", expected_ids)
+
+    # Try with a custom npix_parquet_name
+    shutil.rmtree(tmp_path / "small_sky_object_catalog")
+    args.npix_parquet_name = "0.parquet"
+    runner.run(args, dask_client)
+
+    # The file exists and contains the expected object IDs
+    output_file = pix_dir / "0.parquet"
+    assert_parquet_file_ids(output_file, "id", expected_ids)

Original file line number	Diff line number	Diff line change
`@@ -121,6 +121,7 @@ def run(args, client):`
`121`	`121`	`write_table_kwargs=args.write_table_kwargs,`
`122`	`122`	`row_group_kwargs=args.row_group_kwargs,`
`123`	`123`	`npix_suffix=args.npix_suffix,`
	`124`	`+ npix_parquet_name=args.npix_parquet_name,`
`124`	`125`	`)`
`125`	`126`	`)`
`126`	`127`