Add support for legacy v0.1.0 h5ad files; implement tests for legacy categorical and dataframe formats

Claptar · Claptar · commit 1cebbbf315ab · 2026-01-23T17:12:39.000Z
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -172,3 +172,53 @@ def sample_categorical_h5ad(temp_dir):
         f.create_dataset("X", data=X)
 
     return file_path
+
+
+@pytest.fixture
+def sample_legacy_v010_h5ad(temp_dir):
+    """Create a sample h5ad file with legacy v0.1.0 categorical columns.
+
+    In v0.1.0, categorical columns are stored as:
+    - Integer code datasets with a 'categories' attribute (HDF5 object reference)
+    - Categories stored in __categories/<colname> subgroup
+    """
+    file_path = temp_dir / "test_legacy_v010.h5ad"
+
+    with h5py.File(file_path, "w") as f:
+        # Create obs with legacy categorical column
+        obs = f.create_group("obs")
+        obs.attrs["_index"] = "obs_names"
+        obs.attrs["encoding-type"] = "dataframe"
+        obs.attrs["encoding-version"] = "0.1.0"
+        obs_names = ["cell_1", "cell_2", "cell_3", "cell_4"]
+        obs.create_dataset("obs_names", data=np.array(obs_names, dtype="S"))
+
+        # Create __categories subgroup (v0.1.0 convention)
+        categories_group = obs.create_group("__categories")
+        cell_type_cats = np.array(["TypeA", "TypeB", "TypeC"], dtype="S")
+        cats_ds = categories_group.create_dataset("cell_type", data=cell_type_cats)
+
+        # Create categorical column as integer codes with reference to categories
+        codes = np.array([0, 1, 0, 2], dtype=np.int8)
+        cell_type_ds = obs.create_dataset("cell_type", data=codes)
+        # Store HDF5 object reference to categories
+        cell_type_ds.attrs["categories"] = cats_ds.ref
+
+        # Add a regular non-categorical column
+        obs.create_dataset(
+            "n_counts", data=np.array([100, 200, 150, 300], dtype=np.int32)
+        )
+
+        # Create var
+        var = f.create_group("var")
+        var.attrs["_index"] = "var_names"
+        var.attrs["encoding-type"] = "dataframe"
+        var.attrs["encoding-version"] = "0.1.0"
+        var_names = ["gene_1", "gene_2"]
+        var.create_dataset("var_names", data=np.array(var_names, dtype="S"))
+
+        # Create X matrix (no encoding-type for legacy)
+        X = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]], dtype=np.float32)
+        f.create_dataset("X", data=X)
+
+    return file_path
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -66,36 +66,32 @@ def test_info_depth_short_flag(self, sample_h5ad_file):
         output = result.stdout + (result.stderr or "")
         assert "<" in output
 
-    def test_info_entry_flag(self, sample_h5ad_file):
-        """Test info command with --entry flag."""
-        result = runner.invoke(app, ["info", "--entry", "X", str(sample_h5ad_file)])
+    def test_info_entry_positional(self, sample_h5ad_file):
+        """Test info command with entry as positional argument."""
+        result = runner.invoke(app, ["info", str(sample_h5ad_file), "X"])
         assert result.exit_code == 0
         output = result.stdout + (result.stderr or "")
         assert "Path:" in output
         assert "Type:" in output
 
-    def test_info_entry_short_flag(self, sample_h5ad_file):
-        """Test info command with -e short flag."""
-        result = runner.invoke(app, ["info", "-e", "obs", str(sample_h5ad_file)])
+    def test_info_entry_obs(self, sample_h5ad_file):
+        """Test info command with obs entry."""
+        result = runner.invoke(app, ["info", str(sample_h5ad_file), "obs"])
         assert result.exit_code == 0
         output = result.stdout + (result.stderr or "")
         assert "Path:" in output
         assert "dataframe" in output
 
     def test_info_entry_nested_path(self, sample_h5ad_file):
         """Test info command with nested object path."""
-        result = runner.invoke(
-            app, ["info", "-e", "uns/description", str(sample_h5ad_file)]
-        )
+        result = runner.invoke(app, ["info", str(sample_h5ad_file), "uns/description"])
         assert result.exit_code == 0
         output = result.stdout + (result.stderr or "")
         assert "Path:" in output
 
     def test_info_entry_not_found(self, sample_h5ad_file):
         """Test info command with non-existent object path."""
-        result = runner.invoke(
-            app, ["info", "-e", "nonexistent", str(sample_h5ad_file)]
-        )
+        result = runner.invoke(app, ["info", str(sample_h5ad_file), "nonexistent"])
         assert result.exit_code == 0  # Doesn't exit with error, just shows message
         output = result.stdout + (result.stderr or "")
         assert "not found" in output
@@ -114,6 +110,7 @@ def test_export_dataframe_obs(self, sample_h5ad_file, temp_dir):
                 "dataframe",
                 str(sample_h5ad_file),
                 "obs",
+                "--output",
                 str(output),
             ],
         )
@@ -137,6 +134,7 @@ def test_export_dataframe_var(self, sample_h5ad_file, temp_dir):
                 "dataframe",
                 str(sample_h5ad_file),
                 "var",
+                "--output",
                 str(output),
             ],
         )
@@ -158,6 +156,7 @@ def test_export_dataframe_columns_filter(self, sample_h5ad_file, temp_dir):
                 "dataframe",
                 str(sample_h5ad_file),
                 "obs",
+                "--output",
                 str(output),
                 "--columns",
                 "obs_names,cell_type",
@@ -183,6 +182,7 @@ def test_export_dataframe_head(self, sample_h5ad_file, temp_dir):
                 "dataframe",
                 str(sample_h5ad_file),
                 "obs",
+                "--output",
                 str(output),
                 "--head",
                 "2",
@@ -205,6 +205,7 @@ def test_export_dataframe_invalid_axis(self, sample_h5ad_file, temp_dir):
                 "dataframe",
                 str(sample_h5ad_file),
                 "invalid",
+                "--output",
                 str(output),
             ],
         )
diff --git a/tests/test_export.py b/tests/test_export.py
@@ -112,7 +112,8 @@ class TestExportDataframe:
     def test_export_dataframe_obs(self, sample_h5ad_file, temp_dir):
         out = temp_dir / "obs.csv"
         result = runner.invoke(
-            app, ["export", "dataframe", str(sample_h5ad_file), "obs", str(out)]
+            app,
+            ["export", "dataframe", str(sample_h5ad_file), "obs", "--output", str(out)],
         )
         assert result.exit_code == 0
         assert out.exists()
@@ -125,7 +126,8 @@ def test_wrong_type_for_dataframe(self, sample_h5ad_file, temp_dir):
         """Test that wrong object type is rejected for dataframe export."""
         out = temp_dir / "X.csv"
         result = runner.invoke(
-            app, ["export", "dataframe", str(sample_h5ad_file), "X", str(out)]
+            app,
+            ["export", "dataframe", str(sample_h5ad_file), "X", "--output", str(out)],
         )
         assert result.exit_code == 1
         assert "obs" in result.output or "var" in result.output
diff --git a/tests/test_info_read.py b/tests/test_info_read.py
@@ -245,9 +245,59 @@ def test_col_chunk_categorical(self, sample_categorical_h5ad):
             result = col_chunk_as_strings(f["obs"], "cell_type", 0, 4, cache)
             assert result == ["TypeA", "TypeB", "TypeA", "TypeC"]
 
-    def test_col_chunk_unsupported(self, sample_h5ad_file):
-        """Test reading unsupported column."""
+    def test_col_chunk_not_found(self, sample_h5ad_file):
+        """Test reading non-existent column."""
         with h5py.File(sample_h5ad_file, "r") as f:
             cache = {}
-            with pytest.raises(RuntimeError, match="Unsupported column"):
+            with pytest.raises(RuntimeError, match="not found in group"):
                 col_chunk_as_strings(f["obs"], "nonexistent", 0, 5, cache)
+
+
+class TestLegacyV010Support:
+    """Tests for legacy v0.1.0 format support."""
+
+    def test_get_entry_type_legacy_categorical(self, sample_legacy_v010_h5ad):
+        """Test type detection for legacy categorical column (v0.1.0)."""
+        with h5py.File(sample_legacy_v010_h5ad, "r") as f:
+            info = get_entry_type(f["obs"]["cell_type"])
+            assert info["type"] == "categorical"
+            assert info["version"] == "0.1.0"
+            assert "Legacy" in info["details"]
+
+    def test_get_entry_type_legacy_dataframe(self, sample_legacy_v010_h5ad):
+        """Test type detection for legacy dataframe (v0.1.0)."""
+        with h5py.File(sample_legacy_v010_h5ad, "r") as f:
+            info = get_entry_type(f["obs"])
+            assert info["type"] == "dataframe"
+            assert info["version"] == "0.1.0"
+            assert "legacy" in info["details"].lower()
+
+    def test_read_legacy_categorical_column(self, sample_legacy_v010_h5ad):
+        """Test reading legacy categorical column."""
+        with h5py.File(sample_legacy_v010_h5ad, "r") as f:
+            cache = {}
+            result = read_categorical_column(
+                f["obs"]["cell_type"], 0, 4, cache, f["obs"]
+            )
+            assert result == ["TypeA", "TypeB", "TypeA", "TypeC"]
+
+    def test_col_chunk_legacy_categorical(self, sample_legacy_v010_h5ad):
+        """Test col_chunk_as_strings with legacy categorical column."""
+        with h5py.File(sample_legacy_v010_h5ad, "r") as f:
+            cache = {}
+            result = col_chunk_as_strings(f["obs"], "cell_type", 0, 4, cache)
+            assert result == ["TypeA", "TypeB", "TypeA", "TypeC"]
+
+    def test_col_chunk_legacy_numeric(self, sample_legacy_v010_h5ad):
+        """Test col_chunk_as_strings with legacy numeric column."""
+        with h5py.File(sample_legacy_v010_h5ad, "r") as f:
+            cache = {}
+            result = col_chunk_as_strings(f["obs"], "n_counts", 0, 4, cache)
+            assert result == ["100", "200", "150", "300"]
+
+    def test_legacy_categorical_slice(self, sample_legacy_v010_h5ad):
+        """Test reading slice of legacy categorical column."""
+        with h5py.File(sample_legacy_v010_h5ad, "r") as f:
+            cache = {}
+            result = col_chunk_as_strings(f["obs"], "cell_type", 1, 3, cache)
+            assert result == ["TypeB", "TypeA"]