Skip to content

Commit 872f8c2

Browse files
timtreispre-commit-ci[bot]CopilotLucaMarconatomelonora
authored
Helper function to sanitize tables (#935)
* added helper func * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * tests * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * mypy * Update src/spatialdata/_utils.py Co-authored-by: Copilot <[email protected]> * fix sanitize edge case; add to docs * Apply suggestions from code review * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * move sanite_tables into test directly * remove unnecessary used_names --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Copilot <[email protected]> Co-authored-by: Luca Marconato <[email protected]> Co-authored-by: Wouter-Michiel Vierdag <[email protected]> Co-authored-by: Wouter-Michiel Vierdag <[email protected]>
1 parent 60be9ce commit 872f8c2

File tree

5 files changed

+356
-1
lines changed

5 files changed

+356
-1
lines changed

docs/api/operations.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,6 @@ Operations on `SpatialData` objects.
2929
.. autofunction:: are_extents_equal
3030
.. autofunction:: deepcopy
3131
.. autofunction:: get_pyramid_levels
32+
.. autofunction:: sanitize_name
33+
.. autofunction:: sanitize_table
3234
```

src/spatialdata/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,13 @@
5353
"relabel_sequential",
5454
"map_raster",
5555
"deepcopy",
56+
"sanitize_table",
57+
"sanitize_name",
5658
]
5759

5860
from spatialdata import dataloader, datasets, models, transformations
5961
from spatialdata._core._deepcopy import deepcopy
62+
from spatialdata._core._utils import sanitize_name, sanitize_table
6063
from spatialdata._core.centroids import get_centroids
6164
from spatialdata._core.concatenate import concatenate
6265
from spatialdata._core.data_extent import are_extents_equal, get_extent

src/spatialdata/_core/_utils.py

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
1+
from __future__ import annotations
2+
13
from collections.abc import Iterable
24

5+
from anndata import AnnData
6+
37
from spatialdata._core.spatialdata import SpatialData
48

59

@@ -25,3 +29,138 @@ def _find_common_table_keys(sdatas: Iterable[SpatialData]) -> set[str]:
2529
common_keys.intersection_update(sdata.tables.keys())
2630

2731
return common_keys
32+
33+
34+
def sanitize_name(name: str, is_dataframe_column: bool = False) -> str:
35+
"""
36+
Sanitize a name to comply with SpatialData naming rules.
37+
38+
This function converts invalid names into valid ones by:
39+
1. Converting to string if not already
40+
2. Removing invalid characters
41+
3. Handling special cases like "__" prefix
42+
4. Ensuring the name is not empty
43+
5. Handling special cases for dataframe columns
44+
45+
See a discussion on the naming rules, and how to avoid naming collisions, here:
46+
https://github.com/scverse/spatialdata/discussions/707
47+
48+
Parameters
49+
----------
50+
name
51+
The name to sanitize
52+
is_dataframe_column
53+
Whether this name is for a dataframe column (additional restrictions apply)
54+
55+
Returns
56+
-------
57+
A sanitized version of the name that complies with SpatialData naming rules. If a
58+
santized name cannoted be generated, it returns "unnamed".
59+
60+
Examples
61+
--------
62+
>>> sanitize_name("my@invalid#name")
63+
'my_invalid_name'
64+
>>> sanitize_name("__private")
65+
'private'
66+
>>> sanitize_name("_index", is_dataframe_column=True)
67+
'index'
68+
"""
69+
# Convert to string if not already
70+
name = str(name)
71+
72+
# Handle empty string case
73+
if not name:
74+
return "unnamed"
75+
76+
# Handle special cases
77+
if name in {".", ".."}:
78+
return "unnamed"
79+
80+
sanitized = "".join(char if char.isalnum() or char in "_-." else "_" for char in name)
81+
82+
# remove double underscores if found as a prefix
83+
while sanitized.startswith("__"):
84+
sanitized = sanitized[1:]
85+
86+
if is_dataframe_column and sanitized == "_index":
87+
return "index"
88+
89+
# Ensure we don't end up with an empty string after sanitization
90+
return sanitized or "unnamed"
91+
92+
93+
def sanitize_table(data: AnnData, inplace: bool = True) -> AnnData | None:
94+
"""
95+
Sanitize all keys in an AnnData table to comply with SpatialData naming rules.
96+
97+
This function sanitizes all keys in obs, var, obsm, obsp, varm, varp, uns, and layers
98+
while maintaining case-insensitive uniqueness. It can either modify the table in-place
99+
or return a new sanitized copy.
100+
101+
See a discussion on the naming rules here:
102+
https://github.com/scverse/spatialdata/discussions/707
103+
104+
Parameters
105+
----------
106+
data
107+
The AnnData table to sanitize
108+
inplace
109+
Whether to modify the table in-place or return a new copy
110+
111+
Returns
112+
-------
113+
If inplace is False, returns a new AnnData object with sanitized keys.
114+
If inplace is True, returns None as the original object is modified.
115+
116+
Examples
117+
--------
118+
>>> import anndata as ad
119+
>>> adata = ad.AnnData(obs=pd.DataFrame({"@invalid#": [1, 2]}))
120+
>>> # Create a new sanitized copy
121+
>>> sanitized = sanitize_table(adata)
122+
>>> print(sanitized.obs.columns)
123+
Index(['invalid_'], dtype='object')
124+
>>> # Or modify in-place
125+
>>> sanitize_table(adata, inplace=True)
126+
>>> print(adata.obs.columns)
127+
Index(['invalid_'], dtype='object')
128+
"""
129+
import copy
130+
from collections import defaultdict
131+
132+
# Create a deep copy if not modifying in-place
133+
sanitized = data if inplace else copy.deepcopy(data)
134+
135+
# Track used names to maintain case-insensitive uniqueness
136+
used_names_lower: dict[str, set[str]] = defaultdict(set)
137+
138+
def get_unique_name(name: str, attr: str, is_dataframe_column: bool = False) -> str:
139+
base_name = sanitize_name(name, is_dataframe_column)
140+
normalized_base = base_name.lower()
141+
142+
# If this exact name is already used, add a number
143+
if normalized_base in used_names_lower[attr]:
144+
counter = 1
145+
while f"{base_name}_{counter}".lower() in used_names_lower[attr]:
146+
counter += 1
147+
base_name = f"{base_name}_{counter}"
148+
149+
used_names_lower[attr].add(base_name.lower())
150+
return base_name
151+
152+
# Handle obs and var (dataframe columns)
153+
for attr in ("obs", "var"):
154+
df = getattr(sanitized, attr)
155+
new_columns = {old: get_unique_name(old, attr, is_dataframe_column=True) for old in df.columns}
156+
df.rename(columns=new_columns, inplace=True)
157+
158+
# Handle other attributes
159+
for attr in ("obsm", "obsp", "varm", "varp", "uns", "layers"):
160+
d = getattr(sanitized, attr)
161+
new_keys = {old: get_unique_name(old, attr) for old in d}
162+
# Create new dictionary with sanitized keys
163+
new_dict = {new_keys[old]: value for old, value in d.items()}
164+
setattr(sanitized, attr, new_dict)
165+
166+
return None if inplace else sanitized

src/spatialdata/_core/validation.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -379,5 +379,8 @@ def __exit__(
379379
return False
380380
# Exceptions were collected that we want to raise as a combined validation error.
381381
if self._collector.errors:
382-
raise ValidationError(title=self._message, errors=self._collector.errors)
382+
raise ValidationError(
383+
title=self._message + "\nTo fix, run `spatialdata.utils.sanitize_table(adata)`.",
384+
errors=self._collector.errors,
385+
)
383386
return True

tests/utils/test_sanitize.py

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
from __future__ import annotations
2+
3+
import numpy as np
4+
import pandas as pd
5+
import pytest
6+
from anndata import AnnData
7+
8+
from spatialdata import SpatialData
9+
from spatialdata._core._utils import sanitize_name, sanitize_table
10+
11+
12+
@pytest.fixture
13+
def invalid_table() -> AnnData:
14+
"""AnnData with invalid obs column names to test basic sanitization."""
15+
return AnnData(
16+
obs=pd.DataFrame(
17+
{
18+
"@invalid#": [1, 2],
19+
"valid_name": [3, 4],
20+
"__private": [5, 6],
21+
}
22+
)
23+
)
24+
25+
26+
@pytest.fixture
27+
def invalid_table_with_index() -> AnnData:
28+
"""AnnData with a name requiring whitespace→underscore and a dataframe index column."""
29+
return AnnData(
30+
obs=pd.DataFrame(
31+
{
32+
"invalid name": [1, 2],
33+
"_index": [3, 4],
34+
}
35+
)
36+
)
37+
38+
39+
# -----------------------------------------------------------------------------
40+
# sanitize_name tests
41+
# -----------------------------------------------------------------------------
42+
43+
44+
@pytest.mark.parametrize(
45+
"raw,expected",
46+
[
47+
("valid_name", "valid_name"),
48+
("valid-name", "valid-name"),
49+
("valid.name", "valid.name"),
50+
("invalid@name", "invalid_name"),
51+
("invalid#name", "invalid_name"),
52+
("invalid name", "invalid_name"),
53+
("", "unnamed"),
54+
(".", "unnamed"),
55+
("..", "unnamed"),
56+
("__", "_"),
57+
("___", "_"),
58+
("____#@$@", "_"),
59+
("__private", "_private"),
60+
],
61+
)
62+
def test_sanitize_name_strips_special_chars(raw, expected):
63+
assert sanitize_name(raw) == expected
64+
65+
66+
@pytest.mark.parametrize(
67+
"raw,is_df_col,expected",
68+
[
69+
("_index", True, "index"),
70+
("_index", False, "_index"),
71+
("valid@column", True, "valid_column"),
72+
("__private", True, "_private"),
73+
],
74+
)
75+
def test_sanitize_name_dataframe_column(raw, is_df_col, expected):
76+
assert sanitize_name(raw, is_dataframe_column=is_df_col) == expected
77+
78+
79+
# -----------------------------------------------------------------------------
80+
# sanitize_table basic behaviors
81+
# -----------------------------------------------------------------------------
82+
83+
84+
def test_sanitize_table_basic_columns(invalid_table, invalid_table_with_index):
85+
ad1 = sanitize_table(invalid_table, inplace=False)
86+
assert isinstance(ad1, AnnData)
87+
assert list(ad1.obs.columns) == ["_invalid_", "valid_name", "_private"]
88+
89+
ad2 = sanitize_table(invalid_table_with_index, inplace=False)
90+
assert list(ad2.obs.columns) == ["invalid_name", "index"]
91+
92+
# original fixture remains unchanged
93+
assert list(invalid_table.obs.columns) == ["@invalid#", "valid_name", "__private"]
94+
95+
96+
def test_sanitize_table_inplace_copy(invalid_table):
97+
ad = invalid_table.copy()
98+
sanitize_table(ad) # inplace=True is now default
99+
assert list(ad.obs.columns) == ["_invalid_", "valid_name", "_private"]
100+
101+
102+
def test_sanitize_table_case_insensitive_collisions():
103+
obs = pd.DataFrame(
104+
{
105+
"Column1": [1, 2],
106+
"column1": [3, 4],
107+
"COLUMN1": [5, 6],
108+
}
109+
)
110+
ad = AnnData(obs=obs)
111+
sanitized = sanitize_table(ad, inplace=False)
112+
cols = list(sanitized.obs.columns)
113+
assert sorted(cols) == sorted(["Column1", "column1_1", "COLUMN1_2"])
114+
115+
116+
def test_sanitize_table_whitespace_collision():
117+
"""Ensure 'a b' → 'a_b' doesn't collide silently with existing 'a_b'."""
118+
obs = pd.DataFrame({"a b": [1], "a_b": [2]})
119+
ad = AnnData(obs=obs)
120+
sanitized = sanitize_table(ad, inplace=False)
121+
cols = list(sanitized.obs.columns)
122+
assert "a_b" in cols
123+
assert "a_b_1" in cols
124+
125+
126+
# -----------------------------------------------------------------------------
127+
# sanitize_table attribute‐specific tests
128+
# -----------------------------------------------------------------------------
129+
130+
131+
def test_sanitize_table_obs_and_obs_columns():
132+
ad = AnnData(obs=pd.DataFrame({"@col": [1, 2]}))
133+
sanitized = sanitize_table(ad, inplace=False)
134+
assert list(sanitized.obs.columns) == ["_col"]
135+
136+
137+
def test_sanitize_table_obsm_and_obsp():
138+
ad = AnnData(obs=pd.DataFrame({"@col": [1, 2]}))
139+
ad.obsm["@col"] = np.array([[1, 2], [3, 4]])
140+
ad.obsp["bad name"] = np.array([[1, 2], [3, 4]])
141+
sanitized = sanitize_table(ad, inplace=False)
142+
assert list(sanitized.obsm.keys()) == ["_col"]
143+
assert list(sanitized.obsp.keys()) == ["bad_name"]
144+
145+
146+
def test_sanitize_table_varm_and_varp():
147+
ad = AnnData(obs=pd.DataFrame({"x": [1, 2]}), var=pd.DataFrame(index=["v1", "v2"]))
148+
ad.varm["__priv"] = np.array([[1, 2], [3, 4]])
149+
ad.varp["_index"] = np.array([[1, 2], [3, 4]])
150+
sanitized = sanitize_table(ad, inplace=False)
151+
assert list(sanitized.varm.keys()) == ["_priv"]
152+
assert list(sanitized.varp.keys()) == ["_index"]
153+
154+
155+
def test_sanitize_table_uns_and_layers():
156+
ad = AnnData(obs=pd.DataFrame({"x": [1, 2]}), var=pd.DataFrame(index=["v1", "v2"]))
157+
ad.uns["bad@key"] = "val"
158+
ad.layers["bad#layer"] = np.array([[0, 1], [1, 0]])
159+
sanitized = sanitize_table(ad, inplace=False)
160+
assert list(sanitized.uns.keys()) == ["bad_key"]
161+
assert list(sanitized.layers.keys()) == ["bad_layer"]
162+
163+
164+
def test_sanitize_table_empty_returns_empty():
165+
ad = AnnData()
166+
sanitized = sanitize_table(ad, inplace=False)
167+
assert isinstance(sanitized, AnnData)
168+
assert sanitized.obs.empty
169+
assert sanitized.var.empty
170+
171+
172+
def test_sanitize_table_preserves_underlying_data():
173+
ad = AnnData(obs=pd.DataFrame({"@invalid#": [1, 2], "valid": [3, 4]}))
174+
ad.obsm["@invalid#"] = np.array([[1, 2], [3, 4]])
175+
ad.uns["invalid@key"] = "value"
176+
sanitized = sanitize_table(ad, inplace=False)
177+
assert sanitized.obs["_invalid_"].tolist() == [1, 2]
178+
assert sanitized.obs["valid"].tolist() == [3, 4]
179+
assert np.array_equal(sanitized.obsm["_invalid_"], np.array([[1, 2], [3, 4]]))
180+
assert sanitized.uns["invalid_key"] == "value"
181+
182+
183+
# -----------------------------------------------------------------------------
184+
# SpatialData integration
185+
# -----------------------------------------------------------------------------
186+
187+
188+
def test_sanitize_table_in_spatialdata_sanitized_fixture(invalid_table, invalid_table_with_index):
189+
table1 = invalid_table.copy()
190+
table2 = invalid_table_with_index.copy()
191+
sanitize_table(table1)
192+
sanitize_table(table2)
193+
sdata_sanitized_tables = SpatialData(tables={"table1": table1, "table2": table2})
194+
195+
t1 = sdata_sanitized_tables.tables["table1"]
196+
t2 = sdata_sanitized_tables.tables["table2"]
197+
assert list(t1.obs.columns) == ["_invalid_", "valid_name", "_private"]
198+
assert list(t2.obs.columns) == ["invalid_name", "index"]
199+
200+
201+
def test_spatialdata_retains_other_elements(full_sdata):
202+
# Add another sanitized table into an existing full_sdata
203+
tbl = AnnData(obs=pd.DataFrame({"@foo#": [1, 2], "bar": [3, 4]}))
204+
sanitize_table(tbl)
205+
full_sdata.tables["new_table"] = tbl
206+
207+
# Verify columns and presence of other SpatialData attributes
208+
assert list(full_sdata.tables["new_table"].obs.columns) == ["_foo_", "bar"]

0 commit comments

Comments
 (0)