push/pull: replace dict holding column information with custom class

ilia-kats · ilia-kats · commit 2b26ebf6fc69 · 2025-10-13T11:26:19.000+02:00
diff --git a/src/mudata/_core/mudata.py b/src/mudata/_core/mudata.py
@@ -27,8 +27,8 @@
 from .file_backing import MuDataFileManager
 from .repr import MUDATA_CSS, block_matrix, details_block_table
 from .utils import (
+    MetadataColumn,
     _classify_attr_columns,
-    _classify_prefixed_columns,
     _make_index_unique,
     _maybe_coerce_to_bool,
     _maybe_coerce_to_boolean,
@@ -1940,9 +1940,7 @@ def _pull_attr(
             # - column -> [modname1:column, modname2:column, ...]
             cols = {
                 prefix: [
-                    col
-                    for col in modcols
-                    if col["name"] in columns or col["derived_name"] in columns
+                    col for col in modcols if col.name in columns or col.derived_name in columns
                 ]
                 for prefix, modcols in cols.items()
             }
@@ -1960,15 +1958,15 @@ def _pull_attr(
 
             selector = {"common": common, "nonunique": nonunique, "unique": unique}
             cols = {
-                prefix: [col for col in modcols if selector[col["class"]]]
+                prefix: [col for col in modcols if selector[col.klass]]
                 for prefix, modcols in cols.items()
             }
 
         if mods is not None:
             cols = {prefix: cols[prefix] for prefix in mods}
 
         derived_name_count = Counter(
-            [col["derived_name"] for modcols in cols.values() for col in modcols]
+            [col.derived_name for modcols in cols.values() for col in modcols]
         )
 
         # - axis == self.axis
@@ -2006,24 +2004,24 @@ def _pull_attr(
             mod_map = attrmap[m].ravel()
             mask = mod_map > 0
 
-            mod_df = getattr(mod, attr)[[col["derived_name"] for col in modcols]]
+            mod_df = getattr(mod, attr)[[col.derived_name for col in modcols]]
             if drop:
                 getattr(mod, attr).drop(columns=mod_df.columns, inplace=True)
 
             mod_df.rename(
                 columns={
-                    col["derived_name"]: col["name"]
+                    col.derived_name: col.name
                     for col in modcols
                     if not (
                         (
                             join_common
-                            and col["class"] == "common"
+                            and col.klass == "common"
                             or join_nonunique
-                            and col["class"] == "nonunique"
+                            and col.klass == "nonunique"
                             or not prefix_unique
-                            and col["class"] == "unique"
+                            and col.klass == "unique"
                         )
-                        and derived_name_count[col["derived_name"]] == col["count"]
+                        and derived_name_count[col.derived_name] == col.count
                     )
                 },
                 inplace=True,
@@ -2244,7 +2242,10 @@ def _push_attr(
         if only_drop:
             drop = True
 
-        cols = _classify_prefixed_columns(getattr(self, attr).columns.values, self.mod.keys())
+        cols = [
+            MetadataColumn(allowed_prefixes=self.mod.keys(), name=name)
+            for name in getattr(self, attr).columns
+        ]
 
         if columns is not None:
             for k, v in {"common": common, "prefixed": prefixed}.items():
@@ -2261,23 +2262,23 @@ def _push_attr(
             cols = [
                 col
                 for col in cols
-                if (col["name"] in columns or col["derived_name"] in columns)
-                and (col["prefix"] == "" or mods is not None and col["prefix"] in mods)
+                if (col.name in columns or col.derived_name in columns)
+                and (col.prefix is None or mods is not None and col.prefix in mods)
             ]
         else:
             if common is None:
                 common = True
             if prefixed is None:
                 prefixed = True
 
-            selector = {"common": common, "prefixed": prefixed}
+            selector = {"common": common, "unknown": prefixed}
 
-            cols = [col for col in cols if selector[col["class"]]]
+            cols = [col for col in cols if selector[col.klass]]
 
         if len(cols) == 0:
             return
 
-        derived_name_count = Counter([col["derived_name"] for col in cols])
+        derived_name_count = Counter([col.derived_name for col in cols])
         for c, count in derived_name_count.items():
             # if count > 1, there are both colname and modname:colname present
             if count > 1 and c in getattr(self, attr).columns:
@@ -2299,9 +2300,9 @@ def _push_attr(
             mask = mod_map != 0
             mod_n_attr = mod.n_vars if attr == "var" else mod.n_obs
 
-            mod_cols = [col for col in cols if col["prefix"] == m or col["class"] == "common"]
-            df = getattr(self, attr)[mask].loc[:, [col["name"] for col in mod_cols]]
-            df.columns = [col["derived_name"] for col in mod_cols]
+            mod_cols = [col for col in cols if col.prefix == m or col.klass == "common"]
+            df = getattr(self, attr)[mask].loc[:, [col.name for col in mod_cols]]
+            df.columns = [col.derived_name for col in mod_cols]
 
             df = df.iloc[np.argsort(mod_map[mask])].set_index(np.arange(mod_n_attr))
 
@@ -2316,7 +2317,7 @@ def _push_attr(
 
         if drop:
             for col in cols:
-                getattr(self, attr).drop(col["name"], axis=1, inplace=True)
+                getattr(self, attr).drop(col.name, axis=1, inplace=True)
 
     def push_obs(
         self,
diff --git a/src/mudata/_core/utils.py b/src/mudata/_core/utils.py
@@ -1,6 +1,6 @@
 from collections import Counter
 from collections.abc import Mapping, Sequence
-from typing import TypeVar
+from typing import Literal, TypeVar
 
 import numpy as np
 import pandas as pd
@@ -38,7 +38,56 @@ def _maybe_coerce_to_boolean(df: T) -> T:
     return df
 
 
-def _classify_attr_columns(names: Mapping[str, Sequence[str]]) -> dict[str, list[dict[str, str]]]:
+class MetadataColumn:
+    __slots__ = ("prefix", "derived_name", "count", "_allowed_prefixes")
+
+    def __init__(
+        self,
+        *,
+        allowed_prefixes: Sequence[str],
+        prefix: str | None = None,
+        name: str | None = None,
+        count: int = 0,
+    ):
+        self._allowed_prefixes = allowed_prefixes
+        if prefix is None:
+            self.name = name
+        else:
+            self.prefix = prefix
+            self.derived_name = name
+        self.count = count
+
+    @property
+    def name(self) -> str:
+        if self.prefix is not None:
+            return f"{self.prefix}:{self.derived_name}"
+        else:
+            return self.derived_name
+
+    @name.setter
+    def name(self, new_name):
+        if (
+            len(name_split := new_name.split(":", 1)) < 2
+            or name_split[0] not in self._allowed_prefixes
+        ):
+            self.prefix = None
+            self.derived_name = new_name
+        else:
+            self.prefix, self.derived_name = name_split
+
+    @property
+    def klass(self) -> Literal["common", "unique", "nonunique", "unknown"]:
+        if self.prefix is None or self.count == len(self._allowed_prefixes):
+            return "common"
+        elif self.count == 1:
+            return "unique"
+        elif self.count > 0:
+            return "nonunique"
+        else:
+            return "unknown"
+
+
+def _classify_attr_columns(names: Mapping[str, Sequence[str]]) -> dict[str, list[MetadataColumn]]:
     """
     Classify names into common, non-unique, and unique
     w.r.t. to the list of prefixes.
@@ -50,72 +99,21 @@ def _classify_attr_columns(names: Mapping[str, Sequence[str]]) -> dict[str, list
     - Unique columns are prefixed by modality names,
       and there is only one modality prefix
       for a column with a certain name.
-
-    E.g. {"mod1": ["annotation", "unique"], "mod2": ["annotation"]} will be classified
-    into {"mod1": [{"name": "mod1:annotation", "derived_name": "annotation", "count": 2, "class": "nonunique"},
-                   {"name": "mod1:unique", "derived_name": "unique", "count": 1, "class": "unique"}}],
-          "mod2": [{"name": "mod2:annotation", "derived_name": "annotation", "count": 2, "class": "nonunique"}],
-         }
     """
-    n_mod = len(names)
-    res: dict[str, list[dict[str, str]]] = {}
+    res: dict[str, list[MetadataColumn]] = {}
 
     derived_name_counts = Counter()
-    for prefix, names in names.items():
+    for prefix, pnames in names.items():
         cres = []
-        for name in names:
-            cres.append(
-                {
-                    "name": f"{prefix}:{name}",
-                    "derived_name": name,
-                }
-            )
+        for name in pnames:
+            cres.append(MetadataColumn(allowed_prefixes=names.keys(), prefix=prefix, name=name))
             derived_name_counts[name] += 1
         res[prefix] = cres
 
     for prefix, names in res.items():
         for name_res in names:
-            count = derived_name_counts[name_res["derived_name"]]
-            name_res["count"] = count
-            name_res["class"] = (
-                "common" if count == n_mod else "unique" if count == 1 else "nonunique"
-            )
-
-    return res
-
-
-def _classify_prefixed_columns(
-    names: Sequence[str], prefixes: Sequence[str]
-) -> Sequence[dict[str, str]]:
-    """
-    Classify names into common and prefixed
-    w.r.t. to the list of prefixes.
-
-    - Common columns do not have modality prefixes.
-    - Prefixed columns are prefixed by modality names.
-
-    E.g. ["global", "mod1:annotation", "mod2:annotation", "mod1:unique"] will be classified
-    into [
-        {"name": "global", "prefix": "", "derived_name": "global", "class": "common"},
-        {"name": "mod1:annotation", "prefix": "mod1", "derived_name": "annotation", "class": "prefixed"},
-        {"name": "mod2:annotation", "prefix": "mod2", "derived_name": "annotation", "class": "prefixed"},
-        {"name": "mod1:unique", "prefix": "mod1", "derived_name": "annotation", "class": "prefixed"},
-    ]
-    """
-    res: list[dict[str, str]] = []
-
-    for name in names:
-        if len(name_split := name.split(":", 1)) < 2 or name_split[0] not in prefixes:
-            res.append({"name": name, "prefix": "", "derived_name": name, "class": "common"})
-        else:
-            res.append(
-                {
-                    "name": name,
-                    "prefix": name_split[0],
-                    "derived_name": name_split[1],
-                    "class": "prefixed",
-                }
-            )
+            count = derived_name_counts[name_res.derived_name]
+            name_res.count = count
 
     return res