push_attr fixes

ilia-kats · ilia-kats · commit ca406d75b24c · 2025-10-10T09:07:45.000+02:00
- don't raise exception if mods is used together with common, nonunique,
  or unique: there is nothing in the logic preventing it
- don't raise if columns is used together with common, nonunique, or
  unique, warn instead
- fix ordering of pushed column
- minor code cleanup
diff --git a/src/mudata/_core/mudata.py b/src/mudata/_core/mudata.py
@@ -1920,32 +1920,33 @@ def _pull_attr(
                 raise ValueError("All mods should be present in mdata.mod")
             elif len(mods) == self.n_mod:
                 mods = None
-            for k, v in {"common": common, "nonunique": nonunique, "unique": unique}.items():
-                assert v is None, f"Cannot use mods with {k}."
 
         if only_drop:
             drop = True
 
         cols = _classify_attr_columns(
-            np.concatenate(
-                [
-                    [f"{m}:{val}" for val in getattr(mod, attr).columns.values]
-                    for m, mod in self.mod.items()
-                ]
-            ),
-            self.mod.keys(),
+            {modname: getattr(mod, attr).columns for modname, mod in self.mod.items()}
         )
 
         if columns is not None:
             for k, v in {"common": common, "nonunique": nonunique, "unique": unique}.items():
-                assert v is None, f"Cannot use {k} with columns."
+                if v is not None:
+                    warnings.warn(
+                        f"Both columns and {k} given. Columns take precedence, {k} will be ignored",
+                        RuntimeWarning,
+                        stacklevel=2,
+                    )
 
             # - modname1:column -> [modname1:column]
             # - column -> [modname1:column, modname2:column, ...]
-            cols = [col for col in cols if col["name"] in columns or col["derived_name"] in columns]
-
-            if mods is not None:
-                cols = [col for col in cols if col["prefix"] in mods]
+            cols = {
+                prefix: [
+                    col
+                    for col in modcols
+                    if col["name"] in columns or col["derived_name"] in columns
+                ]
+                for prefix, modcols in cols.items()
+            }
 
             # TODO: Counter for columns in order to track their usage
             # and error out if some columns were not used
@@ -1959,10 +1960,17 @@ def _pull_attr(
                 unique = True
 
             selector = {"common": common, "nonunique": nonunique, "unique": unique}
+            cols = {
+                prefix: [col for col in modcols if selector[col["class"]]]
+                for prefix, modcols in cols.items()
+            }
 
-            cols = [col for col in cols if selector[col["class"]]]
+        if mods is not None:
+            cols = {prefix: cols[prefix] for prefix in mods}
 
-        derived_name_count = Counter([col["derived_name"] for col in cols])
+        derived_name_count = Counter(
+            [col["derived_name"] for modcols in cols.values() for col in modcols]
+        )
 
         # - axis == self.axis
         #   e.g. combine var from multiple modalities (with unique vars)
@@ -1995,44 +2003,36 @@ def _pull_attr(
         n_attr = self.n_vars if attr == "var" else self.n_obs
 
         dfs: list[pd.DataFrame] = []
-        for m, mod in self.mod.items():
-            if mods is not None and m not in mods:
-                continue
+        for m, modcols in cols.items():
+            mod = self.mod[m]
             mod_map = attrmap[m].ravel()
-            mod_n_attr = mod.n_vars if attr == "var" else mod.n_obs
-            mask = mod_map != 0
-
-            mod_df = getattr(mod, attr)
-            mod_columns = [
-                col["derived_name"] for col in cols if col["prefix"] == "" or col["prefix"] == m
-            ]
-            mod_df = mod_df[mod_df.columns.intersection(mod_columns)]
+            mask = mod_map > 0
 
+            mod_df = getattr(mod, attr)[[col["derived_name"] for col in modcols]]
             if drop:
                 getattr(mod, attr).drop(columns=mod_df.columns, inplace=True)
 
-            # Don't use modname: prefix if columns need to be joined
-            if join_common or join_nonunique or (not prefix_unique):
-                cols_special = [
-                    col["derived_name"]
-                    for col in cols
-                    if (
-                        (col["class"] == "common") & join_common
-                        or (col["class"] == "nonunique") & join_nonunique
-                        or (col["class"] == "unique") & (not prefix_unique)
+            mod_df.rename(
+                columns={
+                    col["derived_name"]: col["name"]
+                    for col in modcols
+                    if not (
+                        (
+                            join_common
+                            and col["class"] == "common"
+                            or join_nonunique
+                            and col["class"] == "nonunique"
+                            or not prefix_unique
+                            and col["class"] == "unique"
+                        )
+                        and derived_name_count[col["derived_name"]] == col["count"]
                     )
-                    and col["prefix"] == m
-                    and derived_name_count[col["derived_name"]] == col["count"]
-                ]
-                mod_df.columns = [
-                    col if col in cols_special else f"{m}:{col}" for col in mod_df.columns
-                ]
-            else:
-                mod_df.columns = [f"{m}:{col}" for col in mod_df.columns]
+                },
+                inplace=True,
+            )
 
             mod_df = (
                 _maybe_coerce_to_boolean(mod_df)
-                .set_index(np.arange(mod_n_attr))
                 .iloc[mod_map[mask] - 1]
                 .set_index(np.arange(n_attr)[mask])
                 .reindex(np.arange(n_attr))
@@ -2297,19 +2297,15 @@ def _push_attr(
             if mods is not None and m not in mods:
                 continue
 
-            mod_map = attrmap[m]
+            mod_map = attrmap[m].ravel()
             mask = mod_map != 0
             mod_n_attr = mod.n_vars if attr == "var" else mod.n_obs
 
             mod_cols = [col for col in cols if col["prefix"] == m or col["class"] == "common"]
             df = getattr(self, attr)[mask].loc[:, [col["name"] for col in mod_cols]]
             df.columns = [col["derived_name"] for col in mod_cols]
 
-            df = (
-                df.set_index(np.arange(mod_n_attr))
-                .iloc[mod_map[mask] - 1]
-                .set_index(np.arange(mod_n_attr))
-            )
+            df = df.iloc[np.argsort(mod_map[mask])].set_index(np.arange(mod_n_attr))
 
             if not only_drop:
                 # TODO: _maybe_coerce_to_bool
diff --git a/src/mudata/_core/utils.py b/src/mudata/_core/utils.py
@@ -1,5 +1,5 @@
 from collections import Counter
-from collections.abc import Sequence
+from collections.abc import Mapping, Sequence
 from typing import TypeVar
 
 import numpy as np
@@ -38,9 +38,7 @@ def _maybe_coerce_to_boolean(df: T) -> T:
     return df
 
 
-def _classify_attr_columns(
-    names: Sequence[str], prefixes: Sequence[str]
-) -> Sequence[dict[str, str]]:
+def _classify_attr_columns(names: Mapping[str, Sequence[str]]) -> dict[str, list[dict[str, str]]]:
     """
     Classify names into common, non-unique, and unique
     w.r.t. to the list of prefixes.
@@ -53,50 +51,35 @@ def _classify_attr_columns(
       and there is only one modality prefix
       for a column with a certain name.
 
-    E.g. ["global", "mod1:annotation", "mod2:annotation", "mod1:unique"] will be classified
-    into [
-        {"name": "global", "prefix": "", "derived_name": "global", "count": 1, "class": "common"},
-        {"name": "mod1:annotation", "prefix": "mod1", "derived_name": "annotation", "count": 2, "class": "nonunique"},
-        {"name": "mod2:annotation", "prefix": "mod2", "derived_name": "annotation", "count": 2, "class": "nonunique"},
-        {"name": "mod1:unique", "prefix": "mod1", "derived_name": "annotation", "count": 2, "class": "unique"},
-    ]
+    E.g. {"mod1": ["annotation", "unique"], "mod2": ["annotation"]} will be classified
+    into {"mod1": [{"name": "mod1:annotation", "derived_name": "annotation", "count": 2, "class": "nonunique"},
+                   {"name": "mod1:unique", "derived_name": "unique", "count": 1, "class": "unique"}}],
+          "mod2": [{"name": "mod2:annotation", "derived_name": "annotation", "count": 2, "class": "nonunique"}],
+         }
     """
-    n_mod = len(prefixes)
-    res: list[dict[str, str]] = []
-
-    for name in names:
-        name_common = {
-            "name": name,
-            "prefix": "",
-            "derived_name": name,
-        }
-        name_split = name.split(":", 1)
-
-        if len(name_split) < 2:
-            res.append(name_common)
-        else:
-            maybe_modname, derived_name = name_split
-
-            if maybe_modname in prefixes:
-                name_prefixed = {
-                    "name": name,
-                    "prefix": maybe_modname,
-                    "derived_name": derived_name,
+    n_mod = len(names)
+    res: dict[str, list[dict[str, str]]] = {}
+
+    derived_name_counts = Counter()
+    for prefix, names in names.items():
+        cres = []
+        for name in names:
+            cres.append(
+                {
+                    "name": f"{prefix}:{name}",
+                    "derived_name": name,
                 }
-                res.append(name_prefixed)
-            else:
-                res.append(name_common)
-
-    derived_name_counts = Counter(name_res["derived_name"] for name_res in res)
-    for name_res in res:
-        name_res["count"] = derived_name_counts[name_res["derived_name"]]
-
-    for name_res in res:
-        name_res["class"] = (
-            "common"
-            if name_res["count"] == n_mod
-            else "unique" if name_res["count"] == 1 else "nonunique"
-        )
+            )
+            derived_name_counts[name] += 1
+        res[prefix] = cres
+
+    for prefix, names in res.items():
+        for name_res in names:
+            count = derived_name_counts[name_res["derived_name"]]
+            name_res["count"] = count
+            name_res["class"] = (
+                "common" if count == n_mod else "unique" if count == 1 else "nonunique"
+            )
 
     return res
 
@@ -138,7 +121,7 @@ def _classify_prefixed_columns(
 
 
 def _update_and_concat(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
-    df = df1.copy()
+    df = df1.copy(deep=False)
     # This converts boolean to object dtype, unfortunately
     # df.update(df2)
     common_cols = df1.columns.intersection(df2.columns)
diff --git a/tests/test_pull_push.py b/tests/test_pull_push.py
@@ -5,7 +5,7 @@
 import pytest
 from anndata import AnnData
 
-from mudata import MuData
+from mudata import MuData, set_options
 
 
 @pytest.fixture()
@@ -21,7 +21,8 @@ def modalities(request, obs_n, var_unique):
         mods[m].var["mod"] = m
 
         # common column
-        mods[m].var["highly_variable"] = np.tile([False, True], mods[m].n_vars // 2)
+        mods[m].var["highly_variable"] = np.random.choice([False, True], size=mods[m].n_vars)
+        mods[m].obs["common_obs_col"] = np.random.randint(0, int(1e6), size=mods[m].n_obs)
 
         if var_unique:
             mods[m].var_names = [f"mod{m}_var{j}" for j in range(mods[m].n_vars)]
@@ -88,7 +89,6 @@ def test_pull_var(self, modalities):
         """
         mdata = MuData(modalities)
         mdata.update()
-
         mdata.pull_var()
 
         assert "mod" in mdata.var.columns
@@ -165,6 +165,15 @@ def test_pull_obs_simple(self, modalities):
         for m in mdata.mod.keys():
             assert f"{m}:mod" in mdata.obs.columns
 
+            assert f"{m}:common_obs_col" in mdata.obs.columns
+
+            modmap = mdata.obsmap[m].ravel()
+            mask = modmap > 0
+            assert (
+                mdata.obs[f"{m}:common_obs_col"][mask].to_numpy()
+                == mdata.mod[m].obs["common_obs_col"].to_numpy()[modmap[mask] - 1]
+            ).all()
+
         # join_common shouldn't work
         with pytest.raises(ValueError, match="shared obs_names"):
             mdata.pull_obs(join_common=True)
@@ -182,14 +191,24 @@ def test_push_var_simple(self, modalities):
         mdata = MuData(modalities)
         mdata.update()
 
-        mdata.var["pushed"] = True
-        mdata.var["mod2:mod2_pushed"] = True
+        mdata.var["pushed"] = np.random.randint(0, int(1e6), size=mdata.n_var)
+        mdata.var["mod2:mod2_pushed"] = np.random.randint(0, int(1e6), size=mdata.n_var)
         mdata.push_var()
 
         # pushing should work
-        for mod in mdata.mod.values():
+        for modname, mod in mdata.mod.items():
             assert "pushed" in mod.var.columns
+
+            map = mdata.varmap[modname].ravel()
+            mask = map > 0
+            assert (mdata.var["pushed"][mask] == mod.var["pushed"][map[mask] - 1]).all()
+
         assert "mod2_pushed" in mdata["mod2"].var.columns
+        map = mdata.varmap["mod2"].ravel()
+        mask = map > 0
+        assert (
+            mdata.var["mod2:mod2_pushed"][mask] == mdata["mod2"].var["mod2_pushed"][map[mask] - 1]
+        ).all()
 
     @pytest.mark.parametrize("var_unique", [True, False])
     @pytest.mark.parametrize("obs_n", ["joint", "disjoint"])
@@ -200,14 +219,24 @@ def test_push_obs_simple(self, modalities):
         mdata = MuData(modalities)
         mdata.update()
 
-        mdata.obs["pushed"] = True
-        mdata.obs["mod2:mod2_pushed"] = True
+        mdata.obs["pushed"] = np.random.randint(0, int(1e6), size=mdata.n_obs)
+        mdata.obs["mod2:mod2_pushed"] = np.random.randint(0, int(1e6), size=mdata.n_obs)
         mdata.push_obs()
 
         # pushing should work
-        for mod in mdata.mod.values():
+        for modname, mod in mdata.mod.items():
             assert "pushed" in mod.obs.columns
+
+            map = mdata.obsmap[modname].ravel()
+            mask = map > 0
+            assert (mdata.obs["pushed"][mask] == mod.obs["pushed"][map[mask] - 1]).all()
+
         assert "mod2_pushed" in mdata["mod2"].obs.columns
+        map = mdata.obsmap["mod2"].ravel()
+        mask = map > 0
+        assert (
+            mdata.obs["mod2:mod2_pushed"][mask] == mdata["mod2"].obs["mod2_pushed"][map[mask] - 1]
+        ).all()
 
 
 @pytest.mark.usefixtures("filepath_h5mu")