fix: support multiindexed and arbitrarly-named dimensions for grouping (#373)

FabianHofmann · web-flow · commit 62cdebb1bdf5 · 2024-11-06T16:01:31.000+01:00
diff --git a/doc/release_notes.rst b/doc/release_notes.rst
@@ -6,6 +6,8 @@ Upcoming Version
 
 * When writing out an LP file, large variables and constraints are now chunked to avoid memory issues. This is especially useful for large models with constraints with many terms. The chunk size can be set with the `slice_size` argument in the `solve` function.
 * Constraints which of the form `<= infinity` and `>= -infinity` are now automatically filtered out when solving. The `solve` function now has a new argument `sanitize_infinities` to control this feature. Default is set to `True`.
+* Grouping expressions is now supported on dimensions called "group" and dimensions that have the same name as the grouping object.
+* Grouping dimensions which have multiindexed coordinates is now supported.
 
 Version 0.3.15
 --------------
diff --git a/linopy/constants.py b/linopy/constants.py
@@ -36,6 +36,7 @@
 TERM_DIM = "_term"
 STACKED_TERM_DIM = "_stacked_term"
 GROUPED_TERM_DIM = "_grouped_term"
+GROUP_DIM = "_group"
 FACTOR_DIM = "_factor"
 CONCAT_DIM = "_concat"
 HELPER_DIMS = [TERM_DIM, STACKED_TERM_DIM, GROUPED_TERM_DIM, FACTOR_DIM, CONCAT_DIM]
diff --git a/linopy/expressions.py b/linopy/expressions.py
@@ -62,6 +62,7 @@
     EQUAL,
     FACTOR_DIM,
     GREATER_EQUAL,
+    GROUP_DIM,
     GROUPED_TERM_DIM,
     HELPER_DIMS,
     LESS_EQUAL,
@@ -218,42 +219,43 @@ def sum(self, use_fallback: bool = False, **kwargs) -> LinearExpression:
             group: pd.Series | pd.DataFrame | xr.DataArray = self.group
             if isinstance(group, pd.DataFrame):
                 # dataframes do not have a name, so we need to set it
-                group_name = "group"
+                final_group_name = "group"
             else:
-                group_name = getattr(group, "name", "group") or "group"
+                final_group_name = getattr(group, "name", "group") or "group"
 
             if isinstance(group, DataArray):
                 group = group.to_pandas()
 
             int_map = None
             if isinstance(group, pd.DataFrame):
+                index_name = group.index.name
                 group = group.reindex(self.data.indexes[group.index.name])
+                group.index.name = index_name  # ensure name for multiindex
                 int_map = get_index_map(*group.values.T)
                 orig_group = group
                 group = group.apply(tuple, axis=1).map(int_map)
 
             group_dim = group.index.name
-            if group_name == group_dim:
-                raise ValueError(
-                    "Group name cannot be the same as group dimension in non-fallback mode."
-                )
 
             arrays = [group, group.groupby(group).cumcount()]
-            idx = pd.MultiIndex.from_arrays(
-                arrays, names=[group_name, GROUPED_TERM_DIM]
-            )
-            coords = Coordinates.from_pandas_multiindex(idx, group_dim)
-            ds = self.data.assign_coords(coords)
+            idx = pd.MultiIndex.from_arrays(arrays, names=[GROUP_DIM, GROUPED_TERM_DIM])
+            new_coords = Coordinates.from_pandas_multiindex(idx, group_dim)
+            coords = self.data.indexes[group_dim]
+            names_to_drop = [coords.name]
+            if isinstance(coords, pd.MultiIndex):
+                names_to_drop += list(coords.names)
+            ds = self.data.drop_vars(names_to_drop).assign_coords(new_coords)
             ds = ds.unstack(group_dim, fill_value=LinearExpression._fill_value)
             ds = LinearExpression._sum(ds, dim=GROUPED_TERM_DIM)
 
             if int_map is not None:
-                index = ds.indexes["group"].map({v: k for k, v in int_map.items()})
+                index = ds.indexes[GROUP_DIM].map({v: k for k, v in int_map.items()})
                 index.names = [str(col) for col in orig_group.columns]
-                index.name = group_name
-                coords = Coordinates.from_pandas_multiindex(index, group_name)
-                ds = xr.Dataset(ds.assign_coords(coords))
+                index.name = GROUP_DIM
+                new_coords = Coordinates.from_pandas_multiindex(index, GROUP_DIM)
+                ds = xr.Dataset(ds.assign_coords(new_coords))
 
+            ds = ds.rename({GROUP_DIM: final_group_name})
             return LinearExpression(ds, self.model)
 
         def func(ds):
@@ -1428,6 +1430,8 @@ def to_polars(self) -> pl.DataFrame:
 
     drop = exprwrap(Dataset.drop)
 
+    drop_vars = exprwrap(Dataset.drop_vars)
+
     drop_sel = exprwrap(Dataset.drop_sel)
 
     drop_isel = exprwrap(Dataset.drop_isel)
@@ -1452,6 +1456,8 @@ def to_polars(self) -> pl.DataFrame:
 
     rename = exprwrap(Dataset.rename)
 
+    reset_index = exprwrap(Dataset.reset_index)
+
     rename_dims = exprwrap(Dataset.rename_dims)
 
     roll = exprwrap(Dataset.roll)
diff --git a/test/test_linear_expression.py b/test/test_linear_expression.py
@@ -668,6 +668,17 @@ def test_linear_expression_diff(v):
 
 @pytest.mark.parametrize("use_fallback", [True, False])
 def test_linear_expression_groupby(v, use_fallback):
+    expr = 1 * v
+    dim = v.dims[0]
+    groups = xr.DataArray([1] * 10 + [2] * 10, coords=v.coords, name=dim)
+    grouped = expr.groupby(groups).sum(use_fallback=use_fallback)
+    assert dim in grouped.dims
+    assert (grouped.data[dim] == [1, 2]).all()
+    assert grouped.nterm == 10
+
+
+@pytest.mark.parametrize("use_fallback", [True, False])
+def test_linear_expression_groupby_on_same_name_as_target_dim(v, use_fallback):
     expr = 1 * v
     groups = xr.DataArray([1] * 10 + [2] * 10, coords=v.coords)
     grouped = expr.groupby(groups).sum(use_fallback=use_fallback)
@@ -719,20 +730,31 @@ def test_linear_expression_groupby_series_with_name(v, use_fallback):
 
 
 @pytest.mark.parametrize("use_fallback", [True, False])
-def test_linear_expression_groupby_with_series_false(v, use_fallback):
+def test_linear_expression_groupby_with_series_with_same_group_name(v, use_fallback):
+    """
+    Test that the group by works with a series whose name is the same as
+    the dimension to group.
+    """
     expr = 1 * v
     groups = pd.Series([1] * 10 + [2] * 10, index=v.indexes["dim_2"])
     groups.name = "dim_2"
-    if not use_fallback:
-        with pytest.raises(ValueError):
-            expr.groupby(groups).sum(use_fallback=use_fallback)
-        return
     grouped = expr.groupby(groups).sum(use_fallback=use_fallback)
     assert "dim_2" in grouped.dims
     assert (grouped.data.dim_2 == [1, 2]).all()
     assert grouped.nterm == 10
 
 
+@pytest.mark.parametrize("use_fallback", [True, False])
+def test_linear_expression_groupby_with_series_on_multiindex(u, use_fallback):
+    expr = 1 * u
+    len_grouped_dim = len(u.data["dim_3"])
+    groups = pd.Series([1] * len_grouped_dim, index=u.indexes["dim_3"])
+    grouped = expr.groupby(groups).sum(use_fallback=use_fallback)
+    assert "group" in grouped.dims
+    assert (grouped.data.group == [1]).all()
+    assert grouped.nterm == len_grouped_dim
+
+
 @pytest.mark.parametrize("use_fallback", [True, False])
 def test_linear_expression_groupby_with_dataframe(v, use_fallback):
     expr = 1 * v
@@ -751,6 +773,45 @@ def test_linear_expression_groupby_with_dataframe(v, use_fallback):
     assert grouped.nterm == 3
 
 
+@pytest.mark.parametrize("use_fallback", [True, False])
+def test_linear_expression_groupby_with_dataframe_with_same_group_name(v, use_fallback):
+    """
+    Test that the group by works with a dataframe whose column name is the same as
+    the dimension to group.
+    """
+    expr = 1 * v
+    groups = pd.DataFrame(
+        {"dim_2": [1] * 10 + [2] * 10, "b": list(range(4)) * 5},
+        index=v.indexes["dim_2"],
+    )
+    if use_fallback:
+        with pytest.raises(ValueError):
+            expr.groupby(groups).sum(use_fallback=use_fallback)
+        return
+
+    grouped = expr.groupby(groups).sum(use_fallback=use_fallback)
+    index = pd.MultiIndex.from_frame(groups)
+    assert "group" in grouped.dims
+    assert set(grouped.data.group.values) == set(index.values)
+    assert grouped.nterm == 3
+
+
+@pytest.mark.parametrize("use_fallback", [True, False])
+def test_linear_expression_groupby_with_dataframe_on_multiindex(u, use_fallback):
+    expr = 1 * u
+    len_grouped_dim = len(u.data["dim_3"])
+    groups = pd.DataFrame({"a": [1] * len_grouped_dim}, index=u.indexes["dim_3"])
+
+    if use_fallback:
+        with pytest.raises(ValueError):
+            expr.groupby(groups).sum(use_fallback=use_fallback)
+        return
+    grouped = expr.groupby(groups).sum(use_fallback=use_fallback)
+    assert "group" in grouped.dims
+    assert isinstance(grouped.indexes["group"], pd.MultiIndex)
+    assert grouped.nterm == len_grouped_dim
+
+
 @pytest.mark.parametrize("use_fallback", [True, False])
 def test_linear_expression_groupby_with_dataarray(v, use_fallback):
     expr = 1 * v