Issue #1768 pandas3 support (#1771)

JoerivanEngelen · web-flow · commit c3e653b54182 · 2026-02-12T11:59:34.000+01:00
Fixes #1768 # Description Fixes broken pieces of the code when updating to pandas 3, which [has the following major changes](https://pandas.pydata.org/community/blog/pandas-3.0.html) which also affect iMOD Python somewhat: - _Dedicated string data type by default: string columns are now inferred as the new str dtype instead of object, providing better performance and type safety_ - _Consistent copy/view behaviour with Copy-on-Write (CoW) (a.k.a. getting rid of the SettingWithCopyWarning): more predictable and consistent behavior for all operations, with improved performance through avoiding unnecessary copies_ - _New default resolution for datetime-like data: no longer defaulting to nanoseconds, but generally microseconds (or the resolution of the input), when constructing datetime or timedelta data (avoiding out-of-bounds errors for dates with a year before 1678 or after 2262)_ Because of these changes, some parts of the code had to be modified slightly to get the tests to work. I think with the latter change we could simplify the code base a lot, as all logic to deal with datetimes going beyond the year 2262 and prior to 1678 wouldn't be necessary anymore. This was quite a headache in the past, and I think the choice for microseconds by default will make our lives significantly easier. [I created an issue for this](#1773). In detail this PR alters the following: - Pin pandas version to 3.* in pixi.toml - Update setup and asserts in quite some tests to work with pandas 3.0. Most of these tests will not work with pandas 2.0 anymore, as we now assert for pandas 3.0 behavior - Update examples to work with pandas 3.0 - Fix statements where a view was altered (e.g. altering ``.values`` attribute), this is now forbidden by pandas - Update to new string handling, sometimes I had to internally enforce object dtype to get the old behavior. In other cases we could check for strings by checking ``dtype=="object"`` anymore. Luckily there is ``pd.api.types.is_string_dtype``, which both works for pandas 2 as well as pandas 3. - Add unittest for ``to_pandas_datetime_series`` to check whether pandas behavior is similar in this regard. # Checklist  - [x] Links to correct issue - [x] Update changelog, if changes affect users - [x] PR title starts with ``Issue #nr``, e.g. ``Issue #737`` - [x] Unit tests were added - [ ] **If feature added**: Added/extended example - [ ] **If feature added**: Added feature to API documentation - [ ] **If pixi.lock was changed**: Ran `pixi run generate-sbom` and committed changes
diff --git a/docs/api/changelog.rst b/docs/api/changelog.rst
@@ -44,6 +44,8 @@ Fixed
 - Fixed bug where :func:`imod.evaluate.convert_pointwaterhead_freshwaterhead`
   produced incorrect results when point water heads were below elevation levels
   for unstructured grids.
+- Support pandas 3.0.
+
 
 Changed
 ~~~~~~~
diff --git a/examples/imod-wq/Henry-wq.py b/examples/imod-wq/Henry-wq.py
@@ -102,7 +102,7 @@
 )
 m["oc"] = imod.wq.OutputControl(save_head_idf=True, save_concentration_idf=True)
 m.create_time_discretization(
-    additional_times=pd.date_range("2000-01-01", "2001-01-01", freq="M")
+    additional_times=pd.date_range("2000-01-01", "2001-01-01", freq="ME")
 )
 
 # %%
diff --git a/examples/mf6/circle_transport.py b/examples/mf6/circle_transport.py
@@ -194,7 +194,7 @@
 # stress period is a year long. We can then use the "last" keyword in the output
 # control to save the output.
 
-simtimes = pd.date_range(start="2000-01-01", end="2030-01-01", freq="As")
+simtimes = pd.date_range(start="2000-01-01", end="2030-01-01", freq="YS")
 simulation.create_time_discretization(additional_times=simtimes)
 
 # %%
diff --git a/examples/mf6/hondsrug.py b/examples/mf6/hondsrug.py
@@ -336,7 +336,7 @@ def outer_edge(da):
 # resampled to a yearly step by using the xarray function
 # `Dataset.resample <http://xarray.pydata.org/en/stable/generated/xarray.Dataset.resample.html#xarray.Dataset.resample>`_.
 
-rch_trans_yr = rch_trans.resample(time="A", label="left").mean()
+rch_trans_yr = rch_trans.resample(time="YS", label="left").mean()
 rch_trans_yr
 
 # %%
diff --git a/imod/evaluate/constraints.py b/imod/evaluate/constraints.py
@@ -126,7 +126,8 @@ def stability_constraint_advection(front, lower, right, top_bot, porosity=0.3, R
     dt = 1.0 / (1.0 / dt_x + 1.0 / dt_y + 1.0 / dt_z)
 
     dt_xyz = xr.concat(
-        (dt_x, dt_y, dt_z), dim=pd.Index(["x", "y", "z"], name="direction")
+        (dt_x, dt_y, dt_z),
+        dim=pd.Index(["x", "y", "z"], name="direction", dtype="object"),
     )
     return dt, dt_xyz
 
@@ -254,9 +255,11 @@ def _get_stage_name(sid):
                 if not drop_allnan or not dt.isnull().all():
                     results.append(dt)
                     resultids.append(comb)
-    dt_all = xr.concat(
-        results, pd.Index(resultids, name="combination"), coords="minimal"
-    )
+    # Set index to object dtype to work around xarray concat issue where
+    # StringDtype could not be interpreted as a data type with pandas 3.0 (as
+    # np.dtype is called.)
+    id_index = pd.Index(resultids, name="combination", dtype="object")
+    dt_all = xr.concat(results, id_index, coords="minimal")
 
     # overall dt
     dt_min = dt_all.min(dim="combination")
diff --git a/imod/formats/gen/gen.py b/imod/formats/gen/gen.py
@@ -267,7 +267,7 @@ def read_binary(path: Union[str, Path]) -> "geopandas.GeoDataFrame":  # type: ig
     else:
         df = pd.DataFrame()
     df["feature_type"] = feature_type
-    df["feature_type"] = df["feature_type"].replace(GENTYPE_TO_NAME)
+    df["feature_type"] = df["feature_type"].replace(GENTYPE_TO_NAME).astype(str)
 
     geometry = []
     for ftype, geom in zip(feature_type, xy):
diff --git a/imod/formats/ipf.py b/imod/formats/ipf.py
@@ -458,7 +458,8 @@ def write_assoc(path, df, itype=1, nodata=1.0e20, assoc_columns=None):
     # The reason is that datetime columns are converted to string as well
     # and then quoted. This causes trouble with some iMOD(batch) functions.
     for column in df.columns:
-        if df.loc[:, column].dtype == np.dtype("O"):
+        # Test for strings compatible with pandas 2 and 3
+        if pd.api.types.is_string_dtype(df[column].dtype):
             df.loc[:, column] = df.loc[:, column].astype(str)
             df.loc[:, column] = '"' + df.loc[:, column] + '"'
 
@@ -509,7 +510,8 @@ def write(path, df, indexcolumn=0, assoc_ext="txt", nodata=1.0e20):
     # The reason is that datetime columns are converted to string as well
     # and then quoted. This causes trouble with some iMOD(batch) functions.
     for column in df.columns:
-        if df.loc[:, column].dtype == np.dtype("O"):
+        # Test for strings compatible with pandas 2 and 3
+        if pd.api.types.is_string_dtype(df[column].dtype):
             df.loc[:, column] = df.loc[:, column].astype(str)
             df.loc[:, column] = '"' + df.loc[:, column] + '"'
 
diff --git a/imod/mf6/multimodel/exchange_creator.py b/imod/mf6/multimodel/exchange_creator.py
@@ -290,10 +290,12 @@ def rearrange_connected_cells(self):
 
         label_decreasing = df["cell_label1"] > df["cell_label2"]
 
-        colnames = ["cell_idx1", "cell_idx2", "cell_label1", "cell_label2"]
-        colnames_reversed = ["cell_idx2", "cell_idx1", "cell_label2", "cell_label1"]
+        if label_decreasing.any():
+            colnames = ["cell_idx1", "cell_idx2", "cell_label1", "cell_label2"]
+            colnames_reversed = ["cell_idx2", "cell_idx1", "cell_label2", "cell_label1"]
 
-        decreasing_connections = df.loc[label_decreasing, colnames].values
-        df.loc[label_decreasing, colnames_reversed] = decreasing_connections
+            df_decreasing = df.loc[label_decreasing, colnames]
+            df_decreasing.columns = colnames_reversed
+            df.loc[label_decreasing, colnames_reversed] = df_decreasing
 
         self._connected_cells = df
diff --git a/imod/testing.py b/imod/testing.py
@@ -16,7 +16,7 @@ def assert_frame_equal(left: pd.DataFrame, right: pd.DataFrame, **kwargs):
     def always_int64(df):
         df = df.copy()
         for column, dtype in df.dtypes.items():
-            if np.issubdtype(dtype, np.integer):
+            if pd.api.types.is_integer_dtype(dtype):
                 df[column] = df[column].astype(np.int64)
         return df
 
diff --git a/imod/tests/test_evaluate/test_constraints.py b/imod/tests/test_evaluate/test_constraints.py
@@ -94,13 +94,13 @@ def test_intra_cell_boundary_conditions(test_da1):
     riv1drn = test_da1 * (0.3 * 1.0) / min(100.0, 150) * (1.0 - 0.0)
     dt_min_ref = np.minimum(ghbdrn, riv1drn)
 
-    assert dt_min_ref.equals(dt_min)
-    assert dt_all.equals(
-        xr.concat(
-            (ghbdrn, riv1drn), pd.Index(["ghb-drn", "riv_0-drn"], name="combination")
-        )
+    expected_index = pd.Index(
+        ["ghb-drn", "riv_0-drn"], name="combination", dtype="object"
     )
 
+    assert dt_min_ref.equals(dt_min)
+    assert dt_all.equals(xr.concat((ghbdrn, riv1drn), expected_index))
+
 
 def test_intra_cell_boundary_conditions_thickness_zero(test_da1):
     top_bot = xr.Dataset({"top": test_da1 * -1.0, "bot": test_da1 * -1.0})
diff --git a/imod/tests/test_formats/test_gen.py b/imod/tests/test_formats/test_gen.py
@@ -160,16 +160,17 @@ def test_gen_single_feature(tmp_path, ftype):
     imod.gen.write(path, gdf, feature_type="feature_type")
     back = imod.gen.read(path)
     assert (back["feature_type"] == ftype).all()
+    geom_actual = back["geometry"]
+    geom_expected = gdf["geometry"]
+    expected = gdf.drop(columns="geometry").sort_index(axis=1)
+    actual = back.drop(columns="geometry").sort_index(axis=1)
+    # TODO: account for the fact that geopandas string type is ArrowStringArray, whereas iMOD GEN reader returns object dtype.
+    assert expected.equals(actual)
     if ftype in ("circle", "rectangle"):
         # Gotta do a different check, geometries won't be exactly the same
-        geom_actual = back["geometry"].iloc[0]
-        geom_expected = gdf["geometry"].iloc[0]
-        expected = gdf.drop(columns="geometry").sort_index(axis=1)
-        actual = back.drop(columns="geometry").sort_index(axis=1)
-        assert expected.equals(actual)
-        assert approximately_equal(geom_actual, geom_expected)
+        assert approximately_equal(geom_actual.iloc[0], geom_expected.iloc[0])
     else:
-        assert gdf.sort_index(axis=1).equals(back.sort_index(axis=1))
+        assert geom_actual.geom_equals(geom_expected).all()
 
 
 def test_gen_multi_feature(tmp_path):
diff --git a/imod/tests/test_formats/test_prj_wel.py b/imod/tests/test_formats/test_prj_wel.py
@@ -798,8 +798,8 @@ def test_open_projectfile_data_out_of_bounds_wells(
             assert actual[field] == wel_expected[field]
         if actual["has_associated"]:
             timeseries = data["wel-associated"]["dataframe"][0]["time"]
-            # Test if last element NaT
-            assert timeseries.iloc[-1] is pd.NaT
+            # Test if last element not NaT (since pandas 3, before it was NaT)
+            assert timeseries.iloc[-1] == pd.Timestamp("2999-11-12 00:00:00")
 
 
 @pytest.mark.unittest_jit
diff --git a/imod/tests/test_mf6/test_ex01_twri.py b/imod/tests/test_mf6/test_ex01_twri.py
@@ -432,7 +432,7 @@ def test_simulation_write_and_run(twri_model, tmp_path):
     assert head.shape == (1, 3, 15, 15)
     assert np.all(
         head["time"].values
-        == np.array("1999-01-02T00:00:00.000000000", dtype="datetime64[ns]")
+        == np.array("1999-01-02T00:00:00.000000", dtype="datetime64[ns]")
     )
     meanhead_layer = head.groupby("layer").mean(dim=xr.ALL_DIMS)
     mean_answer = np.array([59.79181509, 30.44132373, 24.88576811])
diff --git a/imod/tests/test_mf6/test_mf6_drn.py b/imod/tests/test_mf6/test_mf6_drn.py
@@ -418,10 +418,10 @@ def test_clip_box_transient(transient_drainage):
     selection = drn.clip_box(time_min="2001-01-01", time_max="2004-01-01")
     expected = np.array(
         [
-            "2001-01-01T00:00:00.000000000",
-            "2002-01-01T00:00:00.000000000",
-            "2003-01-01T00:00:00.000000000",
-            "2004-01-01T00:00:00.000000000",
+            "2001-01-01T00:00:00.000000",
+            "2002-01-01T00:00:00.000000",
+            "2003-01-01T00:00:00.000000",
+            "2004-01-01T00:00:00.000000",
         ],
         dtype="datetime64[ns]",
     )
@@ -434,9 +434,9 @@ def test_clip_box_transient(transient_drainage):
     selection = drn.clip_box(time_min="2000-06-01", time_max="2002-06-01")
     expected = np.array(
         [
-            "2000-06-01T00:00:00.000000000",
-            "2001-01-01T00:00:00.000000000",
-            "2002-01-01T00:00:00.000000000",
+            "2000-06-01T00:00:00.000000",
+            "2001-01-01T00:00:00.000000",
+            "2002-01-01T00:00:00.000000",
         ],
         dtype="datetime64[ns]",
     )
@@ -447,10 +447,10 @@ def test_clip_box_transient(transient_drainage):
     selection = drn.clip_box(time_min="1990-06-01", time_max="2002-06-01")
     expected = np.array(
         [
-            "1990-06-01T00:00:00.000000000",
-            "2000-01-01T00:00:00.000000000",
-            "2001-01-01T00:00:00.000000000",
-            "2002-01-01T00:00:00.000000000",
+            "1990-06-01T00:00:00.000000",
+            "2000-01-01T00:00:00.000000",
+            "2001-01-01T00:00:00.000000",
+            "2002-01-01T00:00:00.000000",
         ],
         dtype="datetime64[ns]",
     )
diff --git a/imod/tests/test_mf6/test_mf6_npf.py b/imod/tests/test_mf6/test_mf6_npf.py
@@ -303,10 +303,8 @@ def test_npf_from_imod5_settings(imod5_dataset, tmp_path):
     # move the coordinates a bit so that it doesn't match the grid of k (and the regridding settings will matter)
     target_grid = data["khv"]["kh"]
     x = target_grid["x"].values
-    x += 50
     y = target_grid["y"].values
-    y += 50
-    target_grid = target_grid.assign_coords({"x": x, "y": y})
+    target_grid = target_grid.assign_coords({"x": x + 50, "y": y + 50})
 
     settings = imod.mf6.NodePropertyFlow.get_regrid_methods()
     settings_1 = deepcopy(settings)
diff --git a/imod/tests/test_mf6/test_mf6_out.py b/imod/tests/test_mf6/test_mf6_out.py
@@ -434,7 +434,7 @@ def test_open_cbc__dis_datetime(transient_twri_result):
         )
 
     for array in cbc.values():
-        assert array.coords["time"].dtype == np.dtype("datetime64[ns]")
+        assert array.coords["time"].dtype == np.dtype("datetime64[us]")
 
 
 def test_open_cbc__dis_transient_unconfined(transient_unconfined_twri_result):
@@ -535,7 +535,7 @@ def test_open_cbc__disv_datetime(circle_result):
         )
 
     for array in cbc.values():
-        assert array.coords["time"].dtype == np.dtype("datetime64[ns]")
+        assert array.coords["time"].dtype == np.dtype("datetime64[us]")
 
 
 def test_open_cbc__disv_sto(circle_result_sto):
diff --git a/imod/tests/test_mf6/test_mf6_rch.py b/imod/tests/test_mf6/test_mf6_rch.py
@@ -409,7 +409,8 @@ def test_planar_rch_from_imod5_constant(imod5_dataset, tmp_path):
     target_discretization = StructuredDiscretization.from_imod5_data(data)
 
     # create a planar grid with time-independent recharge
-    data["rch"]["rate"]["layer"].values[0] = -1
+    data["rch"]["rate"] = data["rch"]["rate"].assign_coords(layer=[-1])
+
     assert not is_transient_data_grid(data["rch"]["rate"])
     assert is_planar_grid(data["rch"]["rate"])
 
@@ -432,7 +433,7 @@ def test_planar_rch_from_imod5_constant(imod5_dataset, tmp_path):
     assert "maxbound 33856" in rendered_rch
     assert rendered_rch.count("begin period") == 1
     # teardown
-    data["rch"]["rate"]["layer"].values[0] = 1
+    data["rch"]["rate"] = data["rch"]["rate"].assign_coords(layer=[1])
 
 
 @pytest.mark.unittest_jit
diff --git a/imod/tests/test_mf6/test_mf6_simulation.py b/imod/tests/test_mf6/test_mf6_simulation.py
@@ -211,7 +211,7 @@ def test_simulation_open_head(circle_model, tmp_path):
     )
     assert head.dims == ("time", "layer", "mesh2d_nFaces")
     assert head.shape == (52, 2, 216)
-    assert str(head.coords["time"].values[()][0]) == "2013-04-29T22:00:00.000000000"
+    assert str(head.coords["time"].values[()][0]) == "2013-04-29T22:00:00.000000"
 
 
 class PathCases:
diff --git a/imod/tests/test_mf6/test_mf6_timedis.py b/imod/tests/test_mf6/test_mf6_timedis.py
@@ -22,7 +22,7 @@ def test_render():
         """\
         begin options
           time_units days
-          start_date_time 2000-01-01T00:00:00.000000000
+          start_date_time 2000-01-01T00:00:00.000000
         end options
 
         begin dimensions
diff --git a/imod/tests/test_mf6/test_mf6_transport_model.py b/imod/tests/test_mf6/test_mf6_transport_model.py
@@ -153,7 +153,7 @@ def test_transport_concentration_loading(tmp_path, flow_transport_simulation):
         simulation_start_time="2000-01-31",
         time_unit="s",
     )
-    assert conc_time.coords["time"].dtype == np.dtype("datetime64[ns]")
+    assert conc_time.coords["time"].dtype == np.dtype("datetime64[us]")
 
 
 def test_transport_balance_loading(tmp_path, flow_transport_simulation):
@@ -170,7 +170,7 @@ def test_transport_balance_loading(tmp_path, flow_transport_simulation):
         simulation_start_time="2000-01-31",
         time_unit="s",
     )
-    assert balance_time.coords["time"].dtype == np.dtype("datetime64[ns]")
+    assert balance_time.coords["time"].dtype == np.dtype("datetime64[us]")
 
     np.testing.assert_allclose(
         balance_notime.sel(species="a")["source-sink mix_ssm"].values,
diff --git a/imod/tests/test_mf6/test_mf6_wel.py b/imod/tests/test_mf6/test_mf6_wel.py
@@ -883,7 +883,9 @@ def test_import_and_convert_to_mf6(imod5_dataset, tmp_path, wel_class):
     wel = wel_class.from_imod5_data("wel-WELLS_L3", data, times, minimum_thickness=1.0)
     assert wel.dataset["x"].values[0] == 197910.0
     assert wel.dataset["y"].values[0] == 362860.0
-    assert np.mean(wel.dataset["rate"].values) == -317.2059091946156
+    np.testing.assert_almost_equal(
+        np.mean(wel.dataset["rate"].values), -317.2059091946156
+    )
     # convert to a gridded well
     top = target_dis.dataset["top"]
     bottom = target_dis.dataset["bottom"]
@@ -895,7 +897,9 @@ def test_import_and_convert_to_mf6(imod5_dataset, tmp_path, wel_class):
     assert len(mf6_well.dataset["x"].values) == 1
     assert mf6_well.dataset["x"].values[0] == 197910.0
     assert mf6_well.dataset["y"].values[0] == 362860.0
-    assert np.mean(mf6_well.dataset["rate"].values) == -317.2059091946156
+    np.testing.assert_almost_equal(
+        np.mean(mf6_well.dataset["rate"].values), -317.2059091946156
+    )
 
     # write the package for validation
     write_context = WriteContext(simulation_directory=tmp_path)
diff --git a/imod/tests/test_typing/test_typing_grid.py b/imod/tests/test_typing/test_typing_grid.py
@@ -95,11 +95,11 @@ def test_is_planar_grid(basic_dis, basic_unstructured_dis):
         assert not is_planar_grid(bottom_layer)
 
         # set layer coordinates as present and 0
-        bottom_layer.coords["layer"].values[0] = 0
+        bottom_layer = bottom_layer.assign_coords(layer=[0])
         assert is_planar_grid(bottom_layer)
 
         # set layer coordinates as present and -1
-        bottom_layer.coords["layer"].values[0] = -1
+        bottom_layer = bottom_layer.assign_coords(layer=[-1])
         assert is_planar_grid(bottom_layer)
 
 
@@ -120,7 +120,7 @@ def test_has_negative_layer(basic_dis, basic_unstructured_dis):
         assert not has_negative_layer(bottom_layer)
 
         # set layer coordinates as present and -1
-        bottom_layer.coords["layer"].values[0] = -1
+        bottom_layer = bottom_layer.assign_coords(layer=[-1])
         assert has_negative_layer(bottom_layer)
 
 
diff --git a/imod/tests/test_util/test_util_time.py b/imod/tests/test_util/test_util_time.py
@@ -4,6 +4,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+from pytest_cases import parametrize_with_cases
 
 import imod
 from imod.util.time import _check_year, forcing_starts_ends, to_datetime_internal
@@ -138,3 +139,35 @@ def test_forcing__irregular_day():
         package_times=package_times, globaltimes=globaltimes
     )
     assert starts_ends == ["1", "2:4", "5", "6"]
+
+
+class DateTimeCases:
+    def case_Y(self):
+        return ["2000", "2001", "9999"]
+
+    def case_Ymd(self):
+        return ["20000101", "20000102", "99990101"]
+
+    def case_YmdH(self):
+        return ["2000010100", "2000010200", "9999010100"]
+
+    def case_YmdHM(self):
+        return ["200001010000", "200001020000", "999901010000"]
+
+    def case_YmdHMS(self):
+        return ["20000101000000", "20000102000000", "99990101000000"]
+
+
+@parametrize_with_cases("datestr", cases=DateTimeCases)
+def test_to_pandas_datetime_series(datestr):
+    """
+    Test whether behavior of pandas stays similar to the past, since pandas 2
+    and 3 have different base time units (ns vs us). The function should be able to handle
+    both cases, but the test is to check if the behavior of pandas has changed
+    in a way that affects the function.
+    """
+    series = pd.Series(datestr)
+    datetime_series = imod.util.time.to_pandas_datetime_series(series)
+    assert datetime_series.dtype == np.dtype("datetime64[us]")
+    assert datetime_series.iloc[0] == np.datetime64("2000-01-01", "us")
+    assert datetime_series.iloc[-1] == np.datetime64("9999-01-01", "us")
diff --git a/imod/tests/test_wq/test_wq_drn.py b/imod/tests/test_wq/test_wq_drn.py
diff --git a/imod/util/expand_repetitions.py b/imod/util/expand_repetitions.py
diff --git a/imod/util/time.py b/imod/util/time.py
diff --git a/pixi.lock b/pixi.lock
diff --git a/pixi.toml b/pixi.toml

Original file line number	Diff line number	Diff line change
`@@ -102,7 +102,7 @@`
`102`	`102`	`)`
`103`	`103`	`m["oc"] = imod.wq.OutputControl(save_head_idf=True, save_concentration_idf=True)`
`104`	`104`	`m.create_time_discretization(`
`105`		`- additional_times=pd.date_range("2000-01-01", "2001-01-01", freq="M")`
	`105`	`+ additional_times=pd.date_range("2000-01-01", "2001-01-01", freq="ME")`
`106`	`106`	`)`
`107`	`107`
`108`	`108`	`# %%`