feat!: improve performance by not sorting in _post_process_ts_df

martibosch · martibosch · commit 24e265ab2639 · 2026-02-01T13:08:01.000+01:00
diff --git a/meteora/clients/base.py b/meteora/clients/base.py
@@ -146,7 +146,7 @@ def _ts_params(self, variable_ids, *args, **kwargs) -> dict:
         return {"variable_ids": variable_ids, **kwargs}
 
     def _post_process_ts_df(self, ts_df: pd.DataFrame) -> pd.DataFrame:
-        return ts_df.apply(pd.to_numeric, axis="columns").sort_index()
+        return ts_df.apply(pd.to_numeric, axis="columns")  # .sort_index()
 
     def _rename_variables_cols(
         self, ts_df: pd.DataFrame, variable_id_ser: pd.Series
diff --git a/meteora/clients/iem.py b/meteora/clients/iem.py
@@ -226,7 +226,6 @@ def _ts_df_from_content(self, response_content: io.StringIO) -> pd.DataFrame:
 
     def _post_process_ts_df(self, ts_df: pd.DataFrame) -> pd.DataFrame:
         # In this case:
-        # - avoid sorting on index as data is already sorted
         # - avoid to_numeric as data is already numeric
         return ts_df
 
diff --git a/meteora/clients/noaa.py b/meteora/clients/noaa.py
@@ -220,10 +220,6 @@ def _process_station_ts_df(year, station_id):
             )
             return pd.DataFrame(columns=variable_cols)
 
-    def _post_process_ts_df(self, ts_df: pd.DataFrame) -> pd.DataFrame:
-        # no need to sort the index given the way the data has been requested
-        return ts_df.apply(pd.to_numeric, axis="columns")
-
     def get_ts_df(
         self,
         variables: VariablesType,
diff --git a/tests/test_meteora.py b/tests/test_meteora.py
@@ -692,9 +692,11 @@ def test_time_series(self):
             )
             # TODO: use "time" as `level` arg?
             assert is_datetime64_any_dtype(ts_df.index.get_level_values(1))
-            # test that index is sorted (note that we need to test it as a multi-index
-            # because otherwise the time index alone is not unique in long data frames
-            assert ts_df.index.is_monotonic_increasing
+            # test that index is sorted - note that we need to test it as a multi-index
+            # for each station because (i) we do not care if stations ids are sorted and
+            # (ii) otherwise the time index alone is not unique in long data frames
+            for _, _ts_df in ts_df.groupby(level="station_id"):
+                assert _ts_df.droplevel("station_id").index.is_monotonic_increasing
             # test index labels
             assert ts_df.index.names == [settings.STATIONS_ID_COL, settings.TIME_COL]