handle month ends for polars in offset_times (#42)

jmoralez · web-flow · commit 49fd37bad952 · 2023-11-22T14:55:19.000-06:00
diff --git a/nbs/processing.ipynb b/nbs/processing.ipynb
@@ -749,11 +749,51 @@
     "    elif isinstance(times, pl_Series) and isinstance(freq, str):\n",
     "        total_offset = _multiply_pl_freq(freq, n)\n",
     "        out = times.dt.offset_by(total_offset)\n",
+    "        if 'mo' in freq:\n",
+    "            next_days = times.dt.offset_by('1d')\n",
+    "            month_ends = (next_days.dt.month() != times.dt.month()).all()\n",
+    "            if month_ends:\n",
+    "                out = out.dt.month_end()\n",
     "    else:\n",
     "        raise ValueError(f\"Can't process the following combination {(type(times), type(freq))}\")\n",
     "    return out"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "03a9f253-3753-4c11-8a7a-410ea924469a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.testing.assert_index_equal(\n",
+    "    offset_times(pd.to_datetime(['2020-01-31', '2020-02-29', '2020-03-31']), 'M', 1),\n",
+    "    pd.Index(pd.to_datetime(['2020-02-29', '2020-03-31', '2020-04-30'])),\n",
+    ")\n",
+    "pd.testing.assert_index_equal(\n",
+    "    offset_times(pd.to_datetime(['2020-01-01', '2020-02-01', '2020-03-01']), 'MS', 1),\n",
+    "    pd.Index(pd.to_datetime(['2020-02-01', '2020-03-01', '2020-04-01'])),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "63ad919f-fea3-4f61-a766-6f952da8bf75",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| polars\n",
+    "pl.testing.assert_series_equal(\n",
+    "    offset_times(pl_Series([dt(2020, 1, 31), dt(2020, 2, 28), dt(2020, 3, 31)]), '1mo_saturating', 1),\n",
+    "    pl_Series([dt(2020, 2, 29), dt(2020, 3, 28), dt(2020, 4, 30)]),\n",
+    ")\n",
+    "pl.testing.assert_series_equal(\n",
+    "    offset_times(pl_Series([dt(2020, 1, 31), dt(2020, 2, 29), dt(2020, 3, 31)]), '1mo_saturating', 1),\n",
+    "    pl_Series([dt(2020, 2, 29), dt(2020, 3, 31), dt(2020, 4, 30)]),\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -1017,6 +1057,8 @@
     "        valid_idxs = np.repeat(cutoff_idxs + 1, h) + np.tile(np.arange(h), cutoff_idxs.size)\n",
     "        out_times.append(times[valid_idxs])\n",
     "        out_cutoffs.append(np.repeat(times[cutoff_idxs], h))\n",
+    "        if isinstance(uids, pl_Series):\n",
+    "            use_series = pl_Series(use_series)\n",
     "        out_ids.append(repeat(filter_with_mask(uids, use_series), h))\n",
     "    return df_constructor(\n",
     "        {\n",
diff --git a/utilsforecast/processing.py b/utilsforecast/processing.py
@@ -328,13 +328,18 @@ def offset_times(
     elif isinstance(times, pl_Series) and isinstance(freq, str):
         total_offset = _multiply_pl_freq(freq, n)
         out = times.dt.offset_by(total_offset)
+        if "mo" in freq:
+            next_days = times.dt.offset_by("1d")
+            month_ends = (next_days.dt.month() != times.dt.month()).all()
+            if month_ends:
+                out = out.dt.month_end()
     else:
         raise ValueError(
             f"Can't process the following combination {(type(times), type(freq))}"
         )
     return out
 
-# %% ../nbs/processing.ipynb 38
+# %% ../nbs/processing.ipynb 40
 def offset_dates(
     dates: Union[Series, pd.Index],
     freq: Union[int, str, BaseOffset],
@@ -345,7 +350,7 @@ def offset_dates(
     )
     return offset_times(dates, freq, n)
 
-# %% ../nbs/processing.ipynb 39
+# %% ../nbs/processing.ipynb 41
 def time_ranges(
     starts: Union[Series, pd.Index],
     freq: Union[int, str, BaseOffset],
@@ -384,7 +389,7 @@ def time_ranges(
         out = out.alias(starts.name)
     return out
 
-# %% ../nbs/processing.ipynb 42
+# %% ../nbs/processing.ipynb 44
 def repeat(
     s: Union[Series, pd.Index, np.ndarray], n: Union[int, np.ndarray, Series]
 ) -> Union[Series, pd.Index, np.ndarray]:
@@ -403,7 +408,7 @@ def repeat(
             out = out.reset_index(drop=True)
     return out
 
-# %% ../nbs/processing.ipynb 45
+# %% ../nbs/processing.ipynb 47
 def cv_times(
     times: np.ndarray,
     uids: Union[Series, pd.Index],
@@ -435,6 +440,8 @@ def cv_times(
         )
         out_times.append(times[valid_idxs])
         out_cutoffs.append(np.repeat(times[cutoff_idxs], h))
+        if isinstance(uids, pl_Series):
+            use_series = pl_Series(use_series)
         out_ids.append(repeat(filter_with_mask(uids, use_series), h))
     return df_constructor(
         {
@@ -444,7 +451,7 @@ def cv_times(
         }
     )
 
-# %% ../nbs/processing.ipynb 47
+# %% ../nbs/processing.ipynb 49
 def group_by(df: Union[Series, DataFrame], by, maintain_order=False):
     if isinstance(df, (pd.Series, pd.DataFrame)):
         out = df.groupby(by, observed=True, sort=not maintain_order)
@@ -457,7 +464,7 @@ def group_by(df: Union[Series, DataFrame], by, maintain_order=False):
             out = df.groupby(by, maintain_order=maintain_order)
     return out
 
-# %% ../nbs/processing.ipynb 48
+# %% ../nbs/processing.ipynb 50
 def group_by_agg(df: DataFrame, by, aggs, maintain_order=False) -> DataFrame:
     if isinstance(df, pd.DataFrame):
         out = group_by(df, by, maintain_order).agg(aggs).reset_index()
@@ -467,39 +474,39 @@ def group_by_agg(df: DataFrame, by, aggs, maintain_order=False) -> DataFrame:
         )
     return out
 
-# %% ../nbs/processing.ipynb 51
+# %% ../nbs/processing.ipynb 53
 def is_in(s: Series, collection) -> Series:
     if isinstance(s, pl_Series):
         out = s.is_in(collection)
     else:
         out = s.isin(collection)
     return out
 
-# %% ../nbs/processing.ipynb 54
+# %% ../nbs/processing.ipynb 56
 def between(s: Series, lower: Series, upper: Series) -> Series:
     if isinstance(s, pd.Series):
         out = s.between(lower, upper)
     else:
         out = s.is_between(lower, upper)
     return out
 
-# %% ../nbs/processing.ipynb 57
+# %% ../nbs/processing.ipynb 59
 def fill_null(df: DataFrame, mapping: Dict[str, Any]) -> DataFrame:
     if isinstance(df, pd.DataFrame):
         out = df.fillna(mapping)
     else:
         out = df.with_columns(*[pl.col(col).fill_null(v) for col, v in mapping.items()])
     return out
 
-# %% ../nbs/processing.ipynb 60
+# %% ../nbs/processing.ipynb 62
 def cast(s: Series, dtype: type) -> Series:
     if isinstance(s, pd.Series):
         s = s.astype(dtype)
     else:
         s = s.cast(dtype)
     return s
 
-# %% ../nbs/processing.ipynb 63
+# %% ../nbs/processing.ipynb 65
 def value_cols_to_numpy(
     df: DataFrame, id_col: str, time_col: str, target_col: str
 ) -> np.ndarray:
@@ -510,7 +517,7 @@ def value_cols_to_numpy(
         data = data.astype(np.float32)
     return data
 
-# %% ../nbs/processing.ipynb 64
+# %% ../nbs/processing.ipynb 66
 def process_df(
     df: DataFrame, id_col: str, time_col: str, target_col: str
 ) -> Tuple[Series, np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray]]:
@@ -558,7 +565,7 @@ def process_df(
     times = df[time_col].to_numpy()[last_idxs]
     return uids, times, data, indptr, sort_idxs
 
-# %% ../nbs/processing.ipynb 66
+# %% ../nbs/processing.ipynb 68
 class DataFrameProcessor:
     def __init__(
         self,
@@ -575,7 +582,7 @@ def process(
     ) -> Tuple[Series, np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray]]:
         return process_df(df, self.id_col, self.time_col, self.target_col)
 
-# %% ../nbs/processing.ipynb 70
+# %% ../nbs/processing.ipynb 72
 def _single_split(
     df: DataFrame,
     i_window: int,
@@ -635,7 +642,7 @@ def _single_split(
         )
     return cutoffs, train_mask, valid_mask
 
-# %% ../nbs/processing.ipynb 71
+# %% ../nbs/processing.ipynb 73
 def backtest_splits(
     df: DataFrame,
     n_windows: int,