add more functions to processing (#27)

jmoralez · web-flow · commit 5ee76eb40834 · 2023-10-23T19:25:25.000Z
diff --git a/nbs/processing.ipynb b/nbs/processing.ipynb
@@ -31,7 +31,7 @@
    "source": [
     "#| export\n",
     "import re\n",
-    "from typing import Dict, List, Optional, Tuple, Union\n",
+    "from typing import Any, Dict, List, Optional, Tuple, Union\n",
     "\n",
     "import numpy as np\n",
     "import pandas as pd\n",
@@ -358,6 +358,44 @@
     "    )"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "757e7421-017d-49f9-bdd0-c59fc7556488",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "def match_if_categorical(s1: Union[Series, pd.Index], s2: Series) -> Tuple[Series, Series]:\n",
+    "    if isinstance(s1.dtype, pd.CategoricalDtype):\n",
+    "        if isinstance(s1, pd.Index):\n",
+    "            cat1 = s1.categories\n",
+    "        else:\n",
+    "            cat1 = s1.cat.categories\n",
+    "        if isinstance(s2.dtype, pd.CategoricalDtype):\n",
+    "            cat2 = s2.cat.categories\n",
+    "        else:\n",
+    "            cat2 = s2.unique().astype(cat1.dtype)\n",
+    "        missing = set(cat2) - set(cat1)\n",
+    "        if missing:\n",
+    "            # we assume the original is s1, so we extend its categories\n",
+    "            new_dtype = pd.CategoricalDtype(categories=cat1.tolist() + sorted(missing))\n",
+    "            s1 = s1.astype(new_dtype)\n",
+    "            s2 = s2.astype(new_dtype)\n",
+    "    elif isinstance(s1, pl_Series) and s1.dtype == pl.Categorical:\n",
+    "        with pl.StringCache():\n",
+    "            cat1 = s1.cat.get_categories()\n",
+    "            if s2.dtype == pl.Categorical:\n",
+    "                cat2 = s2.cat.get_categories()\n",
+    "            else:\n",
+    "                cat2 = s2.unique().sort().cast(cat1.dtype)\n",
+    "            # populate cache, keep original categories first\n",
+    "            pl.concat([cat1, cat2]).cast(pl.Categorical)\n",
+    "            s1 = s1.cast(pl.Utf8).cast(pl.Categorical)\n",
+    "            s2 = s2.cast(pl.Utf8).cast(pl.Categorical)\n",
+    "    return s1, s2"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -369,15 +407,73 @@
     "def vertical_concat(dfs: List[DataFrame]) -> DataFrame:\n",
     "    if not dfs:\n",
     "        raise ValueError(\"Can't concatenate empty list.\")\n",
+    "    if len(dfs) == 1:\n",
+    "        return dfs\n",
     "    if isinstance(dfs[0], pd.DataFrame):\n",
-    "        out = pd.concat(dfs)\n",
-    "    elif isinstance(dfs[0], pl_DataFrame):\n",
-    "        out = pl.concat(dfs)\n",
+    "        cat_cols = [c for c, dtype in dfs[0].dtypes.items() if isinstance(dtype, pd.CategoricalDtype)]\n",
+    "        if cat_cols:\n",
+    "            if len(dfs) > 2:\n",
+    "                raise NotImplementedError('Categorical replacement for more than two dataframes')\n",
+    "            assert len(dfs) == 2\n",
+    "            df1, df2 = dfs\n",
+    "            df1 = df1.copy(deep=False)\n",
+    "            df2 = df2.copy(deep=False)            \n",
+    "            for col in cat_cols:\n",
+    "                s1, s2 = match_if_categorical(df1[col], df2[col])\n",
+    "                df1[col] = s1\n",
+    "                df2[col] = s2\n",
+    "            dfs = [df1, df2]\n",
+    "        out = pd.concat(dfs).reset_index(drop=True)\n",
     "    else:\n",
-    "        raise ValueError(f'Got list of unexpected types: {type(dfs[0])}.')\n",
+    "        all_cols = dfs[0].columns\n",
+    "        cat_cols = [all_cols[i] for i, dtype in enumerate(dfs[0].dtypes) if dtype == pl.Categorical]\n",
+    "        if cat_cols:\n",
+    "            if len(dfs) > 2:\n",
+    "                raise NotImplementedError('Categorical replacement for more than two dataframes')\n",
+    "            assert len(dfs) == 2\n",
+    "            df1, df2 = dfs\n",
+    "            for col in cat_cols:\n",
+    "                s1, s2 = match_if_categorical(df1[col], df2[col])\n",
+    "                df1 = df1.with_columns(s1)\n",
+    "                df2 = df2.with_columns(s2)\n",
+    "            dfs = [df1, df2]\n",
+    "        out = pl.concat(dfs)\n",
     "    return out"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a21c0461-3964-4c82-a406-9fb7ea624f23",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df1 = pd.DataFrame({'x': ['a', 'b', 'c']}, dtype='category')\n",
+    "df2 = pd.DataFrame({'x': ['f', 'b', 'a']}, dtype='category')\n",
+    "pd.testing.assert_series_equal(\n",
+    "    vertical_concat([df1,df2])['x'],\n",
+    "    pd.Series(['a', 'b', 'c', 'f', 'b', 'a'], name='x', dtype=pd.CategoricalDtype(categories=['a', 'b', 'c', 'f']))\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "986ab374-90fc-4ba8-b442-797abc63d2de",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| polars\n",
+    "df1 = pl.DataFrame({'x': ['a', 'b', 'c']}, schema={'x': pl.Categorical})\n",
+    "df2 = pl.DataFrame({'x': ['f', 'b', 'a']}, schema={'x': pl.Categorical})\n",
+    "out = vertical_concat([df1,df2])['x']\n",
+    "assert out.series_equal(pl.Series('x', ['a', 'b', 'c', 'f', 'b', 'a']))\n",
+    "assert out.to_physical().series_equal(pl.Series('x', [0, 1, 2, 3, 1, 0]))\n",
+    "assert out.cat.get_categories().series_equal(\n",
+    "    pl.Series('x', ['a', 'b', 'c', 'f'])\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -452,11 +548,15 @@
    "source": [
     "#| export\n",
     "def join(\n",
-    "    df1: DataFrame,\n",
-    "    df2: DataFrame,\n",
+    "    df1: Union[DataFrame, Series],\n",
+    "    df2: Union[DataFrame, Series],\n",
     "    on: Union[str, List[str]],\n",
     "    how: str = 'inner'\n",
     ") -> DataFrame:\n",
+    "    if isinstance(df1, (pd.Series, pl_Series)):\n",
+    "        df1 = df1.to_frame()\n",
+    "    if isinstance(df2, (pd.Series, pl_Series)):\n",
+    "        df2 = df2.to_frame()\n",
     "    if isinstance(df1, pd.DataFrame):\n",
     "        out = df1.merge(df2, on=on, how=how)\n",
     "    else:\n",
@@ -502,14 +602,68 @@
    "outputs": [],
    "source": [
     "#| export\n",
-    "def sort(df: DataFrame, by: Union[str, List[str]]) -> DataFrame:\n",
+    "def sort(df: DataFrame, by: Optional[Union[str, List[str]]] = None) -> DataFrame:\n",
     "    if isinstance(df, pd.DataFrame):\n",
-    "        out = df.sort_values(by)\n",
-    "    else:\n",
+    "        out = df.sort_values(by).reset_index(drop=True)\n",
+    "    elif isinstance(df, (pd.Series, pd.Index)):\n",
+    "        out = df.sort_values()\n",
+    "        if isinstance(out, pd.Series):\n",
+    "            out = out.reset_index(drop=True)\n",
+    "    elif isinstance(df, pl_DataFrame):\n",
     "        out = df.sort(by)\n",
+    "    else:\n",
+    "        out = df.sort()\n",
     "    return out"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c14e0b1c-3770-4d8d-a8d0-63ed2bdf147c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.testing.assert_frame_equal(\n",
+    "    sort(pd.DataFrame({'x': [3, 1, 2]}), 'x'),\n",
+    "    pd.DataFrame({'x': [1, 2, 3]})\n",
+    ")\n",
+    "pd.testing.assert_frame_equal(\n",
+    "    sort(pd.DataFrame({'x': [3, 1, 2]}), ['x']),\n",
+    "    pd.DataFrame({'x': [1, 2, 3]})\n",
+    ")\n",
+    "pd.testing.assert_series_equal(\n",
+    "    sort(pd.Series([3, 1, 2])),\n",
+    "    pd.Series([1, 2, 3])\n",
+    ")\n",
+    "pd.testing.assert_index_equal(\n",
+    "    sort(pd.Index([3, 1, 2])),\n",
+    "    pd.Index([1, 2, 3])\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "43e1c151-1f81-442d-9f32-d88ca85a5e73",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| polars\n",
+    "# TODO: replace with pl.testing.assert_frame_equal when it's released\n",
+    "pd.testing.assert_frame_equal(\n",
+    "    sort(pl.DataFrame({'x': [3, 1, 2]}), 'x').to_pandas(),\n",
+    "    pd.DataFrame({'x': [1, 2, 3]}),\n",
+    ")\n",
+    "pd.testing.assert_frame_equal(\n",
+    "    sort(pl.DataFrame({'x': [3, 1, 2]}), ['x']).to_pandas(),\n",
+    "    pd.DataFrame({'x': [1, 2, 3]}),\n",
+    ")\n",
+    "pd.testing.assert_series_equal(\n",
+    "    sort(pl.Series('x', [3, 1, 2])).to_pandas(),\n",
+    "    pd.Series([1, 2, 3], name='x')\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -557,6 +711,49 @@
     "    return out"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a2e3ff2c-9e70-46b3-9bf0-bbfcd339d9ba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "def group_by_agg(df: DataFrame, by, aggs, maintain_order=False) -> DataFrame:\n",
+    "    if isinstance(df, pd.DataFrame):\n",
+    "        out = group_by(df, by, maintain_order).agg(aggs).reset_index()\n",
+    "    else:\n",
+    "        out = group_by(df, by, maintain_order).agg(*[getattr(pl.col(c), agg)() for c, agg in aggs.items()])\n",
+    "    return out"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d9f92cd4-d3f7-4de1-b438-c3c5891c3343",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.testing.assert_frame_equal(\n",
+    "    group_by_agg(pd.DataFrame({'x': [1, 1, 2], 'y': [1, 1, 1]}), 'x', {'y': 'sum'}),\n",
+    "    pd.DataFrame({'x': [1, 2], 'y': [2, 1]})\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "329cfc66-a218-498e-b674-96491f47a3e1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| polars\n",
+    "pd.testing.assert_frame_equal(\n",
+    "    group_by_agg(pl.DataFrame({'x': [1, 1, 2], 'y': [1, 1, 1]}), 'x', {'y': 'sum'}, maintain_order=True).to_pandas(),\n",
+    "    pd.DataFrame({'x': [1, 2], 'y': [2, 1]})\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -594,6 +791,93 @@
     "np.testing.assert_equal(is_in(pl.Series([1, 2, 3]), [1]), np.array([True, False, False]))"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9717022e-2c6f-47dc-8b19-da069341b094",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "def between(s: Series, lower: Series, upper: Series) -> Series:\n",
+    "    if isinstance(s, pd.Series):\n",
+    "        out = s.between(lower, upper)\n",
+    "    else:\n",
+    "        out = s.is_between(lower, upper)\n",
+    "    return out"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dca138b4-e771-4b8e-aa54-35dc37802d78",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.testing.assert_equal(\n",
+    "    between(pd.Series([1, 2, 3]), pd.Series([0, 1, 4]), pd.Series([4, 1, 2])),\n",
+    "    np.array([True, False, False]),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c6c773bf-fe23-4428-84f6-c5afaefdad06",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| polars\n",
+    "np.testing.assert_equal(\n",
+    "    between(pl.Series([1, 2, 3]), pl.Series([0, 1, 4]), pl.Series([4, 1, 2])),\n",
+    "    np.array([True, False, False]),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "667302bf-3b54-4298-8fcc-82cd6b12fb73",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "def fill_null(df: DataFrame, mapping: Dict[str, Any]) -> DataFrame:\n",
+    "    if isinstance(df, pd.DataFrame):\n",
+    "        out = df.fillna(mapping)\n",
+    "    else:\n",
+    "        out = df.with_columns(*[pl.col(col).fill_null(v) for col, v in mapping.items()])\n",
+    "    return out"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "74993c58-0886-4290-ab90-8065651886c5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.testing.assert_frame_equal(\n",
+    "    fill_null(pd.DataFrame({'x': [1, np.nan], 'y': [np.nan, 2]}), {'x': 2, 'y': 1}),\n",
+    "    pd.DataFrame({'x': [1, 2], 'y': [1, 2]}, dtype='float64')\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec1d835a-f1dc-4c1a-be2d-e7dd5b9895ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| polars\n",
+    "# TODO: replace with pl.testing.assert_frame_equal when it's released\n",
+    "pd.testing.assert_frame_equal(\n",
+    "    fill_null(pl.DataFrame({'x': [1, None], 'y': [None, 2]}), {'x': 2, 'y': 1}).to_pandas(),\n",
+    "    pd.DataFrame({'x': [1, 2], 'y': [1, 2]})\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/utilsforecast/_modidx.py b/utilsforecast/_modidx.py
@@ -68,15 +68,20 @@
                                                                                                          'utilsforecast/processing.py'),
                                           'utilsforecast.processing.assign_columns': ( 'processing.html#assign_columns',
                                                                                        'utilsforecast/processing.py'),
+                                          'utilsforecast.processing.between': ('processing.html#between', 'utilsforecast/processing.py'),
                                           'utilsforecast.processing.copy_if_pandas': ( 'processing.html#copy_if_pandas',
                                                                                        'utilsforecast/processing.py'),
                                           'utilsforecast.processing.counts_by_id': ( 'processing.html#counts_by_id',
                                                                                      'utilsforecast/processing.py'),
                                           'utilsforecast.processing.drop_index_if_pandas': ( 'processing.html#drop_index_if_pandas',
                                                                                              'utilsforecast/processing.py'),
+                                          'utilsforecast.processing.fill_null': ( 'processing.html#fill_null',
+                                                                                  'utilsforecast/processing.py'),
                                           'utilsforecast.processing.filter_with_mask': ( 'processing.html#filter_with_mask',
                                                                                          'utilsforecast/processing.py'),
                                           'utilsforecast.processing.group_by': ('processing.html#group_by', 'utilsforecast/processing.py'),
+                                          'utilsforecast.processing.group_by_agg': ( 'processing.html#group_by_agg',
+                                                                                     'utilsforecast/processing.py'),
                                           'utilsforecast.processing.horizontal_concat': ( 'processing.html#horizontal_concat',
                                                                                           'utilsforecast/processing.py'),
                                           'utilsforecast.processing.is_in': ('processing.html#is_in', 'utilsforecast/processing.py'),
@@ -85,6 +90,8 @@
                                                                                        'utilsforecast/processing.py'),
                                           'utilsforecast.processing.is_none': ('processing.html#is_none', 'utilsforecast/processing.py'),
                                           'utilsforecast.processing.join': ('processing.html#join', 'utilsforecast/processing.py'),
+                                          'utilsforecast.processing.match_if_categorical': ( 'processing.html#match_if_categorical',
+                                                                                             'utilsforecast/processing.py'),
                                           'utilsforecast.processing.maybe_compute_sort_indices': ( 'processing.html#maybe_compute_sort_indices',
                                                                                                    'utilsforecast/processing.py'),
                                           'utilsforecast.processing.offset_dates': ( 'processing.html#offset_dates',
diff --git a/utilsforecast/processing.py b/utilsforecast/processing.py