[ENH] select columns by level (#1111)

samukweku · web-flow · commit a8324ddcd5d9 · 2022-06-14T07:01:33.000-04:00
* update tests; update logic for slicing when columns are not unique

* skeleton for level parameter

* skeleton for level parameter

* clean up select_columns.py

* changelog

* avoid copying the dataframe, by making changes on the columns only for level parameter

* cleanup _select_column_names

* add explanation for level parameter logic

* simply if-else logic in select_columns

* use set_axis to avoid modifying the original dataframe

* reuse variable name

* clean up a bit

* add more comments

* simplify search for strings

* fix error message

* update comments

* return early

* fail loudly if search not found

* cleanup

* cleanup

* fail loudly

* fix logic for string columns - check the underlying dtype

* fix logic for string columns - check the underlying dtype

* improve string/categorical column logic

* add tests for categorical columns
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,7 +5,7 @@
 -   [DOC] Updated developer guide docs.
 -   [ENH] Allow column selection/renaming within conditional_join. #1102 @samukweku.
 -   [ENH] New decorator `deprecated_kwargs` for breaking API. #1103 @Zeroto521
--   [ENH] Extend select_columns to support non-string columns. #1105 @samukweku
+-   [ENH] Extend select_columns to support non-string columns. Also allow selection on MultiIndex columns via level parameter. #1105 @samukweku
 -   [ENH] Performance improvement for groupby_topk. #1093 @samukweku
 -   [EHN] `min_max_scale` drop `old_min` and `old_max` to fit sklearn's method API. Issue #1068 @Zeroto521
 
diff --git a/janitor/functions/select_columns.py b/janitor/functions/select_columns.py
@@ -1,18 +1,19 @@
 """Implementation of select_columns"""
+from typing import Optional, Union
 import pandas_flavor as pf
 import pandas as pd
-
-from janitor.utils import deprecated_alias
+from pandas.api.types import is_list_like
+from janitor.utils import deprecated_alias, check
 
 from janitor.functions.utils import _select_column_names
-from pandas.api.types import is_list_like
 
 
 @pf.register_dataframe_method
 @deprecated_alias(search_cols="search_column_names")
 def select_columns(
     df: pd.DataFrame,
     *args,
+    level: Optional[Union[int, str]] = None,
     invert: bool = False,
 ) -> pd.DataFrame:
     """
@@ -48,6 +49,8 @@ def select_columns(
         a callable which is applicable to each Series in the DataFrame,
         or variable arguments of all the aforementioned.
         A sequence of booleans is also acceptable.
+    :param level: Determines which level in the columns should be used for the
+        column selection.
     :param invert: Whether or not to invert the selection.
         This will result in the selection of the complement of the columns
         provided.
@@ -62,8 +65,26 @@ def select_columns(
             search_column_names.extend(arg)
         else:
             search_column_names.append(arg)
-    full_column_list = _select_column_names(search_column_names, df)
-
+    if level is not None:
+        # goal here is to capture the original columns
+        # trim the df.columns to the specified level only,
+        # and apply the selection (_select_column_names)
+        # to get the relevant column labels.
+        # note that no level is dropped; if there are three levels,
+        # then three levels are returned, with the specified labels
+        # selected/deselected.
+        # A copy of the dataframe is made via set_axis,
+        # to avoid mutating the original dataframe.
+        df_columns = df.columns
+        check("level", level, [int, str])
+        full_column_list = df_columns.get_level_values(level)
+        full_column_list = _select_column_names(
+            search_column_names, df.set_axis(full_column_list, axis=1)
+        )
+        full_column_list = df_columns.isin(full_column_list, level=level)
+        full_column_list = df_columns[full_column_list]
+    else:
+        full_column_list = _select_column_names(search_column_names, df)
     if invert:
         return df.drop(columns=full_column_list)
     return df.loc[:, full_column_list]
diff --git a/janitor/functions/utils.py b/janitor/functions/utils.py
@@ -12,6 +12,9 @@
     union_categoricals,
     is_scalar,
     is_list_like,
+    is_datetime64_dtype,
+    is_string_dtype,
+    is_categorical_dtype,
 )
 import numpy as np
 from multipledispatch import dispatch
@@ -212,7 +215,16 @@ def _select_column_names(columns_to_select, df):
     """
     if columns_to_select in df.columns:
         return [columns_to_select]
-    raise KeyError(f"No match was returned for '{columns_to_select}'.")
+    raise KeyError(f"No match was returned for {columns_to_select}.")
+
+
+def _is_str_or_cat(df_columns):
+    """Check if the column is a string or categorical with strings."""
+    if is_string_dtype(df_columns):
+        return True
+    if is_categorical_dtype(df_columns):
+        return is_string_dtype(df_columns.categories)
+    return False
 
 
 @_select_column_names.register(str)  # noqa: F811
@@ -221,32 +233,26 @@ def _column_sel_dispatch(columns_to_select, df):  # noqa: F811
     Base function for column selection.
     Applies only to strings.
     It is also applicable to shell-like glob strings,
-    specifically, the `*`.
+    which are supported by `fnmatch`.
     A list/pandas Index of matching column names is returned.
     """
     df_columns = df.columns
-    if pd.api.types.is_string_dtype(df_columns):
-        if (
-            "*" in columns_to_select
-        ):  # shell-style glob string (e.g., `*_thing_*`)
-            return fnmatch.filter(df_columns, columns_to_select)
+
+    if _is_str_or_cat(df_columns):
         if columns_to_select in df_columns:
             return [columns_to_select]
-        raise KeyError(f"No match was returned for '{columns_to_select}'.")
-    if pd.api.types.is_datetime64_any_dtype(df_columns):
-        if not df_columns.is_monotonic_increasing:
-            raise ValueError(
-                "The column is a DatetimeIndex and should be "
-                "monotonic increasing."
-            )
+        outcome = fnmatch.filter(df_columns, columns_to_select)
+        if not outcome:
+            raise KeyError(f"No match was returned for '{columns_to_select}'.")
+        return outcome
+
+    if is_datetime64_dtype(df_columns):
         timestamp = df_columns.get_loc(columns_to_select)
-        if isinstance(timestamp, slice):
+        if not isinstance(timestamp, int):
             return df_columns[timestamp]
         return [df_columns[timestamp]]
-    raise KeyError(
-        f"String('{columns_to_select}') can be applied "
-        "only to string/datetime columns."
-    )
+
+    raise KeyError(f"No match was returned for '{columns_to_select}'.")
 
 
 @_select_column_names.register(re.Pattern)  # noqa: F811
@@ -257,15 +263,16 @@ def _column_sel_dispatch(columns_to_select, df):  # noqa: F811
     `re.compile` is required for the regular expression.
     A pandas Index of matching column names is returned.
     """
-    if pd.api.types.is_string_dtype(df.columns):
-        bools = df.columns.str.contains(
+    df_columns = df.columns
+
+    if _is_str_or_cat(df_columns):
+        bools = df_columns.str.contains(
             columns_to_select, na=False, regex=True
         )
-        return df.columns[bools]
-    raise KeyError(
-        f"Regular expressions('{columns_to_select}') "
-        "can be applied only to string columns."
-    )
+        if not bools.any():
+            raise KeyError(f"No match was returned for {columns_to_select}.")
+        return df_columns[bools]
+    raise KeyError(f"No match was returned for {columns_to_select}.")
 
 
 @_select_column_names.register(slice)  # noqa: F811
@@ -291,13 +298,12 @@ def _column_sel_dispatch(columns_to_select, df):  # noqa: F811
     step_check = None
     method = None
 
-    if not df_columns.is_unique:
+    if not df_columns.is_unique and not df_columns.is_monotonic_increasing:
         raise ValueError(
-            "The column labels are not unique. "
-            "Kindly ensure the labels are unique "
-            "to ensure the correct output."
+            "Non-unique column labels should be monotonic increasing."
         )
-    is_date_column = pd.api.types.is_datetime64_any_dtype(df_columns)
+
+    is_date_column = is_datetime64_dtype(df_columns)
     if is_date_column:
         if not df_columns.is_monotonic_increasing:
             raise ValueError(
@@ -377,6 +383,8 @@ def _column_sel_dispatch(columns_to_select, df):  # noqa: F811
         raise TypeError(
             "The output of the applied callable should be a boolean array."
         )
+    if not filtered_columns.any():
+        raise KeyError(f"No match was returned for {columns_to_select}.")
 
     return df.columns[filtered_columns]
 
diff --git a/tests/functions/test_select_columns.py b/tests/functions/test_select_columns.py
@@ -1,5 +1,6 @@
 import pandas as pd
 import pytest
+import re
 from pandas.testing import assert_frame_equal
 
 
@@ -51,36 +52,6 @@ def test_select_column_names_missing_columns(dataframe, columns):
         dataframe.select_columns(columns)
 
 
-@pytest.mark.functions
-@pytest.mark.parametrize(
-    "columns",
-    [
-        pytest.param(
-            "a",
-            marks=pytest.mark.xfail(
-                reason="`select_columns` now accepts strings"
-            ),
-        ),
-        pytest.param(
-            ("a", "Bell__Chart"),
-            marks=pytest.mark.xfail(
-                reason="`select_columns` converts list-like into lists"
-            ),
-        ),
-        pytest.param(
-            {"a", "Bell__Chart"},
-            marks=pytest.mark.xfail(
-                reason="`select_columns` converts list-like into lists"
-            ),
-        ),
-    ],
-)
-def test_select_column_names_input(dataframe, columns):
-    """Check that passing an iterable that is not a list raises TypeError."""
-    with pytest.raises(TypeError):
-        dataframe.select_columns(columns)
-
-
 @pytest.mark.functions
 @pytest.mark.parametrize(
     "invert,expected",
@@ -116,20 +87,60 @@ def columns(x):
     assert_frame_equal(df, dataframe[expected])
 
 
-@pytest.mark.xfail(reason="Allow tuples which are acceptable in MultiIndex.")
-def test_MultiIndex():
-    """
-    Raise ValueError if columns is a MultiIndex.
-    """
-    df = pd.DataFrame(
+@pytest.fixture
+def df_tuple():
+    "pytest fixture."
+    frame = pd.DataFrame(
         {
             "A": {0: "a", 1: "b", 2: "c"},
             "B": {0: 1, 1: 3, 2: 5},
             "C": {0: 2, 1: 4, 2: 6},
         }
     )
+    frame.columns = [list("ABC"), list("DEF")]
+    return frame
+
+
+def test_multiindex(df_tuple):
+    """
+    Test output for a MultiIndex and tuple passed.
+    """
+    assert_frame_equal(
+        df_tuple.select_columns(("A", "D")), df_tuple.loc[:, [("A", "D")]]
+    )
 
-    df.columns = [list("ABC"), list("DEF")]
 
-    with pytest.raises(ValueError):
-        df.select_columns("A")
+def test_level_callable(df_tuple):
+    """
+    Test output if level is supplied for a callable.
+    """
+    expected = df_tuple.select_columns(
+        lambda df: df.name.startswith("A"), level=0
+    )
+    actual = df_tuple.xs("A", axis=1, drop_level=False, level=0)
+    assert_frame_equal(actual, expected)
+
+
+def test_level_regex(df_tuple):
+    """
+    Test output if level is supplied for a regex
+    """
+    expected = df_tuple.select_columns(re.compile("D"), level=1)
+    actual = df_tuple.xs("D", axis=1, drop_level=False, level=1)
+    assert_frame_equal(actual, expected)
+
+
+def test_level_slice(df_tuple):
+    """
+    Test output if level is supplied for a slice
+    """
+    expected = df_tuple.select_columns(slice("F", "D"), level=1)
+    assert_frame_equal(df_tuple, expected)
+
+
+def test_level_str(df_tuple):
+    """
+    Test output if level is supplied for a string.
+    """
+    expected = df_tuple.select_columns("A", level=0, invert=True)
+    assert_frame_equal(df_tuple.drop(columns="A", axis=1, level=0), expected)
diff --git a/tests/utils/test__select_column.py b/tests/utils/test__select_column.py