WIP: select_dtypes impl

cpcloud · cpcloud · commit 4fc5ae79ed74 · 2014-07-07T12:13:32.000-04:00
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -524,6 +524,7 @@ Attributes and underlying data
    DataFrame.ftypes
    DataFrame.get_dtype_counts
    DataFrame.get_ftype_counts
+   DataFrame.select_dtypes
    DataFrame.values
    DataFrame.axes
    DataFrame.ndim
diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -1552,3 +1552,84 @@ While float dtypes are unchanged.
    casted = dfa[df2>0]
    casted
    casted.dtypes
+
+Selecting columns based on ``dtype``
+------------------------------------
+
+.. _basics.selectdtypes:
+
+.. versionadded:: 0.14.1
+
+The :meth:`~pandas.DataFrame.select_dtypes` method implements subsetting of columns
+based on their ``dtype``.
+
+First, let's create a :class:`~pandas.DataFrame` with a slew of different
+dtypes:
+
+.. ipython:: python
+
+   df = DataFrame({'string': list('abc'),
+                   'int64': list(range(1, 4)),
+                   'uint8': np.arange(3, 6).astype('u1'),
+                   'float64': np.arange(4.0, 7.0),
+                   'bool1': [True, False, True],
+                   'bool2': [False, True, False],
+                   'dates': pd.date_range('now', periods=3).values})
+   df['tdeltas'] = df.dates.diff()
+   df['uint64'] = np.arange(3, 6).astype('u8')
+   df['other_dates'] = pd.date_range('20130101', periods=3).values
+   df
+
+
+``select_dtypes`` has two parameters ``include`` and ``exclude`` that allow you to
+say "give me the columns WITH these dtypes" (``include``) and/or "give the
+columns WITHOUT these dtypes" (``exclude``).
+
+For example, to select ``bool`` columns
+
+.. ipython:: python
+
+   df.select_dtypes(include=[bool])
+
+You can also pass the name of a dtype in the `numpy dtype hierarchy
+<http://docs.scipy.org/doc/numpy/reference/arrays.scalars.html>`__:
+
+.. ipython:: python
+
+   df.select_dtypes(include=['bool'])
+
+:meth:`~pandas.DataFrame.select_dtypes` also works with generic dtypes as well.
+
+For example, to select all numeric and boolean columns while excluding unsigned
+integers
+
+.. ipython:: python
+
+   df.select_dtypes(include=['number', 'bool'], exclude=['unsignedinteger'])
+
+To select string columns you must use the ``object`` dtype:
+
+.. ipython:: python
+
+   df.select_dtypes(include=['object'])
+
+To see all the child dtypes of a generic ``dtype`` like ``numpy.number`` you
+can define a function that returns a tree of child dtypes:
+
+.. ipython:: python
+
+   def subdtypes(dtype):
+       subs = dtype.__subclasses__()
+       if not subs:
+           return dtype
+       return [dtype, [subdtypes(dt) for dt in subs]]
+
+All numpy dtypes are subclasses of ``numpy.generic``:
+
+.. ipython:: python
+
+    subdtypes(np.generic)
+
+.. note::
+
+   The ``include`` and ``exclude`` parameters must be non-string sequences.
diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt
@@ -91,6 +91,8 @@ Enhancements
 
 
 
+- Add :meth:`~pandas.DataFrame.select_dtypes` method to allow selection of
+  columns based on dtype (:issue:`7316`). See :ref:`the docs <basics.selectdtypes>`.
 
 
 
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -1603,6 +1603,66 @@ def _get_fill_func(method):
 #----------------------------------------------------------------------
 # Lots of little utilities
 
+def _validate_date_like_dtype(dtype):
+    try:
+        typ = np.datetime_data(dtype)[0]
+    except ValueError as e:
+        raise TypeError('%s' % e)
+    if typ != 'generic' and typ != 'ns':
+        raise ValueError('%r is too specific of a frequency, try passing %r'
+                         % (dtype.name, dtype.type.__name__))
+
+
+def _invalidate_string_dtypes(dtype_set):
+    """Change string like dtypes to object for ``DataFrame.select_dtypes()``."""
+    non_string_dtypes = dtype_set - _string_dtypes
+    if non_string_dtypes != dtype_set:
+        raise TypeError("string dtypes are not allowed, use 'object' instead")
+
+
+def _get_dtype_from_object(dtype):
+    """Get a numpy dtype.type-style object.
+
+    Notes
+    -----
+    If nothing can be found, returns ``object``.
+    """
+    # type object from a dtype
+    if isinstance(dtype, type) and issubclass(dtype, np.generic):
+        return dtype
+    elif isinstance(dtype, np.dtype):  # dtype object
+        try:
+            _validate_date_like_dtype(dtype)
+        except TypeError:
+            # should still pass if we don't have a datelike
+            pass
+        return dtype.type
+    elif isinstance(dtype, compat.string_types):
+        if dtype == 'datetime' or dtype == 'timedelta':
+            dtype += '64'
+        try:
+            return _get_dtype_from_object(getattr(np, dtype))
+        except AttributeError:
+            # handles cases like _get_dtype(int)
+            # i.e., python objects that are valid dtypes (unlike user-defined
+            # types, in general)
+            pass
+    return _get_dtype_from_object(np.dtype(dtype))
+
+
+_string_dtypes = frozenset(map(_get_dtype_from_object, (compat.binary_type,
+                                                        compat.text_type)))
+
+
+def _get_info_slice(obj, indexer):
+    """Slice the info axis of `obj` with `indexer`."""
+    if not hasattr(obj, '_info_axis_number'):
+        raise TypeError('object of type %r has no info axis' %
+                        type(obj).__name__)
+    slices = [slice(None)] * obj.ndim
+    slices[obj._info_axis_number] = indexer
+    return tuple(slices)
+
 
 def _maybe_box(indexer, values, obj, key):
 
@@ -1613,6 +1673,7 @@ def _maybe_box(indexer, values, obj, key):
     # return the value
     return values
 
+
 def _maybe_box_datetimelike(value):
     # turn a datetime like into a Timestamp/timedelta as needed
 
@@ -1797,6 +1858,7 @@ def _possibly_cast_to_datetime(value, dtype, coerce=False):
 
     return value
 
+
 def _possibly_infer_to_datetimelike(value):
     # we might have a array (or single object) that is datetime like,
     # and no dtype is passed don't change the value unless we find a
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -12,6 +12,7 @@
 # pylint: disable=E1101,E1103
 # pylint: disable=W0212,W0231,W0703,W0622
 
+import functools
 import collections
 import itertools
 import sys
@@ -25,19 +26,18 @@
 from pandas.core.common import (isnull, notnull, PandasError, _try_sort,
                                 _default_index, _maybe_upcast, _is_sequence,
                                 _infer_dtype_from_scalar, _values_from_object,
-                                is_list_like)
+                                is_list_like, _get_dtype)
 from pandas.core.generic import NDFrame, _shared_docs
 from pandas.core.index import Index, MultiIndex, _ensure_index
 from pandas.core.indexing import (_maybe_droplevels,
                                   _convert_to_index_sliceable,
-                                  _check_bool_indexer, _maybe_convert_indices)
+                                  _check_bool_indexer)
 from pandas.core.internals import (BlockManager,
                                    create_block_manager_from_arrays,
                                    create_block_manager_from_blocks)
 from pandas.core.series import Series
 import pandas.computation.expressions as expressions
 from pandas.computation.eval import eval as _eval
-from pandas.computation.scope import _ensure_scope
 from numpy import percentile as _quantile
 from pandas.compat import(range, zip, lrange, lmap, lzip, StringIO, u,
                           OrderedDict, raise_with_traceback)
@@ -1867,6 +1867,118 @@ def eval(self, expr, **kwargs):
         kwargs['resolvers'] = kwargs.get('resolvers', ()) + resolvers
         return _eval(expr, **kwargs)
 
+    def select_dtypes(self, include=None, exclude=None):
+        """Return a subset of a DataFrame including/excluding columns based on
+        their ``dtype``.
+
+        Parameters
+        ----------
+        include, exclude : list-like
+            A list of dtypes or strings to be included/excluded. You must pass
+            in a non-empty sequence for at least one of these.
+
+        Raises
+        ------
+        ValueError
+            * If both of ``include`` and ``exclude`` are empty
+            * If ``include`` and ``exclude`` have overlapping elements
+            * If any kind of string dtype is passed in.
+        TypeError
+            * If either of ``include`` or ``exclude`` is not a sequence
+
+        Returns
+        -------
+        subset : DataFrame
+            The subset of the frame including the dtypes in ``include`` and
+            excluding the dtypes in ``exclude``.
+
+        Notes
+        -----
+        * To select all *numeric* types use the numpy dtype ``numpy.number``
+        * To select strings you must use the ``object`` dtype, but note that
+          this will return *all* object dtype columns
+        * See the `numpy dtype hierarchy
+        <http://docs.scipy.org/doc/numpy/reference/arrays.scalars.html>`__
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({'a': np.random.randn(6).astype('f4'),
+        ...                    'b': [True, False] * 3,
+        ...                    'c': [1.0, 2.0] * 3})
+        >>> df
+                a      b  c
+        0  0.3962   True  1
+        1  0.1459  False  2
+        2  0.2623   True  1
+        3  0.0764  False  2
+        4 -0.9703   True  1
+        5 -1.2094  False  2
+        >>> df.select_dtypes(include=['float64'])
+           c
+        0  1
+        1  2
+        2  1
+        3  2
+        4  1
+        5  2
+        >>> df.select_dtypes(exclude=['floating'])
+               b
+        0   True
+        1  False
+        2   True
+        3  False
+        4   True
+        5  False
+        """
+        include, exclude = include or (), exclude or ()
+        if not (com.is_list_like(include) and com.is_list_like(exclude)):
+            raise TypeError('include and exclude must both be non-string'
+                            ' sequences')
+        selection = tuple(map(frozenset, (include, exclude)))
+
+        if not any(selection):
+            raise ValueError('at least one of include or exclude must be '
+                             'nonempty')
+
+        # convert the myriad valid dtypes object to a single representation
+        include, exclude = map(lambda x:
+                               frozenset(map(com._get_dtype_from_object, x)),
+                               selection)
+        for dtypes in (include, exclude):
+            com._invalidate_string_dtypes(dtypes)
+
+        # can't both include AND exclude!
+        if not include.isdisjoint(exclude):
+            raise ValueError('include and exclude overlap on %s'
+                             % (include & exclude))
+
+        # empty include/exclude -> defaults to True
+        # three cases (we've already raised if both are empty)
+        # case 1: empty include, nonempty exclude
+        # we have True, True, ... True for include, same for exclude
+        # in the loop below we get the excluded
+        # and when we call '&' below we get only the excluded
+        # case 2: nonempty include, empty exclude
+        # same as case 1, but with include
+        # case 3: both nonempty
+        # the "union" of the logic of case 1 and case 2:
+        # we get the included and excluded, and return their logical and
+        include_these = Series(not bool(include), index=self.columns)
+        exclude_these = Series(not bool(exclude), index=self.columns)
+
+        def is_dtype_instance_mapper(column, dtype):
+            return column, functools.partial(issubclass, dtype.type)
+
+        for column, f in itertools.starmap(is_dtype_instance_mapper,
+                                           self.dtypes.iteritems()):
+            if include:  # checks for the case of empty include or exclude
+                include_these[column] = any(map(f, include))
+            if exclude:
+                exclude_these[column] = not any(map(f, exclude))
+
+        dtype_indexer = include_these & exclude_these
+        return self.loc[com._get_info_slice(self, dtype_indexer)]
+
     def _box_item_values(self, key, values):
         items = self.columns[self.columns.get_loc(key)]
         if values.ndim == 2:
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py

Original file line number	Diff line number	Diff line change
`@@ -91,6 +91,8 @@ Enhancements`
`91`	`91`
`92`	`92`
`93`	`93`
	`94`	+- Add :meth:`~pandas.DataFrame.select_dtypes` method to allow selection of
	`95`	+ columns based on dtype (:issue:`7316`). See :ref:`the docs <basics.selectdtypes>`.
`94`	`96`
`95`	`97`
`96`	`98`