|
12 | 12 | # pylint: disable=E1101,E1103
|
13 | 13 | # pylint: disable=W0212,W0231,W0703,W0622
|
14 | 14 |
|
| 15 | +import functools |
15 | 16 | import collections
|
16 | 17 | import itertools
|
17 | 18 | import sys
|
|
25 | 26 | from pandas.core.common import (isnull, notnull, PandasError, _try_sort,
|
26 | 27 | _default_index, _maybe_upcast, _is_sequence,
|
27 | 28 | _infer_dtype_from_scalar, _values_from_object,
|
28 |
| - is_list_like) |
| 29 | + is_list_like, _get_dtype) |
29 | 30 | from pandas.core.generic import NDFrame, _shared_docs
|
30 | 31 | from pandas.core.index import Index, MultiIndex, _ensure_index
|
31 | 32 | from pandas.core.indexing import (_maybe_droplevels,
|
32 | 33 | _convert_to_index_sliceable,
|
33 |
| - _check_bool_indexer, _maybe_convert_indices) |
| 34 | + _check_bool_indexer) |
34 | 35 | from pandas.core.internals import (BlockManager,
|
35 | 36 | create_block_manager_from_arrays,
|
36 | 37 | create_block_manager_from_blocks)
|
37 | 38 | from pandas.core.series import Series
|
38 | 39 | import pandas.computation.expressions as expressions
|
39 | 40 | from pandas.computation.eval import eval as _eval
|
40 |
| -from pandas.computation.scope import _ensure_scope |
41 | 41 | from numpy import percentile as _quantile
|
42 | 42 | from pandas.compat import(range, zip, lrange, lmap, lzip, StringIO, u,
|
43 | 43 | OrderedDict, raise_with_traceback)
|
@@ -1867,6 +1867,118 @@ def eval(self, expr, **kwargs):
|
1867 | 1867 | kwargs['resolvers'] = kwargs.get('resolvers', ()) + resolvers
|
1868 | 1868 | return _eval(expr, **kwargs)
|
1869 | 1869 |
|
| 1870 | + def select_dtypes(self, include=None, exclude=None): |
| 1871 | + """Return a subset of a DataFrame including/excluding columns based on |
| 1872 | + their ``dtype``. |
| 1873 | +
|
| 1874 | + Parameters |
| 1875 | + ---------- |
| 1876 | + include, exclude : list-like |
| 1877 | + A list of dtypes or strings to be included/excluded. You must pass |
| 1878 | + in a non-empty sequence for at least one of these. |
| 1879 | +
|
| 1880 | + Raises |
| 1881 | + ------ |
| 1882 | + ValueError |
| 1883 | + * If both of ``include`` and ``exclude`` are empty |
| 1884 | + * If ``include`` and ``exclude`` have overlapping elements |
| 1885 | + * If any kind of string dtype is passed in. |
| 1886 | + TypeError |
| 1887 | + * If either of ``include`` or ``exclude`` is not a sequence |
| 1888 | +
|
| 1889 | + Returns |
| 1890 | + ------- |
| 1891 | + subset : DataFrame |
| 1892 | + The subset of the frame including the dtypes in ``include`` and |
| 1893 | + excluding the dtypes in ``exclude``. |
| 1894 | +
|
| 1895 | + Notes |
| 1896 | + ----- |
| 1897 | + * To select all *numeric* types use the numpy dtype ``numpy.number`` |
| 1898 | + * To select strings you must use the ``object`` dtype, but note that |
| 1899 | + this will return *all* object dtype columns |
| 1900 | + * See the `numpy dtype hierarchy |
| 1901 | + <http://docs.scipy.org/doc/numpy/reference/arrays.scalars.html>`__ |
| 1902 | +
|
| 1903 | + Examples |
| 1904 | + -------- |
| 1905 | + >>> df = pd.DataFrame({'a': np.random.randn(6).astype('f4'), |
| 1906 | + ... 'b': [True, False] * 3, |
| 1907 | + ... 'c': [1.0, 2.0] * 3}) |
| 1908 | + >>> df |
| 1909 | + a b c |
| 1910 | + 0 0.3962 True 1 |
| 1911 | + 1 0.1459 False 2 |
| 1912 | + 2 0.2623 True 1 |
| 1913 | + 3 0.0764 False 2 |
| 1914 | + 4 -0.9703 True 1 |
| 1915 | + 5 -1.2094 False 2 |
| 1916 | + >>> df.select_dtypes(include=['float64']) |
| 1917 | + c |
| 1918 | + 0 1 |
| 1919 | + 1 2 |
| 1920 | + 2 1 |
| 1921 | + 3 2 |
| 1922 | + 4 1 |
| 1923 | + 5 2 |
| 1924 | + >>> df.select_dtypes(exclude=['floating']) |
| 1925 | + b |
| 1926 | + 0 True |
| 1927 | + 1 False |
| 1928 | + 2 True |
| 1929 | + 3 False |
| 1930 | + 4 True |
| 1931 | + 5 False |
| 1932 | + """ |
| 1933 | + include, exclude = include or (), exclude or () |
| 1934 | + if not (com.is_list_like(include) and com.is_list_like(exclude)): |
| 1935 | + raise TypeError('include and exclude must both be non-string' |
| 1936 | + ' sequences') |
| 1937 | + selection = tuple(map(frozenset, (include, exclude))) |
| 1938 | + |
| 1939 | + if not any(selection): |
| 1940 | + raise ValueError('at least one of include or exclude must be ' |
| 1941 | + 'nonempty') |
| 1942 | + |
| 1943 | + # convert the myriad valid dtypes object to a single representation |
| 1944 | + include, exclude = map(lambda x: |
| 1945 | + frozenset(map(com._get_dtype_from_object, x)), |
| 1946 | + selection) |
| 1947 | + for dtypes in (include, exclude): |
| 1948 | + com._invalidate_string_dtypes(dtypes) |
| 1949 | + |
| 1950 | + # can't both include AND exclude! |
| 1951 | + if not include.isdisjoint(exclude): |
| 1952 | + raise ValueError('include and exclude overlap on %s' |
| 1953 | + % (include & exclude)) |
| 1954 | + |
| 1955 | + # empty include/exclude -> defaults to True |
| 1956 | + # three cases (we've already raised if both are empty) |
| 1957 | + # case 1: empty include, nonempty exclude |
| 1958 | + # we have True, True, ... True for include, same for exclude |
| 1959 | + # in the loop below we get the excluded |
| 1960 | + # and when we call '&' below we get only the excluded |
| 1961 | + # case 2: nonempty include, empty exclude |
| 1962 | + # same as case 1, but with include |
| 1963 | + # case 3: both nonempty |
| 1964 | + # the "union" of the logic of case 1 and case 2: |
| 1965 | + # we get the included and excluded, and return their logical and |
| 1966 | + include_these = Series(not bool(include), index=self.columns) |
| 1967 | + exclude_these = Series(not bool(exclude), index=self.columns) |
| 1968 | + |
| 1969 | + def is_dtype_instance_mapper(column, dtype): |
| 1970 | + return column, functools.partial(issubclass, dtype.type) |
| 1971 | + |
| 1972 | + for column, f in itertools.starmap(is_dtype_instance_mapper, |
| 1973 | + self.dtypes.iteritems()): |
| 1974 | + if include: # checks for the case of empty include or exclude |
| 1975 | + include_these[column] = any(map(f, include)) |
| 1976 | + if exclude: |
| 1977 | + exclude_these[column] = not any(map(f, exclude)) |
| 1978 | + |
| 1979 | + dtype_indexer = include_these & exclude_these |
| 1980 | + return self.loc[com._get_info_slice(self, dtype_indexer)] |
| 1981 | + |
1870 | 1982 | def _box_item_values(self, key, values):
|
1871 | 1983 | items = self.columns[self.columns.get_loc(key)]
|
1872 | 1984 | if values.ndim == 2:
|
|
0 commit comments