Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
The diff you're trying to view is too large. We only load the first 3000 changed files.
12 changes: 12 additions & 0 deletions doc/source/reference/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,18 @@ Flat file
DataFrame.to_csv
read_fwf

.. currentmodule:: pandas.io.parsers

.. autosummary::
:toctree: api/

TextFileReader
TextFileReader.read
TextFileReader.get_chunk
TextFileReader.close

.. currentmodule:: pandas

Clipboard
~~~~~~~~~
.. autosummary::
Expand Down
5 changes: 5 additions & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ including other versions of pandas.
Enhancements
~~~~~~~~~~~~

Documentation improvements
^^^^^^^^^^^^^^^^^^^^^^^^^^

- :class:`pandas.io.parsers.TextFileReader` class and its methods are now fully documented in the API reference, making it easier to discover how to work with chunked reading of CSV files (:issue:`62574`)

.. _whatsnew_300.enhancements.string_dtype:

Dedicated string data type by default
Expand Down
6 changes: 6 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pandas/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@
# numpy compatible types
NumpyValueArrayLike: TypeAlias = ScalarLike_co | npt.ArrayLike
# Name "npt._ArrayLikeInt_co" is not defined [name-defined]
NumpySorter: TypeAlias = npt._ArrayLikeInt_co | None # type: ignore[name-defined]
NumpySorter: TypeAlias = npt._ArrayLikeInt_co | None


P = ParamSpec("P")
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/array_algos/quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def quantile_with_mask(
interpolation=interpolation,
)

result = np.asarray(result) # type: ignore[assignment]
result = np.asarray(result)
result = result.T

return result
Expand Down
7 changes: 2 additions & 5 deletions pandas/core/arrays/_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,11 +146,8 @@ def view(self, dtype: Dtype | None = None) -> ArrayLike:

dt64_values = arr.view(dtype)
return DatetimeArray._simple_new(dt64_values, dtype=dtype)
elif lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype):
from pandas.core.arrays import TimedeltaArray

td64_values = arr.view(dtype)
return TimedeltaArray._simple_new(td64_values, dtype=dtype)
elif isinstance(dtype, ExtensionDtype):
raise NotImplementedError(f"view not implemented for {dtype}")
return arr.view(dtype=dtype)

def take(
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/arrow/_arrow_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def pyarrow_array_to_numpy_and_mask(
mask = pyarrow.BooleanArray.from_buffers(
pyarrow.bool_(), len(arr), [None, bitmask], offset=arr.offset
)
mask = np.asarray(mask) # type: ignore[assignment]
mask = np.asarray(mask)
else:
mask = np.ones(len(arr), dtype=bool)
return data, mask
2 changes: 1 addition & 1 deletion pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -804,7 +804,7 @@ def _add_offset(self, offset: BaseOffset) -> Self:
try:
res_values = offset._apply_array(values._ndarray)
if res_values.dtype.kind == "i":
res_values = res_values.view(values.dtype)
res_values = res_values.view(values._ndarray.dtype)
except NotImplementedError:
if get_option("performance_warnings"):
warnings.warn(
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -898,7 +898,7 @@ def _get_counts_nanvar(
# error: Incompatible types in assignment (expression has type
# "float", variable has type "Union[floating[Any], ndarray[Any,
# dtype[floating[Any]]]]")
count = np.nan # type: ignore[assignment]
count = np.nan
d = np.nan
else:
# count is not narrowed by is_float check
Expand Down
226 changes: 175 additions & 51 deletions pandas/core/reshape/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,53 +51,117 @@ def get_dummies(
"""
Convert categorical variable into dummy/indicator variables.

Each variable is converted in as many 0/1 variables as there are different
values. Columns in the output are each named after a value; if the input is
a DataFrame, the name of the original variable is prepended to the value.
This function converts categorical data into binary (0/1) data, also known as
one-hot encoding or dummy variables. It's commonly used in statistical modeling
and machine learning. For more details, see the :ref:`reshaping` section in
the user guide.

Parameters
----------
data : array-like, Series, or DataFrame
Data of which to get dummy indicators.
Data to encode. If a DataFrame, can work on multiple columns.
prefix : str, list of str, or dict of str, default None
A string to be prepended to DataFrame column names.
Pass a list with length equal to the number of columns
when calling get_dummies on a DataFrame. Alternatively, `prefix`
can be a dictionary mapping column names to prefixes.
String to prepend to column names.
* If a string, the same prefix is used for all columns
* If a list, it should have the same length as the number of columns
* If a dict, maps column names to prefixes
* If None, no prefix is used
prefix_sep : str, list of str, or dict of str, default '_'
Should you choose to prepend DataFrame column names with a prefix, this
is the separator/delimiter to use between the two. Alternatively,
`prefix_sep` can be a list with length equal to the number of columns,
or a dictionary mapping column names to separators.
Separator between prefix and dummy column names.
* If a string, the same separator is used for all columns
* If a list, should have same length as number of columns
* If a dict, maps column names to separators
dummy_na : bool, default False
If True, a NaN indicator column will be added even if no NaN values are present.
If False, NA values are encoded as all zero.
Add a column to indicate NaN values:
* If True, creates NA column even if no NaN values present
* If False, NA values are encoded as all zeros
columns : list-like, default None
Column names in the DataFrame to be encoded.
If `columns` is None then all the columns with
`object`, `string`, or `category` dtype will be converted.
Which columns to encode:
* If None, encodes all object, string, and category columns
* If list-like, encodes only specified columns
sparse : bool, default False
Whether the dummy-encoded columns should be backed by
a :class:`SparseArray` (True) or a regular NumPy array (False).
If True, return SparseArray (save memory for many zeros)
If False, return regular NumPy array
drop_first : bool, default False
Whether to get k-1 dummies out of k categorical levels by removing the
first level.
Whether to drop the first category level:
* If True, drops first level (avoid collinearity in models)
* If False, keeps all levels
dtype : dtype, default bool
Data type for new columns. Only a single dtype is allowed.
Data type for dummy columns. Must be a single dtype.

Returns
-------
DataFrame
Dummy-coded data. If `data` contains other columns than the
dummy-coded one(s), these will be prepended, unaltered, to the result.
Dummy-coded data:
* Each categorical level becomes a new column of 1s and 0s
* Original non-encoded columns are included unchanged
* Each new column name combines the original column name,
prefix_sep, and the encoded level

See Also
--------
Series.str.get_dummies : Convert Series of strings to dummy codes.
:func:`~pandas.from_dummies` : Convert dummy codes to categorical ``DataFrame``.
Series.str.get_dummies : Convert string Series to dummy codes.
from_dummies : Convert dummy codes back to categorical DataFrame.
DataFrame.astype : Convert dtypes of DataFrame columns.
pandas.Categorical : Represent categorical data.

Notes
-----
* Dummy variables are commonly used in statistical models and machine
learning to convert categorical data into a format suitable for
numerical processing.
* The `drop_first` option is useful for avoiding the "dummy variable trap"
where perfectly correlated dummy variables can cause problems in
regression models.
* For sparse matrices, using `sparse=True` can significantly reduce
memory usage when data has many zeros.

Examples
--------
Basic usage for a single column:

>>> s = pd.Series(list('abca'))
>>> pd.get_dummies(s)
a b c
0 1 0 0
1 0 1 0
2 0 0 1
3 1 0 0

With NaN values:

>>> s = pd.Series(list('abcaa'))
>>> s.loc[3] = np.nan
>>> pd.get_dummies(s, dummy_na=True)
a b c NaN
0 1 0 0 0
1 0 1 0 0
2 0 0 1 0
3 0 0 0 1
4 1 0 0 0

With `drop_first=True`:

>>> pd.get_dummies(s, drop_first=True)
b c
0 0 0
1 1 0
2 0 1
3 0 0
4 0 0

With DataFrame input and custom prefixes:

>>> df = pd.DataFrame({
... 'A': ['a', 'b', 'a'],
... 'B': ['b', 'a', 'c'],
... 'C': [1, 2, 3]
... })
>>> pd.get_dummies(df, prefix=['col1', 'col2'])
C col1_a col1_b col2_a col2_b col2_c
0 1 1 0 0 1 0
1 2 0 1 1 0 0
2 3 1 0 0 0 1
Reference :ref:`the user guide <reshaping.dummies>` for more examples.

Examples
Expand Down Expand Up @@ -372,48 +436,108 @@ def from_dummies(
"""
Create a categorical ``DataFrame`` from a ``DataFrame`` of dummy variables.

Inverts the operation performed by :func:`~pandas.get_dummies`.
This function converts dummy/indicator variables (typically 0s and 1s) back into
categorical variables, essentially inverting :func:`~pandas.get_dummies`. For
more details, see the :ref:`reshaping` section in the user guide.

.. versionadded:: 1.5.0

Parameters
----------
data : DataFrame
Data which contains dummy-coded variables in form of integer columns of
1's and 0's.
DataFrame containing dummy-coded variables (columns of 0s and 1s).
Each group of dummy columns represents one original categorical variable.
sep : str, default None
Separator used in the column names of the dummy categories they are
character indicating the separation of the categorical names from the prefixes.
For example, if your column names are 'prefix_A' and 'prefix_B',
you can strip the underscore by specifying sep='_'.
Separator used in the dummy column names between the prefix and category.
For example, if columns are 'color_red', 'color_blue', use sep='_' to
identify 'color' as the original variable name and ['red', 'blue'] as
the categories.
default_category : None, Hashable or dict of Hashables, default None
The default category is the implied category when a value has none of the
listed categories specified with a one, i.e. if all dummies in a row are
zero. Can be a single value for all variables or a dict directly mapping
the default categories to a prefix of a variable. The default category
will be coerced to the dtype of ``data.columns`` if such coercion is
lossless, and will raise otherwise.
Category to use for rows where all dummy values are 0:
* If None, raises error when a row has all zeros
* If Hashable, uses this value for all variables
* If dict, maps each variable prefix to its default category
Value will be coerced to match column dtype if possible.

Returns
-------
DataFrame
Categorical data decoded from the dummy input-data.
A DataFrame with categorical columns decoded from dummy variables.
Each group of dummy columns is converted back to a single
categorical column.

See Also
--------
get_dummies : Convert categorical variable(s) to dummy variables.
Categorical : Pandas Categorical type for categorical data.
DataFrame.astype : Convert DataFrame columns to different types.

Notes
-----
* The function assumes each group of dummy columns represents one
original categorical variable
* Column names must follow the pattern: prefix + sep + category
* Each row should have at most one 1 in each group of dummies
(unless using default_category)
* NA values are not allowed in the dummy columns

Examples
--------
Convert dummy columns back to a single categorical column:

>>> df = pd.DataFrame({
... 'color_red': [1, 0, 0],
... 'color_blue': [0, 1, 0],
... 'color_green': [0, 0, 1],
... 'size': [1, 2, 3]
... })
>>> pd.from_dummies(df, sep='_')
size color
0 1 red
1 2 blue
2 3 green

With a default category for rows of all zeros:

>>> df = pd.DataFrame({
... 'color_red': [1, 0, 0, 0],
... 'color_blue': [0, 1, 0, 0],
... 'color_green': [0, 0, 1, 0]
... })
>>> pd.from_dummies(df, sep='_', default_category='unknown')
color
0 red
1 blue
2 green
3 unknown

With different defaults for different variables:

>>> df = pd.DataFrame({
... 'color_red': [1, 0, 0],
... 'color_blue': [0, 0, 0],
... 'size_S': [0, 1, 0],
... 'size_M': [0, 0, 0]
... })
>>> defaults = {'color': 'unknown', 'size': 'L'}
>>> pd.from_dummies(df, sep='_', default_category=defaults)
color size
0 red L
1 unknown S
2 unknown L

Raises
------
ValueError
* When the input ``DataFrame`` ``data`` contains NA values.
* When the input ``DataFrame`` ``data`` contains column names with separators
that do not match the separator specified with ``sep``.
* When a ``dict`` passed to ``default_category`` does not include an implied
category for each prefix.
* When a value in ``data`` has more than one category assigned to it.
* When ``default_category=None`` and a value in ``data`` has no category
assigned to it.
* When input contains NA values
* When column names don't match the sep pattern
* When default_category dict is missing categories
* When a row has multiple 1s in one dummy group
* When a row has all 0s and no default_category
TypeError
* When the input ``data`` is not of type ``DataFrame``.
* When the input ``DataFrame`` ``data`` contains non-dummy data.
* When the passed ``sep`` is of a wrong data type.
* When input is not a DataFrame
* When columns don't contain dummy data
* When sep is not a string
* When the passed ``default_category`` is of a wrong data type.

See Also
Expand Down
Loading