Skip to content

Commit a4c9d4a

Browse files
Merge remote-tracking branch 'upstream/main' into string-dtype-tests-fixtures
2 parents 6c7a2ca + 73b5578 commit a4c9d4a

File tree

7 files changed

+101
-18
lines changed

7 files changed

+101
-18
lines changed

doc/source/user_guide/style.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,7 @@
351351
"\n",
352352
"- Using [.set_table_styles()][table] to control broader areas of the table with specified internal CSS. Although table styles allow the flexibility to add CSS selectors and properties controlling all individual parts of the table, they are unwieldy for individual cell specifications. Also, note that table styles cannot be exported to Excel. \n",
353353
"- Using [.set_td_classes()][td_class] to directly link either external CSS classes to your data cells or link the internal CSS classes created by [.set_table_styles()][table]. See [here](#Setting-Classes-and-Linking-to-External-CSS). These cannot be used on column header rows or indexes, and also won't export to Excel. \n",
354-
"- Using the [.apply()][apply] and [.map()][map] functions to add direct internal CSS to specific data cells. See [here](#Styler-Functions). As of v1.4.0 there are also methods that work directly on column header rows or indexes; [.apply_index()][applyindex] and [.map_index()][mapindex]. Note that only these methods add styles that will export to Excel. These methods work in a similar way to [DataFrame.apply()][dfapply] and [DataFrame.map()][dfmap].\n",
354+
"- Using the [.apply()][apply] and [.map()][map] functions to add direct internal CSS to specific data cells. See [here](#Styler-Functions). As of v1.4.0 there are also methods that work directly on column header rows or indexes: [.apply_index()][applyindex] and [.map_index()][mapindex]. Note that only these methods add styles that will export to Excel. These methods work in a similar way to [DataFrame.apply()][dfapply] and [DataFrame.map()][dfmap].\n",
355355
"\n",
356356
"[table]: ../reference/api/pandas.io.formats.style.Styler.set_table_styles.rst\n",
357357
"[styler]: ../reference/api/pandas.io.formats.style.Styler.rst\n",

pandas/core/base.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1049,6 +1049,34 @@ def value_counts(
10491049
4.0 1
10501050
NaN 1
10511051
Name: count, dtype: int64
1052+
1053+
**Categorical Dtypes**
1054+
1055+
Rows with categorical type will be counted as one group
1056+
if they have same categories and order.
1057+
In the example below, even though ``a``, ``c``, and ``d``
1058+
all have the same data types of ``category``,
1059+
only ``c`` and ``d`` will be counted as one group
1060+
since ``a`` doesn't have the same categories.
1061+
1062+
>>> df = pd.DataFrame({"a": [1], "b": ["2"], "c": [3], "d": [3]})
1063+
>>> df = df.astype({"a": "category", "c": "category", "d": "category"})
1064+
>>> df
1065+
a b c d
1066+
0 1 2 3 3
1067+
1068+
>>> df.dtypes
1069+
a category
1070+
b object
1071+
c category
1072+
d category
1073+
dtype: object
1074+
1075+
>>> df.dtypes.value_counts()
1076+
category 2
1077+
category 1
1078+
object 1
1079+
Name: count, dtype: int64
10521080
"""
10531081
return algorithms.value_counts_internal(
10541082
self,

pandas/core/series.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1815,14 +1815,30 @@ def _set_name(
18151815
Parrot 30.0
18161816
Parrot 20.0
18171817
Name: Max Speed, dtype: float64
1818+
1819+
We can pass a list of values to group the Series data by custom labels:
1820+
18181821
>>> ser.groupby(["a", "b", "a", "b"]).mean()
18191822
a 210.0
18201823
b 185.0
18211824
Name: Max Speed, dtype: float64
1825+
1826+
Grouping by numeric labels yields similar results:
1827+
1828+
>>> ser.groupby([0, 1, 0, 1]).mean()
1829+
0 210.0
1830+
1 185.0
1831+
Name: Max Speed, dtype: float64
1832+
1833+
We can group by a level of the index:
1834+
18221835
>>> ser.groupby(level=0).mean()
18231836
Falcon 370.0
18241837
Parrot 25.0
18251838
Name: Max Speed, dtype: float64
1839+
1840+
We can group by a condition applied to the Series values:
1841+
18261842
>>> ser.groupby(ser > 100).mean()
18271843
Max Speed
18281844
False 25.0
@@ -1845,11 +1861,16 @@ def _set_name(
18451861
Parrot Captive 30.0
18461862
Wild 20.0
18471863
Name: Max Speed, dtype: float64
1864+
18481865
>>> ser.groupby(level=0).mean()
18491866
Animal
18501867
Falcon 370.0
18511868
Parrot 25.0
18521869
Name: Max Speed, dtype: float64
1870+
1871+
We can also group by the 'Type' level of the hierarchical index
1872+
to get the mean speed for each type:
1873+
18531874
>>> ser.groupby(level="Type").mean()
18541875
Type
18551876
Captive 210.0
@@ -1865,12 +1886,17 @@ def _set_name(
18651886
b 3
18661887
dtype: int64
18671888
1889+
To include `NA` values in the group keys, set `dropna=False`:
1890+
18681891
>>> ser.groupby(level=0, dropna=False).sum()
18691892
a 3
18701893
b 3
18711894
NaN 3
18721895
dtype: int64
18731896
1897+
We can also group by a custom list with NaN values to handle
1898+
missing group labels:
1899+
18741900
>>> arrays = ['Falcon', 'Falcon', 'Parrot', 'Parrot']
18751901
>>> ser = pd.Series([390., 350., 30., 20.], index=arrays, name="Max Speed")
18761902
>>> ser.groupby(["a", "b", "a", np.nan]).mean()

pandas/io/parsers/base_parser.py

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from copy import copy
55
import csv
66
from enum import Enum
7+
import itertools
78
from typing import (
89
TYPE_CHECKING,
910
Any,
@@ -271,7 +272,7 @@ def _maybe_make_multi_index_columns(
271272

272273
@final
273274
def _make_index(
274-
self, data, alldata, columns, indexnamerow: list[Scalar] | None = None
275+
self, alldata, columns, indexnamerow: list[Scalar] | None = None
275276
) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]:
276277
index: Index | None
277278
if isinstance(self.index_col, list) and len(self.index_col):
@@ -326,7 +327,11 @@ def _agg_index(self, index) -> Index:
326327
converters = self._clean_mapping(self.converters)
327328
clean_dtypes = self._clean_mapping(self.dtype)
328329

329-
for i, arr in enumerate(index):
330+
if self.index_names is not None:
331+
names: Iterable = self.index_names
332+
else:
333+
names = itertools.cycle([None])
334+
for i, (arr, name) in enumerate(zip(index, names)):
330335
if self._should_parse_dates(i):
331336
arr = date_converter(
332337
arr,
@@ -369,12 +374,17 @@ def _agg_index(self, index) -> Index:
369374
arr, _ = self._infer_types(
370375
arr, col_na_values | col_na_fvalues, cast_type is None, try_num_bool
371376
)
372-
arrays.append(arr)
373-
374-
names = self.index_names
375-
index = ensure_index_from_sequences(arrays, names)
377+
if cast_type is not None:
378+
# Don't perform RangeIndex inference
379+
idx = Index(arr, name=name, dtype=cast_type)
380+
else:
381+
idx = ensure_index_from_sequences([arr], [name])
382+
arrays.append(idx)
376383

377-
return index
384+
if len(arrays) == 1:
385+
return arrays[0]
386+
else:
387+
return MultiIndex.from_arrays(arrays)
378388

379389
@final
380390
def _set_noconvert_dtype_columns(
@@ -704,12 +714,11 @@ def _get_empty_meta(
704714
dtype_dict: defaultdict[Hashable, Any]
705715
if not is_dict_like(dtype):
706716
# if dtype == None, default will be object.
707-
default_dtype = dtype or object
708-
dtype_dict = defaultdict(lambda: default_dtype)
717+
dtype_dict = defaultdict(lambda: dtype)
709718
else:
710719
dtype = cast(dict, dtype)
711720
dtype_dict = defaultdict(
712-
lambda: object,
721+
lambda: None,
713722
{columns[k] if is_integer(k) else k: v for k, v in dtype.items()},
714723
)
715724

@@ -726,8 +735,14 @@ def _get_empty_meta(
726735
if (index_col is None or index_col is False) or index_names is None:
727736
index = default_index(0)
728737
else:
729-
data = [Series([], dtype=dtype_dict[name]) for name in index_names]
730-
index = ensure_index_from_sequences(data, names=index_names)
738+
# TODO: We could return default_index(0) if dtype_dict[name] is None
739+
data = [
740+
Index([], name=name, dtype=dtype_dict[name]) for name in index_names
741+
]
742+
if len(data) == 1:
743+
index = data[0]
744+
else:
745+
index = MultiIndex.from_arrays(data)
731746
index_col.sort()
732747

733748
for i, n in enumerate(index_col):

pandas/io/parsers/c_parser_wrapper.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -338,7 +338,7 @@ def read(
338338
data = {k: v for k, (i, v) in zip(names, data_tups)}
339339

340340
date_data = self._do_date_conversions(names, data)
341-
index, column_names = self._make_index(date_data, alldata, names)
341+
index, column_names = self._make_index(alldata, names)
342342

343343
return index, column_names, date_data
344344

pandas/io/parsers/python_parser.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -312,9 +312,7 @@ def read(
312312
conv_data = self._convert_data(data)
313313
conv_data = self._do_date_conversions(columns, conv_data)
314314

315-
index, result_columns = self._make_index(
316-
conv_data, alldata, columns, indexnamerow
317-
)
315+
index, result_columns = self._make_index(alldata, columns, indexnamerow)
318316

319317
return index, result_columns, conv_data
320318

pandas/tests/io/parser/dtypes/test_dtypes_basic.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
3030
)
3131

32+
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
33+
3234

3335
@pytest.mark.parametrize("dtype", [str, object])
3436
@pytest.mark.parametrize("check_orig", [True, False])
@@ -614,6 +616,7 @@ def test_string_inference_object_dtype(all_parsers, dtype):
614616
tm.assert_frame_equal(result, expected)
615617

616618

619+
@xfail_pyarrow
617620
def test_accurate_parsing_of_large_integers(all_parsers):
618621
# GH#52505
619622
data = """SYMBOL,MOMENT,ID,ID_DEAL
@@ -624,7 +627,7 @@ def test_accurate_parsing_of_large_integers(all_parsers):
624627
AMZN,20230301181139587,2023552585717889759,2023552585717263360
625628
MSFT,20230301181139587,2023552585717889863,2023552585717263361
626629
NVDA,20230301181139587,2023552585717889827,2023552585717263361"""
627-
orders = pd.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()})
630+
orders = all_parsers.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()})
628631
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263358, "ID_DEAL"]) == 1
629632
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263359, "ID_DEAL"]) == 1
630633
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263360, "ID_DEAL"]) == 2
@@ -646,3 +649,16 @@ def test_dtypes_with_usecols(all_parsers):
646649
values = ["1", "4"]
647650
expected = DataFrame({"a": pd.Series(values, dtype=object), "c": [3, 6]})
648651
tm.assert_frame_equal(result, expected)
652+
653+
654+
def test_index_col_with_dtype_no_rangeindex(all_parsers):
655+
data = StringIO("345.5,519.5,0\n519.5,726.5,1")
656+
result = all_parsers.read_csv(
657+
data,
658+
header=None,
659+
names=["start", "stop", "bin_id"],
660+
dtype={"start": np.float32, "stop": np.float32, "bin_id": np.uint32},
661+
index_col="bin_id",
662+
).index
663+
expected = pd.Index([0, 1], dtype=np.uint32, name="bin_id")
664+
tm.assert_index_equal(result, expected)

0 commit comments

Comments
 (0)