Skip to content

Commit 3860a11

Browse files
Merge remote-tracking branch 'upstream/main' into string-option-enable-env
2 parents 4c3042d + d6c9941 commit 3860a11

File tree

82 files changed

+386
-214
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

82 files changed

+386
-214
lines changed

ci/code_checks.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -304,7 +304,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
304304
-i "pandas.api.types.is_re PR07,SA01" \
305305
-i "pandas.api.types.is_re_compilable PR07,SA01" \
306306
-i "pandas.api.types.is_sparse SA01" \
307-
-i "pandas.api.types.is_string_dtype SA01" \
308307
-i "pandas.api.types.is_timedelta64_ns_dtype SA01" \
309308
-i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \
310309
-i "pandas.api.types.union_categoricals RT03,SA01" \

doc/source/getting_started/comparison/comparison_with_sql.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -505,7 +505,7 @@ DELETE
505505
DELETE FROM tips
506506
WHERE tip > 9;
507507
508-
In pandas we select the rows that should remain instead of deleting them:
508+
In pandas we select the rows that should remain instead of deleting the rows that should be removed:
509509

510510
.. ipython:: python
511511

doc/source/user_guide/pyarrow.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,9 +159,11 @@ PyArrow also provides IO reading functionality that has been integrated into sev
159159
functions provide an ``engine`` keyword that can dispatch to PyArrow to accelerate reading from an IO source.
160160

161161
* :func:`read_csv`
162+
* :func:`read_feather`
162163
* :func:`read_json`
163164
* :func:`read_orc`
164-
* :func:`read_feather`
165+
* :func:`read_parquet`
166+
* :func:`read_table` (experimental)
165167

166168
.. ipython:: python
167169

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -583,6 +583,7 @@ I/O
583583
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
584584
- Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
585585
- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)
586+
- Bug in :meth:`read_stata` where extreme value integers were incorrectly interpreted as missing for format versions 111 and prior (:issue:`58130`)
586587

587588
Period
588589
^^^^^^

pandas/core/apply.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -486,20 +486,14 @@ def compute_dict_like(
486486
cols = df[key]
487487

488488
if cols.ndim == 1:
489-
series_list = [obj._gotitem(key, ndim=1, subset=cols)]
489+
series = obj._gotitem(key, ndim=1, subset=cols)
490+
results.append(getattr(series, op_name)(how, **kwargs))
491+
keys.append(key)
490492
else:
491-
series_list = []
492-
for index in range(cols.shape[1]):
493-
col = cols.iloc[:, index]
494-
493+
for _, col in cols.items():
495494
series = obj._gotitem(key, ndim=1, subset=col)
496-
series_list.append(series)
497-
498-
for series in series_list:
499-
result = getattr(series, op_name)(how, **kwargs)
500-
results.append(result)
501-
keys.append(key)
502-
495+
results.append(getattr(series, op_name)(how, **kwargs))
496+
keys.append(key)
503497
else:
504498
results = [
505499
getattr(obj._gotitem(key, ndim=1), op_name)(how, **kwargs)

pandas/core/dtypes/common.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -558,6 +558,11 @@ def is_string_dtype(arr_or_dtype) -> bool:
558558
boolean
559559
Whether or not the array or dtype is of the string dtype.
560560
561+
See Also
562+
--------
563+
api.types.is_string_dtype : Check whether the provided array or dtype
564+
is of the string dtype.
565+
561566
Examples
562567
--------
563568
>>> from pandas.api.types import is_string_dtype

pandas/core/frame.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -531,6 +531,7 @@ class DataFrame(NDFrame, OpsMixin):
531531
will perform column selection instead.
532532
dtype : dtype, default None
533533
Data type to force. Only a single dtype is allowed. If None, infer.
534+
If ``data`` is DataFrame then is ignored.
534535
copy : bool or None, default None
535536
Copy data from inputs.
536537
For dict data, the default of None behaves like ``copy=True``. For DataFrame

pandas/core/series.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,7 @@ class Series(base.IndexOpsMixin, NDFrame): # type: ignore[misc]
256256
Data type for the output Series. If not specified, this will be
257257
inferred from `data`.
258258
See the :ref:`user guide <basics.dtypes>` for more usages.
259+
If ``data`` is Series then is ignored.
259260
name : Hashable, default None
260261
The name to give to the Series.
261262
copy : bool, default False

pandas/io/stata.py

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -983,6 +983,19 @@ def __init__(self) -> None:
983983
np.float64(struct.unpack("<d", float64_max)[0]),
984984
),
985985
}
986+
self.OLD_VALID_RANGE = {
987+
"b": (-128, 126),
988+
"h": (-32768, 32766),
989+
"l": (-2147483648, 2147483646),
990+
"f": (
991+
np.float32(struct.unpack("<f", float32_min)[0]),
992+
np.float32(struct.unpack("<f", float32_max)[0]),
993+
),
994+
"d": (
995+
np.float64(struct.unpack("<d", float64_min)[0]),
996+
np.float64(struct.unpack("<d", float64_max)[0]),
997+
),
998+
}
986999

9871000
self.OLD_TYPE_MAPPING = {
9881001
98: 251, # byte
@@ -994,7 +1007,7 @@ def __init__(self) -> None:
9941007

9951008
# These missing values are the generic '.' in Stata, and are used
9961009
# to replace nans
997-
self.MISSING_VALUES = {
1010+
self.MISSING_VALUES: dict[str, int | np.float32 | np.float64] = {
9981011
"b": 101,
9991012
"h": 32741,
10001013
"l": 2147483621,
@@ -1808,11 +1821,18 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra
18081821
replacements = {}
18091822
for i in range(len(data.columns)):
18101823
fmt = self._typlist[i]
1811-
if fmt not in self.VALID_RANGE:
1812-
continue
1824+
if self._format_version <= 111:
1825+
if fmt not in self.OLD_VALID_RANGE:
1826+
continue
18131827

1814-
fmt = cast(str, fmt) # only strs in VALID_RANGE
1815-
nmin, nmax = self.VALID_RANGE[fmt]
1828+
fmt = cast(str, fmt) # only strs in OLD_VALID_RANGE
1829+
nmin, nmax = self.OLD_VALID_RANGE[fmt]
1830+
else:
1831+
if fmt not in self.VALID_RANGE:
1832+
continue
1833+
1834+
fmt = cast(str, fmt) # only strs in VALID_RANGE
1835+
nmin, nmax = self.VALID_RANGE[fmt]
18161836
series = data.iloc[:, i]
18171837

18181838
# appreciably faster to do this with ndarray instead of Series
@@ -1827,7 +1847,12 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra
18271847
umissing, umissing_loc = np.unique(series[missing], return_inverse=True)
18281848
replacement = Series(series, dtype=object)
18291849
for j, um in enumerate(umissing):
1830-
missing_value = StataMissingValue(um)
1850+
if self._format_version <= 111:
1851+
missing_value = StataMissingValue(
1852+
float(self.MISSING_VALUES[fmt])
1853+
)
1854+
else:
1855+
missing_value = StataMissingValue(um)
18311856

18321857
loc = missing_loc[umissing_loc == j]
18331858
replacement.iloc[loc] = missing_value

pandas/tests/apply/test_frame_apply.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -368,18 +368,18 @@ def test_apply_mixed_dtype_corner():
368368
result = df[:0].apply(np.mean, axis=1)
369369
# the result here is actually kind of ambiguous, should it be a Series
370370
# or a DataFrame?
371-
expected = Series(np.nan, index=pd.Index([], dtype="int64"))
371+
expected = Series(dtype=np.float64)
372372
tm.assert_series_equal(result, expected)
373373

374374

375375
def test_apply_mixed_dtype_corner_indexing():
376376
df = DataFrame({"A": ["foo"], "B": [1.0]})
377377
result = df.apply(lambda x: x["A"], axis=1)
378-
expected = Series(["foo"], index=[0])
378+
expected = Series(["foo"], index=range(1))
379379
tm.assert_series_equal(result, expected)
380380

381381
result = df.apply(lambda x: x["B"], axis=1)
382-
expected = Series([1.0], index=[0])
382+
expected = Series([1.0], index=range(1))
383383
tm.assert_series_equal(result, expected)
384384

385385

@@ -1037,7 +1037,7 @@ def test_result_type(int_frame_const_col):
10371037

10381038
result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="expand")
10391039
expected = df.copy()
1040-
expected.columns = [0, 1, 2]
1040+
expected.columns = range(3)
10411041
tm.assert_frame_equal(result, expected)
10421042

10431043

@@ -1047,7 +1047,7 @@ def test_result_type_shorter_list(int_frame_const_col):
10471047
df = int_frame_const_col
10481048
result = df.apply(lambda x: [1, 2], axis=1, result_type="expand")
10491049
expected = df[["A", "B"]].copy()
1050-
expected.columns = [0, 1]
1050+
expected.columns = range(2)
10511051
tm.assert_frame_equal(result, expected)
10521052

10531053

0 commit comments

Comments
 (0)