Skip to content

Commit 196792c

Browse files
Merge branch 'main' into fix_docstring_validation_errors
2 parents 6244320 + 9c776ae commit 196792c

File tree

12 files changed

+78
-45
lines changed

12 files changed

+78
-45
lines changed

ci/code_checks.sh

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -158,26 +158,15 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
158158
-i "pandas.Series.sparse.sp_values SA01" \
159159
-i "pandas.Series.sparse.to_coo PR07,RT03,SA01" \
160160
-i "pandas.Series.std PR01,RT03,SA01" \
161-
-i "pandas.Series.str.capitalize RT03" \
162-
-i "pandas.Series.str.casefold RT03" \
163-
-i "pandas.Series.str.center RT03,SA01" \
164-
-i "pandas.Series.str.decode PR07,RT03,SA01" \
165-
-i "pandas.Series.str.encode PR07,RT03,SA01" \
166-
-i "pandas.Series.str.ljust RT03,SA01" \
167-
-i "pandas.Series.str.lower RT03" \
168161
-i "pandas.Series.str.lstrip RT03" \
169162
-i "pandas.Series.str.match RT03" \
170163
-i "pandas.Series.str.normalize RT03,SA01" \
171164
-i "pandas.Series.str.partition RT03" \
172165
-i "pandas.Series.str.repeat SA01" \
173166
-i "pandas.Series.str.replace SA01" \
174-
-i "pandas.Series.str.rjust RT03,SA01" \
175167
-i "pandas.Series.str.rpartition RT03" \
176168
-i "pandas.Series.str.rstrip RT03" \
177169
-i "pandas.Series.str.strip RT03" \
178-
-i "pandas.Series.str.swapcase RT03" \
179-
-i "pandas.Series.str.title RT03" \
180-
-i "pandas.Series.str.upper RT03" \
181170
-i "pandas.Series.str.wrap RT03,SA01" \
182171
-i "pandas.Series.str.zfill RT03" \
183172
-i "pandas.Series.struct.dtypes SA01" \

doc/source/development/contributing_docstring.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ backticks. The following are considered inline code:
142142
143143
With several mistakes in the docstring.
144144
145-
It has a blank like after the signature ``def func():``.
145+
It has a blank line after the signature ``def func():``.
146146
147147
The text 'Some function' should go in the line after the
148148
opening quotes of the docstring, not in the same line.

pandas/core/arrays/string_arrow.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -130,18 +130,22 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr
130130

131131
def __init__(self, values) -> None:
132132
_chk_pyarrow_available()
133-
if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string(
134-
values.type
133+
if isinstance(values, (pa.Array, pa.ChunkedArray)) and (
134+
pa.types.is_string(values.type)
135+
or (
136+
pa.types.is_dictionary(values.type)
137+
and (
138+
pa.types.is_string(values.type.value_type)
139+
or pa.types.is_large_string(values.type.value_type)
140+
)
141+
)
135142
):
136143
values = pc.cast(values, pa.large_string())
137144

138145
super().__init__(values)
139146
self._dtype = StringDtype(storage=self._storage, na_value=self._na_value)
140147

141-
if not pa.types.is_large_string(self._pa_array.type) and not (
142-
pa.types.is_dictionary(self._pa_array.type)
143-
and pa.types.is_large_string(self._pa_array.type.value_type)
144-
):
148+
if not pa.types.is_large_string(self._pa_array.type):
145149
raise ValueError(
146150
"ArrowStringArray requires a PyArrow (chunked) array of "
147151
"large_string type"

pandas/core/dtypes/cast.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1023,6 +1023,8 @@ def convert_dtypes(
10231023
-------
10241024
np.dtype, or ExtensionDtype
10251025
"""
1026+
from pandas.core.arrays.string_ import StringDtype
1027+
10261028
inferred_dtype: str | DtypeObj
10271029

10281030
if (
@@ -1101,6 +1103,13 @@ def convert_dtypes(
11011103
# If we couldn't do anything else, then we retain the dtype
11021104
inferred_dtype = input_array.dtype
11031105

1106+
elif (
1107+
convert_string
1108+
and isinstance(input_array.dtype, StringDtype)
1109+
and input_array.dtype.na_value is np.nan
1110+
):
1111+
inferred_dtype = pandas_dtype_func("string")
1112+
11041113
else:
11051114
inferred_dtype = input_array.dtype
11061115

pandas/core/internals/blocks.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -512,7 +512,11 @@ def convert(self) -> list[Block]:
512512
convert_non_numeric=True,
513513
)
514514
refs = None
515-
if res_values is values:
515+
if (
516+
res_values is values
517+
or isinstance(res_values, NumpyExtensionArray)
518+
and res_values._ndarray is values
519+
):
516520
refs = self.refs
517521

518522
res_values = ensure_block_shape(res_values, self.ndim)

pandas/core/strings/accessor.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1749,6 +1749,18 @@ def pad(
17491749
Returns
17501750
-------
17511751
Series/Index of objects.
1752+
A Series or Index where the strings are modified by :meth:`str.%(method)s`.
1753+
1754+
See Also
1755+
--------
1756+
Series.str.rjust : Fills the left side of strings with an arbitrary
1757+
character.
1758+
Series.str.ljust : Fills the right side of strings with an arbitrary
1759+
character.
1760+
Series.str.center : Fills both sides of strings with an arbitrary
1761+
character.
1762+
Series.str.zfill : Pad strings in the Series/Index by prepending '0'
1763+
character.
17521764
17531765
Examples
17541766
--------
@@ -2024,11 +2036,19 @@ def decode(self, encoding, errors: str = "strict"):
20242036
Parameters
20252037
----------
20262038
encoding : str
2039+
Specifies the encoding to be used.
20272040
errors : str, optional
2041+
Specifies the error handling scheme.
2042+
Possible values are those supported by :meth:`bytes.decode`.
20282043
20292044
Returns
20302045
-------
20312046
Series or Index
2047+
A Series or Index with decoded strings.
2048+
2049+
See Also
2050+
--------
2051+
Series.str.encode : Encodes strings into bytes in a Series/Index.
20322052
20332053
Examples
20342054
--------
@@ -2063,11 +2083,19 @@ def encode(self, encoding, errors: str = "strict"):
20632083
Parameters
20642084
----------
20652085
encoding : str
2086+
Specifies the encoding to be used.
20662087
errors : str, optional
2088+
Specifies the error handling scheme.
2089+
Possible values are those supported by :meth:`str.encode`.
20672090
20682091
Returns
20692092
-------
20702093
Series/Index of objects
2094+
A Series or Index with strings encoded into bytes.
2095+
2096+
See Also
2097+
--------
2098+
Series.str.decode : Decodes bytes into strings in a Series/Index.
20712099
20722100
Examples
20732101
--------
@@ -3209,7 +3237,8 @@ def len(self):
32093237
32103238
Returns
32113239
-------
3212-
Series or Index of object
3240+
Series or Index of objects
3241+
A Series or Index where the strings are modified by :meth:`str.%(method)s`.
32133242
32143243
See Also
32153244
--------

pandas/io/parsers/readers.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,18 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):
268268
Number of lines at bottom of file to skip (Unsupported with ``engine='c'``).
269269
nrows : int, optional
270270
Number of rows of file to read. Useful for reading pieces of large files.
271+
Refers to the number of data rows in the returned DataFrame, excluding:
272+
273+
* The header row containing column names.
274+
* Rows before the header row, if ``header=1`` or larger.
275+
276+
Example usage:
277+
278+
* To read the first 999,999 (non-header) rows:
279+
``read_csv(..., nrows=999999)``
280+
281+
* To read rows 1,000,000 through 1,999,999:
282+
``read_csv(..., skiprows=1000000, nrows=999999)``
271283
na_values : Hashable, Iterable of Hashable or dict of {{Hashable : Iterable}}, optional
272284
Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific
273285
per-column ``NA`` values. By default the following values are interpreted as

pandas/tests/arrays/string_/test_string_arrow.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -88,19 +88,18 @@ def test_constructor_not_string_type_value_dictionary_raises(chunked):
8888
ArrowStringArray(arr)
8989

9090

91-
@pytest.mark.xfail(
92-
reason="dict conversion does not seem to be implemented for large string in arrow"
93-
)
91+
@pytest.mark.parametrize("string_type", ["string", "large_string"])
9492
@pytest.mark.parametrize("chunked", [True, False])
95-
def test_constructor_valid_string_type_value_dictionary(chunked):
93+
def test_constructor_valid_string_type_value_dictionary(string_type, chunked):
9694
pa = pytest.importorskip("pyarrow")
9795

98-
arr = pa.array(["1", "2", "3"], pa.large_string()).dictionary_encode()
96+
arr = pa.array(["1", "2", "3"], getattr(pa, string_type)()).dictionary_encode()
9997
if chunked:
10098
arr = pa.chunked_array(arr)
10199

102100
arr = ArrowStringArray(arr)
103-
assert pa.types.is_string(arr._pa_array.type.value_type)
101+
# dictionary type get converted to dense large string array
102+
assert pa.types.is_large_string(arr._pa_array.type)
104103

105104

106105
def test_constructor_from_list():

pandas/tests/frame/methods/test_convert_dtypes.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,21 +3,15 @@
33
import numpy as np
44
import pytest
55

6-
from pandas._config import using_string_dtype
7-
86
import pandas as pd
97
import pandas._testing as tm
108

119

1210
class TestConvertDtypes:
13-
# TODO convert_dtypes should not use NaN variant of string dtype, but always NA
14-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
1511
@pytest.mark.parametrize(
1612
"convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")]
1713
)
18-
def test_convert_dtypes(
19-
self, convert_integer, expected, string_storage, using_infer_string
20-
):
14+
def test_convert_dtypes(self, convert_integer, expected, string_storage):
2115
# Specific types are tested in tests/series/test_dtypes.py
2216
# Just check that it works for DataFrame here
2317
df = pd.DataFrame(
@@ -182,7 +176,6 @@ def test_convert_dtypes_pyarrow_timestamp(self):
182176
result = expected.convert_dtypes(dtype_backend="pyarrow")
183177
tm.assert_series_equal(result, expected)
184178

185-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
186179
def test_convert_dtypes_avoid_block_splitting(self):
187180
# GH#55341
188181
df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": "a"})
@@ -197,7 +190,6 @@ def test_convert_dtypes_avoid_block_splitting(self):
197190
tm.assert_frame_equal(result, expected)
198191
assert result._mgr.nblocks == 2
199192

200-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
201193
def test_convert_dtypes_from_arrow(self):
202194
# GH#56581
203195
df = pd.DataFrame([["a", datetime.time(18, 12)]], columns=["a", "b"])

pandas/tests/io/parser/dtypes/test_dtypes_basic.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -463,7 +463,6 @@ def test_dtype_backend_and_dtype(all_parsers):
463463
tm.assert_frame_equal(result, expected)
464464

465465

466-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
467466
def test_dtype_backend_string(all_parsers, string_storage):
468467
# GH#36712
469468
pa = pytest.importorskip("pyarrow")
@@ -507,7 +506,6 @@ def test_dtype_backend_ea_dtype_specified(all_parsers):
507506
tm.assert_frame_equal(result, expected)
508507

509508

510-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
511509
def test_dtype_backend_pyarrow(all_parsers, request):
512510
# GH#36712
513511
pa = pytest.importorskip("pyarrow")

0 commit comments

Comments
 (0)