Skip to content

Commit f2d140d

Browse files
committed
Merge remote-tracking branch 'upstream/main' into pyarrow-nameerror
2 parents b17510c + 106f33c commit f2d140d

File tree

11 files changed

+138
-20
lines changed

11 files changed

+138
-20
lines changed

ci/code_checks.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
8282
-i "pandas.Timestamp.min PR02" \
8383
-i "pandas.Timestamp.resolution PR02" \
8484
-i "pandas.Timestamp.tzinfo GL08" \
85-
-i "pandas.api.types.is_re_compilable PR07,SA01" \
8685
-i "pandas.arrays.ArrowExtensionArray PR07,SA01" \
8786
-i "pandas.arrays.IntegerArray SA01" \
8887
-i "pandas.arrays.IntervalArray.length SA01" \

doc/source/conf.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,6 @@
242242
"external_links": [],
243243
"footer_start": ["pandas_footer", "sphinx-version"],
244244
"github_url": "https://github.com/pandas-dev/pandas",
245-
"twitter_url": "https://twitter.com/pandas_dev",
246245
"analytics": {
247246
"plausible_analytics_domain": "pandas.pydata.org",
248247
"plausible_analytics_url": "https://views.scientific-python.org/js/script.js",
@@ -258,6 +257,11 @@
258257
# patch version doesn't compare as equal (e.g. 2.2.1 != 2.2.0 but it should be)
259258
"show_version_warning_banner": False,
260259
"icon_links": [
260+
{
261+
"name": "X",
262+
"url": "https://x.com/pandas_dev",
263+
"icon": "fa-brands fa-square-x-twitter",
264+
},
261265
{
262266
"name": "Mastodon",
263267
"url": "https://fosstodon.org/@pandas_dev",

doc/source/user_guide/reshaping.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,7 @@ The missing value can be filled with a specific value with the ``fill_value`` ar
321321
.. image:: ../_static/reshaping_melt.png
322322

323323
The top-level :func:`~pandas.melt` function and the corresponding :meth:`DataFrame.melt`
324-
are useful to massage a :class:`DataFrame` into a format where one or more columns
324+
are useful to reshape a :class:`DataFrame` into a format where one or more columns
325325
are *identifier variables*, while all other columns, considered *measured
326326
variables*, are "unpivoted" to the row axis, leaving just two non-identifier
327327
columns, "variable" and "value". The names of those columns can be customized

pandas/_libs/index.pyi

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@ class MaskedUInt16Engine(MaskedIndexEngine): ...
7272
class MaskedUInt8Engine(MaskedIndexEngine): ...
7373
class MaskedBoolEngine(MaskedUInt8Engine): ...
7474

75+
class StringObjectEngine(ObjectEngine):
76+
def __init__(self, values: object, na_value) -> None: ...
77+
7578
class BaseMultiIndexCodesEngine:
7679
levels: list[np.ndarray]
7780
offsets: np.ndarray # np.ndarray[..., ndim=1]

pandas/_libs/index.pyx

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -557,6 +557,31 @@ cdef class StringEngine(IndexEngine):
557557
raise KeyError(val)
558558
return str(val)
559559

560+
cdef class StringObjectEngine(ObjectEngine):
561+
562+
cdef:
563+
object na_value
564+
bint uses_na
565+
566+
def __init__(self, ndarray values, na_value):
567+
super().__init__(values)
568+
self.na_value = na_value
569+
self.uses_na = na_value is C_NA
570+
571+
cdef bint _checknull(self, object val):
572+
if self.uses_na:
573+
return val is C_NA
574+
else:
575+
return util.is_nan(val)
576+
577+
cdef _check_type(self, object val):
578+
if isinstance(val, str):
579+
return val
580+
elif self._checknull(val):
581+
return self.na_value
582+
else:
583+
raise KeyError(val)
584+
560585

561586
cdef class DatetimeEngine(Int64Engine):
562587

pandas/core/dtypes/inference.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,12 +190,17 @@ def is_re_compilable(obj: object) -> bool:
190190
Parameters
191191
----------
192192
obj : The object to check
193+
The object to check if the object can be compiled into a regex pattern instance.
193194
194195
Returns
195196
-------
196197
bool
197198
Whether `obj` can be compiled as a regex pattern.
198199
200+
See Also
201+
--------
202+
api.types.is_re : Check if the object is a regex pattern instance.
203+
199204
Examples
200205
--------
201206
>>> from pandas.api.types import is_re_compilable

pandas/core/generic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -838,7 +838,7 @@ def pop(self, item: Hashable) -> Series | Any:
838838
return result
839839

840840
@final
841-
def squeeze(self, axis: Axis | None = None):
841+
def squeeze(self, axis: Axis | None = None) -> Scalar | Series | DataFrame:
842842
"""
843843
Squeeze 1 dimensional axis objects into scalars.
844844

pandas/core/indexes/base.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -876,7 +876,7 @@ def _engine(
876876
# ndarray[Any, Any]]" has no attribute "_ndarray" [union-attr]
877877
target_values = self._data._ndarray # type: ignore[union-attr]
878878
elif is_string_dtype(self.dtype) and not is_object_dtype(self.dtype):
879-
return libindex.StringEngine(target_values)
879+
return libindex.StringObjectEngine(target_values, self.dtype.na_value) # type: ignore[union-attr]
880880

881881
# error: Argument 1 to "ExtensionEngine" has incompatible type
882882
# "ndarray[Any, Any]"; expected "ExtensionArray"
@@ -5974,7 +5974,6 @@ def _should_fallback_to_positional(self) -> bool:
59745974
def get_indexer_non_unique(
59755975
self, target
59765976
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
5977-
target = ensure_index(target)
59785977
target = self._maybe_cast_listlike_indexer(target)
59795978

59805979
if not self._should_compare(target) and not self._should_partial_index(target):

pandas/core/reshape/melt.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,9 @@ def melt(
5151
"""
5252
Unpivot a DataFrame from wide to long format, optionally leaving identifiers set.
5353
54-
This function is useful to massage a DataFrame into a format where one
54+
This function is useful to reshape a DataFrame into a format where one
5555
or more columns are identifier variables (`id_vars`), while all other
56-
columns, considered measured variables (`value_vars`), are "unpivoted" to
56+
columns are considered measured variables (`value_vars`), and are "unpivoted" to
5757
the row axis, leaving just two non-identifier columns, 'variable' and
5858
'value'.
5959

pandas/tests/indexes/string/test_indexing.py

Lines changed: 93 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,51 @@
66
import pandas._testing as tm
77

88

9+
def _isnan(val):
10+
try:
11+
return val is not pd.NA and np.isnan(val)
12+
except TypeError:
13+
return False
14+
15+
16+
class TestGetLoc:
17+
def test_get_loc(self, any_string_dtype):
18+
index = Index(["a", "b", "c"], dtype=any_string_dtype)
19+
assert index.get_loc("b") == 1
20+
21+
def test_get_loc_raises(self, any_string_dtype):
22+
index = Index(["a", "b", "c"], dtype=any_string_dtype)
23+
with pytest.raises(KeyError, match="d"):
24+
index.get_loc("d")
25+
26+
def test_get_loc_invalid_value(self, any_string_dtype):
27+
index = Index(["a", "b", "c"], dtype=any_string_dtype)
28+
with pytest.raises(KeyError, match="1"):
29+
index.get_loc(1)
30+
31+
def test_get_loc_non_unique(self, any_string_dtype):
32+
index = Index(["a", "b", "a"], dtype=any_string_dtype)
33+
result = index.get_loc("a")
34+
expected = np.array([True, False, True])
35+
tm.assert_numpy_array_equal(result, expected)
36+
37+
def test_get_loc_non_missing(self, any_string_dtype, nulls_fixture):
38+
index = Index(["a", "b", "c"], dtype=any_string_dtype)
39+
with pytest.raises(KeyError):
40+
index.get_loc(nulls_fixture)
41+
42+
def test_get_loc_missing(self, any_string_dtype, nulls_fixture):
43+
index = Index(["a", "b", nulls_fixture], dtype=any_string_dtype)
44+
if any_string_dtype == "string" and (
45+
(any_string_dtype.na_value is pd.NA and nulls_fixture is not pd.NA)
46+
or (_isnan(any_string_dtype.na_value) and not _isnan(nulls_fixture))
47+
):
48+
with pytest.raises(KeyError):
49+
index.get_loc(nulls_fixture)
50+
else:
51+
assert index.get_loc(nulls_fixture) == 2
52+
53+
954
class TestGetIndexer:
1055
@pytest.mark.parametrize(
1156
"method,expected",
@@ -41,23 +86,60 @@ def test_get_indexer_strings_raises(self, any_string_dtype):
4186
["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2]
4287
)
4388

89+
@pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA])
90+
def test_get_indexer_missing(self, any_string_dtype, null, using_infer_string):
91+
# NaT and Decimal("NaN") from null_fixture are not supported for string dtype
92+
index = Index(["a", "b", null], dtype=any_string_dtype)
93+
result = index.get_indexer(["a", null, "c"])
94+
if using_infer_string:
95+
expected = np.array([0, 2, -1], dtype=np.intp)
96+
elif any_string_dtype == "string" and (
97+
(any_string_dtype.na_value is pd.NA and null is not pd.NA)
98+
or (_isnan(any_string_dtype.na_value) and not _isnan(null))
99+
):
100+
expected = np.array([0, -1, -1], dtype=np.intp)
101+
else:
102+
expected = np.array([0, 2, -1], dtype=np.intp)
44103

45-
class TestGetIndexerNonUnique:
46-
@pytest.mark.xfail(reason="TODO(infer_string)", strict=False)
47-
def test_get_indexer_non_unique_nas(self, any_string_dtype, nulls_fixture):
48-
index = Index(["a", "b", None], dtype=any_string_dtype)
49-
indexer, missing = index.get_indexer_non_unique([nulls_fixture])
104+
tm.assert_numpy_array_equal(result, expected)
50105

51-
expected_indexer = np.array([2], dtype=np.intp)
52-
expected_missing = np.array([], dtype=np.intp)
106+
107+
class TestGetIndexerNonUnique:
108+
@pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA])
109+
def test_get_indexer_non_unique_nas(
110+
self, any_string_dtype, null, using_infer_string
111+
):
112+
index = Index(["a", "b", null], dtype=any_string_dtype)
113+
indexer, missing = index.get_indexer_non_unique(["a", null])
114+
115+
if using_infer_string:
116+
expected_indexer = np.array([0, 2], dtype=np.intp)
117+
expected_missing = np.array([], dtype=np.intp)
118+
elif any_string_dtype == "string" and (
119+
(any_string_dtype.na_value is pd.NA and null is not pd.NA)
120+
or (_isnan(any_string_dtype.na_value) and not _isnan(null))
121+
):
122+
expected_indexer = np.array([0, -1], dtype=np.intp)
123+
expected_missing = np.array([1], dtype=np.intp)
124+
else:
125+
expected_indexer = np.array([0, 2], dtype=np.intp)
126+
expected_missing = np.array([], dtype=np.intp)
53127
tm.assert_numpy_array_equal(indexer, expected_indexer)
54128
tm.assert_numpy_array_equal(missing, expected_missing)
55129

56130
# actually non-unique
57-
index = Index(["a", None, "b", None], dtype=any_string_dtype)
58-
indexer, missing = index.get_indexer_non_unique([nulls_fixture])
59-
60-
expected_indexer = np.array([1, 3], dtype=np.intp)
131+
index = Index(["a", null, "b", null], dtype=any_string_dtype)
132+
indexer, missing = index.get_indexer_non_unique(["a", null])
133+
134+
if using_infer_string:
135+
expected_indexer = np.array([0, 1, 3], dtype=np.intp)
136+
elif any_string_dtype == "string" and (
137+
(any_string_dtype.na_value is pd.NA and null is not pd.NA)
138+
or (_isnan(any_string_dtype.na_value) and not _isnan(null))
139+
):
140+
pass
141+
else:
142+
expected_indexer = np.array([0, 1, 3], dtype=np.intp)
61143
tm.assert_numpy_array_equal(indexer, expected_indexer)
62144
tm.assert_numpy_array_equal(missing, expected_missing)
63145

0 commit comments

Comments
 (0)