From 7b43a938f8e1b1f8597bf9e63911335cd31dc0a0 Mon Sep 17 00:00:00 2001 From: pan-vlados Date: Wed, 18 Dec 2024 04:17:41 +0300 Subject: [PATCH 1/8] GH1074 Add type hint Series[list[str]] for Series.str.split with expand=False --- pandas-stubs/_typing.pyi | 6 ++++-- pandas-stubs/core/series.pyi | 6 +++--- pandas-stubs/core/strings.pyi | 4 ++++ tests/test_frame.py | 6 +++--- tests/test_series.py | 3 +++ 5 files changed, 17 insertions(+), 8 deletions(-) diff --git a/pandas-stubs/_typing.pyi b/pandas-stubs/_typing.pyi index c4ee25c4b..56b83e176 100644 --- a/pandas-stubs/_typing.pyi +++ b/pandas-stubs/_typing.pyi @@ -547,7 +547,8 @@ S1 = TypeVar( | Period | Interval | CategoricalDtype - | BaseOffset, + | BaseOffset + | list[str], ) S2 = TypeVar( @@ -566,7 +567,8 @@ S2 = TypeVar( | Period | Interval | CategoricalDtype - | BaseOffset, + | BaseOffset + | list[str], ) IndexingInt: TypeAlias = ( diff --git a/pandas-stubs/core/series.pyi b/pandas-stubs/core/series.pyi index 2907cec00..f2bb6d938 100644 --- a/pandas-stubs/core/series.pyi +++ b/pandas-stubs/core/series.pyi @@ -242,15 +242,15 @@ class Series(IndexOpsMixin[S1], NDFrame): copy: bool = ..., ) -> Series[float]: ... @overload - def __new__( # type: ignore[overload-overlap] + def __new__( cls, - data: Sequence[Never], + data: Sequence[str], index: Axes | None = ..., *, dtype: Dtype = ..., name: Hashable = ..., copy: bool = ..., - ) -> Series[Any]: ... + ) -> Series[str]: ... @overload def __new__( cls, diff --git a/pandas-stubs/core/strings.pyi b/pandas-stubs/core/strings.pyi index 7e0dc880a..522504b4e 100644 --- a/pandas-stubs/core/strings.pyi +++ b/pandas-stubs/core/strings.pyi @@ -65,6 +65,10 @@ class StringMethods(NoNewAttributesMixin, Generic[T, _TS, _TM]): self, pat: str = ..., *, n: int = ..., expand: Literal[True], regex: bool = ... ) -> _TS: ... @overload + def split( + self, pat: str = ..., *, n: int = ..., expand: Literal[False], regex: bool = ... + ) -> Series[list[str]]: ... + @overload def split( self, pat: str = ..., *, n: int = ..., expand: bool = ..., regex: bool = ... ) -> T: ... diff --git a/tests/test_frame.py b/tests/test_frame.py index d8cf99f84..8810a06b9 100644 --- a/tests/test_frame.py +++ b/tests/test_frame.py @@ -3767,9 +3767,9 @@ class MyDict(TypedDict): def test_series_empty_dtype() -> None: - """Test for the creation of a Series from an empty list GH571 to map to a Series[Any].""" + """Test for the creation of a Series from an empty list GH571 to map to a Series[str].""" new_tab: Sequence[Never] = [] # need to be typehinted to please mypy - check(assert_type(pd.Series(new_tab), "pd.Series[Any]"), pd.Series) - check(assert_type(pd.Series([]), "pd.Series[Any]"), pd.Series) + check(assert_type(pd.Series(new_tab), "pd.Series[str]"), pd.Series) + check(assert_type(pd.Series([]), "pd.Series[str]"), pd.Series) # ensure that an empty string does not get matched to Sequence[Never] check(assert_type(pd.Series(""), "pd.Series[str]"), pd.Series) diff --git a/tests/test_series.py b/tests/test_series.py index f35fd20aa..ac4d78b2e 100644 --- a/tests/test_series.py +++ b/tests/test_series.py @@ -1556,6 +1556,9 @@ def test_string_accessors(): check(assert_type(s.str.split("a"), pd.Series), pd.Series) # GH 194 check(assert_type(s.str.split("a", expand=True), pd.DataFrame), pd.DataFrame) + check( + assert_type(s.str.split("a", expand=False), "pd.Series[list[str]]"), pd.Series + ) check(assert_type(s.str.startswith("a"), "pd.Series[bool]"), pd.Series, np.bool_) check( assert_type(s.str.startswith(("a", "b")), "pd.Series[bool]"), From bb3e78f8afea41670f6c2445f1343657c904fbf5 Mon Sep 17 00:00:00 2001 From: pan-vlados Date: Wed, 18 Dec 2024 21:21:37 +0300 Subject: [PATCH 2/8] Updates: - fix Index.str.split method return wrong result; - add test for Index.str.split method with expand=False; - return changes performed in pull request #1029. --- pandas-stubs/core/indexes/base.pyi | 4 +++- pandas-stubs/core/series.pyi | 24 +++++++++++++++++++++++- pandas-stubs/core/strings.pyi | 7 +++++-- tests/test_frame.py | 6 +++--- tests/test_indexes.py | 3 +++ 5 files changed, 37 insertions(+), 7 deletions(-) diff --git a/pandas-stubs/core/indexes/base.pyi b/pandas-stubs/core/indexes/base.pyi index 8f44bc5e3..763aae027 100644 --- a/pandas-stubs/core/indexes/base.pyi +++ b/pandas-stubs/core/indexes/base.pyi @@ -261,7 +261,9 @@ class Index(IndexOpsMixin[S1]): **kwargs, ) -> Self: ... @property - def str(self) -> StringMethods[Self, MultiIndex, np_ndarray_bool]: ... + def str( + self, + ) -> StringMethods[Self, MultiIndex, np_ndarray_bool, Index[list[str]]]: ... def is_(self, other) -> bool: ... def __len__(self) -> int: ... def __array__(self, dtype=...) -> np.ndarray: ... diff --git a/pandas-stubs/core/series.pyi b/pandas-stubs/core/series.pyi index f2bb6d938..b91d3842b 100644 --- a/pandas-stubs/core/series.pyi +++ b/pandas-stubs/core/series.pyi @@ -242,6 +242,26 @@ class Series(IndexOpsMixin[S1], NDFrame): copy: bool = ..., ) -> Series[float]: ... @overload + def __new__( # type: ignore[overload-overlap] + cls, + data: Sequence[Never], + index: Axes | None = ..., + *, + dtype: Dtype = ..., + name: Hashable = ..., + copy: bool = ..., + ) -> Series[Any]: ... + @overload + def __new__( + cls, + data: Sequence[list[str]], + index: Axes | None = ..., + *, + dtype: Dtype = ..., + name: Hashable = ..., + copy: bool = ..., + ) -> Series[list[str]]: ... + @overload def __new__( cls, data: Sequence[str], @@ -1199,7 +1219,9 @@ class Series(IndexOpsMixin[S1], NDFrame): ) -> Series[S1]: ... def to_period(self, freq: _str | None = ..., copy: _bool = ...) -> DataFrame: ... @property - def str(self) -> StringMethods[Series, DataFrame, Series[bool]]: ... + def str( + self, + ) -> StringMethods[Series, DataFrame, Series[bool], Series[list[str]]]: ... @property def dt(self) -> CombinedDatetimelikeProperties: ... @property diff --git a/pandas-stubs/core/strings.pyi b/pandas-stubs/core/strings.pyi index 522504b4e..cefe321bb 100644 --- a/pandas-stubs/core/strings.pyi +++ b/pandas-stubs/core/strings.pyi @@ -15,6 +15,7 @@ import numpy as np import pandas as pd from pandas import ( DataFrame, + Index, MultiIndex, Series, ) @@ -28,10 +29,12 @@ from pandas._typing import ( # The _TS type is what is used for the result of str.split with expand=True _TS = TypeVar("_TS", DataFrame, MultiIndex) +# The _TS2 type is what is used for the result of str.split with expand=False +_TS2 = TypeVar("_TS2", Series[list[str]], Index[list[str]]) # The _TM type is what is used for the result of str.match _TM = TypeVar("_TM", Series[bool], np_ndarray_bool) -class StringMethods(NoNewAttributesMixin, Generic[T, _TS, _TM]): +class StringMethods(NoNewAttributesMixin, Generic[T, _TS, _TM, _TS2]): def __init__(self, data: T) -> None: ... def __getitem__(self, key: slice | int) -> T: ... def __iter__(self) -> T: ... @@ -67,7 +70,7 @@ class StringMethods(NoNewAttributesMixin, Generic[T, _TS, _TM]): @overload def split( self, pat: str = ..., *, n: int = ..., expand: Literal[False], regex: bool = ... - ) -> Series[list[str]]: ... + ) -> _TS2: ... @overload def split( self, pat: str = ..., *, n: int = ..., expand: bool = ..., regex: bool = ... diff --git a/tests/test_frame.py b/tests/test_frame.py index 8810a06b9..d8cf99f84 100644 --- a/tests/test_frame.py +++ b/tests/test_frame.py @@ -3767,9 +3767,9 @@ class MyDict(TypedDict): def test_series_empty_dtype() -> None: - """Test for the creation of a Series from an empty list GH571 to map to a Series[str].""" + """Test for the creation of a Series from an empty list GH571 to map to a Series[Any].""" new_tab: Sequence[Never] = [] # need to be typehinted to please mypy - check(assert_type(pd.Series(new_tab), "pd.Series[str]"), pd.Series) - check(assert_type(pd.Series([]), "pd.Series[str]"), pd.Series) + check(assert_type(pd.Series(new_tab), "pd.Series[Any]"), pd.Series) + check(assert_type(pd.Series([]), "pd.Series[Any]"), pd.Series) # ensure that an empty string does not get matched to Sequence[Never] check(assert_type(pd.Series(""), "pd.Series[str]"), pd.Series) diff --git a/tests/test_indexes.py b/tests/test_indexes.py index aab49c405..fc4a7efb0 100644 --- a/tests/test_indexes.py +++ b/tests/test_indexes.py @@ -113,6 +113,9 @@ def test_str_split() -> None: ind = pd.Index(["a-b", "c-d"]) check(assert_type(ind.str.split("-"), "pd.Index[str]"), pd.Index) check(assert_type(ind.str.split("-", expand=True), pd.MultiIndex), pd.MultiIndex) + check( + assert_type(ind.str.split("-", expand=False), "pd.Index[list[str]]"), pd.Index + ) def test_str_match() -> None: From f77583cd44440c16f199659f963069eddf73ac66 Mon Sep 17 00:00:00 2001 From: Vladislav Date: Wed, 18 Dec 2024 22:22:20 +0300 Subject: [PATCH 3/8] Update tests/test_indexes.py Co-authored-by: Irv Lustig --- tests/test_indexes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_indexes.py b/tests/test_indexes.py index fc4a7efb0..b4140ab7f 100644 --- a/tests/test_indexes.py +++ b/tests/test_indexes.py @@ -114,7 +114,7 @@ def test_str_split() -> None: check(assert_type(ind.str.split("-"), "pd.Index[str]"), pd.Index) check(assert_type(ind.str.split("-", expand=True), pd.MultiIndex), pd.MultiIndex) check( - assert_type(ind.str.split("-", expand=False), "pd.Index[list[str]]"), pd.Index + assert_type(ind.str.split("-", expand=False), "pd.Index[list[str]]"), pd.Index, list ) From d93390d9bec37279612973513a3f035b93c66a08 Mon Sep 17 00:00:00 2001 From: Vladislav Date: Wed, 18 Dec 2024 22:22:33 +0300 Subject: [PATCH 4/8] Update tests/test_series.py Co-authored-by: Irv Lustig --- tests/test_series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_series.py b/tests/test_series.py index ac4d78b2e..df23d5333 100644 --- a/tests/test_series.py +++ b/tests/test_series.py @@ -1553,7 +1553,7 @@ def test_string_accessors(): check(assert_type(s.str.rstrip(), pd.Series), pd.Series) check(assert_type(s.str.slice(0, 4, 2), pd.Series), pd.Series) check(assert_type(s.str.slice_replace(0, 2, "XX"), pd.Series), pd.Series) - check(assert_type(s.str.split("a"), pd.Series), pd.Series) + check(assert_type(s.str.split("a"), "pd.Series[list[str]]"), pd.Series, list) # GH 194 check(assert_type(s.str.split("a", expand=True), pd.DataFrame), pd.DataFrame) check( From 577d375a217877419c8d3e26ef9b7036059d37f9 Mon Sep 17 00:00:00 2001 From: Vladislav Date: Wed, 18 Dec 2024 22:22:45 +0300 Subject: [PATCH 5/8] Update tests/test_series.py Co-authored-by: Irv Lustig --- tests/test_series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_series.py b/tests/test_series.py index df23d5333..d2bf9e348 100644 --- a/tests/test_series.py +++ b/tests/test_series.py @@ -1557,7 +1557,7 @@ def test_string_accessors(): # GH 194 check(assert_type(s.str.split("a", expand=True), pd.DataFrame), pd.DataFrame) check( - assert_type(s.str.split("a", expand=False), "pd.Series[list[str]]"), pd.Series + assert_type(s.str.split("a", expand=False), "pd.Series[list[str]]"), pd.Series, list ) check(assert_type(s.str.startswith("a"), "pd.Series[bool]"), pd.Series, np.bool_) check( From e46842a18222bdcd4c312975bebf69fb6e07fb74 Mon Sep 17 00:00:00 2001 From: pan-vlados Date: Wed, 18 Dec 2024 22:26:33 +0300 Subject: [PATCH 6/8] Updates: - combine two str.split overloads and keep only _TS and _TS2; - fix test_indexes.py test for test_str_split(). --- pandas-stubs/core/strings.pyi | 6 +----- tests/test_indexes.py | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas-stubs/core/strings.pyi b/pandas-stubs/core/strings.pyi index cefe321bb..df208ef5e 100644 --- a/pandas-stubs/core/strings.pyi +++ b/pandas-stubs/core/strings.pyi @@ -69,13 +69,9 @@ class StringMethods(NoNewAttributesMixin, Generic[T, _TS, _TM, _TS2]): ) -> _TS: ... @overload def split( - self, pat: str = ..., *, n: int = ..., expand: Literal[False], regex: bool = ... + self, pat: str = ..., *, n: int = ..., expand: Literal[False] = ..., regex: bool = ... ) -> _TS2: ... @overload - def split( - self, pat: str = ..., *, n: int = ..., expand: bool = ..., regex: bool = ... - ) -> T: ... - @overload def rsplit(self, pat: str = ..., *, n: int = ..., expand: Literal[True]) -> _TS: ... @overload def rsplit(self, pat: str = ..., *, n: int = ..., expand: bool = ...) -> T: ... diff --git a/tests/test_indexes.py b/tests/test_indexes.py index b4140ab7f..1712e6c28 100644 --- a/tests/test_indexes.py +++ b/tests/test_indexes.py @@ -111,7 +111,7 @@ def test_difference_none() -> None: def test_str_split() -> None: # GH 194 ind = pd.Index(["a-b", "c-d"]) - check(assert_type(ind.str.split("-"), "pd.Index[str]"), pd.Index) + check(assert_type(ind.str.split("-"), "pd.Index[list[str]]"), pd.Index) check(assert_type(ind.str.split("-", expand=True), pd.MultiIndex), pd.MultiIndex) check( assert_type(ind.str.split("-", expand=False), "pd.Index[list[str]]"), pd.Index, list From 7ba4a4571a05a278cb63913b8307650908d3eded Mon Sep 17 00:00:00 2001 From: pan-vlados Date: Wed, 18 Dec 2024 22:32:14 +0300 Subject: [PATCH 7/8] pre-commit fixes --- pandas-stubs/core/strings.pyi | 7 ++++++- tests/test_indexes.py | 4 +++- tests/test_series.py | 4 +++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/pandas-stubs/core/strings.pyi b/pandas-stubs/core/strings.pyi index df208ef5e..4ea794a5d 100644 --- a/pandas-stubs/core/strings.pyi +++ b/pandas-stubs/core/strings.pyi @@ -69,7 +69,12 @@ class StringMethods(NoNewAttributesMixin, Generic[T, _TS, _TM, _TS2]): ) -> _TS: ... @overload def split( - self, pat: str = ..., *, n: int = ..., expand: Literal[False] = ..., regex: bool = ... + self, + pat: str = ..., + *, + n: int = ..., + expand: Literal[False] = ..., + regex: bool = ..., ) -> _TS2: ... @overload def rsplit(self, pat: str = ..., *, n: int = ..., expand: Literal[True]) -> _TS: ... diff --git a/tests/test_indexes.py b/tests/test_indexes.py index 1712e6c28..a70c45228 100644 --- a/tests/test_indexes.py +++ b/tests/test_indexes.py @@ -114,7 +114,9 @@ def test_str_split() -> None: check(assert_type(ind.str.split("-"), "pd.Index[list[str]]"), pd.Index) check(assert_type(ind.str.split("-", expand=True), pd.MultiIndex), pd.MultiIndex) check( - assert_type(ind.str.split("-", expand=False), "pd.Index[list[str]]"), pd.Index, list + assert_type(ind.str.split("-", expand=False), "pd.Index[list[str]]"), + pd.Index, + list, ) diff --git a/tests/test_series.py b/tests/test_series.py index d2bf9e348..41dbaadfd 100644 --- a/tests/test_series.py +++ b/tests/test_series.py @@ -1557,7 +1557,9 @@ def test_string_accessors(): # GH 194 check(assert_type(s.str.split("a", expand=True), pd.DataFrame), pd.DataFrame) check( - assert_type(s.str.split("a", expand=False), "pd.Series[list[str]]"), pd.Series, list + assert_type(s.str.split("a", expand=False), "pd.Series[list[str]]"), + pd.Series, + list, ) check(assert_type(s.str.startswith("a"), "pd.Series[bool]"), pd.Series, np.bool_) check( From 667cef321f338603eb1f77e073919eecf973f7c0 Mon Sep 17 00:00:00 2001 From: pan-vlados Date: Wed, 18 Dec 2024 23:00:55 +0300 Subject: [PATCH 8/8] Add type hints and tests for str.rsplit() for expand=False --- pandas-stubs/core/strings.pyi | 4 +++- tests/test_indexes.py | 14 +++++++++++++- tests/test_series.py | 7 ++++++- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/pandas-stubs/core/strings.pyi b/pandas-stubs/core/strings.pyi index 4ea794a5d..b952ced0d 100644 --- a/pandas-stubs/core/strings.pyi +++ b/pandas-stubs/core/strings.pyi @@ -79,7 +79,9 @@ class StringMethods(NoNewAttributesMixin, Generic[T, _TS, _TM, _TS2]): @overload def rsplit(self, pat: str = ..., *, n: int = ..., expand: Literal[True]) -> _TS: ... @overload - def rsplit(self, pat: str = ..., *, n: int = ..., expand: bool = ...) -> T: ... + def rsplit( + self, pat: str = ..., *, n: int = ..., expand: Literal[False] = ... + ) -> _TS2: ... @overload def partition(self, sep: str = ...) -> pd.DataFrame: ... @overload diff --git a/tests/test_indexes.py b/tests/test_indexes.py index a70c45228..468908ad5 100644 --- a/tests/test_indexes.py +++ b/tests/test_indexes.py @@ -111,7 +111,7 @@ def test_difference_none() -> None: def test_str_split() -> None: # GH 194 ind = pd.Index(["a-b", "c-d"]) - check(assert_type(ind.str.split("-"), "pd.Index[list[str]]"), pd.Index) + check(assert_type(ind.str.split("-"), "pd.Index[list[str]]"), pd.Index, list) check(assert_type(ind.str.split("-", expand=True), pd.MultiIndex), pd.MultiIndex) check( assert_type(ind.str.split("-", expand=False), "pd.Index[list[str]]"), @@ -120,6 +120,18 @@ def test_str_split() -> None: ) +def test_str_rsplit() -> None: + # GH 1074 + ind = pd.Index(["a-b", "c-d"]) + check(assert_type(ind.str.rsplit("-"), "pd.Index[list[str]]"), pd.Index, list) + check(assert_type(ind.str.rsplit("-", expand=True), pd.MultiIndex), pd.MultiIndex) + check( + assert_type(ind.str.rsplit("-", expand=False), "pd.Index[list[str]]"), + pd.Index, + list, + ) + + def test_str_match() -> None: i = pd.Index( ["applep", "bananap", "Cherryp", "DATEp", "eGGpLANTp", "123p", "23.45p"] diff --git a/tests/test_series.py b/tests/test_series.py index 41dbaadfd..1aca73c50 100644 --- a/tests/test_series.py +++ b/tests/test_series.py @@ -1548,8 +1548,13 @@ def test_string_accessors(): check(assert_type(s.str.rindex("p"), pd.Series), pd.Series) check(assert_type(s.str.rjust(80), pd.Series), pd.Series) check(assert_type(s.str.rpartition("p"), pd.DataFrame), pd.DataFrame) - check(assert_type(s.str.rsplit("a"), pd.Series), pd.Series) + check(assert_type(s.str.rsplit("a"), "pd.Series[list[str]]"), pd.Series, list) check(assert_type(s.str.rsplit("a", expand=True), pd.DataFrame), pd.DataFrame) + check( + assert_type(s.str.rsplit("a", expand=False), "pd.Series[list[str]]"), + pd.Series, + list, + ) check(assert_type(s.str.rstrip(), pd.Series), pd.Series) check(assert_type(s.str.slice(0, 4, 2), pd.Series), pd.Series) check(assert_type(s.str.slice_replace(0, 2, "XX"), pd.Series), pd.Series)