Skip to content

Commit 72dea2a

Browse files
skritsotalakisStelios Kritsotalakisdangotbanned
authored
feat: str.split (#1932) (#2054)
--------- Co-authored-by: Stelios Kritsotalakis <kstelios@DESKTOP-D65QO0G> Co-authored-by: Dan Redding <[email protected]>
1 parent 62f8a1f commit 72dea2a

File tree

13 files changed

+193
-0
lines changed

13 files changed

+193
-0
lines changed

docs/api-reference/expr_str.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
- replace
1212
- replace_all
1313
- slice
14+
- split
1415
- starts_with
1516
- strip_chars
1617
- tail

docs/api-reference/series_str.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
- replace
1212
- replace_all
1313
- slice
14+
- split
1415
- starts_with
1516
- strip_chars
1617
- tail

narwhals/_arrow/expr_str.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,11 @@ def slice(self: Self, offset: int, length: int | None) -> ArrowExpr:
6767
self._compliant_expr, "str", "slice", offset=offset, length=length
6868
)
6969

70+
def split(self: Self, by: str) -> ArrowExpr:
71+
return reuse_series_namespace_implementation(
72+
self._compliant_expr, "str", "split", by=by
73+
)
74+
7075
def to_datetime(self: Self, format: str | None) -> ArrowExpr: # noqa: A002
7176
return reuse_series_namespace_implementation(
7277
self._compliant_expr, "str", "to_datetime", format=format

narwhals/_arrow/series_str.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,10 @@ def slice(self: Self, offset: int, length: int | None) -> ArrowSeries:
7272
)
7373
)
7474

75+
def split(self: Self, by: str) -> ArrowSeries:
76+
split_series = pc.split_pattern(self._compliant_series._native_series, by) # type: ignore[call-overload]
77+
return self._compliant_series._from_native_series(split_series)
78+
7579
def to_datetime(self: Self, format: str | None) -> ArrowSeries: # noqa: A002
7680
native = self._compliant_series._native_series
7781
format = parse_datetime_format(native) if format is None else format

narwhals/_dask/expr_str.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,13 @@ def slice(self: Self, offset: int, length: int | None) -> DaskExpr:
8181
length=length,
8282
)
8383

84+
def split(self: Self, by: str) -> DaskExpr:
85+
return self._compliant_expr._from_call(
86+
lambda _input, by: _input.str.split(pat=by),
87+
"split",
88+
by=by,
89+
)
90+
8491
def to_datetime(self: Self, format: str | None) -> DaskExpr: # noqa: A002
8592
return self._compliant_expr._from_call(
8693
lambda _input, format: dd.to_datetime(_input, format=format), # noqa: A006

narwhals/_duckdb/expr_str.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,12 @@ def func(_input: duckdb.Expression) -> duckdb.Expression:
5454

5555
return self._compliant_expr._from_call(func, "slice")
5656

57+
def split(self: Self, by: str) -> DuckDBExpr:
58+
return self._compliant_expr._from_call(
59+
lambda _input: FunctionExpression("str_split", _input, lit(by)),
60+
"split",
61+
)
62+
5763
def len_chars(self: Self) -> DuckDBExpr:
5864
return self._compliant_expr._from_call(
5965
lambda _input: FunctionExpression("length", _input), "len_chars"

narwhals/_pandas_like/expr_str.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,11 @@ def slice(self: Self, offset: int, length: int | None) -> PandasLikeExpr:
9393
self._compliant_expr, "str", "slice", offset=offset, length=length
9494
)
9595

96+
def split(self: Self, by: str) -> PandasLikeExpr:
97+
return reuse_series_namespace_implementation(
98+
self._compliant_expr, "str", "split", by=by
99+
)
100+
96101
def to_datetime(self: Self, format: str | None) -> PandasLikeExpr: # noqa: A002
97102
return reuse_series_namespace_implementation(
98103
self._compliant_expr,

narwhals/_pandas_like/series_str.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22

33
from typing import TYPE_CHECKING
44

5+
from narwhals._pandas_like.utils import get_dtype_backend
56
from narwhals._pandas_like.utils import to_datetime
7+
from narwhals.utils import Implementation
68

79
if TYPE_CHECKING:
810
from typing_extensions import Self
@@ -61,6 +63,27 @@ def slice(self: Self, offset: int, length: int | None) -> PandasLikeSeries:
6163
self._compliant_series._native_series.str.slice(start=offset, stop=stop),
6264
)
6365

66+
def split(self: Self, by: str) -> PandasLikeSeries:
67+
if (
68+
self._compliant_series._implementation is not Implementation.CUDF
69+
): # pragma: no cover
70+
dtype_backend = get_dtype_backend(
71+
self._compliant_series._native_series.dtype,
72+
self._compliant_series._implementation,
73+
)
74+
if dtype_backend != "pyarrow":
75+
msg = (
76+
"This operation requires a pyarrow-backed series. "
77+
"Please refer to https://narwhals-dev.github.io/narwhals/api-reference/narwhals/#narwhals.maybe_convert_dtypes "
78+
"and ensure you are using dtype_backend='pyarrow'. "
79+
"Additionally, make sure you have pandas version 1.5+ and pyarrow installed. "
80+
)
81+
raise TypeError(msg)
82+
83+
return self._compliant_series._from_native_series(
84+
self._compliant_series._native_series.str.split(pat=by),
85+
)
86+
6487
def to_datetime(self: Self, format: str | None) -> PandasLikeSeries: # noqa: A002
6588
return self._compliant_series._from_native_series(
6689
to_datetime(self._compliant_series._implementation)(

narwhals/_spark_like/expr_str.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,12 @@ def func(_input: Column) -> Column:
9090

9191
return self._compliant_expr._from_call(func, "slice")
9292

93+
def split(self: Self, by: str) -> SparkLikeExpr:
94+
return self._compliant_expr._from_call(
95+
lambda _input: self._compliant_expr._F.split(_input, by),
96+
"split",
97+
)
98+
9399
def to_uppercase(self: Self) -> SparkLikeExpr:
94100
return self._compliant_expr._from_call(
95101
self._compliant_expr._F.upper, "to_uppercase"

narwhals/expr_str.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,40 @@ def slice(self: Self, offset: int, length: int | None = None) -> ExprT:
271271
self._expr._metadata,
272272
)
273273

274+
def split(self: Self, by: str) -> ExprT:
275+
r"""Split the string values of an expression by a substring.
276+
277+
Arguments:
278+
by: Substring to split by.
279+
280+
Returns:
281+
A new expression.
282+
283+
Examples:
284+
>>> import polars as pl
285+
>>> import narwhals as nw
286+
>>> df_native = pl.DataFrame({"s": ["foo bar", "foo_bar"]})
287+
>>> df = nw.from_native(df_native)
288+
>>> df.with_columns(nw.col("s").str.split("_").alias("s_split"))
289+
┌────────────────────────────┐
290+
| Narwhals DataFrame |
291+
|----------------------------|
292+
|shape: (2, 2) |
293+
|┌─────────┬────────────────┐|
294+
|│ s ┆ s_split │|
295+
|│ --- ┆ --- │|
296+
|│ str ┆ list[str] │|
297+
|╞═════════╪════════════════╡|
298+
|│ foo bar ┆ ["foo bar"] │|
299+
|│ foo_bar ┆ ["foo", "bar"] │|
300+
|└─────────┴────────────────┘|
301+
└────────────────────────────┘
302+
"""
303+
return self._expr.__class__(
304+
lambda plx: self._expr._to_compliant_expr(plx).str.split(by=by),
305+
self._expr._metadata,
306+
)
307+
274308
def head(self: Self, n: int = 5) -> ExprT:
275309
r"""Take the first n elements of each string.
276310

0 commit comments

Comments
 (0)