Skip to content

Commit e028741

Browse files
authored
fix: Align str.to_titlecase with polars v1.35.0 behavior (#3238)
* fix: Align str.to_titlecase * Update docstrings
1 parent e860bcc commit e028741

File tree

7 files changed

+12
-65
lines changed

7 files changed

+12
-65
lines changed

narwhals/_duckdb/expr_str.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def to_titlecase(self) -> DuckDBExpr:
3535

3636
def _to_titlecase(expr: Expression) -> Expression:
3737
extract_expr = F(
38-
"regexp_extract_all", F("lower", expr), lit(r"[a-z0-9]*[^a-z0-9]*")
38+
"regexp_extract_all", F("lower", expr), lit(r"[a-z]*[^a-z]*")
3939
)
4040
elem = col("_")
4141
capitalize = lambda_expr(

narwhals/_polars/expr.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -393,10 +393,10 @@ class PolarsExprStringNamespace(
393393
def to_titlecase(self) -> PolarsExpr:
394394
native_expr = self.native
395395

396-
if BACKEND_VERSION < (1, 5):
396+
if BACKEND_VERSION < (1, 35):
397397
native_result = (
398398
native_expr.str.to_lowercase()
399-
.str.extract_all(r"[a-z0-9]*[^a-z0-9]*")
399+
.str.extract_all(r"[a-z]*[^a-z]*")
400400
.list.eval(pl.element().str.to_titlecase())
401401
.list.join("")
402402
)

narwhals/_spark_like/expr_str.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def _to_titlecase(expr: Column) -> Column:
5454
F = self.compliant._F
5555
lower_expr = F.lower(expr)
5656
extract_expr = F.regexp_extract_all(
57-
lower_expr, regexp=F.lit(r"[a-z0-9]*[^a-z0-9]*"), idx=0
57+
lower_expr, regexp=F.lit(r"[a-z]*[^a-z]*"), idx=0
5858
)
5959
capitalized_expr = F.transform(extract_expr, f=F.initcap)
6060
return F.array_join(capitalized_expr, delimiter="")

narwhals/expr_str.py

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -446,21 +446,8 @@ def to_titlecase(self) -> ExprT:
446446
This is a form of case transform where the first letter of each word is
447447
capitalized, with the rest of the word in lowercase.
448448
449-
Warning:
450-
Different backends might follow different rules to determine what a "word" is:
451-
452-
- duckdb, polars and spark-like use non-**alphanumeric** characters to
453-
define the word boundaries.
454-
- pandas-like, pyarrow and dask use non-**alphabetic** characters to define
455-
the word boundaries, matching the behavior of
456-
[`str.title`](https://docs.python.org/3/library/stdtypes.html#str.title).
457-
458-
We can observe the difference with the string `"with123numbers"`:
459-
460-
- non-**alphanumeric** -> `"With123numbers"`
461-
- notice lowercase **n** after the digits
462-
- non-**alphabetic** -> `"With123Numbers"`
463-
- notice uppercase **N** after the digits
449+
Word boundaries are defined by non-**alphabetic** characters, matching the
450+
behavior of [`str.title`](https://docs.python.org/3/library/stdtypes.html#str.title)
464451
465452
Examples:
466453
>>> import polars as pl

narwhals/series_str.py

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -399,17 +399,8 @@ def to_titlecase(self) -> SeriesT:
399399
This is a form of case transform where the first letter of each word is
400400
capitalized, with the rest of the word in lowercase.
401401
402-
Warning:
403-
Different backends might follow different rules to determine what a "word" is:
404-
405-
- polars uses **non-alphanumeric** characters to define the word boundaries.
406-
- pandas-like and pyarrow use **non-alphabetic** characters to define
407-
the word boundaries, matching the behavior of
408-
[`str.title`](https://docs.python.org/3/library/stdtypes.html#str.title).
409-
410-
As an example of such difference, in the former case the string `"with123numbers"`
411-
is mapped to `"With123numbers"` (notice lowercase **n** after the digits), while
412-
in the latter to `"With123Numbers"` (notice uppercase **N** after the digits).
402+
Word boundaries are defined by non-**alphabetic** characters, matching the
403+
behavior of [`str.title`](https://docs.python.org/3/library/stdtypes.html#str.title)
413404
414405
Examples:
415406
>>> import pyarrow as pa

tests/expr_and_series/str/to_titlecase_test.py

Lines changed: 1 addition & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -15,26 +15,7 @@
1515
]
1616
}
1717

18-
expected_non_alphabetic = {
19-
"a": [
20-
"E.T. Phone Home",
21-
"They'Re Bill'S Friends From The Uk",
22-
"To Infinity,And Beyond!",
23-
"With123Numbers",
24-
"__Dunder__Score_A1_.2B ?Three",
25-
]
26-
}
27-
expected_non_alphanumeric = {
28-
"a": [
29-
"E.T. Phone Home",
30-
"They'Re Bill'S Friends From The Uk",
31-
"To Infinity,And Beyond!",
32-
"With123numbers",
33-
"__Dunder__Score_A1_.2b ?Three",
34-
]
35-
}
36-
37-
NON_ALPHANUMERIC_BACKENDS = ("duckdb", "polars", "pyspark")
18+
expected = {"a": [s.title() for s in data["a"]]}
3819

3920

4021
def test_str_to_titlecase_expr(
@@ -47,25 +28,13 @@ def test_str_to_titlecase_expr(
4728
if "ibis" in str(constructor):
4829
request.applymarker(pytest.mark.xfail)
4930

50-
expected = (
51-
expected_non_alphanumeric
52-
if any(x in str(constructor) for x in NON_ALPHANUMERIC_BACKENDS)
53-
else expected_non_alphabetic
54-
)
55-
5631
df = nw.from_native(constructor(data))
5732
result_frame = df.select(nw.col("a").str.to_titlecase())
5833

5934
assert_equal_data(result_frame, expected)
6035

6136

6237
def test_str_to_titlecase_series(constructor_eager: ConstructorEager) -> None:
63-
expected = (
64-
expected_non_alphanumeric
65-
if any(x in str(constructor_eager) for x in NON_ALPHANUMERIC_BACKENDS)
66-
else expected_non_alphabetic
67-
)
68-
6938
df = nw.from_native(constructor_eager(data), eager_only=True)
7039
result_series = df["a"].str.to_titlecase()
7140

tests/utils_test.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -524,7 +524,7 @@ def func3(
524524

525525

526526
def test_requires() -> None:
527-
class SomeAccesssor:
527+
class SomeAccessor:
528528
_accessor: ClassVar[Accessor] = "str"
529529

530530
def __init__(self, compliant: ProbablyCompliant) -> None:
@@ -565,8 +565,8 @@ def repeat(self, n: int) -> str:
565565
return self.native * n
566566

567567
@property
568-
def str(self) -> SomeAccesssor:
569-
return SomeAccesssor(self)
568+
def str(self) -> SomeAccessor:
569+
return SomeAccessor(self)
570570

571571
v_05 = ProbablyCompliant("123", (0, 5))
572572
v_201 = ProbablyCompliant("123", (2, 0, 1))

0 commit comments

Comments
 (0)