Skip to content

Commit 5ebc246

Browse files
feat: add to_string method to SparkLikeExprDateTimeNamespace (#1842)
* feat: add `to_string` method to Spark `Expr.dt` * refactor(spark): centralize datetime format conversion * fix: raise on selectors addition (#1854) * avoid checking datetime objects * less hacky workaround * fix get_native_namespace test --------- Co-authored-by: Francesco Bruzzesi <[email protected]> Co-authored-by: FBruzzesi <[email protected]>
1 parent c7255b8 commit 5ebc246

File tree

5 files changed

+126
-65
lines changed

5 files changed

+126
-65
lines changed

narwhals/_spark_like/expr_dt.py

Lines changed: 72 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from typing import TYPE_CHECKING
44

55
from narwhals._duration import parse_interval_string
6-
from narwhals._spark_like.utils import UNITS_DICT
6+
from narwhals._spark_like.utils import UNITS_DICT, strptime_to_pyspark_format
77

88
if TYPE_CHECKING:
99
from sqlframe.base.column import Column
@@ -15,6 +15,40 @@ class SparkLikeExprDateTimeNamespace:
1515
def __init__(self, expr: SparkLikeExpr) -> None:
1616
self._compliant_expr = expr
1717

18+
def to_string(self, format: str) -> SparkLikeExpr:
19+
F = self._compliant_expr._F # noqa: N806
20+
21+
def _to_string(_input: Column) -> Column:
22+
# Handle special formats
23+
if format == "%G-W%V":
24+
return self._format_iso_week(_input)
25+
if format == "%G-W%V-%u":
26+
return self._format_iso_week_with_day(_input)
27+
28+
format_, suffix = self._format_microseconds(_input, format)
29+
30+
# Convert Python format to PySpark format
31+
pyspark_fmt = strptime_to_pyspark_format(format_)
32+
33+
result = F.date_format(_input, pyspark_fmt)
34+
if "T" in format_:
35+
# `strptime_to_pyspark_format` replaces "T" with " " since pyspark
36+
# does not support the literal "T" in `date_format`.
37+
# If no other spaces are in the given format, then we can revert this
38+
# operation, otherwise we raise an exception.
39+
if " " not in format_:
40+
result = F.replace(result, F.lit(" "), F.lit("T"))
41+
else: # pragma: no cover
42+
msg = (
43+
"`dt.to_string` with a format that contains both spaces and "
44+
" the literal 'T' is not supported for spark-like backends."
45+
)
46+
raise NotImplementedError(msg)
47+
48+
return F.concat(result, *suffix)
49+
50+
return self._compliant_expr._with_callable(_to_string)
51+
1852
def date(self) -> SparkLikeExpr:
1953
return self._compliant_expr._with_callable(self._compliant_expr._F.to_date)
2054

@@ -89,3 +123,40 @@ def replace_time_zone(self, time_zone: str | None) -> SparkLikeExpr:
89123
else: # pragma: no cover
90124
msg = "`replace_time_zone` with non-null `time_zone` not yet implemented for spark-like"
91125
raise NotImplementedError(msg)
126+
127+
def _format_iso_week_with_day(self, _input: Column) -> Column:
128+
"""Format datetime as ISO week string with day."""
129+
F = self._compliant_expr._F # noqa: N806
130+
131+
year = F.date_format(_input, "yyyy")
132+
week = F.lpad(F.weekofyear(_input).cast("string"), 2, "0")
133+
day = F.dayofweek(_input)
134+
# Adjust Sunday from 1 to 7
135+
day = F.when(day == 1, 7).otherwise(day - 1)
136+
return F.concat(year, F.lit("-W"), week, F.lit("-"), day.cast("string"))
137+
138+
def _format_iso_week(self, _input: Column) -> Column:
139+
"""Format datetime as ISO week string."""
140+
F = self._compliant_expr._F # noqa: N806
141+
142+
year = F.date_format(_input, "yyyy")
143+
week = F.lpad(F.weekofyear(_input).cast("string"), 2, "0")
144+
return F.concat(year, F.lit("-W"), week)
145+
146+
def _format_microseconds(
147+
self, _input: Column, format: str
148+
) -> tuple[str, tuple[Column, ...]]:
149+
"""Format microseconds if present in format, else it's a no-op."""
150+
F = self._compliant_expr._F # noqa: N806
151+
152+
suffix: tuple[Column, ...]
153+
if format.endswith((".%f", "%.f")):
154+
import re
155+
156+
micros = F.unix_micros(_input) % 1_000_000
157+
micros_str = F.lpad(micros.cast("string"), 6, "0")
158+
suffix = (F.lit("."), micros_str)
159+
format_ = re.sub(r"(.%|%.)f$", "", format)
160+
return format_, suffix
161+
162+
return format, ()

narwhals/_spark_like/expr_str.py

Lines changed: 1 addition & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from functools import partial
44
from typing import TYPE_CHECKING
55

6+
from narwhals._spark_like.utils import strptime_to_pyspark_format
67
from narwhals.utils import _is_naive_format
78

89
if TYPE_CHECKING:
@@ -112,34 +113,3 @@ def to_datetime(self, format: str | None) -> SparkLikeExpr:
112113
return self._compliant_expr._with_callable(
113114
lambda expr: function(F.replace(expr, F.lit("T"), F.lit(" ")))
114115
)
115-
116-
117-
def strptime_to_pyspark_format(format: str) -> str:
118-
"""Converts a Python strptime datetime format string to a PySpark datetime format string."""
119-
# Mapping from Python strptime format to PySpark format
120-
121-
# see https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
122-
# and https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior
123-
format_mapping = {
124-
"%Y": "y", # Year with century
125-
"%y": "y", # Year without century
126-
"%m": "M", # Month
127-
"%d": "d", # Day of the month
128-
"%H": "H", # Hour (24-hour clock) 0-23
129-
"%I": "h", # Hour (12-hour clock) 1-12
130-
"%M": "m", # Minute
131-
"%S": "s", # Second
132-
"%f": "S", # Microseconds -> Milliseconds
133-
"%p": "a", # AM/PM
134-
"%a": "E", # Abbreviated weekday name
135-
"%A": "E", # Full weekday name
136-
"%j": "D", # Day of the year
137-
"%z": "Z", # Timezone offset
138-
"%s": "X", # Unix timestamp
139-
}
140-
141-
# Replace Python format specifiers with PySpark specifiers
142-
pyspark_format = format
143-
for py_format, spark_format in format_mapping.items():
144-
pyspark_format = pyspark_format.replace(py_format, spark_format)
145-
return pyspark_format.replace("T", " ")

narwhals/_spark_like/utils.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from __future__ import annotations
22

33
from importlib import import_module
4-
from typing import TYPE_CHECKING, Any, Sequence
4+
from typing import TYPE_CHECKING, Any, Sequence, overload
55

66
from narwhals.exceptions import UnsupportedDTypeError
77
from narwhals.utils import Implementation, isinstance_or_issubclass
@@ -33,6 +33,26 @@
3333
"ns": "nanosecond",
3434
}
3535

36+
# see https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
37+
# and https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior
38+
DATETIME_PATTERNS_MAPPING = {
39+
"%Y": "yyyy", # Year with century (4 digits)
40+
"%y": "yy", # Year without century (2 digits)
41+
"%m": "MM", # Month (01-12)
42+
"%d": "dd", # Day of the month (01-31)
43+
"%H": "HH", # Hour (24-hour clock) (00-23)
44+
"%I": "hh", # Hour (12-hour clock) (01-12)
45+
"%M": "mm", # Minute (00-59)
46+
"%S": "ss", # Second (00-59)
47+
"%f": "S", # Microseconds -> Milliseconds
48+
"%p": "a", # AM/PM
49+
"%a": "E", # Abbreviated weekday name
50+
"%A": "E", # Full weekday name
51+
"%j": "D", # Day of the year
52+
"%z": "Z", # Timezone offset
53+
"%s": "X", # Unix timestamp
54+
}
55+
3656

3757
class WindowInputs:
3858
__slots__ = ("expr", "order_by", "partition_by")
@@ -250,3 +270,23 @@ def import_window(implementation: Implementation, /) -> type[Any]:
250270
return import_module(
251271
f"sqlframe.{_BaseSession().execution_dialect_name}.window"
252272
).Window
273+
274+
275+
@overload
276+
def strptime_to_pyspark_format(format: None) -> None: ...
277+
278+
279+
@overload
280+
def strptime_to_pyspark_format(format: str) -> str: ...
281+
282+
283+
def strptime_to_pyspark_format(format: str | None) -> str | None:
284+
"""Converts a Python strptime datetime format string to a PySpark datetime format string."""
285+
if format is None: # pragma: no cover
286+
return None
287+
288+
# Replace Python format specifiers with PySpark specifiers
289+
pyspark_format = format
290+
for py_format, spark_format in DATETIME_PATTERNS_MAPPING.items():
291+
pyspark_format = pyspark_format.replace(py_format, spark_format)
292+
return pyspark_format.replace("T", " ")

tests/expr_and_series/dt/replace_time_zone_test.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,19 +48,14 @@ def test_replace_time_zone(
4848
assert_equal_data(result_str, expected)
4949

5050

51-
def test_replace_time_zone_none(
52-
constructor: Constructor, request: pytest.FixtureRequest
53-
) -> None:
51+
def test_replace_time_zone_none(constructor: Constructor) -> None:
5452
if (
5553
("pyarrow" in str(constructor) and is_windows())
5654
or ("pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2,))
5755
or ("modin_pyarrow" in str(constructor) and PANDAS_VERSION < (2,))
5856
or ("pyarrow_table" in str(constructor) and PYARROW_VERSION < (12,))
5957
):
6058
pytest.skip()
61-
if any(x in str(constructor) for x in ("pyspark",)):
62-
# pyspark: needs `to_string`
63-
request.applymarker(pytest.mark.xfail)
6459
data = {
6560
"a": [
6661
datetime(2020, 1, 1, tzinfo=timezone.utc),

tests/expr_and_series/dt/to_string_test.py

Lines changed: 11 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,7 @@ def test_dt_to_string_series(constructor_eager: ConstructorEager, fmt: str) -> N
3939
"fmt", ["%Y-%m-%d", "%Y-%m-%d %H:%M:%S", "%Y/%m/%d %H:%M:%S", "%G-W%V-%u", "%G-W%V"]
4040
)
4141
@pytest.mark.skipif(is_windows(), reason="pyarrow breaking on windows")
42-
def test_dt_to_string_expr(
43-
constructor: Constructor, fmt: str, request: pytest.FixtureRequest
44-
) -> None:
45-
if "pyspark" in str(constructor):
46-
request.applymarker(pytest.mark.xfail)
42+
def test_dt_to_string_expr(constructor: Constructor, fmt: str) -> None:
4743
input_frame = nw.from_native(constructor(data))
4844

4945
expected_col = [datetime.strftime(d, fmt) for d in data["a"]]
@@ -115,23 +111,19 @@ def test_dt_to_string_iso_local_datetime_expr(
115111
expected: str,
116112
request: pytest.FixtureRequest,
117113
) -> None:
118-
if (
119-
("pyspark" in str(constructor))
120-
or "duckdb" in str(constructor)
121-
or "ibis" in str(constructor)
122-
):
114+
if "duckdb" in str(constructor) or "ibis" in str(constructor):
123115
request.applymarker(pytest.mark.xfail)
124116
df = constructor({"a": [data]})
125117

126-
result = nw.from_native(df).with_columns(
127-
_clean_string_expr(nw.col("a").dt.to_string("%Y-%m-%dT%H:%M:%S.%f")).alias("b")
118+
result = nw.from_native(df).select(
119+
b=_clean_string_expr(nw.col("a").dt.to_string("%Y-%m-%dT%H:%M:%S.%f"))
128120
)
129-
assert_equal_data(result, {"a": [data], "b": [_clean_string(expected)]})
121+
assert_equal_data(result, {"b": [_clean_string(expected)]})
130122

131-
result = nw.from_native(df).with_columns(
132-
_clean_string_expr(nw.col("a").dt.to_string("%Y-%m-%dT%H:%M:%S%.f")).alias("b")
123+
result = nw.from_native(df).select(
124+
b=_clean_string_expr(nw.col("a").dt.to_string("%Y-%m-%dT%H:%M:%S%.f"))
133125
)
134-
assert_equal_data(result, {"a": [data], "b": [_clean_string(expected)]})
126+
assert_equal_data(result, {"b": [_clean_string(expected)]})
135127

136128

137129
@pytest.mark.parametrize(("data", "expected"), [(datetime(2020, 1, 9), "2020-01-09")])
@@ -147,15 +139,8 @@ def test_dt_to_string_iso_local_date_series(
147139
@pytest.mark.parametrize(("data", "expected"), [(datetime(2020, 1, 9), "2020-01-09")])
148140
@pytest.mark.skipif(is_windows(), reason="pyarrow breaking on windows")
149141
def test_dt_to_string_iso_local_date_expr(
150-
constructor: Constructor,
151-
data: datetime,
152-
expected: str,
153-
request: pytest.FixtureRequest,
142+
constructor: Constructor, data: datetime, expected: str
154143
) -> None:
155-
if "pyspark" in str(constructor):
156-
request.applymarker(pytest.mark.xfail)
157144
df = constructor({"a": [data]})
158-
result = nw.from_native(df).with_columns(
159-
nw.col("a").dt.to_string("%Y-%m-%d").alias("b")
160-
)
161-
assert_equal_data(result, {"a": [data], "b": [expected]})
145+
result = nw.from_native(df).select(b=nw.col("a").dt.to_string("%Y-%m-%d"))
146+
assert_equal_data(result, {"b": [expected]})

0 commit comments

Comments
 (0)