From 1c195fe8a752d9e0dde7f32f6ee9a49e8ed7343c Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 4 Apr 2025 16:01:21 +0100 Subject: [PATCH 1/8] feat: disallow how=horizontal for lazy concat --- narwhals/_compliant/namespace.py | 9 ++++++++- narwhals/_dask/namespace.py | 20 +------------------- narwhals/_duckdb/namespace.py | 5 +---- narwhals/_spark_like/namespace.py | 9 +-------- narwhals/functions.py | 23 +++++++++++++++++------ narwhals/stable/v1/__init__.py | 2 +- tests/frame/concat_test.py | 11 ++++------- 7 files changed, 33 insertions(+), 46 deletions(-) diff --git a/narwhals/_compliant/namespace.py b/narwhals/_compliant/namespace.py index a8c10a5fc5..692050f786 100644 --- a/narwhals/_compliant/namespace.py +++ b/narwhals/_compliant/namespace.py @@ -78,7 +78,7 @@ def concat( self, items: Iterable[CompliantFrameT], *, - how: Literal["horizontal", "vertical", "diagonal"], + how: Literal["vertical", "diagonal"], ) -> CompliantFrameT: ... def when( self, predicate: CompliantExprT @@ -184,3 +184,10 @@ def from_numpy( if is_numpy_array_2d(data): return self._dataframe.from_numpy(data, schema=schema, context=self) return self._series.from_numpy(data, context=self) + + def concat( + self, + items: Iterable[EagerDataFrameT], + *, + how: Literal["horizontal", "vertical", "diagonal"], + ) -> EagerDataFrameT: ... diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index 88ef002a24..bd0fdfa27d 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -154,7 +154,7 @@ def concat( self: Self, items: Iterable[DaskLazyFrame], *, - how: Literal["horizontal", "vertical", "diagonal"], + how: Literal["vertical", "diagonal"], ) -> DaskLazyFrame: if not items: msg = "No items to concatenate" # pragma: no cover @@ -178,24 +178,6 @@ def concat( backend_version=self._backend_version, version=self._version, ) - if how == "horizontal": - all_column_names: list[str] = [ - column for frame in dfs for column in frame.columns - ] - if len(all_column_names) != len(set(all_column_names)): # pragma: no cover - duplicates = [ - i for i in all_column_names if all_column_names.count(i) > 1 - ] - msg = ( - f"Columns with name(s): {', '.join(duplicates)} " - "have more than one occurrence" - ) - raise AssertionError(msg) - return DaskLazyFrame( - dd.concat(dfs, axis=1, join="outer"), - backend_version=self._backend_version, - version=self._version, - ) if how == "diagonal": return DaskLazyFrame( dd.concat(dfs, axis=0, join="outer"), diff --git a/narwhals/_duckdb/namespace.py b/narwhals/_duckdb/namespace.py index cb7ec12ef2..5c8223b06a 100644 --- a/narwhals/_duckdb/namespace.py +++ b/narwhals/_duckdb/namespace.py @@ -63,11 +63,8 @@ def concat( self: Self, items: Iterable[DuckDBLazyFrame], *, - how: Literal["horizontal", "vertical", "diagonal"], + how: Literal["vertical", "diagonal"], ) -> DuckDBLazyFrame: - if how == "horizontal": - msg = "horizontal concat not supported for duckdb. Please join instead" - raise TypeError(msg) if how == "diagonal": msg = "Not implemented yet" raise NotImplementedError(msg) diff --git a/narwhals/_spark_like/namespace.py b/narwhals/_spark_like/namespace.py index c9bd25e43e..aa1c8e583b 100644 --- a/narwhals/_spark_like/namespace.py +++ b/narwhals/_spark_like/namespace.py @@ -192,16 +192,9 @@ def concat( self: Self, items: Iterable[SparkLikeLazyFrame], *, - how: Literal["horizontal", "vertical", "diagonal"], + how: Literal["vertical", "diagonal"], ) -> SparkLikeLazyFrame: dfs = [item._native_frame for item in items] - if how == "horizontal": - msg = ( - "Horizontal concatenation is not supported for LazyFrame backed by " - "a PySpark DataFrame." - ) - raise NotImplementedError(msg) - if how == "vertical": cols_0 = dfs[0].columns for i, df in enumerate(dfs[1:], start=1): diff --git a/narwhals/functions.py b/narwhals/functions.py index 56457d3f31..9014ca55ab 100644 --- a/narwhals/functions.py +++ b/narwhals/functions.py @@ -26,6 +26,7 @@ from narwhals.dependencies import is_numpy_array from narwhals.dependencies import is_numpy_array_2d from narwhals.dependencies import is_pyarrow_table +from narwhals.exceptions import InvalidOperationError from narwhals.expr import Expr from narwhals.series import Series from narwhals.translate import from_native @@ -105,12 +106,13 @@ def concat( - vertical: Concatenate vertically. Column names must match. - horizontal: Concatenate horizontally. If lengths don't match, then - missing rows are filled with null values. + missing rows are filled with null values. This is only supported + when all inputs are (eager) DataFrames. - diagonal: Finds a union between the column schemas and fills missing column values with null. Returns: - A new DataFrame, Lazyframe resulting from the concatenation. + A new DataFrame or Lazyframe resulting from the concatenation. Raises: TypeError: The items to concatenate should either all be eager, or all lazy @@ -177,16 +179,25 @@ def concat( |z: [[null,null],["x","y"]]| └──────────────────────────┘ """ - if how not in {"horizontal", "vertical", "diagonal"}: # pragma: no cover - msg = "Only vertical, horizontal and diagonal concatenations are supported." - raise NotImplementedError(msg) + from narwhals.dataframe import LazyFrame + if not items: - msg = "No items to concatenate" + msg = "No items to concatenate." raise ValueError(msg) items = list(items) validate_laziness(items) + if how not in {"horizontal", "vertical", "diagonal"}: # pragma: no cover + msg = "Only vertical, horizontal and diagonal concatenations are supported." + raise NotImplementedError(msg) first_item = items[0] plx = first_item.__narwhals_namespace__() + if isinstance(first_item, LazyFrame) and how == "horizontal": + msg = ( + "Horizontal concatenation is not supported for LazyFrames.\n\n" + "Hint: you may want to use `join` instead." + ) + raise InvalidOperationError(msg) + return first_item._with_compliant( plx.concat([df._compliant_frame for df in items], how=how), ) diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index deb33e6481..2fd1037ec6 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -2061,7 +2061,7 @@ def concat( def concat( items: Iterable[LazyFrame[IntoFrameT]], *, - how: Literal["horizontal", "vertical", "diagonal"] = "vertical", + how: Literal["vertical", "diagonal"] = "vertical", ) -> LazyFrame[IntoFrameT]: ... diff --git a/tests/frame/concat_test.py b/tests/frame/concat_test.py index bbdd306fe3..230a25550f 100644 --- a/tests/frame/concat_test.py +++ b/tests/frame/concat_test.py @@ -4,19 +4,16 @@ import narwhals.stable.v1 as nw from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import assert_equal_data -def test_concat_horizontal( - constructor: Constructor, request: pytest.FixtureRequest -) -> None: - if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_concat_horizontal(constructor_eager: ConstructorEager) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]} - df_left = nw.from_native(constructor(data)).lazy() + df_left = nw.from_native(constructor_eager(data), eager_only=True) data_right = {"c": [6, 12, -1], "d": [0, -4, 2]} - df_right = nw.from_native(constructor(data_right)).lazy() + df_right = nw.from_native(constructor_eager(data_right), eager_only=True) result = nw.concat([df_left, df_right], how="horizontal") expected = { From 5211a036b3c1bbea48264d032c43f7a48e65d58a Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 4 Apr 2025 16:05:56 +0100 Subject: [PATCH 2/8] stable api docstring --- narwhals/stable/v1/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index 2fd1037ec6..5c4cc8d396 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -2086,12 +2086,13 @@ def concat( - vertical: Concatenate vertically. Column names must match. - horizontal: Concatenate horizontally. If lengths don't match, then - missing rows are filled with null values. + missing rows are filled with null values. This is only supported + when all inputs are (eager) DataFrames. - diagonal: Finds a union between the column schemas and fills missing column values with null. Returns: - A new DataFrame, Lazyframe resulting from the concatenation. + A new DataFrame or Lazyframe resulting from the concatenation. Raises: TypeError: The items to concatenate should either all be eager, or all lazy From f981deef15acdb8e6e24a335fc3bbd5765642f6d Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 4 Apr 2025 16:16:12 +0100 Subject: [PATCH 3/8] coverage --- tests/frame/concat_test.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/frame/concat_test.py b/tests/frame/concat_test.py index 230a25550f..3dbcbed350 100644 --- a/tests/frame/concat_test.py +++ b/tests/frame/concat_test.py @@ -3,6 +3,7 @@ import pytest import narwhals.stable.v1 as nw +from narwhals.exceptions import InvalidOperationError from tests.utils import Constructor from tests.utils import ConstructorEager from tests.utils import assert_equal_data @@ -27,6 +28,8 @@ def test_concat_horizontal(constructor_eager: ConstructorEager) -> None: with pytest.raises(ValueError, match="No items"): nw.concat([]) + with pytest.raises(InvalidOperationError): + nw.concat([df_left.lazy()], how="horizontal") def test_concat_vertical(constructor: Constructor) -> None: From d053845858d542c8e6fe42a2335c891e22c0e425 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 10 Apr 2025 09:38:48 +0100 Subject: [PATCH 4/8] skip hist_test for now --- tests/series_only/hist_test.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/series_only/hist_test.py b/tests/series_only/hist_test.py index f28747ccde..a72265f2bc 100644 --- a/tests/series_only/hist_test.py +++ b/tests/series_only/hist_test.py @@ -13,6 +13,10 @@ from tests.utils import ConstructorEager from tests.utils import assert_equal_data +pytest.skip( + reason="https://github.com/narwhals-dev/narwhals/issues/2348", allow_module_level=True +) + data = { "int": [0, 1, 2, 3, 4, 5, 6], } From ee52813d612b0e8d036344d5d5b5e9a42e6457c0 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 10 Apr 2025 14:29:11 +0100 Subject: [PATCH 5/8] test: try an `xfail` instead of `skip` https://github.com/narwhals-dev/narwhals/pull/2341#discussion_r2037249525 --- tests/series_only/hist_test.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/series_only/hist_test.py b/tests/series_only/hist_test.py index a72265f2bc..8d8b30adb1 100644 --- a/tests/series_only/hist_test.py +++ b/tests/series_only/hist_test.py @@ -13,10 +13,11 @@ from tests.utils import ConstructorEager from tests.utils import assert_equal_data -pytest.skip( - reason="https://github.com/narwhals-dev/narwhals/issues/2348", allow_module_level=True +xfail_hist = pytest.mark.xfail( + reason="https://github.com/narwhals-dev/narwhals/issues/2348", strict=False ) + data = { "int": [0, 1, 2, 3, 4, 5, 6], } @@ -80,6 +81,7 @@ ] +@xfail_hist @pytest.mark.parametrize("params", bins_and_expected) @pytest.mark.parametrize("include_breakpoint", [True, False]) @pytest.mark.filterwarnings( @@ -165,6 +167,7 @@ def test_hist_bin( assert_equal_data(result, expected) +@xfail_hist @pytest.mark.parametrize("params", counts_and_expected) @pytest.mark.parametrize("include_breakpoint", [True, False]) @pytest.mark.filterwarnings( @@ -236,6 +239,7 @@ def test_hist_count( ) +@xfail_hist @pytest.mark.filterwarnings( "ignore:`Series.hist` is being called from the stable API although considered an unstable feature." ) @@ -272,6 +276,7 @@ def test_hist_count_no_spread( assert_equal_data(result, expected) +@xfail_hist @pytest.mark.filterwarnings( "ignore:`Series.hist` is being called from the stable API although considered an unstable feature." ) @@ -287,6 +292,7 @@ def test_hist_bin_and_bin_count() -> None: s.hist(bins=[1, 3], bin_count=4) +@xfail_hist @pytest.mark.filterwarnings( "ignore:`Series.hist` is being called from the stable API although considered an unstable feature." ) @@ -335,6 +341,7 @@ def test_hist_small_bins( s["values"].hist(bins=[1, 3], bin_count=4) +@xfail_hist @pytest.mark.filterwarnings( "ignore:`Series.hist` is being called from the stable API although considered an unstable feature." ) @@ -369,6 +376,7 @@ def test_hist_non_monotonic(constructor_eager: ConstructorEager) -> None: st.floats(min_value=0.001, max_value=1_000, allow_nan=False), max_size=50 ), ) +@xfail_hist @pytest.mark.filterwarnings( "ignore:`Series.hist` is being called from the stable API although considered an unstable feature.", "ignore:invalid value encountered in cast:RuntimeWarning", @@ -425,6 +433,7 @@ def test_hist_bin_hypotheis( ), bin_count=st.integers(min_value=0, max_value=1_000), ) +@xfail_hist @pytest.mark.skipif( POLARS_VERSION < (1, 15), reason="hist(bin_count=...) behavior significantly changed after this version", From 13b0fb147d9dd84836551fedfabc182a377bca85 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 10 Apr 2025 14:50:11 +0100 Subject: [PATCH 6/8] test: be more specific --- tests/frame/concat_test.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/frame/concat_test.py b/tests/frame/concat_test.py index 3dbcbed350..1e663212d8 100644 --- a/tests/frame/concat_test.py +++ b/tests/frame/concat_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +import re + import pytest import narwhals.stable.v1 as nw @@ -28,7 +30,8 @@ def test_concat_horizontal(constructor_eager: ConstructorEager) -> None: with pytest.raises(ValueError, match="No items"): nw.concat([]) - with pytest.raises(InvalidOperationError): + pattern = re.compile(r"horizontal.+not supported.+lazyframe", re.IGNORECASE) + with pytest.raises(InvalidOperationError, match=pattern): nw.concat([df_left.lazy()], how="horizontal") From f1938d66a83f31a84b0a4f6068b3be03fe22f45e Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 10 Apr 2025 14:54:56 +0100 Subject: [PATCH 7/8] refactor: Reuse `is_narwhals_lazyframe` --- narwhals/functions.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/narwhals/functions.py b/narwhals/functions.py index e78fca6f87..367d088b15 100644 --- a/narwhals/functions.py +++ b/narwhals/functions.py @@ -153,7 +153,7 @@ def concat(items: Iterable[FrameT], *, how: ConcatMethod = "vertical") -> FrameT |z: [[null,null],["x","y"]]| └──────────────────────────┘ """ - from narwhals.dataframe import LazyFrame + from narwhals.dependencies import is_narwhals_lazyframe if not items: msg = "No items to concatenate." @@ -164,14 +164,13 @@ def concat(items: Iterable[FrameT], *, how: ConcatMethod = "vertical") -> FrameT msg = "Only vertical, horizontal and diagonal concatenations are supported." raise NotImplementedError(msg) first_item = items[0] - plx = first_item.__narwhals_namespace__() - if isinstance(first_item, LazyFrame) and how == "horizontal": + if is_narwhals_lazyframe(first_item) and how == "horizontal": msg = ( "Horizontal concatenation is not supported for LazyFrames.\n\n" "Hint: you may want to use `join` instead." ) raise InvalidOperationError(msg) - + plx = first_item.__narwhals_namespace__() return first_item._with_compliant( plx.concat([df._compliant_frame for df in items], how=how), ) From 0dd9cc23d7c5bea3bdcfcc215ad8e958f9e2b191 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 10 Apr 2025 14:56:13 +0100 Subject: [PATCH 8/8] docs: big F --- narwhals/functions.py | 2 +- narwhals/stable/v1/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/narwhals/functions.py b/narwhals/functions.py index 367d088b15..a9b01a2e02 100644 --- a/narwhals/functions.py +++ b/narwhals/functions.py @@ -86,7 +86,7 @@ def concat(items: Iterable[FrameT], *, how: ConcatMethod = "vertical") -> FrameT values with null. Returns: - A new DataFrame or Lazyframe resulting from the concatenation. + A new DataFrame or LazyFrame resulting from the concatenation. Raises: TypeError: The items to concatenate should either all be eager, or all lazy diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index 184de55297..92245b2671 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -2075,7 +2075,7 @@ def concat(items: Iterable[FrameT], *, how: ConcatMethod = "vertical") -> FrameT values with null. Returns: - A new DataFrame or Lazyframe resulting from the concatenation. + A new DataFrame or LazyFrame resulting from the concatenation. Raises: TypeError: The items to concatenate should either all be eager, or all lazy