narwhals-dev
diff --git a/‎.github/workflows/downstream_tests.yml‎
Lines changed: 21 additions & 0 deletions b/‎.github/workflows/downstream_tests.yml‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎.github/workflows/extremes.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/extremes.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/pytest.yml‎
Lines changed: 9 additions & 4 deletions b/‎.github/workflows/pytest.yml‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎docs/api-reference/sql.md‎
Lines changed: 15 additions & 0 deletions b/‎docs/api-reference/sql.md‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎docs/concepts/order_dependence.md‎
Lines changed: 12 additions & 0 deletions b/‎docs/concepts/order_dependence.md‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎docs/generating_sql.md‎
Lines changed: 28 additions & 26 deletions b/‎docs/generating_sql.md‎
Lines changed: 28 additions & 26 deletions
diff --git a/‎mkdocs.yml‎
Lines changed: 1 addition & 0 deletions b/‎mkdocs.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎narwhals/_arrow/group_by.py‎
Lines changed: 39 additions & 37 deletions b/‎narwhals/_arrow/group_by.py‎
Lines changed: 39 additions & 37 deletions
@@ -35,6 +35,8 @@ jobs:
       - name: install-altair-dev
         run: |
           cd altair
+          # Temporary pin until it's addressed upstream.
+          uv pip install "pandas<3" --system
           uv pip install -e ".[dev, all]" --system
       - name: install-narwhals-dev
         run: |
@@ -86,6 +88,10 @@ jobs:
         run: |
           cd marimo
           . .venv/bin/activate
+          # Temporary pin until it's addressed upstream.
+          uv pip install "pandas<3"
+          # Temporary pin to get CI green
+          uv pip install "sqlglot<28.7.0"
           uv pip install -e ".[dev]"
           which python
       - name: install-narwhals-dev
@@ -237,6 +243,8 @@ jobs:
         run: |
           cd tea-tasting
           uv sync --group test
+          # Temporary pin to get CI green
+          uv pip install "sqlglot<28.7.0"
       - name: install-narwhals-dev
         run: |
           cd tea-tasting
@@ -282,6 +290,8 @@ jobs:
       - name: install-tubular-dev
         run: |
           cd tubular
+          # Temporary pin until it's addressed upstream.
+          uv pip install "pandas<3" --system
           uv pip install -e ".[dev]" --system
       - name: install-narwhals-dev
         run: |
@@ -367,6 +377,8 @@ jobs:
       - name: install-deps
         run: |
           cd hierarchicalforecast
+          # Temporary pin until it's addressed upstream.
+          uv pip install "pandas<3" --system
           uv pip install . --group dev --group polars --system
       - name: install-narwhals-dev
         run: |
@@ -409,6 +421,8 @@ jobs:
       - name: install-formulaic-dev
         run: |
           cd formulaic
+          # Temporary pin until it's addressed upstream.
+          hatch run uv pip install "pandas<3"
           hatch run uv pip install -e ".[arrow,calculus]"
       - name: install-narwhals-dev
         run: |
@@ -452,6 +466,8 @@ jobs:
           uv venv -p ${{ matrix.python-version }}
           . .venv/bin/activate
           uv pip install . --group dev
+          # Temporary pin to get CI green
+          uv pip install "sqlglot<28.7.0"
           uv pip install pytest pytest-cov pytest-snapshot pandas polars "ibis-framework[duckdb,mysql,postgres,sqlite]>=9.5.0" chatlas shiny
       - name: install-narwhals-dev
         run: |
@@ -499,6 +515,9 @@ jobs:
         run: |
           cd validoopsie
           uv sync --dev --upgrade
+          # Temporary pin until it's addressed upstream.
+          uv remove pandas --group dev
+          uv add "pandas<3"
       - name: install-narwhals-dev
         run: |
           cd validoopsie
@@ -545,6 +564,8 @@ jobs:
       - name: install-deps
         run: |
           cd darts
+          # Temporary pin until it's addressed upstream.
+          uv pip install "pandas<3"  --system
           uv pip install \
           -r requirements/core.txt \
           -r requirements/dev.txt \
 
@@ -169,7 +169,7 @@ jobs:
       - name: Assert nightlies dependencies
         run: |
           DEPS=$(uv pip freeze)
-          echo "$DEPS" | grep -E 'pandas.*(dev|rc)'
+          echo "$DEPS" | grep -E 'pandas.*(dev|rc|\+)'
           echo "$DEPS" | grep 'pyarrow.*dev'
           echo "$DEPS" | grep 'numpy.*dev'
           echo "$DEPS" | grep 'dask.*@'
 
@@ -53,7 +53,9 @@ jobs:
           cache-dependency-glob: "pyproject.toml"
       - name: install-reqs
         # we are not testing pyspark, modin, or dask on Windows here because nobody got time for that
-        run: uv pip install -e ".[ibis]" --group core-tests --group extra --system
+        # TODO(FBruzzesi): Temporarily pin sqlglot to <28.6.0 to avoid breaking changes in SQLFrame
+        # See https://github.com/eakmanrq/sqlframe/issues/577
+        run: uv pip install -e ".[ibis]" --group core-tests --group extra "sqlglot<28.6.0" --system
       - name: install-test-plugin
         run: uv pip install -e test-plugin/. --system
       - name: show-deps
@@ -85,7 +87,9 @@ jobs:
           cache-suffix: pytest-full-coverage-${{ matrix.python-version }}
           cache-dependency-glob: "pyproject.toml"
       - name: install-reqs
-        run: uv pip install -e ".[dask, modin, ibis]" --group core-tests --group extra --system
+        # TODO(FBruzzesi): Temporarily pin sqlglot to <28.6.0 to avoid breaking changes in SQLFrame
+        # See https://github.com/eakmanrq/sqlframe/issues/577
+        run: uv pip install -e ".[dask, modin, ibis]" --group core-tests --group extra "sqlglot<28.6.0" --system
       - name: install-test-plugin
         run: uv pip install -e test-plugin/. --system
       - name: show-deps
@@ -153,8 +157,9 @@ jobs:
           cache-suffix: python-314-${{ matrix.python-version }}
           cache-dependency-glob: "pyproject.toml"
       - name: install-reqs
-        # Use `--pre` as duckdb stable not compatible with 3.14
-        run: uv pip install -e . --group tests --pre pandas polars pyarrow duckdb sqlframe  --system
+        # TODO(FBruzzesi): Temporarily pin sqlglot to <28.6.0 to avoid breaking changes in SQLFrame
+        # See https://github.com/eakmanrq/sqlframe/issues/577
+        run: uv pip install -e . --group tests pandas polars pyarrow duckdb sqlframe "sqlglot<28.6.0" --system
       - name: show-deps
         run: uv pip freeze
       - name: Run pytest
 
@@ -0,0 +1,15 @@
+# `narwhals.sql`
+
+::: narwhals.sql
+    handler: python
+    options:
+      members:
+        - table
+
+::: narwhals.sql.SQLTable
+    handler: python
+    options:
+      members:
+        - to_sql
+      show_source: false
+      show_bases: false
@@ -17,6 +17,7 @@ such as:
 - `cum_sum`, `cum_min`, ...
 - `rolling_sum`, `rolling_min`, ...
 - `is_first_distinct`, `is_last_distinct`.
+- `first`, `last`.
 
 When row-order is defined, as is the case for `DataFrame`, these operations pose
 no issue.
@@ -50,3 +51,14 @@ When writing an order-dependent function, if you want it to be executable by `La
 (and not just `DataFrame`), make sure that all order-dependent expressions are followed
 by `over` with `order_by` specified. If you forget to, don't worry, Narwhals will
 give you a loud and clear error message.
+
+## Aggregations
+
+To make `nw.col('a').first()` valid in the lazy case, you have the choice between writing:
+
+- `nw.col('a').first().over(order_by='i')`.
+- `nw.col('a').first(order_by='i')`.
+
+The first produces a new column of the same length as the original dataframe, whereas
+the other one produces a scalar. If you're using `first` in a group-by context, where
+you're required to provide aggregations, then we recommend using the latter.
@@ -5,54 +5,62 @@ For example, what's the SQL equivalent to:
 
 ```python exec="1" source="above" session="generating-sql"
 import narwhals as nw
-from narwhals.typing import IntoFrameT
+from narwhals.typing import FrameT
 
 
-def avg_monthly_price(df_native: IntoFrameT) -> IntoFrameT:
+def avg_monthly_price(df: FrameT) -> FrameT:
     return (
-        nw.from_native(df_native)
-        .group_by(nw.col("date").dt.truncate("1mo"))
+        df.group_by(nw.col("date").dt.truncate("1mo"))
         .agg(nw.col("price").mean())
         .sort("date")
-        .to_native()
     )
 ```
 
 ?
 
-There are several ways to find out.
+Narwhals provides you with a `narwhals.sql` module to do just that!
 
-## Via DuckDB
+!!! info
+    `narwhals.sql` currently requires DuckDB to be installed.
+
+## `narwhals.sql`
 
 You can generate SQL directly from DuckDB.
 
 ```python exec="1" source="above" session="generating-sql" result="sql"
-import duckdb
+import narwhals as nw
+from narwhals.sql import table
 
-conn = duckdb.connect()
-conn.sql("""CREATE TABLE prices (date DATE, price DOUBLE);""")
+prices = table("prices", {"date": nw.Date, "price": nw.Float64})
 
-df = nw.from_native(conn.table("prices"))
-print(avg_monthly_price(df).sql_query())
+result = (
+    prices.group_by(nw.col("date").dt.truncate("1mo"))
+    .agg(nw.col("price").mean())
+    .sort("date")
+)
+print(result.to_sql())
 ```
 
-To make it look a bit prettier, or to then transpile it to other SQL dialects, we can pass it to [SQLGlot](https://github.com/tobymao/sqlglot):
+To make it look a bit prettier, you can pass `pretty=True`, but
+note that this currently requires [sqlparse](https://github.com/andialbrecht/sqlparse) to be installed.
 
 ```python exec="1" source="above" session="generating-sql" result="sql"
-import sqlglot
-
-print(sqlglot.transpile(avg_monthly_price(df).sql_query(), pretty=True)[0])
+print(result.to_sql(pretty=True))
 ```
 
+Note that the generated SQL follows DuckDB's dialect. To translate it to other dialects,
+you may want to look into [sqlglot](https://github.com/tobymao/sqlglot), or use one of the
+solutions below (which also use sqlglot).
+
 ## Via Ibis
 
-We can also use Ibis to generate SQL:
+You can also use Ibis or SQLFrame to generate SQL:
 
 ```python exec="1" source="above" session="generating-sql" result="sql"
 import ibis
 
-t = ibis.table({"date": "date", "price": "double"}, name="prices")
-print(ibis.to_sql(avg_monthly_price(t)))
+df = nw.from_native(ibis.table({"date": "date", "price": "double"}, name="prices"))
+print(ibis.to_sql(avg_monthly_price(df).to_native()))
 ```
 
 ## Via SQLFrame
@@ -66,11 +74,5 @@ session = StandaloneSession.builder.getOrCreate()
 session.catalog.add_table("prices", column_mapping={"date": "date", "price": "float"})
 df = nw.from_native(session.read.table("prices"))
 
-print(avg_monthly_price(df).sql(dialect="duckdb"))
-```
-
-Or, to print the SQL code in a different dialect (say, databricks):
-
-```python exec="1" source="above" session="generating-sql" result="sql"
-print(avg_monthly_price(df).sql(dialect="databricks"))
+print(avg_monthly_price(df).to_native().sql(dialect="duckdb"))
 ```
@@ -70,6 +70,7 @@ nav:
     - api-reference/dtypes.md
     - api-reference/exceptions.md
     - api-reference/selectors.md
+    - api-reference/sql.md
     - api-reference/testing.md
     - api-reference/typing.md
     - api-reference/utils.md
 
@@ -6,7 +6,11 @@
 import pyarrow as pa
 import pyarrow.compute as pc
 
-from narwhals._arrow.utils import cast_to_comparable_string_types, extract_py_scalar
+from narwhals._arrow.utils import (
+    BACKEND_VERSION,
+    cast_to_comparable_string_types,
+    extract_py_scalar,
+)
 from narwhals._compliant import EagerGroupBy
 from narwhals._expression_parsing import evaluate_output_names_and_aliases
 from narwhals._utils import generate_temporary_column_name, requires
@@ -71,12 +75,11 @@ def __init__(
         self._df = df
         frame, self._keys, self._output_key_names = self._parse_keys(df, keys=keys)
         self._compliant_frame = frame.drop_nulls(self._keys) if drop_null_keys else frame
-        self._grouped = pa.TableGroupBy(self.compliant.native, self._keys)
         self._drop_null_keys = drop_null_keys
 
     def _configure_agg(
-        self, grouped: pa.TableGroupBy, expr: ArrowExpr, /
-    ) -> tuple[pa.TableGroupBy, Aggregation, AggregateOptions | None]:
+        self, expr: ArrowExpr, /
+    ) -> tuple[Aggregation, AggregateOptions | None]:
         option: AggregateOptions | None = None
         function_name = self._leaf_name(expr)
         kwargs = self._kwargs(expr)
@@ -91,50 +94,49 @@ def _configure_agg(
             option = pc.ScalarAggregateOptions(min_count=0)
         elif function_name in self._OPTION_ORDERED:
             ignore_nulls = kwargs.get("ignore_nulls", False)
-            grouped, option = self._ordered_agg(
-                grouped, function_name, ignore_nulls=ignore_nulls
-            )
-        return grouped, self._remap_expr_name(function_name), option
-
-    def _ordered_agg(
-        self,
-        grouped: pa.TableGroupBy,
-        name: NarwhalsAggregation,
-        /,
-        *,
-        ignore_nulls: bool,
-    ) -> tuple[pa.TableGroupBy, AggregateOptions]:
-        """The default behavior of `pyarrow` raises when `first` or `last` are used.
-
-        You'd see an error like:
+            option = pc.ScalarAggregateOptions(skip_nulls=ignore_nulls)
+        return self._remap_expr_name(function_name), option
 
-            ArrowNotImplementedError: Using ordered aggregator in multiple threaded execution is not supported
-
-        We need to **disable** multi-threading to use them, but the ability to do so
-        wasn't possible before `14.0.0` ([pyarrow-36709])
-
-        [pyarrow-36709]: https://github.com/apache/arrow/issues/36709
-        """
-        backend_version = self.compliant._backend_version
-        if backend_version >= (14, 0) and grouped._use_threads:
-            native = self.compliant.native
-            grouped = pa.TableGroupBy(native, grouped.keys, use_threads=False)
-        elif backend_version < (14, 0):  # pragma: no cover
+    def _configure_grouped(self, *exprs: ArrowExpr) -> pa.TableGroupBy:
+        order_by = ()
+        use_threads = True
+        for expr in exprs:
+            md = next(expr._metadata.op_nodes_reversed())
+            if md.name not in self._OPTION_ORDERED:
+                continue
+            # [pyarrow-36709]: https://github.com/apache/arrow/issues/36709
+            use_threads = False
+            if _current_order_by := md.kwargs.get("order_by", ()):
+                if order_by and _current_order_by != order_by:
+                    msg = f"Only one `order_by` can be specified in `group_by`. Found both {order_by} and {_current_order_by}."
+                    raise NotImplementedError(msg)
+                order_by = _current_order_by
+        if not use_threads and BACKEND_VERSION < (14,):  # pragma: no cover
             msg = (
-                f"Using `{name}()` in a `group_by().agg(...)` context is only available in 'pyarrow>=14.0.0', "
-                f"found version {requires._unparse_version(backend_version)!r}.\n\n"
+                f"Using `first/last` in a `group_by().agg(...)` context is only available in 'pyarrow>=14.0.0', "
+                f"found version {requires._unparse_version(BACKEND_VERSION)!r}.\n\n"
                 f"See https://github.com/apache/arrow/issues/36709"
             )
             raise NotImplementedError(msg)
-        return grouped, pc.ScalarAggregateOptions(skip_nulls=ignore_nulls)
+        if order_by:
+            return pa.TableGroupBy(
+                self.compliant.sort(*order_by, descending=False, nulls_last=False).native,
+                self._keys,
+                use_threads=use_threads,
+            )
+        if not use_threads:
+            return pa.TableGroupBy(self.compliant.native, self._keys, use_threads=False)
+        # TODO(unassigned): combine with `return` above once PyArrow 15 is the minimum.
+        return pa.TableGroupBy(self.compliant.native, self._keys)
 
     def agg(self, *exprs: ArrowExpr) -> ArrowDataFrame:
         self._ensure_all_simple(exprs)
+        grouped = self._configure_grouped(*exprs)
+
         aggs: list[tuple[str, Aggregation, AggregateOptions | None]] = []
         expected_pyarrow_column_names: list[str] = self._keys.copy()
         new_column_names: list[str] = self._keys.copy()
         exclude = (*self._keys, *self._output_key_names)
-        grouped = self._grouped
 
         for expr in exprs:
             output_names, aliases = evaluate_output_names_and_aliases(
@@ -153,7 +155,7 @@ def agg(self, *exprs: ArrowExpr) -> ArrowDataFrame:
                 aggs.append((self._keys[0], "count", pc.CountOptions(mode="all")))
                 continue
 
-            grouped, function_name, option = self._configure_agg(grouped, expr)
+            function_name, option = self._configure_agg(expr)
             new_column_names.extend(aliases)
             expected_pyarrow_column_names.extend(
                 [f"{output_name}_{function_name}" for output_name in output_names]