Skip to content
Open
Show file tree
Hide file tree
Changes from 32 commits
Commits
Show all changes
104 commits
Select commit Hold shift + click to select a range
b4141af
feat: Make expressions printable, rewrite internals
MarcoGorelli Sep 25, 2025
0cf73ca
coverage
MarcoGorelli Sep 25, 2025
52f978e
typing
MarcoGorelli Sep 25, 2025
f6ce196
coverage
MarcoGorelli Sep 25, 2025
6169b23
typing again
MarcoGorelli Sep 25, 2025
c31c5e9
revert accidental change
MarcoGorelli Sep 25, 2025
ed29d1c
skip old polars
MarcoGorelli Sep 25, 2025
f90c13b
old vs
MarcoGorelli Sep 26, 2025
2943784
fix dataframe to numpy
MarcoGorelli Sep 26, 2025
dea9e3e
document ExprNode
MarcoGorelli Sep 26, 2025
4048ae6
safer `col`, fix typing
MarcoGorelli Sep 26, 2025
906f7fb
:art:
MarcoGorelli Sep 26, 2025
a457bf0
exclude too
MarcoGorelli Sep 26, 2025
f29d8ad
typing
MarcoGorelli Sep 26, 2025
11890a9
mypy
MarcoGorelli Sep 26, 2025
07ed5ee
remove unnecessary check
MarcoGorelli Sep 26, 2025
06eafaa
wait how tf doe thi work
MarcoGorelli Sep 26, 2025
53c048f
grossly simplify broadcast
MarcoGorelli Sep 26, 2025
1bc6c95
simplify
MarcoGorelli Sep 26, 2025
5784048
Merge remote-tracking branch 'upstream/main' into nodes-rewrite
MarcoGorelli Sep 27, 2025
c2209d2
cov
MarcoGorelli Sep 27, 2025
6810813
post merge fixup
MarcoGorelli Sep 27, 2025
48a9dfd
even simpler!
MarcoGorelli Sep 27, 2025
5ba10ed
assign variable
MarcoGorelli Sep 27, 2025
8cde5d2
replace/replace_all typing
MarcoGorelli Sep 27, 2025
19a5c99
yay remove type ignore
MarcoGorelli Sep 27, 2025
b758710
wooah we can support per-group broadcasting
MarcoGorelli Sep 27, 2025
64ccba4
test repr
MarcoGorelli Sep 27, 2025
474a2df
dask cmon man
MarcoGorelli Sep 27, 2025
12a6637
minor things
MarcoGorelli Sep 27, 2025
c602133
minor things
MarcoGorelli Sep 27, 2025
0996ffe
simplify
MarcoGorelli Sep 27, 2025
4dd0e8f
Merge branch 'main' into nodes-rewrite
MarcoGorelli Sep 28, 2025
0b6d2e5
post merge fixup
MarcoGorelli Sep 28, 2025
dfbdeee
reduce selectors diff
MarcoGorelli Sep 30, 2025
1ebf11e
document ExprNodes
MarcoGorelli Sep 30, 2025
ca6ddb9
Merge remote-tracking branch 'upstream/main' into nodes-rewrite
MarcoGorelli Sep 30, 2025
bd1ccea
dask fix
MarcoGorelli Sep 30, 2025
18137e1
simplify
MarcoGorelli Sep 30, 2025
255e828
wip simpler
MarcoGorelli Sep 30, 2025
ae48754
simpler typing
MarcoGorelli Sep 30, 2025
6c054cc
who even needs `is_compliant_expr` anymore?
MarcoGorelli Sep 30, 2025
7d37394
dask fixup
MarcoGorelli Sep 30, 2025
1771c8e
polars compat
MarcoGorelli Sep 30, 2025
8815b19
split clip into clip_lower,clip_upper,clip
MarcoGorelli Sep 30, 2025
3ed89a9
complete the split
MarcoGorelli Sep 30, 2025
a7a3bb2
:art:
MarcoGorelli Sep 30, 2025
e982fd0
typing
MarcoGorelli Sep 30, 2025
90a3302
skip old dask for fill_null
MarcoGorelli Sep 30, 2025
e8fa7f1
coverage, simplify
MarcoGorelli Sep 30, 2025
7a23876
old dask
MarcoGorelli Sep 30, 2025
b37dc66
it gets simpler
MarcoGorelli Sep 30, 2025
c2089de
remove even more chaff
MarcoGorelli Sep 30, 2025
ed80088
remove several Any
MarcoGorelli Sep 30, 2025
74985f4
typing
MarcoGorelli Sep 30, 2025
5aa3d71
expressifiable_args -> kwargs
MarcoGorelli Oct 1, 2025
3c075e6
more precise typing, remove another `Any`
MarcoGorelli Oct 1, 2025
f022d1c
more typing fixes
MarcoGorelli Oct 1, 2025
9bcb61d
more typing fixes
MarcoGorelli Oct 1, 2025
9203e48
Merge remote-tracking branch 'upstream/main' into nodes-rewrite
MarcoGorelli Oct 6, 2025
32346ab
groupby fix
MarcoGorelli Oct 6, 2025
51d258e
first last
MarcoGorelli Oct 6, 2025
f6ab8d6
fix is_native, with_row_index
MarcoGorelli Oct 6, 2025
068e75e
typing, docs (thanks Francesco!)
MarcoGorelli Oct 6, 2025
db0303a
more slow testv
MarcoGorelli Oct 6, 2025
d47e522
Merge remote-tracking branch 'upstream/main' into nodes-rewrite
MarcoGorelli Oct 9, 2025
d0b3acd
remove unnecessary extract_native
MarcoGorelli Oct 9, 2025
a50395a
typing
MarcoGorelli Oct 9, 2025
9a356c7
revert `to_frame()/select/get_column` change (can do it later)
MarcoGorelli Oct 10, 2025
6e9eab4
__call__ -> _to_compliant_expr
MarcoGorelli Oct 10, 2025
6d71c41
Merge remote-tracking branch 'upstream/main' into nodes-rewrite
MarcoGorelli Oct 10, 2025
a8bfefe
skip as appropriate
MarcoGorelli Oct 10, 2025
8ee0100
coverage, docs
MarcoGorelli Oct 10, 2025
52d58c8
one more
MarcoGorelli Oct 10, 2025
e987618
update docs, use `cls`, coverage, take over_node_order_by and partiti…
MarcoGorelli Oct 11, 2025
edc117d
WIP: refactor ExprMetadata
FBruzzesi Oct 11, 2025
639b996
kind = node.kind in from_node and with_node
FBruzzesi Oct 11, 2025
680926f
Merge branch 'nodes-rewrite' into experimental/linked-list
FBruzzesi Oct 11, 2025
52dd7f6
no cover `iter_nodes`
FBruzzesi Oct 11, 2025
e8ab3f6
rm iter_nodes method
FBruzzesi Oct 11, 2025
322354b
Merge pull request #7 from narwhals-dev/experimental/linked-list
MarcoGorelli Oct 11, 2025
baa4702
Merge remote-tracking branch 'upstream/main' into nodes-rewrite
MarcoGorelli Oct 11, 2025
3ff84ac
pass `prev` to `combine_metadata`
MarcoGorelli Oct 11, 2025
7443955
Merge remote-tracking branch 'upstream/main' into nodes-rewrite
MarcoGorelli Oct 12, 2025
b0a78e3
fixup
MarcoGorelli Oct 12, 2025
d9a30b9
Merge branch 'main' into nodes-rewrite
MarcoGorelli Oct 15, 2025
a353940
Merge remote-tracking branch 'upstream/main' into nodes-rewrite
MarcoGorelli Oct 16, 2025
1962d53
ceil, floor
MarcoGorelli Oct 16, 2025
eb1f74e
split out `_with_node` into `_with_over_node` and `_append_node`
MarcoGorelli Oct 16, 2025
fd87898
simplify ExprMetadata.from_node
MarcoGorelli Oct 16, 2025
0840868
simplify further
MarcoGorelli Oct 16, 2025
a2dbd2e
clearer names
MarcoGorelli Oct 16, 2025
95c9a66
Merge branch 'nodes-rewrite' of github.com:MarcoGorelli/narwhals into…
MarcoGorelli Oct 16, 2025
c9c46c0
dask fixup
MarcoGorelli Oct 16, 2025
33e3078
typing
MarcoGorelli Oct 16, 2025
90468de
raise developer-facing assertionerror in _metadata
MarcoGorelli Oct 16, 2025
a657cbb
cvg
MarcoGorelli Oct 16, 2025
0592804
correctly respect arguments metadata in `with_filtration`, add test
MarcoGorelli Oct 17, 2025
5d150f3
mark `filter` not implemented for dask
MarcoGorelli Oct 17, 2025
739e0d5
fixup
MarcoGorelli Oct 17, 2025
34229c1
keep `filter` in `CompliantSeries` for now
MarcoGorelli Oct 17, 2025
f52335c
Merge branch 'main' into nodes-rewrite
MarcoGorelli Oct 19, 2025
b582e92
Merge remote-tracking branch 'upstream/main' into nodes-rewrite
MarcoGorelli Oct 21, 2025
081e8c7
Merge branch 'nodes-rewrite' of github.com:MarcoGorelli/narwhals into…
MarcoGorelli Oct 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 85 additions & 26 deletions docs/how_it_works.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,9 @@ pn = PandasLikeNamespace(
implementation=Implementation.PANDAS,
version=Version.MAIN,
)
print(nw.col("a")._to_compliant_expr(pn))
print(nw.col("a")(pn))
```

The result from the last line above is the same as we'd get from `pn.col('a')`, and it's
a `narwhals._pandas_like.expr.PandasLikeExpr` object, which we'll call `PandasLikeExpr` for
short.
Expand Down Expand Up @@ -177,7 +178,7 @@ The way you access the Narwhals-compliant wrapper depends on the object:

- `narwhals.DataFrame` and `narwhals.LazyFrame`: use the `._compliant_frame` attribute.
- `narwhals.Series`: use the `._compliant_series` attribute.
- `narwhals.Expr`: call the `._to_compliant_expr` method, and pass to it the Narwhals-compliant namespace associated with
- `narwhals.Expr`: call the `.__call__` method, and pass to it the Narwhals-compliant namespace associated with
the given backend.

🛑 BUT WAIT! What's a Narwhals-compliant namespace?
Expand Down Expand Up @@ -212,9 +213,10 @@ pn = PandasLikeNamespace(
implementation=Implementation.PANDAS,
version=Version.MAIN,
)
expr = (nw.col("a") + 1)._to_compliant_expr(pn)
expr = (nw.col("a") + 1)(pn)
print(expr)
```

If we then extract a Narwhals-compliant dataframe from `df` by
calling `._compliant_frame`, we get a `PandasLikeDataFrame` - and that's an object which we can pass `expr` to!

Expand All @@ -228,6 +230,7 @@ We can then view the underlying pandas Dataframe which was produced by calling `
```python exec="1" result="python" session="pandas_api_mapping" source="above"
print(result._native_frame)
```

which is the same as we'd have obtained by just using the Narwhals API directly:

```python exec="1" result="python" session="pandas_api_mapping" source="above"
Expand All @@ -238,49 +241,42 @@ print(nw.to_native(df.select(nw.col("a") + 1)))

Group-by is probably one of Polars' most significant innovations (on the syntax side) with respect
to pandas. We can write something like

```python
df: pl.DataFrame
df.group_by("a").agg((pl.col("c") > pl.col("b").mean()).max())
```

To do this in pandas, we need to either use `GroupBy.apply` (sloooow), or do some crazy manual
optimisations to get it to work.

In Narwhals, here's what we do:

- if somebody uses a simple group-by aggregation (e.g. `df.group_by('a').agg(nw.col('b').mean())`),
then on the pandas side we translate it to
```python
df: pd.DataFrame
df.groupby("a").agg({"b": ["mean"]})
```

```python
df: pd.DataFrame
df.groupby("a").agg({"b": ["mean"]})
```

- if somebody passes a complex group-by aggregation, then we use `apply` and raise a `UserWarning`, warning
users of the performance penalty and advising them to refactor their code so that the aggregation they perform
ends up being a simple one.

In order to tell whether an aggregation is simple, Narwhals uses the private `_depth` attribute of `PandasLikeExpr`:

```python exec="1" result="python" session="pandas_impl" source="above"
print(pn.col("a").mean())
print((pn.col("a") + 1).mean())
```

For simple aggregations, Narwhals can just look at `_depth` and `function_name` and figure out
which (efficient) elementary operation this corresponds to in pandas.

## Expression Metadata

Let's try printing out a few expressions to the console to see what they show us:
Let's try printing out some compliant expressions' metadata to see what it shows us:

```python exec="1" result="python" session="metadata" source="above"
```python exec="1" result="python" session="pandas_impl" source="above"
import narwhals as nw

print(nw.col("a"))
print(nw.col("a").mean())
print(nw.col("a").mean().over("b"))
print(nw.col("a")(pn)._metadata)
print(nw.col("a").mean()(pn)._metadata)
print(nw.col("a").mean().over("b")(pn)._metadata)
```

Note how they tell us something about their metadata. This section is all about
making sense of what that all means, what the rules are, and what it enables.
This section is all about making sense of what that all means, what the rules are, and what it enables.

Here's a brief description of each piece of metadata:

Expand All @@ -293,8 +289,6 @@ Here's a brief description of each piece of metadata:
- `ExpansionKind.MULTI_UNNAMED`: Produces multiple outputs whose names depend
on the input dataframe. For example, `nw.nth(0, 1)` or `nw.selectors.numeric()`.

- `last_node`: Kind of the last operation in the expression. See
`narwhals._expression_parsing.ExprKind` for the various options.
- `has_windows`: Whether the expression already contains an `over(...)` statement.
- `n_orderable_ops`: How many order-dependent operations the expression contains.

Expand All @@ -311,6 +305,7 @@ Here's a brief description of each piece of metadata:
- `is_scalar_like`: Whether the output of the expression is always length-1.
- `is_literal`: Whether the expression doesn't depend on any column but instead
only on literal values, like `nw.lit(1)`.
- `nodes`: List of operations which this expression applies when evaluated.

#### Chaining

Expand Down Expand Up @@ -377,3 +372,67 @@ Narwhals triggers a broadcast in these situations:

Each backend is then responsible for doing its own broadcasting, as defined in each
`CompliantExpr.broadcast` method.

### Elementwise push-down

SQL is picky about `over` operations. For example:

- `sum(a) over (partition by b)` is valid.
- `sum(abs(a)) over (partition by b)` is valid.
- `abs(sum(a)) over (partition by b)` is not valid.

In Polars, however, all three of

- `pl.col('a').sum().over('b')` is valid.
- `pl.col('a').abs().sum().over('b')` is valid.
- `pl.col('a').sum().abs().over('b')` is valid.

How can we retain Polars' level of flexibility when translating to SQL engines?

The answer is: by rewriting expressions. Specifically, we push down `over` nodes past elementwise ones.
To see this, let's try printing the Narwhals equivalent of the last expression above (the one that SQL rejects):

```python exec="1" result="python" session="pushdown" source="above"
import narwhals as nw

print(nw.col("a").sum().abs().over("b"))
```

Note how Narwhals automatically inserted the `over` operation _before_ the `abs` one. In other words, instead
of doing

- `sum` -> `abs` -> `over`

it did

- `sum` -> `over` -> `abs`

thus allowing the expression to be valid for SQL engines!

This is what we refer to as "pushing down `over` nodes". The idea is:

- Elementwise operations operate row-by-row and don't depend on the rows around them.
- An `over` node partitions or orders a computation.
- Therefore, an elementwise operation followed by an `over` operation is the same
as doing the `over` operation followed by that same elementwise operation!

Note that the pushdown also applies to any arguments to the elementwise operation.
For example, if we have

```python
(nw.col("a").sum() + nw.col("b").sum()).over("c")
```

then `+` is an elementwise operation and so can be swapped with `over`. We just need
to take care to apply the `over` operation to all the arguments of `+`, so that we
end up with

```python
nw.col("a").sum().over("c") + nw.col("b").sum().over("c")
```

In general, query optimisation is out-of-scope for Narwhals. We consider this
expression rewrite acceptable because:

- It's simple.
- It allows us to evaluate operations which otherwise wouldn't be allowed for certain backends.
7 changes: 2 additions & 5 deletions narwhals/_arrow/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from narwhals._arrow.series import ArrowSeries
from narwhals._arrow.utils import native_to_narwhals_dtype
from narwhals._compliant import EagerDataFrame
from narwhals._expression_parsing import ExprKind
from narwhals._utils import (
Implementation,
Version,
Expand Down Expand Up @@ -387,12 +386,10 @@ def join(
)

return self._with_native(
self.with_columns(
plx.lit(0, None).alias(key_token).broadcast(ExprKind.LITERAL)
)
self.with_columns(plx.lit(0, None).alias(key_token).broadcast())
.native.join(
other.with_columns(
plx.lit(0, None).alias(key_token).broadcast(ExprKind.LITERAL)
plx.lit(0, None).alias(key_token).broadcast()
).native,
keys=key_token,
right_keys=key_token,
Expand Down
20 changes: 2 additions & 18 deletions narwhals/_arrow/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@

from narwhals._arrow.dataframe import ArrowDataFrame
from narwhals._arrow.namespace import ArrowNamespace
from narwhals._compliant.typing import AliasNames, EvalNames, EvalSeries, ScalarKwargs
from narwhals._expression_parsing import ExprMetadata
from narwhals._compliant.typing import AliasNames, EvalNames, EvalSeries
from narwhals._utils import Version, _LimitedContext


Expand All @@ -32,23 +31,15 @@ def __init__(
self,
call: EvalSeries[ArrowDataFrame, ArrowSeries],
*,
depth: int,
function_name: str,
evaluate_output_names: EvalNames[ArrowDataFrame],
alias_output_names: AliasNames | None,
version: Version,
scalar_kwargs: ScalarKwargs | None = None,
implementation: Implementation | None = None,
implementation: Implementation = Implementation.PYARROW,
) -> None:
self._call = call
self._depth = depth
self._function_name = function_name
self._depth = depth
self._evaluate_output_names = evaluate_output_names
self._alias_output_names = alias_output_names
self._version = version
self._scalar_kwargs = scalar_kwargs or {}
self._metadata: ExprMetadata | None = None

@classmethod
def from_column_names(
Expand All @@ -57,7 +48,6 @@ def from_column_names(
/,
*,
context: _LimitedContext,
function_name: str = "",
) -> Self:
def func(df: ArrowDataFrame) -> list[ArrowSeries]:
try:
Expand All @@ -74,8 +64,6 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]:

return cls(
func,
depth=0,
function_name=function_name,
evaluate_output_names=evaluate_column_names,
alias_output_names=None,
version=context._version,
Expand All @@ -93,8 +81,6 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]:

return cls(
func,
depth=0,
function_name="nth",
evaluate_output_names=cls._eval_names_indices(column_indices),
alias_output_names=None,
version=context._version,
Expand Down Expand Up @@ -160,8 +146,6 @@ def func(df: ArrowDataFrame) -> Sequence[ArrowSeries]:

return self.__class__(
func,
depth=self._depth + 1,
function_name=self._function_name + "->over",
evaluate_output_names=self._evaluate_output_names,
alias_output_names=self._alias_output_names,
version=self._version,
Expand Down
11 changes: 6 additions & 5 deletions narwhals/_arrow/group_by.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,11 @@ def agg(self, *exprs: ArrowExpr) -> ArrowDataFrame:
output_names, aliases = evaluate_output_names_and_aliases(
expr, self.compliant, exclude
)

if expr._depth == 0:
md = expr._metadata
op_nodes_reversed = list(md.op_nodes_reversed())
if len(op_nodes_reversed) == 1:
# e.g. `agg(nw.len())`
if expr._function_name != "len": # pragma: no cover
if op_nodes_reversed[0].name != "len": # pragma: no cover
msg = "Safety assertion failed, please report a bug to https://github.com/narwhals-dev/narwhals/issues"
raise AssertionError(msg)

Expand All @@ -85,8 +86,8 @@ def agg(self, *exprs: ArrowExpr) -> ArrowDataFrame:

function_name = self._leaf_name(expr)
if function_name in {"std", "var"}:
assert "ddof" in expr._scalar_kwargs # noqa: S101
option: Any = pc.VarianceOptions(ddof=expr._scalar_kwargs["ddof"])
last_node = op_nodes_reversed[0]
option: Any = pc.VarianceOptions(**last_node.kwargs)
elif function_name in {"len", "n_unique"}:
option = pc.CountOptions(mode="all")
elif function_name == "count":
Expand Down
Loading
Loading