diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 2ae018ba79..7a48e7800a 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -88,7 +88,7 @@ jobs: - name: show-deps run: uv pip freeze - name: Run pytest - run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=100 --runslow --constructors=pandas,pandas[nullable],pandas[pyarrow],pyarrow,modin[pyarrow],polars[eager],polars[lazy],dask,duckdb,sqlframe,ibis --durations=30 + run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=100 --runslow --constructors=pandas,pandas[nullable],pandas[pyarrow],pyarrow,modin[pyarrow],polars[eager],polars[lazy],dask,duckdb,sqlframe,ibis,daft --durations=30 - name: Run doctests # reprs differ between versions, so we only run doctests on the latest Python if: matrix.python-version == '3.13' diff --git a/narwhals/_compliant/expr.py b/narwhals/_compliant/expr.py index 487d22f29e..f304d049b3 100644 --- a/narwhals/_compliant/expr.py +++ b/narwhals/_compliant/expr.py @@ -69,7 +69,6 @@ class NativeExpr(Protocol): """ def between(self, *args: Any, **kwds: Any) -> Any: ... - def isin(self, *args: Any, **kwds: Any) -> Any: ... class CompliantExpr(Protocol38[CompliantFrameT, CompliantSeriesOrNativeExprT_co]): diff --git a/narwhals/_daft/__init__.py b/narwhals/_daft/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/narwhals/_daft/dataframe.py b/narwhals/_daft/dataframe.py new file mode 100644 index 0000000000..6eba80036b --- /dev/null +++ b/narwhals/_daft/dataframe.py @@ -0,0 +1,340 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Literal, cast + +import daft +import daft.exceptions +import daft.functions + +from narwhals._daft.utils import evaluate_exprs, lit, native_to_narwhals_dtype +from narwhals._utils import ( + Implementation, + ValidateBackendVersion, + Version, + check_column_names_are_unique, + not_implemented, + parse_columns_to_drop, +) +from narwhals.dependencies import get_daft +from narwhals.exceptions import ColumnNotFoundError, DuplicateError +from narwhals.typing import CompliantLazyFrame + +if TYPE_CHECKING: + from collections.abc import Iterator, Mapping, Sequence + from types import ModuleType + + from typing_extensions import Self, TypeIs + + from narwhals._compliant.typing import CompliantDataFrameAny + from narwhals._daft.expr import DaftExpr + from narwhals._daft.group_by import DaftGroupBy + from narwhals._daft.namespace import DaftNamespace + from narwhals._utils import _LimitedContext + from narwhals.dataframe import LazyFrame + from narwhals.dtypes import DType + from narwhals.typing import JoinStrategy + + +class DaftLazyFrame( + CompliantLazyFrame["DaftExpr", "daft.DataFrame", "LazyFrame[daft.DataFrame]"], + ValidateBackendVersion, +): + _implementation = Implementation.DAFT + + def __init__( + self, + native_dataframe: daft.DataFrame, + *, + version: Version, + validate_backend_version: bool = False, + ) -> None: + self._native_frame: daft.DataFrame = native_dataframe + self._version = version + self._cached_schema: dict[str, DType] | None = None + self._cached_columns: list[str] | None = None + if validate_backend_version: # pragma: no cover + self._validate_backend_version() + + @staticmethod + def _is_native(obj: daft.DataFrame | Any) -> TypeIs[daft.DataFrame]: + return isinstance(obj, daft.DataFrame) + + @classmethod + def from_native(cls, data: daft.DataFrame, /, *, context: _LimitedContext) -> Self: + return cls(data, version=context._version) + + def to_narwhals(self) -> LazyFrame[daft.DataFrame]: + return self._version.lazyframe(self, level="lazy") + + def __native_namespace__(self) -> ModuleType: + return get_daft() # type: ignore[no-any-return] + + def __narwhals_namespace__(self) -> DaftNamespace: + from narwhals._daft.namespace import DaftNamespace + + return DaftNamespace(version=self._version) + + def __narwhals_lazyframe__(self) -> Self: + return self + + def _with_version(self, version: Version) -> Self: + return self.__class__(self._native_frame, version=version) + + def _with_native(self, df: daft.DataFrame) -> Self: + return self.__class__(df, version=self._version) + + def _iter_columns(self) -> Iterator[daft.Expression]: + return iter(self._native_frame.columns) + + @property + def columns(self) -> list[str]: + if self._cached_columns is None: + self._cached_columns = ( + list(self.schema) + if self._cached_schema is not None + else self.native.column_names + ) + return self._cached_columns + + def collect( + self, backend: ModuleType | Implementation | str | None, **kwargs: Any + ) -> CompliantDataFrameAny: + if backend is None or backend is Implementation.PYARROW: + from narwhals._arrow.dataframe import ArrowDataFrame + + return ArrowDataFrame( + native_dataframe=self._native_frame.to_arrow(), + validate_backend_version=True, + version=self._version, + validate_column_names=True, + ) + + if backend is Implementation.PANDAS: + from narwhals._pandas_like.dataframe import PandasLikeDataFrame + + return PandasLikeDataFrame( + native_dataframe=self._native_frame.to_pandas(), + implementation=Implementation.PANDAS, + validate_backend_version=True, + version=self._version, + validate_column_names=True, + ) + + if backend is Implementation.POLARS: + import polars as pl # ignore-banned-import + + from narwhals._polars.dataframe import PolarsDataFrame + + return PolarsDataFrame( + df=cast("pl.DataFrame", pl.from_arrow(self._native_frame.to_arrow())), + validate_backend_version=True, + version=self._version, + ) + + msg = f"Unsupported `backend` value: {backend}" # pragma: no cover + raise ValueError(msg) # pragma: no cover + + def simple_select(self, *column_names: str) -> Self: + return self._with_native(self._native_frame.select(*column_names)) + + def aggregate(self, *exprs: DaftExpr) -> Self: + new_columns_map = evaluate_exprs(self, *exprs) + return self._with_native( + self._native_frame.agg([val.alias(col) for col, val in new_columns_map]) + ) + + def select(self, *exprs: DaftExpr) -> Self: + new_columns_map = evaluate_exprs(self, *exprs) + if not new_columns_map: + msg = "At least one expression must be passed to LazyFrame.select" + raise ValueError(msg) + try: + return self._with_native( + self._native_frame.select( + *(val.alias(col) for col, val in new_columns_map) + ) + ) + except daft.exceptions.DaftCoreException as e: + if "duplicate" in str(e): # pragma: no cover + raise DuplicateError(e) from None + if "not found" in str(e): + msg = f"{e!s}\n\nHint: Did you mean one of these columns: {self.columns}?" + raise ColumnNotFoundError(msg) from e + raise + + def with_columns(self, *exprs: DaftExpr) -> Self: + new_columns_map = dict(evaluate_exprs(self, *exprs)) + return self._with_native(self._native_frame.with_columns(new_columns_map)) + + def filter(self, predicate: DaftExpr) -> Self: + # `[0]` is safe as the predicate's expression only returns a single column + mask = predicate._call(self)[0] + return self._with_native(self._native_frame.filter(mask)) + + @property + def schema(self) -> dict[str, DType]: + if self._cached_schema is None: + # Note: prefer `self._cached_schema` over `functools.cached_property` + # due to Python3.13 failures. + self._cached_schema = { + field.name: native_to_narwhals_dtype(field.dtype, self._version) + for field in (self._native_frame.schema()) + } + return self._cached_schema + + def collect_schema(self) -> dict[str, DType]: + return { + field.name: native_to_narwhals_dtype(field.dtype, self._version) + for field in self._native_frame.schema() + } + + def drop(self, columns: Sequence[str], *, strict: bool) -> Self: + columns_to_drop = parse_columns_to_drop(self, columns, strict=strict) + selection = [col for col in self.columns if col not in columns_to_drop] + return self._with_native(self._native_frame.select(*selection)) + + def head(self, n: int) -> Self: + return self._with_native(self._native_frame.limit(n)) + + def group_by( + self, keys: Sequence[str] | Sequence[DaftExpr], *, drop_null_keys: bool + ) -> DaftGroupBy: + from narwhals._daft.group_by import DaftGroupBy + + return DaftGroupBy(self, keys, drop_null_keys=drop_null_keys) + + def sort(self, *by: str, descending: bool | Sequence[bool], nulls_last: bool) -> Self: + return self._with_native( + self._native_frame.sort( + list(by), + desc=descending if isinstance(descending, bool) else list(descending), + nulls_first=not nulls_last, + ) + ) + + def drop_nulls(self, subset: Sequence[str] | None) -> Self: + if subset: + return self._with_native(self._native_frame.drop_null(*subset)) + return self._with_native(self._native_frame.drop_null()) + + def rename(self, mapping: Mapping[str, str]) -> Self: + selection = [ + daft.col(col).alias(mapping[col]) if col in mapping else col + for col in self.columns + ] + return self._with_native(self.native.select(*selection)) + + def unique(self, subset: Sequence[str] | None, keep: str) -> Self: + # upstream issue: + # https://github.com/Eventual-Inc/Daft/issues/4151 + if subset and set(subset) != set(self.columns): + msg = "`unique` with `subset` specified is not yet supported." + raise NotImplementedError(msg) + if keep == "none": + msg = "Only `keep='any'` is supported for `'daft'`." + raise NotImplementedError(msg) + return self._with_native(self._native_frame.unique()) + + def join( + self, + other: Self, + how: JoinStrategy, + left_on: Sequence[str] | None, + right_on: Sequence[str] | None, + suffix: str, + ) -> Self: + if how == "cross": + return self._with_native( + self.native.join(other.native, how="cross", prefix="", suffix=suffix) + ) + left_columns = self.columns + right_columns = other.columns + + right_on_ = list(right_on) if right_on is not None else [] + left_on_ = list(left_on) if left_on is not None else [] + + # create a mapping for columns on other + # `right_on` columns will be renamed as `left_on` + # the remaining columns will be either added the suffix or left unchanged. + right_cols_to_rename = ( + [c for c in right_columns if c not in right_on_] + if how != "full" + else right_columns + ) + + rename_mapping = { + **dict(zip(right_on_, left_on_)), + **{ + colname: f"{colname}{suffix}" if colname in left_columns else colname + for colname in right_cols_to_rename + }, + } + plx = self.__narwhals_namespace__() + other_native = other.select( + *[plx.col(old).alias(new) for old, new in rename_mapping.items()] + ).native + col_order = left_columns.copy() + + if how in {"inner", "left", "cross"}: + col_order.extend( + rename_mapping[colname] + for colname in right_columns + if colname not in right_on_ + ) + elif how == "full": + col_order.extend(rename_mapping.values()) + + check_column_names_are_unique(col_order) + + right_on_remapped = [rename_mapping[c] for c in right_on_] + how_native: Literal["inner", "left", "semi", "anti", "outer"] = ( + "outer" if how == "full" else how + ) + + return self._with_native( + self.native.join( + other_native, + left_on=[daft.col(x) for x in left_on_], + right_on=[daft.col(x) for x in right_on_remapped], + how=how_native, + ).select(*col_order) + ) + + def unpivot( + self, + on: Sequence[str] | None, + index: Sequence[str] | None, + variable_name: str, + value_name: str, + ) -> Self: + index_ = [] if index is None else index + on_ = [c for c in self.columns if c not in index_] if on is None else on + return self._with_native( + self._native_frame.unpivot( + ids=index_, values=on_, variable_name=variable_name, value_name=value_name + ) + ) + + def with_row_index(self, name: str, order_by: Sequence[str]) -> Self: + row_index_expr = ( + ( + daft.functions.row_number().over( + daft.Window().partition_by(lit(1)).order_by(*order_by) + ) + - 1 + ) + if order_by + else daft.functions.monotonically_increasing_id() + ) + return self._with_native( + self.native.select(row_index_expr.alias(name), *self.columns) + ) + + gather_every = not_implemented.deprecated( + "`LazyFrame.gather_every` is deprecated and will be removed in a future version." + ) + join_asof = not_implemented() + tail = not_implemented.deprecated( + "`LazyFrame.tail` is deprecated and will be removed in a future version." + ) + explode = not_implemented() diff --git a/narwhals/_daft/expr.py b/narwhals/_daft/expr.py new file mode 100644 index 0000000000..a564df2565 --- /dev/null +++ b/narwhals/_daft/expr.py @@ -0,0 +1,612 @@ +from __future__ import annotations + +import operator +from collections.abc import Iterable +from typing import TYPE_CHECKING, Any, Callable, Literal, cast + +from daft import Window, coalesce, col, lit +from daft.functions import row_number + +from narwhals._compliant import LazyExpr +from narwhals._compliant.window import WindowInputs +from narwhals._daft.expr_dt import DaftExprDateTimeNamespace +from narwhals._daft.expr_str import DaftExprStringNamespace +from narwhals._daft.expr_struct import DaftExprStructNamespace +from narwhals._daft.utils import narwhals_to_native_dtype +from narwhals._expression_parsing import ( + ExprKind, + combine_alias_output_names, + combine_evaluate_output_names, +) +from narwhals._utils import Implementation, not_implemented + +if TYPE_CHECKING: + from collections.abc import Iterable, Sequence + + from daft import Expression + from typing_extensions import Self + + from narwhals._compliant.typing import ( + AliasNames, + EvalNames, + EvalSeries, + WindowFunction, + ) + from narwhals._daft.dataframe import DaftLazyFrame + from narwhals._daft.namespace import DaftNamespace + from narwhals._expression_parsing import ExprMetadata + from narwhals._utils import Version, _LimitedContext + from narwhals.dtypes import DType + + DaftWindowFunction = WindowFunction[DaftLazyFrame, Expression] + DaftWindowInputs = WindowInputs[Expression] + + +class DaftExpr(LazyExpr["DaftLazyFrame", "Expression"]): + _implementation = Implementation.DAFT + + def __init__( + self, + call: Callable[[DaftLazyFrame], Sequence[Expression]], + window_function: DaftWindowFunction | None = None, + *, + evaluate_output_names: EvalNames[DaftLazyFrame], + alias_output_names: AliasNames | None, + version: Version, + ) -> None: + self._call = call + self._evaluate_output_names = evaluate_output_names + self._alias_output_names = alias_output_names + self._version = version + self._metadata: ExprMetadata | None = None + self._window_function: DaftWindowFunction | None = window_function + + @property + def window_function(self) -> DaftWindowFunction: + def default_window_func( + df: DaftLazyFrame, window_inputs: DaftWindowInputs + ) -> list[Expression]: + assert not window_inputs.order_by # noqa: S101 + return [ + expr.over( + self.partition_by(*window_inputs.partition_by).order_by( + *window_inputs.order_by + ) + ) + for expr in self(df) + ] + + return self._window_function or default_window_func + + def __call__(self, df: DaftLazyFrame) -> Sequence[Expression]: + return self._call(df) + + def broadcast(self, kind: Literal[ExprKind.AGGREGATION, ExprKind.LITERAL]) -> Self: + if kind is ExprKind.LITERAL: + return self + return self.over([lit(1)], []) + + def partition_by(self, *cols: Expression | str) -> Window: + return Window().partition_by(*cols or [lit(1)]) + + def __narwhals_expr__(self) -> None: ... + + def __narwhals_namespace__(self) -> DaftNamespace: # pragma: no cover + # Unused, just for compatibility with PandasLikeExpr + from narwhals._daft.namespace import DaftNamespace + + return DaftNamespace(version=self._version) + + def _with_window_function(self, window_function: DaftWindowFunction) -> Self: + return self.__class__( + self._call, + window_function, + evaluate_output_names=self._evaluate_output_names, + alias_output_names=self._alias_output_names, + version=self._version, + ) + + @classmethod + def _alias_native(cls, expr: Expression, name: str) -> Expression: + return expr.alias(name) + + def _cum_window_func( + self, + *, + reverse: bool, + func_name: Literal["sum", "max", "min", "count", "product"], + ) -> DaftWindowFunction: + def func(df: DaftLazyFrame, inputs: DaftWindowInputs) -> Sequence[Expression]: + window = ( + self.partition_by(*inputs.partition_by) + .order_by(*inputs.order_by, desc=reverse, nulls_first=not reverse) + .rows_between(Window.unbounded_preceding, 0) + ) + return [getattr(expr, func_name)().over(window) for expr in self._call(df)] + + return func + + def _rolling_window_func( + self, + func_name: Literal["sum", "mean", "std", "var"], + window_size: int, + min_samples: int, + ddof: int | None = None, + *, + center: bool, + ) -> DaftWindowFunction: + supported_funcs = ["sum", "mean", "std", "var"] + if center: + half = (window_size - 1) // 2 + remainder = (window_size - 1) % 2 + start = -half - remainder + end = half + else: + start = -window_size + 1 + end = 0 + + def func(df: DaftLazyFrame, inputs: DaftWindowInputs) -> Sequence[Expression]: + window = ( + self.partition_by(*inputs.partition_by) + .order_by(*inputs.order_by, nulls_first=True) + .rows_between(start, end) + ) + if func_name in {"sum", "mean"}: + func_: str = func_name + elif func_name == "std" and ddof == 0: # pragma: no cover. todo(marco) + func_ = "stddev" + elif func_name in {"var", "std"}: # pragma: no cover + msg = f"Only ddof=0 and ddof=1 are currently supported for rolling_{func_name}." + raise ValueError(msg) + else: # pragma: no cover + msg = f"Only the following functions are supported: {supported_funcs}.\nGot: {func_name}." + raise ValueError(msg) + return [ + (expr.count().over(window) >= lit(min_samples)).if_else( + getattr(expr, func_)().over(window), lit(None) + ) + for expr in self._call(df) + ] + + return func + + @classmethod + def from_column_names( + cls: type[Self], + evaluate_column_names: EvalNames[DaftLazyFrame], + /, + *, + context: _LimitedContext, + ) -> Self: + def func(df: DaftLazyFrame) -> list[Expression]: + return [col(col_name) for col_name in evaluate_column_names(df)] + + return cls( + func, + evaluate_output_names=evaluate_column_names, + alias_output_names=None, + version=context._version, + ) + + @classmethod + def from_column_indices( + cls: type[Self], *column_indices: int, context: _LimitedContext + ) -> Self: + def func(df: DaftLazyFrame) -> list[Expression]: + columns = df.columns + return [col(columns[i]) for i in column_indices] + + return cls( + func, + evaluate_output_names=lambda df: [df.columns[i] for i in column_indices], + alias_output_names=None, + version=context._version, + ) + + @classmethod + def _from_elementwise_horizontal_op( + cls, func: Callable[[Iterable[Expression]], Expression], *exprs: Self + ) -> Self: + def call(df: DaftLazyFrame) -> list[Expression]: + cols = (col for _expr in exprs for col in _expr(df)) + return [func(cols)] + + def window_function( + df: DaftLazyFrame, window_inputs: DaftWindowInputs + ) -> list[Expression]: + cols = ( + col for _expr in exprs for col in _expr.window_function(df, window_inputs) + ) + return [func(cols)] + + context = exprs[0] + return cls( + call=call, + window_function=window_function, + evaluate_output_names=combine_evaluate_output_names(*exprs), + alias_output_names=combine_alias_output_names(*exprs), + version=context._version, + ) + + def _callable_to_eval_series( + self, call: Callable[..., Expression], /, **expressifiable_args: Self | Any + ) -> EvalSeries[DaftLazyFrame, Expression]: + def func(df: DaftLazyFrame) -> list[Expression]: + native_series_list = self(df) + other_native_series = { + key: df._evaluate_expr(value) if self._is_expr(value) else lit(value) + for key, value in expressifiable_args.items() + } + return [ + call(native_series, **other_native_series) + for native_series in native_series_list + ] + + return func + + def _push_down_window_function( + self, call: Callable[..., Expression], /, **expressifiable_args: Self | Any + ) -> DaftWindowFunction: + def window_f( + df: DaftLazyFrame, window_inputs: DaftWindowInputs + ) -> Sequence[Expression]: + # If a function `f` is elementwise, and `g` is another function, then + # - `f(g) over (window)` + # - `f(g over (window)) + # are equivalent. + # Make sure to only use with if `call` is elementwise! + native_series_list = self.window_function(df, window_inputs) + other_native_series = { + key: df._evaluate_window_expr(value, window_inputs) + if self._is_expr(value) + else lit(value) + for key, value in expressifiable_args.items() + } + return [ + call(native_series, **other_native_series) + for native_series in native_series_list + ] + + return window_f + + def _with_callable( + self, call: Callable[..., Expression], /, **expressifiable_args: Self | Any + ) -> Self: + return self.__class__( + self._callable_to_eval_series(call, **expressifiable_args), + evaluate_output_names=self._evaluate_output_names, + alias_output_names=self._alias_output_names, + version=self._version, + ) + + def _with_elementwise( + self, call: Callable[..., Expression], /, **expressifiable_args: Self | Any + ) -> Self: + return self.__class__( + self._callable_to_eval_series(call, **expressifiable_args), + self._push_down_window_function(call, **expressifiable_args), + evaluate_output_names=self._evaluate_output_names, + alias_output_names=self._alias_output_names, + version=self._version, + ) + + def _with_binary(self, op: Callable[..., Expression], other: Self | Any) -> Self: + return self.__class__( + self._callable_to_eval_series(op, other=other), + self._push_down_window_function(op, other=other), + evaluate_output_names=self._evaluate_output_names, + alias_output_names=self._alias_output_names, + version=self._version, + ) + + def _with_alias_output_names(self, func: AliasNames | None, /) -> Self: + return type(self)( + self._call, + self._window_function, + evaluate_output_names=self._evaluate_output_names, + alias_output_names=func, + version=self._version, + ) + + def __invert__(self) -> Self: + invert = cast("Callable[..., Expression]", operator.invert) + return self._with_elementwise(invert) + + def __floordiv__(self, other: Self) -> Self: + return self._with_binary(lambda expr, other: (expr / other).floor(), other) + + def __rfloordiv__(self, other: Self) -> Self: + return self._with_binary(lambda expr, other: (other / expr).floor(), other).alias( + "literal" + ) + + def all(self) -> Self: + def f(expr: Expression) -> Expression: + return coalesce(expr.bool_and(), lit(True)) # noqa: FBT003 + + def window_f( + df: DaftLazyFrame, window_inputs: DaftWindowInputs + ) -> Sequence[Expression]: + return [ + coalesce( + expr.bool_and().over(self.partition_by(*window_inputs.partition_by)), + lit(True), # noqa: FBT003 + ) + for expr in self(df) + ] + + return self._with_callable(f)._with_window_function(window_f) + + def any(self) -> Self: + def f(expr: Expression) -> Expression: + return coalesce(expr.bool_or(), lit(False)) # noqa: FBT003 + + def window_f( + df: DaftLazyFrame, window_inputs: DaftWindowInputs + ) -> Sequence[Expression]: + return [ + coalesce( + expr.bool_or().over(self.partition_by(*window_inputs.partition_by)), + lit(False), # noqa: FBT003 + ) + for expr in self(df) + ] + + return self._with_callable(f)._with_window_function(window_f) + + def cast(self, dtype: DType | type[DType]) -> Self: + def func(expr: Expression) -> Expression: + native_dtype = narwhals_to_native_dtype(dtype, self._version) + return expr.cast(native_dtype) + + def window_f(df: DaftLazyFrame, inputs: DaftWindowInputs) -> list[Expression]: + native_dtype = narwhals_to_native_dtype(dtype, self._version) + return [expr.cast(native_dtype) for expr in self.window_function(df, inputs)] + + return self._with_elementwise(func)._with_window_function(window_f) + + def count(self) -> Self: + return self._with_elementwise(lambda _input: _input.count("valid")) + + def abs(self) -> Self: + return self._with_elementwise(lambda _input: _input.abs()) + + def mean(self) -> Self: + return self._with_callable(lambda _input: _input.mean()) + + def quantile( + self, + quantile: float, + interpolation: Literal["nearest", "higher", "lower", "midpoint", "linear"], + ) -> Self: + if interpolation != "lower": + msg = "Only `interpolation='lower'` is supported for `quantile` for Daft." + raise NotImplementedError(msg) + return self._with_callable(lambda _input: _input.approx_percentiles(quantile)) + + def clip( + self, lower_bound: Any | None = None, upper_bound: Any | None = None + ) -> Self: + def _clip_lower(_input: Expression, lower_bound: Expression) -> Expression: + return _input.clip(lower_bound) + + def _clip_upper(_input: Expression, upper_bound: Expression) -> Expression: + return _input.clip(max=upper_bound) + + def _clip_both( + _input: Expression, lower_bound: Expression, upper_bound: Expression + ) -> Expression: + return _input.clip(lower_bound, upper_bound) + + if lower_bound is None: + return self._with_elementwise(_clip_upper, upper_bound=upper_bound) + if upper_bound is None: + return self._with_elementwise(_clip_lower, lower_bound=lower_bound) + return self._with_elementwise( + _clip_both, lower_bound=lower_bound, upper_bound=upper_bound + ) + + def sum(self) -> Self: + def f(expr: Expression) -> Expression: + return coalesce(expr.sum(), lit(0)) + + def window_f( + df: DaftLazyFrame, window_inputs: DaftWindowInputs + ) -> Sequence[Expression]: + return [ + coalesce( + expr.sum().over(self.partition_by(*window_inputs.partition_by)), + lit(0), + ) + for expr in self(df) + ] + + return self._with_callable(f)._with_window_function(window_f) + + def n_unique(self) -> Self: + return self._with_callable( + lambda _input: _input.count_distinct() + _input.is_null().bool_or() + ) + + def over( + self, partition_by: Sequence[str | Expression], order_by: Sequence[str] + ) -> Self: + def func(df: DaftLazyFrame) -> Sequence[Expression]: + return self.window_function(df, WindowInputs(partition_by, order_by)) + + return self.__class__( + func, + evaluate_output_names=self._evaluate_output_names, + alias_output_names=self._alias_output_names, + version=self._version, + ) + + def len(self) -> Self: + return self._with_callable(lambda _input: _input.count("all")) + + def std(self, ddof: int) -> Self: + def func(expr: Expression) -> Expression: + std_pop = expr.stddev() + if ddof == 0: + return std_pop + n_samples = expr.count(mode="valid") + return std_pop * n_samples.sqrt() / (n_samples - ddof).sqrt() + + return self._with_callable(func) + + def var(self, ddof: int) -> Self: + def func(expr: Expression) -> Expression: + std_pop = expr.stddev() + var_pop = std_pop * std_pop + if ddof == 0: + return var_pop + n_samples = expr.count(mode="valid") + return var_pop * n_samples / (n_samples - ddof) + + return self._with_callable(func) + + def max(self) -> Self: + return self._with_callable(lambda _input: _input.max()) + + def min(self) -> Self: + return self._with_callable(lambda _input: _input.min()) + + def null_count(self) -> Self: + return self._with_callable(lambda _input: _input.is_null().cast("uint32").sum()) + + def is_null(self) -> Self: + return self._with_elementwise(lambda _input: _input.is_null()) + + def is_nan(self) -> Self: + return self._with_elementwise(lambda _input: _input.float.is_nan()) + + def shift(self, n: int) -> Self: + def func(df: DaftLazyFrame, inputs: DaftWindowInputs) -> Sequence[Expression]: + window = self.partition_by(*inputs.partition_by).order_by( + *inputs.order_by, nulls_first=True + ) + return [expr.lag(n).over(window) for expr in self(df)] + + return self._with_window_function(func) + + def is_first_distinct(self) -> Self: + def func(df: DaftLazyFrame, inputs: DaftWindowInputs) -> Sequence[Expression]: + return [ + row_number().over( + self.partition_by(*inputs.partition_by, expr).order_by( + *inputs.order_by, nulls_first=True + ) + ) + == lit(1) + for expr in self(df) + ] + + return self._with_window_function(func) + + def is_last_distinct(self) -> Self: + def func(df: DaftLazyFrame, inputs: DaftWindowInputs) -> Sequence[Expression]: + return [ + row_number().over( + self.partition_by(*inputs.partition_by, expr).order_by( + *inputs.order_by, desc=True, nulls_first=False + ) + ) + == lit(1) + for expr in self(df) + ] + + return self._with_window_function(func) + + def diff(self) -> Self: + def func(df: DaftLazyFrame, inputs: DaftWindowInputs) -> Sequence[Expression]: + window = self.partition_by(*inputs.partition_by).order_by( + *inputs.order_by, nulls_first=True + ) + return [expr - expr.lag(1).over(window) for expr in self(df)] + + return self._with_window_function(func) + + def cum_sum(self, *, reverse: bool) -> Self: + return self._with_window_function( + self._cum_window_func(reverse=reverse, func_name="sum") + ) + + def cum_min(self, *, reverse: bool) -> Self: + return self._with_window_function( + self._cum_window_func(reverse=reverse, func_name="min") + ) + + def cum_max(self, *, reverse: bool) -> Self: + return self._with_window_function( + self._cum_window_func(reverse=reverse, func_name="max") + ) + + def cum_count(self, *, reverse: bool) -> Self: + return self._with_window_function( + self._cum_window_func(reverse=reverse, func_name="count") + ) + + def rolling_sum(self, window_size: int, *, min_samples: int, center: bool) -> Self: + return self._with_window_function( + self._rolling_window_func("sum", window_size, min_samples, center=center) + ) + + def rolling_mean(self, window_size: int, *, min_samples: int, center: bool) -> Self: + return self._with_window_function( + self._rolling_window_func("mean", window_size, min_samples, center=center) + ) + + def is_finite(self) -> Self: + return self._with_elementwise( + lambda _input: (_input > float("-inf")) & (_input < float("inf")) + ) + + def is_in(self, other: Sequence[Any]) -> Self: + return self._with_elementwise(lambda _input: _input.is_in(other)) + + def round(self, decimals: int) -> Self: + return self._with_elementwise(lambda _input: _input.round(decimals)) + + def fill_null(self, value: Self | Any, strategy: Any, limit: int | None) -> Self: + if strategy is not None: + msg = "todo" + raise NotImplementedError(msg) + + return self._with_elementwise( + lambda _input, value: _input.fill_null(value), value=value + ) + + def log(self, base: float) -> Self: + return self._with_elementwise(lambda expr: expr.log(base=base)) + + def skew(self) -> Self: + return self._with_callable(lambda expr: expr.skew()) + + @property + def str(self) -> DaftExprStringNamespace: + return DaftExprStringNamespace(self) + + @property + def dt(self) -> DaftExprDateTimeNamespace: + return DaftExprDateTimeNamespace(self) + + @property + def list(self) -> Any: + msg = "todo" + raise NotImplementedError(msg) + + @property + def struct(self) -> DaftExprStructNamespace: + return DaftExprStructNamespace(self) + + drop_nulls = not_implemented() + rank = not_implemented() # https://github.com/Eventual-Inc/Daft/issues/4290 + median = not_implemented() # https://github.com/Eventual-Inc/Daft/issues/3491 + unique = not_implemented() + is_unique = not_implemented() + kurtosis = not_implemented() + exp = not_implemented() + sqrt = not_implemented() + cum_prod = not_implemented() + rolling_std = not_implemented() # https://github.com/Eventual-Inc/Daft/issues/4464 + rolling_var = not_implemented() # https://github.com/Eventual-Inc/Daft/issues/4705 diff --git a/narwhals/_daft/expr_dt.py b/narwhals/_daft/expr_dt.py new file mode 100644 index 0000000000..12a3e06faa --- /dev/null +++ b/narwhals/_daft/expr_dt.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from narwhals._compliant import LazyExprNamespace +from narwhals._compliant.any_namespace import DateTimeNamespace +from narwhals._utils import not_implemented + +if TYPE_CHECKING: + from narwhals._daft.expr import DaftExpr + + +class DaftExprDateTimeNamespace( + LazyExprNamespace["DaftExpr"], DateTimeNamespace["DaftExpr"] +): + def date(self) -> DaftExpr: + return self.compliant._with_elementwise(lambda expr: expr.dt.date()) + + def year(self) -> DaftExpr: + return self.compliant._with_elementwise(lambda expr: expr.dt.year()) + + def month(self) -> DaftExpr: + return self.compliant._with_elementwise(lambda expr: expr.dt.month()) + + def day(self) -> DaftExpr: + return self.compliant._with_elementwise(lambda expr: expr.dt.day()) + + def hour(self) -> DaftExpr: + return self.compliant._with_elementwise(lambda expr: expr.dt.hour()) + + def minute(self) -> DaftExpr: + return self.compliant._with_elementwise(lambda expr: expr.dt.minute()) + + def second(self) -> DaftExpr: + return self.compliant._with_elementwise(lambda expr: expr.dt.second()) + + def millisecond(self) -> DaftExpr: + return self.compliant._with_elementwise(lambda expr: expr.dt.millisecond()) + + def microsecond(self) -> DaftExpr: + return self.compliant._with_elementwise(lambda expr: expr.dt.microsecond()) + + def nanosecond(self) -> DaftExpr: + return self.compliant._with_elementwise(lambda expr: expr.dt.nanosecond()) + + def weekday(self) -> DaftExpr: + return self.compliant._with_elementwise(lambda expr: expr.dt.day_of_week() + 1) + + def to_string(self, format: str | None) -> DaftExpr: + return self.compliant._with_elementwise(lambda expr: expr.dt.strftime(format)) + + def ordinal_day(self) -> DaftExpr: + return self.compliant._with_elementwise(lambda expr: expr.dt.day_of_year()) + + convert_time_zone = not_implemented() + replace_time_zone = not_implemented() + timestamp = not_implemented() + truncate = not_implemented() + total_hours = not_implemented() + total_minutes = not_implemented() + total_seconds = not_implemented() + total_milliseconds = not_implemented() + total_microseconds = not_implemented() + total_nanoseconds = not_implemented() diff --git a/narwhals/_daft/expr_str.py b/narwhals/_daft/expr_str.py new file mode 100644 index 0000000000..b9da6b9197 --- /dev/null +++ b/narwhals/_daft/expr_str.py @@ -0,0 +1,83 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import daft + +from narwhals._compliant import LazyExprNamespace +from narwhals._compliant.any_namespace import StringNamespace +from narwhals._utils import not_implemented + +if TYPE_CHECKING: + from daft import Expression + + from narwhals._daft.expr import DaftExpr + + +class DaftExprStringNamespace(LazyExprNamespace["DaftExpr"], StringNamespace["DaftExpr"]): + def starts_with(self, prefix: str) -> DaftExpr: + return self.compliant._with_elementwise(lambda expr: expr.str.startswith(prefix)) + + def ends_with(self, prefix: str) -> DaftExpr: + return self.compliant._with_elementwise(lambda expr: expr.str.endswith(prefix)) + + def contains(self, pattern: str, *, literal: bool) -> DaftExpr: + if not literal: + return self.compliant._with_elementwise(lambda expr: expr.str.match(pattern)) + return self.compliant._with_elementwise(lambda expr: expr.str.contains(pattern)) + + def split(self, by: str) -> DaftExpr: + return self.compliant._with_elementwise(lambda expr: expr.str.split(by)) + + def len_chars(self) -> DaftExpr: + return self.compliant._with_elementwise(lambda expr: expr.str.length()) + + def to_date(self, format: str | None) -> DaftExpr: + if format is None: + return self.compliant._with_elementwise(lambda expr: expr.cast("date")) + return self.compliant._with_elementwise(lambda expr: expr.str.to_date(format)) + + def to_datetime(self, format: str | None) -> DaftExpr: + if format is None: + msg = "`format` must be specified for Daft in `to_datetime`." + raise ValueError(msg) + return self.compliant._with_elementwise(lambda expr: expr.str.to_datetime(format)) + + def to_lowercase(self) -> DaftExpr: + return self.compliant._with_elementwise(lambda expr: expr.str.lower()) + + def to_uppercase(self) -> DaftExpr: + return self.compliant._with_elementwise(lambda expr: expr.str.upper()) + + def strip_chars(self, characters: str | None) -> DaftExpr: + if characters is None: + return self.compliant._with_elementwise( + lambda expr: expr.str.lstrip().str.rstrip() + ) + msg = "`strip_chars` with `characters` is currently not supported for Daft" + raise NotImplementedError(msg) + + def replace(self, pattern: str, value: str, *, literal: bool, n: int) -> DaftExpr: + msg = "`replace` is currently not supported for Daft" + raise NotImplementedError(msg) + + def replace_all(self, pattern: str, value: str, *, literal: bool) -> DaftExpr: + return self.compliant._with_elementwise( + lambda expr: expr.str.replace(pattern, value, regex=not literal) + ) + + def slice(self, offset: int, length: int | None) -> DaftExpr: + offset_lit = daft.lit(offset).cast("uint64") + + def func(expr: Expression) -> Expression: + length_expr = expr.str.length() if length is None else daft.lit(length) + offset_expr = ( + expr.str.length() + offset_lit + if offset < 0 + else daft.lit(offset).cast("uint64") + ) + return expr.str.substr(offset_expr, length_expr) + + return self.compliant._with_elementwise(func) + + zfill = not_implemented() diff --git a/narwhals/_daft/expr_struct.py b/narwhals/_daft/expr_struct.py new file mode 100644 index 0000000000..7d3dde1ddd --- /dev/null +++ b/narwhals/_daft/expr_struct.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from narwhals._daft.expr import DaftExpr + + +class DaftExprStructNamespace: + def __init__(self, expr: DaftExpr) -> None: + self.compliant = expr + + def field(self, name: str) -> DaftExpr: + return self.compliant._with_elementwise( + lambda _input: _input.struct.get(name) + ).alias(name) diff --git a/narwhals/_daft/group_by.py b/narwhals/_daft/group_by.py new file mode 100644 index 0000000000..850c80a064 --- /dev/null +++ b/narwhals/_daft/group_by.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from narwhals._compliant import LazyGroupBy + +if TYPE_CHECKING: + from collections.abc import Sequence + + from daft import Expression # noqa: F401 + + from narwhals._daft.dataframe import DaftLazyFrame + from narwhals._daft.expr import DaftExpr + + +class DaftGroupBy(LazyGroupBy["DaftLazyFrame", "DaftExpr", "Expression"]): + def __init__( + self, + df: DaftLazyFrame, + keys: Sequence[DaftExpr] | Sequence[str], + /, + *, + drop_null_keys: bool, + ) -> None: + frame, self._keys, self._output_key_names = self._parse_keys(df, keys=keys) + self._compliant_frame = frame.drop_nulls(self._keys) if drop_null_keys else frame + + def agg(self, *exprs: DaftExpr) -> DaftLazyFrame: + result = ( + self.compliant.native.groupby(*self._keys).agg(*agg_columns) + if (agg_columns := list(self._evaluate_exprs(exprs))) + else self.compliant.native.select(*self._keys).unique() + ) + + return self.compliant._with_native(result).rename( + dict(zip(self._keys, self._output_key_names)) + ) diff --git a/narwhals/_daft/namespace.py b/narwhals/_daft/namespace.py new file mode 100644 index 0000000000..78e14a6bda --- /dev/null +++ b/narwhals/_daft/namespace.py @@ -0,0 +1,185 @@ +from __future__ import annotations + +import operator +from functools import reduce +from typing import TYPE_CHECKING, Any + +import daft +import daft.functions +from daft import Expression + +from narwhals._compliant import LazyThen, LazyWhen +from narwhals._compliant.namespace import LazyNamespace +from narwhals._daft.dataframe import DaftLazyFrame +from narwhals._daft.expr import DaftExpr +from narwhals._daft.selectors import DaftSelectorNamespace +from narwhals._daft.utils import lit, narwhals_to_native_dtype +from narwhals._utils import Implementation, not_implemented + +if TYPE_CHECKING: + from collections.abc import Iterable, Sequence + + from narwhals._daft.expr import DaftWindowInputs + from narwhals._utils import Version + from narwhals.dtypes import DType + from narwhals.typing import ConcatMethod + + +class DaftNamespace(LazyNamespace[DaftLazyFrame, DaftExpr, daft.DataFrame]): + _implementation: Implementation = Implementation.DAFT + + def __init__(self, *, version: Version) -> None: + self._version = version + + @property + def selectors(self) -> DaftSelectorNamespace: + return DaftSelectorNamespace.from_namespace(self) + + @property + def _expr(self) -> type[DaftExpr]: + return DaftExpr + + @property + def _lazyframe(self) -> type[DaftLazyFrame]: + return DaftLazyFrame + + def lit(self, value: Any, dtype: DType | type[DType] | None) -> DaftExpr: + def func(_df: DaftLazyFrame) -> list[Expression]: + if dtype is not None: + return [lit(value).cast(narwhals_to_native_dtype(dtype, self._version))] + return [lit(value)] + + return DaftExpr( + func, + evaluate_output_names=lambda _df: ["literal"], + alias_output_names=None, + version=self._version, + ) + + def concat( + self, items: Iterable[DaftLazyFrame], *, how: ConcatMethod + ) -> DaftLazyFrame: + list_items = list(items) + native_items = (item._native_frame for item in items) + if how == "diagonal": + return DaftLazyFrame( + reduce(lambda x, y: x.union_all_by_name(y), native_items), + version=self._version, + ) + first = list_items[0] + schema = first.schema + if how == "vertical" and not all(x.schema == schema for x in list_items[1:]): + msg = "inputs should all have the same schema" + raise TypeError(msg) + res = reduce(lambda x, y: x.union(y), native_items) + return first._with_native(res) + + concat_str = not_implemented() + + def all_horizontal(self, *exprs: DaftExpr, ignore_nulls: bool) -> DaftExpr: + def func(cols: Iterable[Expression]) -> Expression: + it = ( + (daft.coalesce(col, lit(True)) for col in cols) # noqa: FBT003 + if ignore_nulls + else cols + ) + return reduce(operator.and_, it) + + return self._expr._from_elementwise_horizontal_op(func, *exprs) + + def any_horizontal(self, *exprs: DaftExpr, ignore_nulls: bool) -> DaftExpr: + def func(cols: Iterable[Expression]) -> Expression: + it = ( + (daft.coalesce(col, lit(False)) for col in cols) # noqa: FBT003 + if ignore_nulls + else cols + ) + return reduce(operator.or_, it) + + return self._expr._from_elementwise_horizontal_op(func, *exprs) + + def sum_horizontal(self, *exprs: DaftExpr) -> DaftExpr: + def func(cols: Iterable[Expression]) -> Expression: + return daft.functions.columns_sum(*cols) + + return self._expr._from_elementwise_horizontal_op(func, *exprs) + + def max_horizontal(self, *exprs: DaftExpr) -> DaftExpr: + def func(cols: Iterable[Expression]) -> Expression: + return daft.functions.columns_max(*cols) + + return self._expr._from_elementwise_horizontal_op(func, *exprs) + + def min_horizontal(self, *exprs: DaftExpr) -> DaftExpr: + def func(cols: Iterable[Expression]) -> Expression: + return daft.functions.columns_min(*cols) + + return self._expr._from_elementwise_horizontal_op(func, *exprs) + + def mean_horizontal(self, *exprs: DaftExpr) -> DaftExpr: + def func(cols: Iterable[Expression]) -> Expression: + return daft.functions.columns_mean(*cols) + + return self._expr._from_elementwise_horizontal_op(func, *exprs) + + def when(self, predicate: DaftExpr) -> DaftWhen: + return DaftWhen.from_expr(predicate, context=self) + + def len(self) -> DaftExpr: + def func(_df: DaftLazyFrame) -> list[Expression]: + if not _df.columns: # pragma: no cover + msg = "Cannot use `nw.len()` on Daft DataFrame with zero columns" + raise ValueError(msg) + return [daft.col(_df.columns[0]).count(mode="all")] + + return DaftExpr( + call=func, + evaluate_output_names=lambda _df: ["len"], + alias_output_names=None, + version=self._version, + ) + + +class DaftWhen(LazyWhen[DaftLazyFrame, Expression, DaftExpr]): + @property + def _then(self) -> type[DaftThen]: + return DaftThen + + def __call__(self, df: DaftLazyFrame) -> Sequence[Expression]: + is_expr = self._condition._is_expr + condition = df._evaluate_expr(self._condition) + then_ = self._then_value + then = df._evaluate_expr(then_) if is_expr(then_) else lit(then_) + other_ = self._otherwise_value + if other_ is None: + result = condition.if_else(then, None) + else: + otherwise = df._evaluate_expr(other_) if is_expr(other_) else lit(other_) + result = condition.if_else(then, otherwise) + return [result] + + def _window_function( + self, df: DaftLazyFrame, window_inputs: DaftWindowInputs + ) -> Sequence[Expression]: + is_expr = self._condition._is_expr + condition = df._evaluate_window_expr(self._condition, window_inputs) + then_ = self._then_value + then = ( + df._evaluate_window_expr(then_, window_inputs) + if is_expr(then_) + else lit(then_) + ) + other_ = self._otherwise_value + if other_ is None: + result = condition.if_else(then, lit(None)) + else: + otherwise = ( + df._evaluate_window_expr(other_, window_inputs) + if is_expr(other_) + else lit(other_) + ) + result = condition.if_else(then, otherwise) + return [result] + + +class DaftThen(LazyThen[DaftLazyFrame, Expression, DaftExpr], DaftExpr): ... diff --git a/narwhals/_daft/selectors.py b/narwhals/_daft/selectors.py new file mode 100644 index 0000000000..e4d5b5dbc7 --- /dev/null +++ b/narwhals/_daft/selectors.py @@ -0,0 +1,27 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import daft + +from narwhals._compliant import CompliantSelector, LazySelectorNamespace +from narwhals._daft.expr import DaftExpr + +if TYPE_CHECKING: + from narwhals._daft.dataframe import DaftLazyFrame # noqa: F401 + + +class DaftSelectorNamespace(LazySelectorNamespace["DaftLazyFrame", daft.Expression]): + @property + def _selector(self) -> type[DaftSelector]: + return DaftSelector + + +class DaftSelector(CompliantSelector["DaftLazyFrame", daft.Expression], DaftExpr): # type: ignore[misc] + def _to_expr(self) -> DaftExpr: + return DaftExpr( + self._call, + evaluate_output_names=self._evaluate_output_names, + alias_output_names=self._alias_output_names, + version=self._version, + ) diff --git a/narwhals/_daft/utils.py b/narwhals/_daft/utils.py new file mode 100644 index 0000000000..db49161a97 --- /dev/null +++ b/narwhals/_daft/utils.py @@ -0,0 +1,146 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import daft +import daft.datatype +from daft import DataType + +from narwhals._utils import isinstance_or_issubclass + +if TYPE_CHECKING: + from narwhals._daft.dataframe import DaftLazyFrame + from narwhals._daft.expr import DaftExpr + from narwhals._utils import Version + from narwhals.dtypes import DType + +lit = daft.lit +"""Alias for `daft.lit`.""" + + +def evaluate_exprs( + df: DaftLazyFrame, /, *exprs: DaftExpr +) -> list[tuple[str, daft.Expression]]: + native_results: list[tuple[str, daft.Expression]] = [] + for expr in exprs: + native_series_list = expr._call(df) + output_names = expr._evaluate_output_names(df) + if expr._alias_output_names is not None: + output_names = expr._alias_output_names(output_names) + if len(output_names) != len(native_series_list): # pragma: no cover + msg = f"Internal error: got output names {output_names}, but only got {len(native_series_list)} results" + raise AssertionError(msg) + native_results.extend(zip(output_names, native_series_list)) + return native_results + + +def native_to_narwhals_dtype(daft_dtype: DataType, version: Version) -> DType: # noqa: PLR0912,C901 + dtypes = version.dtypes + + if daft_dtype == DataType.int64(): + return dtypes.Int64() + if daft_dtype == DataType.int32(): + return dtypes.Int32() + if daft_dtype == DataType.int16(): + return dtypes.Int16() + if daft_dtype == DataType.int8(): + return dtypes.Int8() + if daft_dtype == DataType.uint64(): + return dtypes.UInt64() + if daft_dtype == DataType.uint32(): + return dtypes.UInt32() + if daft_dtype == DataType.uint16(): + return dtypes.UInt16() + if daft_dtype == DataType.uint8(): + return dtypes.UInt8() + if daft_dtype == DataType.float64(): + return dtypes.Float64() + if daft_dtype == DataType.float32(): + return dtypes.Float32() + if daft_dtype == DataType.string(): + return dtypes.String() + if daft_dtype == DataType.date(): + return dtypes.Date() + if daft_dtype == DataType.timestamp("us", None): + return dtypes.Datetime("us", None) + if daft_dtype == DataType.bool(): + return dtypes.Boolean() + if daft_dtype == DataType.duration("us"): + return dtypes.Duration("us") + if daft_dtype == DataType.decimal128(1, 1): # pragma: no cover + return dtypes.Decimal() + if DataType.is_fixed_size_list(daft_dtype): + return dtypes.Array( + native_to_narwhals_dtype(daft_dtype.dtype, version), daft_dtype.size + ) + return dtypes.Unknown() # pragma: no cover + + +def narwhals_to_native_dtype( # noqa: PLR0912,C901 + dtype: DType | type[DType], version: Version +) -> daft.DataType: + dtypes = version.dtypes + if dtype == dtypes.Float64: + return DataType.float64() + if dtype == dtypes.Float32: + return DataType.float32() + if dtype in {dtypes.Int128, dtypes.UInt128}: # pragma: no cover + msg = "Converting to Int128/UInt128 is not (yet) supported for Daft." + raise NotImplementedError(msg) + if dtype == dtypes.Int64: + return DataType.int64() + if dtype == dtypes.Int32: + return DataType.int32() + if dtype == dtypes.Int16: + return DataType.int16() + if dtype == dtypes.Int8: + return DataType.int8() + if dtype == dtypes.UInt64: + return DataType.uint64() + if dtype == dtypes.UInt32: + return DataType.uint32() + if dtype == dtypes.UInt16: + return DataType.uint16() + if dtype == dtypes.UInt8: + return DataType.uint8() + if dtype == dtypes.String: + return DataType.string() + if dtype == dtypes.Boolean: + return DataType.bool() + if dtype == dtypes.Object: # pragma: no cover + msg = "Converting to Object is not (yet) supported for Daft" + raise NotImplementedError(msg) + if dtype == dtypes.Categorical: + msg = "Converting to Categorical is not (yet) supported for Daft" + raise NotImplementedError(msg) + if dtype == dtypes.Enum: + msg = "Converting to Enum is not (yet) supported for Daft" + raise NotImplementedError(msg) + if dtype == dtypes.Date: + return DataType.date() + if dtype == dtypes.Time: + return DataType.time("ns") + if dtype == dtypes.Binary: + return DataType.binary() + if dtype == dtypes.Decimal: # pragma: no cover + msg = "Casting to Decimal is not supported yet." + raise NotImplementedError(msg) + if isinstance_or_issubclass(dtype, dtypes.Datetime): + return DataType.timestamp(dtype.time_unit, dtype.time_zone) + if isinstance_or_issubclass(dtype, dtypes.Duration): + return DataType.duration(dtype.time_unit) + if isinstance_or_issubclass(dtype, dtypes.List): + return DataType.list(narwhals_to_native_dtype(dtype.inner, version)) + if isinstance_or_issubclass(dtype, dtypes.Struct): + return DataType.struct( + { + field.name: narwhals_to_native_dtype(field.dtype, version) + for field in dtype.fields + } + ) + if isinstance_or_issubclass(dtype, dtypes.Array): # pragma: no cover + return DataType.fixed_size_list( + narwhals_to_native_dtype(dtype.inner, version), dtype.size + ) + msg = f"Unknown dtype: {dtype}" # pragma: no cover + raise AssertionError(msg) diff --git a/narwhals/_namespace.py b/narwhals/_namespace.py index 56670a0c59..8a96452d09 100644 --- a/narwhals/_namespace.py +++ b/narwhals/_namespace.py @@ -21,6 +21,7 @@ get_pandas, get_polars, get_pyarrow, + is_daft_dataframe, is_dask_dataframe, is_duckdb_relation, is_ibis_table, @@ -43,6 +44,7 @@ from typing_extensions import Self, TypeAlias, TypeIs from narwhals._arrow.namespace import ArrowNamespace + from narwhals._daft.namespace import DaftNamespace from narwhals._dask.namespace import DaskNamespace from narwhals._duckdb.namespace import DuckDBNamespace from narwhals._ibis.namespace import IbisNamespace @@ -59,18 +61,20 @@ _Polars: TypeAlias = Literal["polars"] _Arrow: TypeAlias = Literal["pyarrow"] _Dask: TypeAlias = Literal["dask"] + _Daft: TypeAlias = Literal["daft"] _DuckDB: TypeAlias = Literal["duckdb"] _PandasLike: TypeAlias = Literal["pandas", "cudf", "modin"] _Ibis: TypeAlias = Literal["ibis"] _SparkLike: TypeAlias = Literal["pyspark", "sqlframe", "pyspark[connect]"] _EagerOnly: TypeAlias = "_PandasLike | _Arrow" _EagerAllowed: TypeAlias = "_Polars | _EagerOnly" - _LazyOnly: TypeAlias = "_SparkLike | _Dask | _DuckDB | _Ibis" + _LazyOnly: TypeAlias = "_Daft | _Dask | _DuckDB | _Ibis | _SparkLike" _LazyAllowed: TypeAlias = "_Polars | _LazyOnly" Polars: TypeAlias = Literal[_Polars, Implementation.POLARS] Arrow: TypeAlias = Literal[_Arrow, Implementation.PYARROW] Dask: TypeAlias = Literal[_Dask, Implementation.DASK] + Daft: TypeAlias = Literal[_Daft, Implementation.DAFT] DuckDB: TypeAlias = Literal[_DuckDB, Implementation.DUCKDB] Ibis: TypeAlias = Literal[_Ibis, Implementation.IBIS] PandasLike: TypeAlias = Literal[ @@ -84,7 +88,7 @@ ] EagerOnly: TypeAlias = "PandasLike | Arrow" EagerAllowed: TypeAlias = "EagerOnly | Polars" - LazyOnly: TypeAlias = "SparkLike | Dask | DuckDB | Ibis" + LazyOnly: TypeAlias = "Daft | Dask | DuckDB | Ibis | SparkLike" LazyAllowed: TypeAlias = "LazyOnly | Polars" BackendName: TypeAlias = "_EagerAllowed | _LazyAllowed" @@ -124,6 +128,9 @@ def where(self, cond: Any, other: Any = ..., **kwds: Any) -> Any: ... class _NativeDask(Protocol): _partition_type: type[pd.DataFrame] + class _NativeDaft(Protocol): + def explain(self, *args: Any, **kwds: Any) -> Any: ... + class _CuDFDataFrame(_BasePandasLikeFrame, Protocol): def to_pylibcudf(self, *args: Any, **kwds: Any) -> Any: ... @@ -160,7 +167,7 @@ class _ModinSeries(_BasePandasLikeSeries, Protocol): "_NativeSQLFrame | _NativePySpark | _NativePySparkConnect" ) - NativeKnown: TypeAlias = "_NativePolars | _NativeArrow | _NativePandasLike | _NativeSparkLike | _NativeDuckDB | _NativeDask | _NativeIbis" + NativeKnown: TypeAlias = "_NativePolars | _NativeArrow | _NativePandasLike | _NativeSparkLike | _NativeDuckDB | _NativeDask | _NativeIbis | _NativeDaft" NativeUnknown: TypeAlias = ( "NativeFrame | NativeSeries | NativeLazyFrame | DataFrameLike" ) @@ -224,6 +231,10 @@ def from_backend(cls, backend: DuckDB, /) -> Namespace[DuckDBNamespace]: ... @classmethod def from_backend(cls, backend: Dask, /) -> Namespace[DaskNamespace]: ... + @overload + @classmethod + def from_backend(cls, backend: Daft, /) -> Namespace[DaftNamespace]: ... + @overload @classmethod def from_backend(cls, backend: Ibis, /) -> Namespace[IbisNamespace]: ... @@ -288,6 +299,10 @@ def from_backend( from narwhals._ibis.namespace import IbisNamespace ns = IbisNamespace(version=version) + elif impl.is_daft(): + from narwhals._daft.namespace import DaftNamespace + + ns = DaftNamespace(version=version) else: msg = "Not supported Implementation" # pragma: no cover raise AssertionError(msg) @@ -329,6 +344,10 @@ def from_native_object(cls, native: _NativeDask, /) -> Namespace[DaskNamespace]: @classmethod def from_native_object(cls, native: _NativeIbis, /) -> Namespace[IbisNamespace]: ... + @overload + @classmethod + def from_native_object(cls, native: _NativeDaft, /) -> Namespace[DaftNamespace]: ... + @overload @classmethod def from_native_object( @@ -354,7 +373,7 @@ def from_native_object( ) -> Namespace[CompliantNamespaceAny]: ... @classmethod - def from_native_object( # noqa: PLR0911 + def from_native_object( # noqa: PLR0911,C901 cls: type[Namespace[Any]], native: NativeAny, / ) -> Namespace[Any]: if is_native_polars(native): @@ -373,6 +392,8 @@ def from_native_object( # noqa: PLR0911 ) elif is_native_dask(native): return cls.from_backend(Implementation.DASK) # pragma: no cover + elif is_native_daft(native): + return cls.from_backend(Implementation.DAFT) # pragma: no cover elif is_native_duckdb(native): return cls.from_backend(Implementation.DUCKDB) elif is_native_cudf(native): # pragma: no cover @@ -402,6 +423,10 @@ def is_native_dask(obj: Any) -> TypeIs[_NativeDask]: return is_dask_dataframe(obj) +def is_native_daft(obj: Any) -> TypeIs[_NativeDaft]: + return is_daft_dataframe(obj) + + is_native_duckdb: _Guard[_NativeDuckDB] = is_duckdb_relation is_native_sqlframe: _Guard[_NativeSQLFrame] = is_sqlframe_dataframe is_native_pyspark: _Guard[_NativePySpark] = is_pyspark_dataframe diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py index 9520ea1630..65083f22ea 100644 --- a/narwhals/_spark_like/dataframe.py +++ b/narwhals/_spark_like/dataframe.py @@ -353,8 +353,8 @@ def join( left_columns = self.columns right_columns = other.columns - right_on_: list[str] = list(right_on) if right_on is not None else [] - left_on_: list[str] = list(left_on) if left_on is not None else [] + right_on_ = list(right_on) if right_on is not None else [] + left_on_ = list(left_on) if left_on is not None else [] # create a mapping for columns on other # `right_on` columns will be renamed as `left_on` diff --git a/narwhals/_spark_like/expr.py b/narwhals/_spark_like/expr.py index 37f65e50f6..390dc4be20 100644 --- a/narwhals/_spark_like/expr.py +++ b/narwhals/_spark_like/expr.py @@ -142,7 +142,6 @@ def _sort( yield from (sort(col) for col in cols) def partition_by(self, *cols: Column | str) -> WindowSpec: - """Wraps `Window().paritionBy`, with default and `WindowInputs` handling.""" return self._Window.partitionBy(*cols or [self._F.lit(1)]) def __narwhals_expr__(self) -> None: ... diff --git a/narwhals/_utils.py b/narwhals/_utils.py index b748b0d053..05d05fbcd6 100644 --- a/narwhals/_utils.py +++ b/narwhals/_utils.py @@ -27,6 +27,7 @@ from narwhals._typing_compat import assert_never, deprecated from narwhals.dependencies import ( get_cudf, + get_daft, get_dask_dataframe, get_duckdb, get_ibis, @@ -290,6 +291,8 @@ class Implementation(NoAutoEnum): """SQLFrame implementation.""" PYSPARK_CONNECT = "pyspark[connect]" """PySpark Connect implementation.""" + DAFT = "daft" + """Daft implementation.""" UNKNOWN = "unknown" """Unknown implementation.""" @@ -320,6 +323,7 @@ def from_native_namespace( get_ibis(): Implementation.IBIS, get_sqlframe(): Implementation.SQLFRAME, get_pyspark_connect(): Implementation.PYSPARK_CONNECT, + get_daft(): Implementation.DAFT, } return mapping.get(native_namespace, Implementation.UNKNOWN) @@ -586,6 +590,14 @@ def is_sqlframe(self) -> bool: """ return self is Implementation.SQLFRAME # pragma: no cover + def is_daft(self) -> bool: + """Return whether implementation is Daft. + + Returns: + Boolean. + """ + return self is Implementation.DAFT # pragma: no cover + def _backend_version(self) -> tuple[int, ...]: """Returns backend version.""" return backend_version(self) @@ -603,6 +615,7 @@ def _backend_version(self) -> tuple[int, ...]: Implementation.DUCKDB: (1,), Implementation.IBIS: (6,), Implementation.SQLFRAME: (3, 22, 0), + Implementation.DAFT: (0, 4, 7), } _IMPLEMENTATION_TO_MODULE_NAME: Mapping[Implementation, str] = { @@ -610,6 +623,7 @@ def _backend_version(self) -> tuple[int, ...]: Implementation.MODIN: "modin.pandas", Implementation.PYSPARK: "pyspark.sql", Implementation.PYSPARK_CONNECT: "pyspark.sql.connect", + Implementation.DAFT: "daft", } """Stores non default mapping from Implementation to module name""" diff --git a/narwhals/dependencies.py b/narwhals/dependencies.py index 483605f4f1..2a6abc7681 100644 --- a/narwhals/dependencies.py +++ b/narwhals/dependencies.py @@ -8,6 +8,7 @@ if TYPE_CHECKING: import cudf + import daft import dask.dataframe as dd import duckdb import ibis @@ -125,6 +126,11 @@ def get_sqlframe() -> Any: return sys.modules.get("sqlframe", None) +def get_daft() -> Any: + """Get daft module (if already imported - else return None).""" + return sys.modules.get("daft", None) + + def _raise_if_narwhals_df_or_lf(df: Any) -> None: if is_narwhals_dataframe(df) or is_narwhals_lazyframe(df): msg = ( @@ -371,6 +377,11 @@ def is_sqlframe_dataframe(df: Any) -> TypeIs[SQLFrameDataFrame]: return False # pragma: no cover +def is_daft_dataframe(df: Any) -> TypeIs[daft.DataFrame]: + """Check whether `df` is a Daft DataFrame without importing Daft.""" + return bool((daft := get_daft()) is not None and isinstance(df, daft.DataFrame)) + + def is_numpy_array(arr: Any | _NDArray[_ShapeT]) -> TypeIs[_NDArray[_ShapeT]]: """Check whether `arr` is a NumPy Array without importing NumPy.""" return (np := get_numpy()) is not None and isinstance(arr, np.ndarray) diff --git a/narwhals/functions.py b/narwhals/functions.py index 12d43e4a40..96f0bb7c07 100644 --- a/narwhals/functions.py +++ b/narwhals/functions.py @@ -763,6 +763,7 @@ def scan_csv( Implementation.DASK, Implementation.DUCKDB, Implementation.IBIS, + Implementation.DAFT, }: native_frame = native_namespace.read_csv(source, **kwargs) elif implementation is Implementation.PYARROW: @@ -854,6 +855,7 @@ def read_parquet( Implementation.CUDF, Implementation.DUCKDB, Implementation.IBIS, + Implementation.DAFT, }: native_frame = native_namespace.read_parquet(source, **kwargs) elif implementation is Implementation.PYARROW: @@ -961,6 +963,7 @@ def scan_parquet( Implementation.DASK, Implementation.DUCKDB, Implementation.IBIS, + Implementation.DAFT, }: native_frame = native_namespace.read_parquet(source, **kwargs) elif implementation is Implementation.PYARROW: diff --git a/narwhals/translate.py b/narwhals/translate.py index 497874aa7e..85b9631e7c 100644 --- a/narwhals/translate.py +++ b/narwhals/translate.py @@ -18,6 +18,7 @@ get_numpy, get_pandas, is_cupy_scalar, + is_daft_dataframe, is_dask_dataframe, is_duckdb_relation, is_ibis_table, @@ -539,6 +540,22 @@ def _from_native_impl( # noqa: C901, PLR0911, PLR0912, PLR0915 return native_object return ns_spark.compliant.from_native(native_object).to_narwhals() + # Daft + elif is_daft_dataframe(native_object): # pragma: no cover + if series_only or eager_only or eager_or_interchange_only: + if not pass_through: + msg = ( + "Cannot only use `series_only`, `eager_only` or `eager_or_interchange_only` " + "with Daft DataFrame" + ) + raise TypeError(msg) + return native_object + return ( + version.namespace.from_native_object(native_object) + .compliant.from_native(native_object) + .to_narwhals() + ) + # Interchange protocol elif _supports_dataframe_interchange(native_object): from narwhals._interchange.dataframe import InterchangeFrame diff --git a/pyproject.toml b/pyproject.toml index ffa51e3daa..f173237e1f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,10 +45,11 @@ dask = ["dask[dataframe]>=2024.8"] duckdb = ["duckdb>=1.0"] ibis = ["ibis-framework>=6.0.0", "rich", "packaging", "pyarrow_hotfix"] sqlframe = ["sqlframe>=3.22.0"] +daft = ["daft>=0.4.7"] [dependency-groups] core = [ - "narwhals[duckdb,pandas,polars,pyarrow,sqlframe]" + "narwhals[duckdb,pandas,polars,pyarrow,sqlframe,daft]" ] tests = [ "covdefaults", @@ -245,6 +246,8 @@ filterwarnings = [ "ignore:.*__array__ implementation doesn't accept a copy keyword, so passing copy=False failed:DeprecationWarning:modin", # raised internally by pandas "ignore:.*np.find_common_type is deprecated:DeprecationWarning:pandas", + # raised internally by daft + "ignore:.*datetime.datetime.utcnow.. is deprecated and scheduled for removal:DeprecationWarning:daft", # Warning raised when calling PandasLikeNamespace.from_arrow with old pyarrow "ignore:.*is_sparse is deprecated and will be removed in a future version.*:DeprecationWarning:pyarrow", ] diff --git a/tests/conftest.py b/tests/conftest.py index 26c8f114a7..c620bd2d13 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -14,6 +14,7 @@ if TYPE_CHECKING: from collections.abc import Sequence + import daft import duckdb import ibis import pandas as pd @@ -220,6 +221,12 @@ def sqlframe_pyspark_lazy_constructor(obj: Data) -> SQLFrameDataFrame: # pragma return session.createDataFrame([*zip(*obj.values())], schema=[*obj.keys()]) +def daft_lazy_constructor(obj: dict[str, Any]) -> daft.DataFrame: # pragma: no cover + import daft + + return daft.from_pydict(obj) + + @lru_cache(maxsize=1) def _ibis_backend() -> IbisDuckDBBackend: # pragma: no cover """Cached (singleton) in-memory backend to ensure all tables exist within the same in-memory database.""" @@ -252,6 +259,7 @@ def ibis_lazy_constructor(obj: Data) -> ibis.Table: # pragma: no cover "duckdb": duckdb_lazy_constructor, "pyspark": pyspark_lazy_constructor, # type: ignore[dict-item] "sqlframe": sqlframe_pyspark_lazy_constructor, + "daft": daft_lazy_constructor, "ibis": ibis_lazy_constructor, } GPU_CONSTRUCTORS: dict[str, ConstructorEager] = {"cudf": cudf_constructor} @@ -289,6 +297,23 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: ): continue # pragma: no cover + if ( + any( + x in str(metafunc.function) + for x in ( + "concat_str", + "is_unique", + "rank", + "truncate", + "kurtosis_", + "sqrt_", + "exp_", + ) + ) + and constructor == "daft" + ): + continue + if constructor in EAGER_CONSTRUCTORS: eager_constructors.append(EAGER_CONSTRUCTORS[constructor]) eager_constructors_ids.append(constructor) diff --git a/tests/expr_and_series/arithmetic_test.py b/tests/expr_and_series/arithmetic_test.py index af0c464e5b..6416b29c93 100644 --- a/tests/expr_and_series/arithmetic_test.py +++ b/tests/expr_and_series/arithmetic_test.py @@ -39,6 +39,8 @@ def test_arithmetic_expr( ) -> None: if "duckdb" in str(constructor) and attr == "__floordiv__": request.applymarker(pytest.mark.xfail) + if "daft" in str(constructor) and attr in {"__pow__"}: + request.applymarker(pytest.mark.xfail) if attr == "__mod__" and any( x in str(constructor) for x in ["pandas_pyarrow", "modin_pyarrow"] ): @@ -71,6 +73,8 @@ def test_right_arithmetic_expr( ) -> None: if "dask" in str(constructor) and DASK_VERSION < (2024, 10): pytest.skip() + if "daft" in str(constructor) and attr in {"__rpow__"}: + request.applymarker(pytest.mark.xfail) if attr == "__rmod__" and any( x in str(constructor) for x in ["pandas_pyarrow", "modin_pyarrow"] ): @@ -191,14 +195,14 @@ def test_mod(constructor_eager: ConstructorEager, *, left: int, right: int) -> N @pytest.mark.parametrize( ("attr", "lhs", "expected"), [ - ("__add__", nw.lit(1), [2, 3, 5]), - ("__sub__", nw.lit(1), [0, -1, -3]), - ("__mul__", nw.lit(2), [2, 4, 8]), - ("__truediv__", nw.lit(2.0), [2.0, 1.0, 0.5]), - ("__truediv__", nw.lit(1), [1, 0.5, 0.25]), - ("__floordiv__", nw.lit(2), [2, 1, 0]), - ("__mod__", nw.lit(3), [0, 1, 3]), - ("__pow__", nw.lit(2), [2, 4, 16]), + ("__add__", 1, [2, 3, 5]), + ("__sub__", 1, [0, -1, -3]), + ("__mul__", 2, [2, 4, 8]), + ("__truediv__", 2.0, [2.0, 1.0, 0.5]), + ("__truediv__", 1, [1, 0.5, 0.25]), + ("__floordiv__", 2, [2, 1, 0]), + ("__mod__", 3, [0, 1, 3]), + ("__pow__", 2, [2, 4, 16]), ], ) def test_arithmetic_expr_left_literal( @@ -216,10 +220,12 @@ def test_arithmetic_expr_left_literal( x in str(constructor) for x in ["pandas_pyarrow", "modin_pyarrow"] ): request.applymarker(pytest.mark.xfail) + if "daft" in str(constructor) and (attr == "__pow__"): + request.applymarker(pytest.mark.xfail) data = {"a": [1.0, 2.0, 4.0]} df = nw.from_native(constructor(data)) - result = df.select(getattr(lhs, attr)(nw.col("a"))) + result = df.select(getattr(nw.lit(lhs), attr)(nw.col("a"))) assert_equal_data(result, {"literal": expected}) @@ -254,10 +260,14 @@ def test_arithmetic_series_left_literal( assert_equal_data(result, {"literal": expected}) -def test_std_broadcating(constructor: Constructor) -> None: +def test_std_broadcating( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: if "duckdb" in str(constructor) and DUCKDB_VERSION < (1, 3): # `std(ddof=2)` fails for duckdb here pytest.skip() + if "daft" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor({"a": [1, 2, 3]})) result = df.with_columns(b=nw.col("a").std()).sort("a") expected = {"a": [1, 2, 3], "b": [1.0, 1.0, 1.0]} diff --git a/tests/expr_and_series/binary_test.py b/tests/expr_and_series/binary_test.py index 6140ead120..8983d05dad 100644 --- a/tests/expr_and_series/binary_test.py +++ b/tests/expr_and_series/binary_test.py @@ -6,9 +6,11 @@ from tests.utils import DASK_VERSION, Constructor, assert_equal_data -def test_expr_binary(constructor: Constructor) -> None: +def test_expr_binary(constructor: Constructor, request: pytest.FixtureRequest) -> None: if "dask" in str(constructor) and DASK_VERSION < (2024, 10): pytest.skip() + if "daft" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]} df_raw = constructor(data) result = nw.from_native(df_raw).with_columns( diff --git a/tests/expr_and_series/cast_test.py b/tests/expr_and_series/cast_test.py index 914f2ecd91..285ed0e28b 100644 --- a/tests/expr_and_series/cast_test.py +++ b/tests/expr_and_series/cast_test.py @@ -59,6 +59,7 @@ SPARK_LIKE_INCOMPATIBLE_COLUMNS = {"e", "f", "g", "h", "o", "p"} DUCKDB_INCOMPATIBLE_COLUMNS = {"o"} IBIS_INCOMPATIBLE_COLUMNS = {"o"} +DAFT_INCOMPATIBLE_COLUMNS = {"o"} @pytest.mark.filterwarnings("ignore:casting period[M] values to int64:FutureWarning") @@ -77,6 +78,8 @@ def test_cast(constructor: Constructor, request: pytest.FixtureRequest) -> None: incompatible_columns = DUCKDB_INCOMPATIBLE_COLUMNS # pragma: no cover elif "ibis" in str(constructor): incompatible_columns = IBIS_INCOMPATIBLE_COLUMNS # pragma: no cover + elif "daft" in str(constructor): + incompatible_columns = DAFT_INCOMPATIBLE_COLUMNS # pragma: no cover else: incompatible_columns = set() @@ -178,7 +181,7 @@ def test_cast_string() -> None: def test_cast_raises_for_unknown_dtype( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor): + if any(x in str(constructor) for x in ("duckdb", "daft")): request.applymarker(pytest.mark.xfail) if "pyarrow_table" in str(constructor) and PYARROW_VERSION < (15,): # Unsupported cast from string to dictionary using function cast_dictionary @@ -330,7 +333,8 @@ def test_cast_time(request: pytest.FixtureRequest, constructor: Constructor) -> pytest.skip() if any( - backend in str(constructor) for backend in ("dask", "pyspark", "modin", "cudf") + backend in str(constructor) + for backend in ("dask", "pyspark", "modin", "cudf", "daft") ): request.applymarker(pytest.mark.xfail) @@ -344,7 +348,7 @@ def test_cast_binary(request: pytest.FixtureRequest, constructor: Constructor) - if "pandas" in str(constructor) and PANDAS_VERSION < (2, 2): pytest.skip() - if any(backend in str(constructor) for backend in ("cudf", "dask", "modin")): + if any(backend in str(constructor) for backend in ("cudf", "dask", "modin", "daft")): request.applymarker(pytest.mark.xfail) data = {"a": ["test1", "test2"]} diff --git a/tests/expr_and_series/cum_prod_test.py b/tests/expr_and_series/cum_prod_test.py index d626403a83..67c4b5852e 100644 --- a/tests/expr_and_series/cum_prod_test.py +++ b/tests/expr_and_series/cum_prod_test.py @@ -65,8 +65,9 @@ def test_lazy_cum_prod_grouped( reverse: bool, expected_a: list[int], ) -> None: - if "pyarrow_table" in str(constructor): + if any(x in str(constructor) for x in ("pyarrow_table", "daft")): # grouped window functions not yet supported + # https://github.com/Eventual-Inc/Daft/issues/4703 request.applymarker(pytest.mark.xfail) if "modin" in str(constructor): pytest.skip(reason="probably bugged") diff --git a/tests/expr_and_series/drop_nulls_test.py b/tests/expr_and_series/drop_nulls_test.py index d067552eb6..075a680753 100644 --- a/tests/expr_and_series/drop_nulls_test.py +++ b/tests/expr_and_series/drop_nulls_test.py @@ -32,7 +32,7 @@ def test_drop_nulls(constructor_eager: ConstructorEager) -> None: def test_drop_nulls_agg(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if any(x in str(constructor) for x in ("duckdb", "pyspark", "ibis")): + if any(x in str(constructor) for x in ("duckdb", "pyspark", "ibis", "daft")): request.applymarker(pytest.mark.xfail) data = { "A": [1, 2, None, 4], diff --git a/tests/expr_and_series/dt/convert_time_zone_test.py b/tests/expr_and_series/dt/convert_time_zone_test.py index 70cdb14372..2581c1ed46 100644 --- a/tests/expr_and_series/dt/convert_time_zone_test.py +++ b/tests/expr_and_series/dt/convert_time_zone_test.py @@ -30,7 +30,7 @@ def test_convert_time_zone( or ("modin_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1)) ): pytest.skip() - if any(x in str(constructor) for x in ("cudf", "duckdb", "pyspark", "ibis")): + if any(x in str(constructor) for x in ("cudf", "duckdb", "pyspark", "ibis", "daft")): request.applymarker(pytest.mark.xfail) data = { "a": [ @@ -89,7 +89,7 @@ def test_convert_time_zone_from_none( or ("pyarrow_table" in str(constructor) and PYARROW_VERSION < (12,)) ): pytest.skip() - if any(x in str(constructor) for x in ("cudf", "duckdb", "pyspark", "ibis")): + if any(x in str(constructor) for x in ("cudf", "duckdb", "pyspark", "ibis", "daft")): request.applymarker(pytest.mark.xfail) if "polars" in str(constructor) and POLARS_VERSION < (0, 20, 7): # polars used to disallow this diff --git a/tests/expr_and_series/dt/datetime_duration_test.py b/tests/expr_and_series/dt/datetime_duration_test.py index 8e204905ba..146e9cf23a 100644 --- a/tests/expr_and_series/dt/datetime_duration_test.py +++ b/tests/expr_and_series/dt/datetime_duration_test.py @@ -42,6 +42,8 @@ def test_duration_attributes( request.applymarker(pytest.mark.xfail) if "duckdb" in str(constructor) and attribute == "total_nanoseconds": request.applymarker(pytest.mark.xfail) + if "daft" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) diff --git a/tests/expr_and_series/dt/replace_time_zone_test.py b/tests/expr_and_series/dt/replace_time_zone_test.py index 748347ef23..3d7c9b028d 100644 --- a/tests/expr_and_series/dt/replace_time_zone_test.py +++ b/tests/expr_and_series/dt/replace_time_zone_test.py @@ -29,8 +29,7 @@ def test_replace_time_zone( or ("pyarrow_table" in str(constructor) and PYARROW_VERSION < (12,)) ): pytest.skip() - - if any(x in str(constructor) for x in ("cudf", "pyspark", "ibis", "duckdb")): + if any(x in str(constructor) for x in ("cudf", "duckdb", "pyspark", "ibis", "daft")): request.applymarker(pytest.mark.xfail) data = { "a": [ @@ -49,7 +48,9 @@ def test_replace_time_zone( assert_equal_data(result_str, expected) -def test_replace_time_zone_none(constructor: Constructor) -> None: +def test_replace_time_zone_none( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: if ( ("pyarrow" in str(constructor) and is_windows()) or ("pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2,)) @@ -57,6 +58,9 @@ def test_replace_time_zone_none(constructor: Constructor) -> None: or ("pyarrow_table" in str(constructor) and PYARROW_VERSION < (12,)) ): pytest.skip() + if "daft" in str(constructor): + # https://github.com/Eventual-Inc/Daft/issues/4096 + request.applymarker(pytest.mark.xfail) data = { "a": [ datetime(2020, 1, 1, tzinfo=timezone.utc), @@ -84,7 +88,7 @@ def test_replace_time_zone_series( or ("pyarrow_table" in str(constructor_eager) and PYARROW_VERSION < (12,)) ): pytest.skip() - if any(x in str(constructor_eager) for x in ("cudf",)): + if any(x in str(constructor_eager) for x in ("cudf", "daft")): request.applymarker(pytest.mark.xfail) data = { "a": [ diff --git a/tests/expr_and_series/dt/timestamp_test.py b/tests/expr_and_series/dt/timestamp_test.py index d1ccfa1534..bb2951c190 100644 --- a/tests/expr_and_series/dt/timestamp_test.py +++ b/tests/expr_and_series/dt/timestamp_test.py @@ -52,7 +52,7 @@ def test_timestamp_datetimes( time_unit: Literal["ns", "us", "ms"], expected: list[int | None], ) -> None: - if any(x in str(constructor) for x in ("duckdb", "pyspark", "ibis")): + if any(x in str(constructor) for x in ("duckdb", "pyspark", "ibis", "daft")): request.applymarker( pytest.mark.xfail(reason="Backend timestamp conversion not yet implemented") ) @@ -96,7 +96,7 @@ def test_timestamp_datetimes_tz_aware( time_unit: Literal["ns", "us", "ms"], expected: list[int | None], ) -> None: - if any(x in str(constructor) for x in ("duckdb", "pyspark", "ibis")): + if any(x in str(constructor) for x in ("duckdb", "pyspark", "ibis", "daft")): request.applymarker( pytest.mark.xfail(reason="Backend timestamp conversion not yet implemented") ) @@ -155,7 +155,7 @@ def test_timestamp_dates( time_unit: Literal["ns", "us", "ms"], expected: list[int | None], ) -> None: - if any(x in str(constructor) for x in ("duckdb", "pyspark", "ibis")): + if any(x in str(constructor) for x in ("duckdb", "pyspark", "ibis", "daft")): request.applymarker( pytest.mark.xfail(reason="Backend timestamp conversion not yet implemented") ) @@ -182,7 +182,7 @@ def test_timestamp_dates( def test_timestamp_invalid_date( request: pytest.FixtureRequest, constructor: Constructor ) -> None: - if any(x in str(constructor) for x in ("duckdb", "pyspark", "ibis")): + if any(x in str(constructor) for x in ("duckdb", "pyspark", "ibis", "daft")): request.applymarker( pytest.mark.xfail(reason="Backend timestamp conversion not yet implemented") ) diff --git a/tests/expr_and_series/dt/to_string_test.py b/tests/expr_and_series/dt/to_string_test.py index 4fd4c14c66..a08a7e62db 100644 --- a/tests/expr_and_series/dt/to_string_test.py +++ b/tests/expr_and_series/dt/to_string_test.py @@ -111,7 +111,7 @@ def test_dt_to_string_iso_local_datetime_expr( expected: str, request: pytest.FixtureRequest, ) -> None: - if "duckdb" in str(constructor) or "ibis" in str(constructor): + if any(x in str(constructor) for x in ("duckdb", "ibis")): request.applymarker(pytest.mark.xfail) df = constructor({"a": [data]}) diff --git a/tests/expr_and_series/dt/truncate_test.py b/tests/expr_and_series/dt/truncate_test.py index 79eba4704a..3955ff61fb 100644 --- a/tests/expr_and_series/dt/truncate_test.py +++ b/tests/expr_and_series/dt/truncate_test.py @@ -53,7 +53,7 @@ def test_truncate( every: str, expected: list[datetime], ) -> None: - if any(x in str(constructor) for x in ("sqlframe", "pyspark")): + if any(x in str(constructor) for x in ("sqlframe", "pyspark", "daft")): # TODO(marco): investigate pyspark, it also localizes to UTC here. request.applymarker( pytest.mark.xfail(reason="https://github.com/eakmanrq/sqlframe/issues/383") @@ -109,7 +109,9 @@ def test_truncate_multiples( every: str, expected: list[datetime], ) -> None: - if any(x in str(constructor) for x in ("sqlframe", "cudf", "pyspark", "duckdb")): + if any( + x in str(constructor) for x in ("sqlframe", "cudf", "pyspark", "duckdb", "daft") + ): # Reasons: # - sqlframe: https://github.com/eakmanrq/sqlframe/issues/383 # - cudf: https://github.com/rapidsai/cudf/issues/18654 diff --git a/tests/expr_and_series/fill_null_test.py b/tests/expr_and_series/fill_null_test.py index 492fd79ea6..c9c5110988 100644 --- a/tests/expr_and_series/fill_null_test.py +++ b/tests/expr_and_series/fill_null_test.py @@ -85,7 +85,7 @@ def test_fill_null_strategies_with_limit_as_none( ): pytest.skip() - if "ibis" in str(constructor): + if any(x in str(constructor) for x in ("daft", "ibis")): request.applymarker(pytest.mark.xfail) data_limits = { @@ -172,7 +172,7 @@ def test_fill_null_limits( ): pytest.skip() - if "ibis" in str(constructor): + if any(x in str(constructor) for x in ("daft", "ibis")): request.applymarker(pytest.mark.xfail) context: Any = ( @@ -370,7 +370,7 @@ def test_fill_null_series_exceptions(constructor_eager: ConstructorEager) -> Non def test_fill_null_strategies_with_partition_by( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if any(x in str(constructor) for x in ("pyarrow_table", "dask", "ibis")): + if any(x in str(constructor) for x in ("pyarrow_table", "dask", "ibis", "daft")): request.applymarker(pytest.mark.xfail) if ("duckdb" in str(constructor) and DUCKDB_VERSION < (1, 3)) or ( diff --git a/tests/expr_and_series/is_duplicated_test.py b/tests/expr_and_series/is_duplicated_test.py index 42f07a91e5..573cc7a003 100644 --- a/tests/expr_and_series/is_duplicated_test.py +++ b/tests/expr_and_series/is_duplicated_test.py @@ -6,9 +6,13 @@ from tests.utils import DUCKDB_VERSION, Constructor, ConstructorEager, assert_equal_data -def test_is_duplicated_expr(constructor: Constructor) -> None: +def test_is_duplicated_expr( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: if "duckdb" in str(constructor) and DUCKDB_VERSION < (1, 3): pytest.skip() + if "daft" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 1, 2], "b": [1, 2, 3], "index": [0, 1, 2]} df = nw.from_native(constructor(data)) @@ -17,9 +21,13 @@ def test_is_duplicated_expr(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_is_duplicated_w_nulls_expr(constructor: Constructor) -> None: +def test_is_duplicated_w_nulls_expr( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: if "duckdb" in str(constructor) and DUCKDB_VERSION < (1, 3): pytest.skip() + if "daft" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 1, None], "b": [1, None, None], "index": [0, 1, 2]} df = nw.from_native(constructor(data)) diff --git a/tests/expr_and_series/is_finite_test.py b/tests/expr_and_series/is_finite_test.py index 69e3614e2b..fd93963040 100644 --- a/tests/expr_and_series/is_finite_test.py +++ b/tests/expr_and_series/is_finite_test.py @@ -12,7 +12,7 @@ def test_is_finite_expr(constructor: Constructor) -> None: if any( x in str(constructor) - for x in ("polars", "pyarrow_table", "duckdb", "pyspark", "ibis") + for x in ("polars", "pyarrow_table", "duckdb", "pyspark", "ibis", "daft") ): expected = {"a": [False, False, True, None]} elif any( diff --git a/tests/expr_and_series/is_nan_test.py b/tests/expr_and_series/is_nan_test.py index 8027065a35..6e4b4141f4 100644 --- a/tests/expr_and_series/is_nan_test.py +++ b/tests/expr_and_series/is_nan_test.py @@ -24,7 +24,10 @@ ] -def test_nan(constructor: Constructor) -> None: +def test_nan(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "daft" in str(constructor): + # missing pow https://github.com/Eventual-Inc/Daft/issues/3793 + request.applymarker(pytest.mark.xfail) data_na = {"int": [-1, 1, None]} df = nw.from_native(constructor(data_na)).with_columns( float=nw.col("int").cast(nw.Float64), float_na=nw.col("int") ** 0.5 @@ -110,6 +113,9 @@ def test_nan_non_float(constructor: Constructor, request: pytest.FixtureRequest) exc = ( ArrowNotImplementedError if "pyarrow_table" in str(constructor) + # Daft raises its own error + else Exception + if "daft" in str(constructor) else InvalidOperationError ) diff --git a/tests/expr_and_series/is_unique_test.py b/tests/expr_and_series/is_unique_test.py index 2df0518360..b247cd91a1 100644 --- a/tests/expr_and_series/is_unique_test.py +++ b/tests/expr_and_series/is_unique_test.py @@ -6,9 +6,11 @@ from tests.utils import DUCKDB_VERSION, Constructor, ConstructorEager, assert_equal_data -def test_is_unique_expr(constructor: Constructor) -> None: +def test_is_unique_expr(constructor: Constructor, request: pytest.FixtureRequest) -> None: if "duckdb" in str(constructor) and DUCKDB_VERSION < (1, 3): pytest.skip() + if "daft" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 1, 2], "b": [1, 2, 3], "index": [0, 1, 2]} df = nw.from_native(constructor(data)) diff --git a/tests/expr_and_series/list/len_test.py b/tests/expr_and_series/list/len_test.py index dbe31acab9..640baac5b0 100644 --- a/tests/expr_and_series/list/len_test.py +++ b/tests/expr_and_series/list/len_test.py @@ -11,7 +11,7 @@ def test_len_expr(request: pytest.FixtureRequest, constructor: Constructor) -> None: - if any(backend in str(constructor) for backend in ("dask", "modin", "cudf")): + if any(backend in str(constructor) for backend in ("dask", "modin", "cudf", "daft")): request.applymarker(pytest.mark.xfail) if "pandas" in str(constructor) and PANDAS_VERSION < (2, 2): diff --git a/tests/expr_and_series/median_test.py b/tests/expr_and_series/median_test.py index fff8aee7a5..6369062c6c 100644 --- a/tests/expr_and_series/median_test.py +++ b/tests/expr_and_series/median_test.py @@ -20,7 +20,7 @@ def test_median_expr( constructor: Constructor, expr: nw.Expr, request: pytest.FixtureRequest ) -> None: - if "dask_lazy_p2" in str(constructor): + if any(x in str(constructor) for x in ("dask", "daft")): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(expr) @@ -41,11 +41,7 @@ def test_median_series( def test_median_expr_raises_on_str( constructor: Constructor, expr: nw.Expr, request: pytest.FixtureRequest ) -> None: - if ( - ("pyspark" in str(constructor)) - or "duckdb" in str(constructor) - or "ibis" in str(constructor) - ): + if any(x in str(constructor) for x in ("pyspark", "duckdb", "ibis", "daft")): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) diff --git a/tests/expr_and_series/over_test.py b/tests/expr_and_series/over_test.py index ff870fa050..69aa8afb7a 100644 --- a/tests/expr_and_series/over_test.py +++ b/tests/expr_and_series/over_test.py @@ -55,6 +55,9 @@ def test_over_std_var(request: pytest.FixtureRequest, constructor: Constructor) if "cudf" in str(constructor): # https://github.com/rapidsai/cudf/issues/18159 request.applymarker(pytest.mark.xfail) + if "daft" in str(constructor): + # https://github.com/Eventual-Inc/Daft/issues/4464 + request.applymarker(pytest.mark.xfail) if "duckdb" in str(constructor) and DUCKDB_VERSION < (1, 3): pytest.skip() @@ -435,7 +438,7 @@ def test_len_over_2369(constructor: Constructor, request: pytest.FixtureRequest) def test_over_quantile(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "pyarrow_table" in str(constructor) or "pyspark" in str(constructor): + if any(x in str(constructor) for x in ("pyarrow_table", "pyspark", "daft")): request.applymarker(pytest.mark.xfail) data = {"a": [1, 2, 3, 4, 5, 6], "b": ["x", "x", "x", "y", "y", "y"]} diff --git a/tests/expr_and_series/pipe_test.py b/tests/expr_and_series/pipe_test.py index d32743e1ea..71396bc0ec 100644 --- a/tests/expr_and_series/pipe_test.py +++ b/tests/expr_and_series/pipe_test.py @@ -9,7 +9,7 @@ def test_pipe_expr(constructor: Constructor) -> None: df = nw.from_native(constructor(input_list)) - e = df.select(nw.col("a").pipe(lambda x: x**2)) + e = df.select(nw.col("a").pipe(lambda x: x * x)) assert_equal_data(e, {"a": expected}) diff --git a/tests/expr_and_series/quantile_test.py b/tests/expr_and_series/quantile_test.py index 7ba38a5ee5..8e4987a420 100644 --- a/tests/expr_and_series/quantile_test.py +++ b/tests/expr_and_series/quantile_test.py @@ -31,6 +31,8 @@ def test_quantile_expr( and interpolation != "linear" ) or "pyspark" in str(constructor): request.applymarker(pytest.mark.xfail) + if "daft" in str(constructor) and interpolation != "lower": + request.applymarker(pytest.mark.xfail) q = 0.3 data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]} @@ -46,6 +48,10 @@ def test_quantile_expr( else does_not_raise() ) + if "daft" in str(constructor): + # very approximate 'lower' :smile: + expected = {"a": [0.99], "b": [4.014835333028612], "z": [7.028793021534831]} + with context: result = df.select(nw.all().quantile(quantile=q, interpolation=interpolation)) assert_equal_data(result, expected) diff --git a/tests/expr_and_series/rank_test.py b/tests/expr_and_series/rank_test.py index 51f98783a9..8dc181d237 100644 --- a/tests/expr_and_series/rank_test.py +++ b/tests/expr_and_series/rank_test.py @@ -223,6 +223,8 @@ def test_lazy_rank_expr_desc( if "dask" in str(constructor): # `rank` is not implemented in Dask request.applymarker(pytest.mark.xfail) + if "daft" in str(constructor): + request.applymarker(pytest.mark.xfail) context = ( pytest.raises( diff --git a/tests/expr_and_series/replace_strict_test.py b/tests/expr_and_series/replace_strict_test.py index d2072e8018..fe5b61178b 100644 --- a/tests/expr_and_series/replace_strict_test.py +++ b/tests/expr_and_series/replace_strict_test.py @@ -27,6 +27,8 @@ def test_replace_strict( or "ibis" in str(constructor) ): request.applymarker(pytest.mark.xfail) + if "daft" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor({"a": [1, 2, 3]})) result = df.select( nw.col("a").replace_strict( @@ -60,11 +62,7 @@ def test_replace_non_full( ) -> None: if "dask" in str(constructor): request.applymarker(pytest.mark.xfail) - if ( - ("pyspark" in str(constructor)) - or "duckdb" in str(constructor) - or "ibis" in str(constructor) - ): + if any(x in str(constructor) for x in ("pyspark", "duckdb", "daft", "ibis", "daft")): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor({"a": [1, 2, 3]})) if isinstance(df, nw.LazyFrame): @@ -85,11 +83,7 @@ def test_replace_strict_mapping( ) -> None: if "dask" in str(constructor): request.applymarker(pytest.mark.xfail) - if ( - ("pyspark" in str(constructor)) - or "duckdb" in str(constructor) - or "ibis" in str(constructor) - ): + if any(x in str(constructor) for x in ("pyspark", "duckdb", "ibis", "daft")): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor({"a": [1, 2, 3]})) diff --git a/tests/expr_and_series/rolling_std_test.py b/tests/expr_and_series/rolling_std_test.py index f027b450d9..d886bf9305 100644 --- a/tests/expr_and_series/rolling_std_test.py +++ b/tests/expr_and_series/rolling_std_test.py @@ -195,6 +195,7 @@ def test_rolling_std_expr_lazy_ungrouped( *, center: bool, ddof: int, + request: pytest.FixtureRequest, ) -> None: if ("polars" in str(constructor) and POLARS_VERSION < (1, 10)) or ( "duckdb" in str(constructor) and DUCKDB_VERSION < (1, 3) @@ -206,6 +207,9 @@ def test_rolling_std_expr_lazy_ungrouped( if "dask" in str(constructor) and ddof != 1: # Only `ddof=1` is supported pytest.skip() + if any(x in str(constructor) for x in ("daft",)): + # https://github.com/Eventual-Inc/Daft/issues/4464 + request.applymarker(pytest.mark.xfail) data = { "a": [1, None, 2, None, 4, 6, 11], "b": [1, None, 2, 3, 4, 5, 6], @@ -315,7 +319,7 @@ def test_rolling_std_expr_lazy_grouped( or ("pandas" in str(constructor) and PANDAS_VERSION < (1, 2)) ): pytest.skip() - if any(x in str(constructor) for x in ("dask", "pyarrow_table")): + if any(x in str(constructor) for x in ("dask", "pyarrow_table", "daft")): request.applymarker(pytest.mark.xfail) if "modin" in str(constructor): # unreliable diff --git a/tests/expr_and_series/rolling_var_test.py b/tests/expr_and_series/rolling_var_test.py index 3c13663bdc..35c8fb3c82 100644 --- a/tests/expr_and_series/rolling_var_test.py +++ b/tests/expr_and_series/rolling_var_test.py @@ -201,6 +201,7 @@ def test_rolling_var_expr_lazy_ungrouped( *, center: bool, ddof: int, + request: pytest.FixtureRequest, ) -> None: if ("polars" in str(constructor) and POLARS_VERSION < (1, 10)) or ( "duckdb" in str(constructor) and DUCKDB_VERSION < (1, 3) @@ -211,7 +212,11 @@ def test_rolling_var_expr_lazy_ungrouped( pytest.skip() if "dask" in str(constructor) and ddof != 1: # Only `ddof=1` is supported - pytest.skip() + request.applymarker(pytest.mark.xfail) + if any(x in str(constructor) for x in ("daft",)): + # https://github.com/Eventual-Inc/Daft/issues/4705 + request.applymarker(pytest.mark.xfail) + data = { "a": [1, None, 2, None, 4, 6, 11], "b": [1, None, 2, 3, 4, 5, 6], @@ -273,7 +278,7 @@ def test_rolling_var_expr_lazy_grouped( or ("pandas" in str(constructor) and PANDAS_VERSION < (1, 2)) ): pytest.skip() - if any(x in str(constructor) for x in ("dask", "pyarrow_table")): + if any(x in str(constructor) for x in ("dask", "pyarrow_table", "daft")): request.applymarker(pytest.mark.xfail) if "modin" in str(constructor): # unreliable diff --git a/tests/expr_and_series/skew_test.py b/tests/expr_and_series/skew_test.py index e2495ee58f..a399f82d8d 100644 --- a/tests/expr_and_series/skew_test.py +++ b/tests/expr_and_series/skew_test.py @@ -32,7 +32,6 @@ def test_skew_series( ([0.0, 0.0, 0.0], None), ([1, 2, 3, 2, 1], 0.343622), ], - ids=range(5), ) @pytest.mark.filterwarnings("ignore:.*invalid value:RuntimeWarning:dask") def test_skew_expr( @@ -41,11 +40,11 @@ def test_skew_expr( expected: float | None, request: pytest.FixtureRequest, ) -> None: - if "ibis" in str(constructor): + if any(x in str(constructor) for x in ("ibis",)): # https://github.com/ibis-project/ibis/issues/11176 request.applymarker(pytest.mark.xfail) - if "pyspark" in str(constructor) and int(request.node.callspec.id[-1]) == 0: + if any(x in str(constructor) for x in ("pyspark", "ibis", "daft")) and not data: # Can not infer schema from empty dataset. pytest.skip() diff --git a/tests/expr_and_series/str/contains_test.py b/tests/expr_and_series/str/contains_test.py index 61aa2e060f..84f705ef94 100644 --- a/tests/expr_and_series/str/contains_test.py +++ b/tests/expr_and_series/str/contains_test.py @@ -11,7 +11,7 @@ def test_contains_case_insensitive( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "cudf" in str(constructor): + if any(x in str(constructor) for x in ("cudf",)): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) @@ -22,12 +22,7 @@ def test_contains_case_insensitive( assert_equal_data(result, expected) -def test_contains_series_case_insensitive( - constructor_eager: ConstructorEager, request: pytest.FixtureRequest -) -> None: - if "cudf" in str(constructor_eager): - request.applymarker(pytest.mark.xfail) - +def test_contains_series_case_insensitive(constructor_eager: ConstructorEager) -> None: df = nw.from_native(constructor_eager(data), eager_only=True) result = df.select(case_insensitive_match=df["pets"].str.contains("(?i)parrot|Dove")) expected = {"case_insensitive_match": [False, False, True, True, True, None]} @@ -51,13 +46,16 @@ def test_contains_series_case_sensitive(constructor_eager: ConstructorEager) -> def test_contains_literal(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) result = df.select( - nw.col("pets").str.contains("Parrot|dove").alias("default_match"), - nw.col("pets").str.contains("Parrot|dove", literal=True).alias("literal_match"), + nw.col("pets").str.contains("Parrot|dove", literal=True).alias("literal_match") ) - expected = { - "default_match": [False, False, False, True, True, None], - "literal_match": [False, False, False, False, True, None], - } + expected = {"literal_match": [False, False, False, False, True, None]} + assert_equal_data(result, expected) + + +def test_contains_non_literal(constructor: Constructor) -> None: + df = nw.from_native(constructor(data)) + result = df.select(nw.col("pets").str.contains("Parrot|dove").alias("default_match")) + expected = {"default_match": [False, False, False, True, True, None]} assert_equal_data(result, expected) @@ -65,10 +63,12 @@ def test_contains_series_literal(constructor_eager: ConstructorEager) -> None: df = nw.from_native(constructor_eager(data), eager_only=True) result = df.select( default_match=df["pets"].str.contains("Parrot|dove"), - literal_match=df["pets"].str.contains("Parrot|dove", literal=True), + literal_match1=df["pets"].str.contains("Parrot|dove", literal=True), + literal_match2=df["pets"].str.contains("|dove", literal=True), ) expected = { "default_match": [False, False, False, True, True, None], - "literal_match": [False, False, False, False, True, None], + "literal_match1": [False, False, False, False, True, None], + "literal_match2": [False, False, False, False, True, None], } assert_equal_data(result, expected) diff --git a/tests/expr_and_series/str/replace_test.py b/tests/expr_and_series/str/replace_test.py index a82813b62c..4e3917838b 100644 --- a/tests/expr_and_series/str/replace_test.py +++ b/tests/expr_and_series/str/replace_test.py @@ -83,11 +83,7 @@ def test_str_replace_expr( literal: bool, # noqa: FBT001 expected: dict[str, list[str]], ) -> None: - if ( - ("pyspark" in str(constructor)) - or "duckdb" in str(constructor) - or "ibis" in str(constructor) - ): + if any(x in str(constructor) for x in ("pyspark", "duckdb", "daft", "ibis")): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result_df = df.select( diff --git a/tests/expr_and_series/str/strip_chars_test.py b/tests/expr_and_series/str/strip_chars_test.py index 8944bce4de..ecccd1caa8 100644 --- a/tests/expr_and_series/str/strip_chars_test.py +++ b/tests/expr_and_series/str/strip_chars_test.py @@ -20,7 +20,7 @@ def test_str_strip_chars( characters: str | None, expected: Any, ) -> None: - if "ibis" in str(constructor) and characters is not None: + if any(x in str(constructor) for x in ("daft", "ibis")) and characters: request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) diff --git a/tests/expr_and_series/str/to_datetime_test.py b/tests/expr_and_series/str/to_datetime_test.py index f3f8ec7538..bc9957a88c 100644 --- a/tests/expr_and_series/str/to_datetime_test.py +++ b/tests/expr_and_series/str/to_datetime_test.py @@ -91,6 +91,7 @@ def test_to_datetime_infer_fmt( ("polars" in str(constructor) and str(data["a"][0]).isdigit()) or "duckdb" in str(constructor) or ("pyspark" in str(constructor) and data["a"][0] == "20240101123456") + or "daft" in str(constructor) or "ibis" in str(constructor) ): request.applymarker(pytest.mark.xfail) @@ -151,7 +152,7 @@ def test_to_datetime_series_infer_fmt( def test_to_datetime_infer_fmt_from_date( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor) or "ibis" in str(constructor): + if any(x in str(constructor) for x in ("daft", "duckdb", "ibis")): request.applymarker(pytest.mark.xfail) data = {"z": ["2020-01-01", "2020-01-02", None]} if "pyspark" in str(constructor): @@ -216,6 +217,9 @@ def test_to_datetime_tz_aware( if "cudf" in str(constructor): # cuDF does not yet support timezone-aware datetimes request.applymarker(pytest.mark.xfail) + if "daft" in str(constructor): + # https://github.com/Eventual-Inc/Daft/issues/4220 + request.applymarker(pytest.mark.xfail) context = ( pytest.raises(NotImplementedError) if any(x in str(constructor) for x in ("duckdb", "ibis")) and format is None diff --git a/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py b/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py index ba1bb7de6f..55a7c7beaf 100644 --- a/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py +++ b/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py @@ -36,6 +36,7 @@ def test_str_to_uppercase( "polars", "cudf", "pyspark", + "daft", ) ) and "ẞ" in expected["a"][0] diff --git a/tests/expr_and_series/str/zfill_test.py b/tests/expr_and_series/str/zfill_test.py index 7ecc3d36fd..4fcca92f14 100644 --- a/tests/expr_and_series/str/zfill_test.py +++ b/tests/expr_and_series/str/zfill_test.py @@ -34,6 +34,9 @@ def test_str_zfill(request: pytest.FixtureRequest, constructor: Constructor) -> "in `expr.str.slice(1, length)`" ) pytest.skip(reason=reason) + if "daft" in str(constructor): + # TODO(unassigned): implement zfill for daft + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.col("a").str.zfill(3)) diff --git a/tests/expr_and_series/unary_test.py b/tests/expr_and_series/unary_test.py index 5dfb610ed9..1bc762db27 100644 --- a/tests/expr_and_series/unary_test.py +++ b/tests/expr_and_series/unary_test.py @@ -9,7 +9,8 @@ def test_unary(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "ibis" in str(constructor): + if any(x in str(constructor) for x in ("ibis", "daft")): + # ibis misses skew, daft misses median request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6], "c": [7.0, 8.0, None], "z": [7.0, 8.0, 9.0]} @@ -71,7 +72,8 @@ def test_unary_series(constructor_eager: ConstructorEager) -> None: def test_unary_two_elements( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "ibis" in str(constructor): + if any(x in str(constructor) for x in ("ibis",)): + # ibis misses skew request.applymarker(pytest.mark.xfail) data = {"a": [1, 2], "b": [2, 10], "c": [2.0, None]} result = nw.from_native(constructor(data)).select( @@ -120,7 +122,8 @@ def test_unary_one_element( ) -> None: if "pyspark" in str(constructor) and "sqlframe" not in str(constructor): request.applymarker(pytest.mark.xfail) - if "ibis" in str(constructor): + if any(x in str(constructor) for x in ("ibis",)): + # ibis misses skew request.applymarker(pytest.mark.xfail) data = {"a": [1], "b": [2], "c": [None]} # Dask runs into a divide by zero RuntimeWarning for 1 element skew. @@ -156,7 +159,7 @@ def test_unary_one_element( def test_unary_one_element_series(constructor_eager: ConstructorEager) -> None: data = {"a": [1], "b": [2], "c": [None]} - df = nw.from_native(constructor_eager(data)) + df = nw.from_native(constructor_eager(data), eager_only=True) result = { "a_nunique": [df["a"].n_unique()], "a_skew": [df["a"].skew()], diff --git a/tests/expr_and_series/unique_test.py b/tests/expr_and_series/unique_test.py index ceb4cc874f..bdc30b8d0b 100644 --- a/tests/expr_and_series/unique_test.py +++ b/tests/expr_and_series/unique_test.py @@ -29,7 +29,7 @@ def test_unique_expr(constructor: Constructor) -> None: def test_unique_expr_agg( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if any(x in str(constructor) for x in ("duckdb", "pyspark", "ibis")): + if any(x in str(constructor) for x in ("duckdb", "pyspark", "ibis", "daft")): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.col("a").unique().sum()) diff --git a/tests/frame/add_test.py b/tests/frame/add_test.py index 166be83fd6..a19d15ed5f 100644 --- a/tests/frame/add_test.py +++ b/tests/frame/add_test.py @@ -6,9 +6,11 @@ from tests.utils import DUCKDB_VERSION, Constructor, assert_equal_data -def test_add(constructor: Constructor) -> None: +def test_add(constructor: Constructor, request: pytest.FixtureRequest) -> None: if "duckdb" in str(constructor) and DUCKDB_VERSION < (1, 3): pytest.skip() + if "daft" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]} df = nw.from_native(constructor(data)) result = df.with_columns( diff --git a/tests/frame/concat_test.py b/tests/frame/concat_test.py index 9b1ca72887..0106a6a88d 100644 --- a/tests/frame/concat_test.py +++ b/tests/frame/concat_test.py @@ -33,6 +33,8 @@ def test_concat_horizontal(constructor_eager: ConstructorEager) -> None: nw.concat([df_left.lazy()], how="horizontal") +# Warning raised internally by Modin. +@pytest.mark.filterwarnings("ignore:When grouping with a length-1:FutureWarning") def test_concat_vertical(constructor: Constructor) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]} df_left = ( @@ -42,8 +44,8 @@ def test_concat_vertical(constructor: Constructor) -> None: data_right = {"c": [6, 12, -1], "d": [0, -4, 2]} df_right = nw.from_native(constructor(data_right)).lazy() - result = nw.concat([df_left, df_right], how="vertical") - expected = {"c": [1, 3, 2, 6, 12, -1], "d": [4, 4, 6, 0, -4, 2]} + result = nw.concat([df_left, df_right], how="vertical").sort("c") + expected = {"c": [-1, 1, 2, 3, 6, 12], "d": [2, 4, 6, 4, 0, -4]} assert_equal_data(result, expected) with pytest.raises(ValueError, match="No items"): @@ -61,23 +63,30 @@ def test_concat_vertical(constructor: Constructor) -> None: nw.concat([df_left, df_left.select("d")], how="vertical").collect() +# Warning raised internally by Modin. +@pytest.mark.filterwarnings( + "ignore:The behavior of DataFrame concatenation with empty:FutureWarning" +) +@pytest.mark.filterwarnings( + "ignore:When grouping with a length-1 list-like:FutureWarning" +) def test_concat_diagonal( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor) or "ibis" in str(constructor): + if any(x in str(constructor) for x in ("duckdb", "ibis")): request.applymarker(pytest.mark.xfail) data_1 = {"a": [1, 3], "b": [4, 6]} - data_2 = {"a": [100, 200], "z": ["x", "y"]} + data_2 = {"a": [100, 200, 200], "z": ["x", "y", "y"]} expected = { - "a": [1, 3, 100, 200], - "b": [4, 6, None, None], - "z": [None, None, "x", "y"], + "a": [1, 3, 100, 200, 200], + "b": [4, 6, None, None, None], + "z": [None, None, "x", "y", "y"], } df_1 = nw.from_native(constructor(data_1)).lazy() df_2 = nw.from_native(constructor(data_2)).lazy() - result = nw.concat([df_1, df_2], how="diagonal") + result = nw.concat([df_1, df_2], how="diagonal").sort("a") assert_equal_data(result, expected) diff --git a/tests/frame/explode_test.py b/tests/frame/explode_test.py index d6663d68c8..e5dd0b24d9 100644 --- a/tests/frame/explode_test.py +++ b/tests/frame/explode_test.py @@ -37,7 +37,7 @@ def test_explode_single_col( ) -> None: if any( backend in str(constructor) - for backend in ("dask", "modin", "cudf", "pyarrow_table") + for backend in ("dask", "modin", "cudf", "pyarrow_table", "daft") ): request.applymarker(pytest.mark.xfail) @@ -94,6 +94,7 @@ def test_explode_multiple_cols( "pyarrow_table", "duckdb", "pyspark", + "daft", "ibis", ) ): @@ -116,7 +117,7 @@ def test_explode_shape_error( ) -> None: if any( backend in str(constructor) - for backend in ("dask", "modin", "cudf", "pyarrow_table") + for backend in ("dask", "modin", "cudf", "pyarrow_table", "daft") ): request.applymarker(pytest.mark.xfail) @@ -139,7 +140,7 @@ def test_explode_shape_error( def test_explode_invalid_operation_error( request: pytest.FixtureRequest, constructor: Constructor ) -> None: - if any(x in str(constructor) for x in ("pyarrow_table", "dask")): + if any(x in str(constructor) for x in ("pyarrow_table", "dask", "daft")): request.applymarker(pytest.mark.xfail) if "polars" in str(constructor) and POLARS_VERSION < (0, 20, 6): diff --git a/tests/frame/group_by_test.py b/tests/frame/group_by_test.py index f870f323d6..c542b0cacf 100644 --- a/tests/frame/group_by_test.py +++ b/tests/frame/group_by_test.py @@ -141,7 +141,12 @@ def test_group_by_depth_1_std_var(constructor: Constructor, attr: str, ddof: int assert_equal_data(result, expected) -def test_group_by_median(constructor: Constructor) -> None: +def test_group_by_median( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "daft" in str(constructor): + # https://github.com/Eventual-Inc/Daft/issues/3491 + request.applymarker(pytest.mark.xfail) data = {"a": [1, 1, 1, 2, 2, 2], "b": [5, 4, 6, 7, 3, 2]} result = ( nw.from_native(constructor(data)) @@ -283,13 +288,12 @@ def test_no_agg(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_group_by_categorical(constructor: Constructor) -> None: - if ( - ("pyspark" in str(constructor)) - or "duckdb" in str(constructor) - or "ibis" in str(constructor) - ): - pytest.skip(reason="DuckDB, PySpark, and Ibis do not support categorical types") +def test_group_by_categorical( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if any(x in str(constructor) for x in ("pyspark", "duckdb", "ibis", "daft")): + # No support (yet?) for categorical + request.applymarker(pytest.mark.xfail) if "pyarrow_table" in str(constructor) and PYARROW_VERSION < ( 15, ): # pragma: no cover diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py index e2c575b7b2..6d062d6a5b 100644 --- a/tests/frame/join_test.py +++ b/tests/frame/join_test.py @@ -483,7 +483,7 @@ def test_joinasof_numeric( strategy: Literal["backward", "forward", "nearest"], expected: dict[str, list[Any]], ) -> None: - if any(x in str(constructor) for x in ("pyarrow_table", "cudf", "pyspark")): + if any(x in str(constructor) for x in ("pyarrow_table", "cudf", "pyspark", "daft")): request.applymarker(pytest.mark.xfail) if ( "duckdb" in str(constructor) or "ibis" in str(constructor) @@ -554,7 +554,7 @@ def test_joinasof_time( strategy: Literal["backward", "forward", "nearest"], expected: dict[str, list[Any]], ) -> None: - if any(x in str(constructor) for x in ("pyarrow_table", "cudf", "pyspark")): + if any(x in str(constructor) for x in ("pyarrow_table", "cudf", "pyspark", "daft")): request.applymarker(pytest.mark.xfail) if ( "duckdb" in str(constructor) or "ibis" in str(constructor) @@ -597,7 +597,7 @@ def test_joinasof_time( def test_joinasof_by(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if any(x in str(constructor) for x in ("pyarrow_table", "cudf", "pyspark")): + if any(x in str(constructor) for x in ("daft", "pyarrow_table", "cudf", "pyspark")): request.applymarker(pytest.mark.xfail) if PANDAS_VERSION < (2, 1) and ( ("pandas_pyarrow" in str(constructor)) or ("pandas_nullable" in str(constructor)) @@ -632,7 +632,7 @@ def test_joinasof_by(constructor: Constructor, request: pytest.FixtureRequest) - def test_joinasof_suffix( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if any(x in str(constructor) for x in ("pyarrow_table", "cudf", "pyspark")): + if any(x in str(constructor) for x in ("daft", "pyarrow_table", "cudf", "pyspark")): request.applymarker(pytest.mark.xfail) if PANDAS_VERSION < (2, 1) and ( ("pandas_pyarrow" in str(constructor)) or ("pandas_nullable" in str(constructor)) diff --git a/tests/frame/select_test.py b/tests/frame/select_test.py index e39acaab69..ca0fe303be 100644 --- a/tests/frame/select_test.py +++ b/tests/frame/select_test.py @@ -95,10 +95,7 @@ def test_missing_columns( data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]} df = nw.from_native(constructor(data)) selected_columns = ["a", "e", "f"] - msg = ( - r"The following columns were not found: \[.*\]" - r"\n\nHint: Did you mean one of these columns: \['a', 'b', 'z'\]?" - ) + msg = r"not found.*\n\nHint: Did you mean one of these columns: \['a', 'b', 'z'\]?" if "polars" in str(constructor): # In the lazy case, Polars only errors when we call `collect`, # and we have no way to recover exactly which columns the user @@ -171,7 +168,7 @@ def test_select_duplicates(constructor: Constructor) -> None: df = nw.from_native(constructor({"a": [1, 2]})).lazy() with pytest.raises( ValueError, - match="Expected unique|[Dd]uplicate|more than one|Duplicate column name", + match="Expected unique|[Dd]uplicate|more than one|Duplicate column name|is ambiguous", ): df.select("a", nw.col("a") + 1).collect() diff --git a/tests/frame/unique_test.py b/tests/frame/unique_test.py index 4bf9698a6f..8ee842edaa 100644 --- a/tests/frame/unique_test.py +++ b/tests/frame/unique_test.py @@ -33,7 +33,12 @@ def test_unique_eager( assert_equal_data(result, expected) -def test_unique_invalid_subset(constructor: Constructor) -> None: +def test_unique_invalid_subset( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "daft" in str(constructor): + # https://github.com/Eventual-Inc/Daft/issues/4151 + request.applymarker(pytest.mark.xfail) if "duckdb" in str(constructor) and DUCKDB_VERSION < (1, 3): pytest.skip() df_raw = constructor(data) @@ -55,7 +60,11 @@ def test_unique( subset: str | list[str] | None, keep: Literal["any", "none"], expected: dict[str, list[float]], + request: pytest.FixtureRequest, ) -> None: + if "daft" in str(constructor): + # https://github.com/Eventual-Inc/Daft/issues/4151 + request.applymarker(pytest.mark.xfail) if "duckdb" in str(constructor) and DUCKDB_VERSION < (1, 3): pytest.skip() df_raw = constructor(data) @@ -74,7 +83,11 @@ def test_unique_full_subset( subset: list[str] | None, keep: Literal["any", "none"], expected: dict[str, list[float]], + request: pytest.FixtureRequest, ) -> None: + if "daft" in str(constructor) and keep == "none": + # https://github.com/Eventual-Inc/Daft/issues/4151 + request.applymarker(pytest.mark.xfail) if "duckdb" in str(constructor) and DUCKDB_VERSION < (1, 3): pytest.skip() data = {"a": [1, 1, 1, 2], "b": [3, 3, 4, 4]} diff --git a/tests/frame/with_columns_test.py b/tests/frame/with_columns_test.py index 49f4ccd1b9..7b6cc3c4a1 100644 --- a/tests/frame/with_columns_test.py +++ b/tests/frame/with_columns_test.py @@ -54,17 +54,11 @@ def test_with_columns_order_single_row(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_with_columns_dtypes_single_row( - constructor: Constructor, request: pytest.FixtureRequest -) -> None: +def test_with_columns_dtypes_single_row(constructor: Constructor) -> None: if "pyarrow_table" in str(constructor) and PYARROW_VERSION < (15,): pytest.skip() - if ( - ("pyspark" in str(constructor)) - or "duckdb" in str(constructor) - or "ibis" in str(constructor) - ): - request.applymarker(pytest.mark.xfail) + if any(x in str(constructor) for x in ("pyspark", "duckdb", "daft", "ibis")): + pytest.skip("not categorical support") data = {"a": ["foo"]} df = nw.from_native(constructor(data)).with_columns(nw.col("a").cast(nw.Categorical)) result = df.with_columns(nw.col("a")) diff --git a/tests/selectors_test.py b/tests/selectors_test.py index d393089e25..48207b5daf 100644 --- a/tests/selectors_test.py +++ b/tests/selectors_test.py @@ -73,6 +73,8 @@ def test_categorical(request: pytest.FixtureRequest, constructor: Constructor) - or "ibis" in str(constructor) ): request.applymarker(pytest.mark.xfail) + if "daft" in str(constructor): + request.applymarker(pytest.mark.xfail) expected = {"b": ["a", "b", "c"]} df = nw.from_native(constructor(data)).with_columns(nw.col("b").cast(nw.Categorical)) @@ -93,6 +95,8 @@ def test_datetime(constructor: Constructor, request: pytest.FixtureRequest) -> N request.applymarker(pytest.mark.xfail) if "modin" in str(constructor): pytest.skip(reason="too slow") + if "daft" in str(constructor): + request.applymarker(pytest.mark.xfail) ts1 = datetime(2000, 11, 20, 18, 12, 16, 600000) ts2 = datetime(2020, 10, 30, 10, 20, 25, 123000) @@ -213,7 +217,7 @@ def test_set_ops( request: pytest.FixtureRequest, ) -> None: if ( - any(x in str(constructor) for x in ("duckdb", "sqlframe", "ibis")) + any(x in str(constructor) for x in ("duckdb", "sqlframe", "ibis", "daft")) and not expected ): # https://github.com/narwhals-dev/narwhals/issues/2469 @@ -266,6 +270,8 @@ def test_tz_aware(constructor: Constructor, request: pytest.FixtureRequest) -> N ): # replace_time_zone not implemented request.applymarker(pytest.mark.xfail) + if "daft" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [datetime(2020, 1, 1), datetime(2020, 1, 2)], "c": [4, 5]} df = nw.from_native(constructor(data)).with_columns( diff --git a/tests/series_only/cast_test.py b/tests/series_only/cast_test.py index 6311db96b9..864b1d9aaa 100644 --- a/tests/series_only/cast_test.py +++ b/tests/series_only/cast_test.py @@ -115,7 +115,7 @@ def test_cast_to_enum_vmain( # Backends that do not (yet) support Enum dtype if any( backend in str(constructor) - for backend in ("pyarrow_table", "sqlframe", "pyspark", "ibis") + for backend in ("pyarrow_table", "sqlframe", "pyspark", "ibis", "daft") ): request.applymarker(pytest.mark.xfail) diff --git a/tests/translate/get_native_namespace_test.py b/tests/translate/get_native_namespace_test.py index f6b33670b2..79eb58c5af 100644 --- a/tests/translate/get_native_namespace_test.py +++ b/tests/translate/get_native_namespace_test.py @@ -52,6 +52,10 @@ def _get_expected_namespace(constructor_name: str) -> Any | None: # noqa: PLR09 import sqlframe return sqlframe + elif "daft" in constructor_name: + import daft + + return daft return None # pragma: no cover diff --git a/tests/v1_test.py b/tests/v1_test.py index 18c057f188..fcab289166 100644 --- a/tests/v1_test.py +++ b/tests/v1_test.py @@ -260,12 +260,9 @@ def test_cast_to_enum_v1( request: pytest.FixtureRequest, constructor: Constructor ) -> None: # Backends that do not (yet) support Enum dtype - if ( - any( - backend in str(constructor) - for backend in ["pyarrow_table", "sqlframe", "pyspark", "ibis"] - ) - or str(constructor) == "modin" + if any( + backend in str(constructor) + for backend in ["pyarrow_table", "sqlframe", "pyspark", "ibis", "daft"] ): request.applymarker(pytest.mark.xfail) diff --git a/tpch/execute.py b/tpch/execute.py index 0d1cdf4358..935c2fa10e 100644 --- a/tpch/execute.py +++ b/tpch/execute.py @@ -4,6 +4,7 @@ from importlib import import_module from pathlib import Path +import daft import dask.dataframe as dd import duckdb import pandas as pd @@ -34,11 +35,18 @@ "polars[lazy]": (pl, {}), "pyarrow": (pa, {}), "duckdb": (duckdb, {}), + "daft": (daft, {}), "dask": (dd, {"engine": "pyarrow", "dtype_backend": "pyarrow"}), "sqlframe": (sqlframe, {"session": DuckDBSession()}), } -DUCKDB_SKIPS = ["q15"] +DAFT_SKIPS = [ + "q4", # needs `unique` with `subset` + "q15", # needs `filter` which works with windows +] +DUCKDB_SKIPS = [ + "q15" # needs `filter` which works with windows +] QUERY_DATA_PATH_MAP = { "q1": (LINEITEM_PATH,), @@ -95,7 +103,9 @@ def execute_query(query_id: str) -> None: expected = pl.read_parquet(DATA_DIR / f"result_{query_id}.parquet") for backend, (native_namespace, kwargs) in BACKEND_NAMESPACE_KWARGS_MAP.items(): - if backend in {"duckdb", "sqlframe"} and query_id in DUCKDB_SKIPS: + if (backend in {"duckdb", "sqlframe"} and query_id in DUCKDB_SKIPS) or ( + backend == "daft" and query_id in DAFT_SKIPS + ): print(f"\nSkipping {query_id} for {backend}") # noqa: T201 continue @@ -108,7 +118,7 @@ def execute_query(query_id: str) -> None: ) ) .lazy() - .collect(backend=nw.Implementation.POLARS) + .collect(backend="polars") .to_native() ) diff --git a/tpch/queries/q14.py b/tpch/queries/q14.py index 24bf1f6f89..6579b8c373 100644 --- a/tpch/queries/q14.py +++ b/tpch/queries/q14.py @@ -19,7 +19,7 @@ def query(line_item_ds: FrameT, part_ds: FrameT) -> FrameT: .select( ( 100.00 - * nw.when(nw.col("p_type").str.contains("PROMO*")) + * nw.when(nw.col("p_type").str.starts_with("PROMO")) .then(nw.col("l_extendedprice") * (1 - nw.col("l_discount"))) .otherwise(0) .sum() diff --git a/tpch/queries/q9.py b/tpch/queries/q9.py index 73c1305187..8b871b3198 100644 --- a/tpch/queries/q9.py +++ b/tpch/queries/q9.py @@ -26,7 +26,7 @@ def query( ) .join(orders_ds, left_on="l_orderkey", right_on="o_orderkey") .join(nation_ds, left_on="s_nationkey", right_on="n_nationkey") - .filter(nw.col("p_name").str.contains("green")) + .filter(nw.col("p_name").str.contains("green", literal=True)) .select( nw.col("n_name").alias("nation"), nw.col("o_orderdate").dt.year().alias("o_year"),