Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion narwhals/_duckdb/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
import pandas as pd
import pyarrow as pa
from duckdb import Expression
from duckdb.typing import DuckDBPyType
from duckdb.sqltypes import DuckDBPyType
from typing_extensions import Self, TypeIs

from narwhals._compliant.typing import CompliantDataFrameAny
Expand Down
10 changes: 6 additions & 4 deletions narwhals/_duckdb/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

import duckdb
from duckdb import CoalesceOperator, Expression
from duckdb.typing import BIGINT, VARCHAR

from narwhals._duckdb.dataframe import DuckDBLazyFrame
from narwhals._duckdb.expr import DuckDBExpr
Expand All @@ -16,6 +15,7 @@
DeferredTimeZone,
F,
concat_str,
duckdb_dtypes,
function,
lit,
narwhals_to_native_dtype,
Expand Down Expand Up @@ -108,9 +108,9 @@ def func(df: DuckDBLazyFrame) -> list[Expression]:
cols_separated = [
y
for x in [
(col.cast(VARCHAR),)
(col.cast(duckdb_dtypes.VARCHAR),)
if i == len(cols) - 1
else (col.cast(VARCHAR), lit(separator))
else (col.cast(duckdb_dtypes.VARCHAR), lit(separator))
for i, col in enumerate(cols)
]
for y in x
Expand All @@ -130,7 +130,9 @@ def func(cols: Iterable[Expression]) -> Expression:
cols = list(cols)
return reduce(
operator.add, (CoalesceOperator(col, lit(0)) for col in cols)
) / reduce(operator.add, (col.isnotnull().cast(BIGINT) for col in cols))
) / reduce(
operator.add, (col.isnotnull().cast(duckdb_dtypes.BIGINT) for col in cols)
)

return self._expr._from_elementwise_horizontal_op(func, *exprs)

Expand Down
25 changes: 15 additions & 10 deletions narwhals/_duckdb/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@
from typing import TYPE_CHECKING

import duckdb
import duckdb.typing as duckdb_dtypes
from duckdb import Expression
from duckdb.typing import DuckDBPyType

try:
import duckdb.sqltypes as duckdb_dtypes
except ModuleNotFoundError:
import duckdb.typing as duckdb_dtypes

from narwhals._utils import Version, isinstance_or_issubclass, zip_strict
from narwhals.exceptions import ColumnNotFoundError
Expand Down Expand Up @@ -131,7 +134,9 @@ def time_zone(self) -> str:


def native_to_narwhals_dtype(
duckdb_dtype: DuckDBPyType, version: Version, deferred_time_zone: DeferredTimeZone
duckdb_dtype: duckdb_dtypes.DuckDBPyType,
version: Version,
deferred_time_zone: DeferredTimeZone,
) -> DType:
duckdb_dtype_id = duckdb_dtype.id
dtypes = version.dtypes
Expand Down Expand Up @@ -216,7 +221,7 @@ def _non_nested_native_to_narwhals_dtype(duckdb_dtype_id: str, version: Version)


dtypes = Version.MAIN.dtypes
NW_TO_DUCKDB_DTYPES: Mapping[type[DType], DuckDBPyType] = {
NW_TO_DUCKDB_DTYPES: Mapping[type[DType], duckdb_dtypes.DuckDBPyType] = {
dtypes.Float64: duckdb_dtypes.DOUBLE,
dtypes.Float32: duckdb_dtypes.FLOAT,
dtypes.Binary: duckdb_dtypes.BLOB,
Expand All @@ -228,14 +233,14 @@ def _non_nested_native_to_narwhals_dtype(duckdb_dtype_id: str, version: Version)
dtypes.Int16: duckdb_dtypes.SMALLINT,
dtypes.Int32: duckdb_dtypes.INTEGER,
dtypes.Int64: duckdb_dtypes.BIGINT,
dtypes.Int128: DuckDBPyType("INT128"),
dtypes.Int128: duckdb_dtypes.DuckDBPyType("INT128"),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we use

Suggested change
dtypes.Int128: duckdb_dtypes.DuckDBPyType("INT128"),
dtypes.Int128: duckdb_dtypes.HUGEINT,

(and dtypes.UInt128: duckdb_dtypes.UHUGEINT, below)?

dtypes.UInt8: duckdb_dtypes.UTINYINT,
dtypes.UInt16: duckdb_dtypes.USMALLINT,
dtypes.UInt32: duckdb_dtypes.UINTEGER,
dtypes.UInt64: duckdb_dtypes.UBIGINT,
dtypes.UInt128: DuckDBPyType("UINT128"),
dtypes.UInt128: duckdb_dtypes.DuckDBPyType("UINT128"),
}
TIME_UNIT_TO_TIMESTAMP: Mapping[TimeUnit, DuckDBPyType] = {
TIME_UNIT_TO_TIMESTAMP: Mapping[TimeUnit, duckdb_dtypes.DuckDBPyType] = {
"s": duckdb_dtypes.TIMESTAMP_S,
"ms": duckdb_dtypes.TIMESTAMP_MS,
"us": duckdb_dtypes.TIMESTAMP,
Expand All @@ -246,7 +251,7 @@ def _non_nested_native_to_narwhals_dtype(duckdb_dtype_id: str, version: Version)

def narwhals_to_native_dtype( # noqa: PLR0912, C901
dtype: IntoDType, version: Version, deferred_time_zone: DeferredTimeZone
) -> DuckDBPyType:
) -> duckdb_dtypes.DuckDBPyType:
dtypes = version.dtypes
base_type = dtype.base_type()
if duckdb_type := NW_TO_DUCKDB_DTYPES.get(base_type):
Expand All @@ -256,7 +261,7 @@ def narwhals_to_native_dtype( # noqa: PLR0912, C901
msg = "Converting to Enum is not supported in narwhals.stable.v1"
raise NotImplementedError(msg)
if isinstance(dtype, dtypes.Enum):
return DuckDBPyType(f"ENUM{dtype.categories!r}")
return duckdb_dtypes.DuckDBPyType(f"ENUM{dtype.categories!r}")
msg = "Can not cast / initialize Enum without categories present"
raise ValueError(msg)
if isinstance_or_issubclass(dtype, dtypes.Datetime):
Expand Down Expand Up @@ -291,7 +296,7 @@ def narwhals_to_native_dtype( # noqa: PLR0912, C901
nw_inner = nw_inner.inner
duckdb_inner = narwhals_to_native_dtype(nw_inner, version, deferred_time_zone)
duckdb_shape_fmt = "".join(f"[{item}]" for item in dtype.shape)
return DuckDBPyType(f"{duckdb_inner}{duckdb_shape_fmt}")
return duckdb_dtypes.DuckDBPyType(f"{duckdb_inner}{duckdb_shape_fmt}")
if issubclass(base_type, UNSUPPORTED_DTYPES):
msg = f"Converting to {base_type.__name__} dtype is not supported for DuckDB."
raise NotImplementedError(msg)
Expand Down
7 changes: 6 additions & 1 deletion tests/expr_and_series/fill_nan_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
modin_constructor,
pandas_constructor,
)
from tests.utils import Constructor, ConstructorEager, assert_equal_data
from tests.utils import PANDAS_VERSION, Constructor, ConstructorEager, assert_equal_data

NON_NULLABLE_CONSTRUCTORS = [
pandas_constructor,
Expand All @@ -31,6 +31,9 @@ def test_fill_nan(constructor: Constructor) -> None:
# no nan vs null distinction
expected = {"float": [-1.0, 1.0, 3.0], "float_na": [3.0, 1.0, 3.0]}
assert result.lazy().collect()["float_na"].null_count() == 0
elif "pandas" in str(constructor) and PANDAS_VERSION >= (3,):
expected = {"float": [-1.0, 1.0, None], "float_na": [None, 1.0, None]}
assert result.lazy().collect()["float_na"].null_count() == 2
else:
expected = {"float": [-1.0, 1.0, None], "float_na": [3.0, 1.0, None]}
assert result.lazy().collect()["float_na"].null_count() == 1
Expand All @@ -46,5 +49,7 @@ def test_fill_nan_series(constructor_eager: ConstructorEager) -> None:
if any(constructor_eager is c for c in NON_NULLABLE_CONSTRUCTORS):
# no nan vs null distinction
assert_equal_data({"a": result}, {"a": [999.0, 1.0, 999.0]})
elif "pandas" in str(constructor_eager) and PANDAS_VERSION >= (3,):
assert_equal_data({"a": result}, {"a": [None, 1.0, None]})
else:
assert_equal_data({"a": result}, {"a": [999.0, 1.0, None]})
34 changes: 27 additions & 7 deletions tests/expr_and_series/is_close_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from __future__ import annotations

from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any

import pytest

Expand All @@ -18,7 +18,7 @@
modin_constructor,
pandas_constructor,
)
from tests.utils import Constructor, ConstructorEager, assert_equal_data
from tests.utils import PANDAS_VERSION, Constructor, ConstructorEager, assert_equal_data

if TYPE_CHECKING:
from narwhals.typing import NumericLiteral
Expand All @@ -32,7 +32,7 @@
NULL_PLACEHOLDER, NAN_PLACEHOLDER = 9999.0, -1.0
INF_POS, INF_NEG = float("inf"), float("-inf")

data = {
data: dict[str, Any] = {
"x": [1.001, NULL_PLACEHOLDER, NAN_PLACEHOLDER, INF_POS, INF_NEG, INF_POS],
"y": [1.005, NULL_PLACEHOLDER, NAN_PLACEHOLDER, INF_POS, 3.0, INF_NEG],
"non_numeric": list("number"),
Expand Down Expand Up @@ -109,7 +109,7 @@ def test_is_close_series_with_series(
rel_tol: float,
*,
nans_equal: bool,
expected: list[float],
expected: list[Any],
) -> None:
df = nw.from_native(constructor_eager(data), eager_only=True)
x, y = df["x"], df["y"]
Expand All @@ -122,6 +122,11 @@ def test_is_close_series_with_series(

if constructor_eager in NON_NULLABLE_CONSTRUCTORS:
expected = [v if v is not None else nans_equal for v in expected]
elif "pandas" in str(constructor_eager) and PANDAS_VERSION >= (3,):
expected = [
v if data["y"][i] not in {NULL_PLACEHOLDER, NAN_PLACEHOLDER} else None
for i, v in enumerate(expected)
]
assert_equal_data({"result": result}, {"result": expected})


Expand All @@ -133,7 +138,7 @@ def test_is_close_series_with_scalar(
rel_tol: float,
*,
nans_equal: bool,
expected: list[float],
expected: list[Any],
) -> None:
df = nw.from_native(constructor_eager(data), eager_only=True)
y = df["y"]
Expand All @@ -145,6 +150,11 @@ def test_is_close_series_with_scalar(

if constructor_eager in NON_NULLABLE_CONSTRUCTORS:
expected = [v if v is not None else False for v in expected]
elif "pandas" in str(constructor_eager) and PANDAS_VERSION >= (3,):
expected = [
v if data["y"][i] not in {NULL_PLACEHOLDER, NAN_PLACEHOLDER} else None
for i, v in enumerate(expected)
]
assert_equal_data({"result": result}, {"result": expected})


Expand All @@ -157,7 +167,7 @@ def test_is_close_expr_with_expr(
rel_tol: float,
*,
nans_equal: bool,
expected: list[float],
expected: list[Any],
) -> None:
if "sqlframe" in str(constructor):
# TODO(FBruzzesi): Figure out a MRE and report upstream
Expand Down Expand Up @@ -185,6 +195,11 @@ def test_is_close_expr_with_expr(
)
if constructor in NON_NULLABLE_CONSTRUCTORS:
expected = [v if v is not None else nans_equal for v in expected]
elif "pandas" in str(constructor) and PANDAS_VERSION >= (3,):
expected = [
v if data["y"][i] not in {NULL_PLACEHOLDER, NAN_PLACEHOLDER} else None
for i, v in enumerate(expected)
]
assert_equal_data(result, {"idx": data["idx"], "result": expected})


Expand All @@ -197,7 +212,7 @@ def test_is_close_expr_with_scalar(
rel_tol: float,
*,
nans_equal: bool,
expected: list[float],
expected: list[Any],
) -> None:
if "sqlframe" in str(constructor):
# TODO(FBruzzesi): Figure out a MRE and report upstream
Expand All @@ -221,4 +236,9 @@ def test_is_close_expr_with_scalar(
)
if constructor in NON_NULLABLE_CONSTRUCTORS:
expected = [v if v is not None else False for v in expected]
elif "pandas" in str(constructor) and PANDAS_VERSION >= (3,):
expected = [
v if data["y"][i] not in {NULL_PLACEHOLDER, NAN_PLACEHOLDER} else None
for i, v in enumerate(expected)
]
assert_equal_data(result, {"idx": data["idx"], "result": expected})
16 changes: 15 additions & 1 deletion tests/expr_and_series/is_nan_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
modin_constructor,
pandas_constructor,
)
from tests.utils import Constructor, ConstructorEager, assert_equal_data
from tests.utils import PANDAS_VERSION, Constructor, ConstructorEager, assert_equal_data

NON_NULLABLE_CONSTRUCTORS = [
pandas_constructor,
Expand Down Expand Up @@ -43,6 +43,13 @@ def test_nan(constructor: Constructor) -> None:
"float": [False, False, True],
"float_na": [True, False, True],
}
elif "pandas" in str(constructor) and PANDAS_VERSION >= (3,):
# NaN values are coerced into NA for nullable datatypes by default
expected = {
"int": [False, False, None],
"float": [False, False, None],
"float_na": [None, False, None],
}
else:
# Null are preserved and should be differentiated for nullable datatypes
expected = {
Expand Down Expand Up @@ -82,6 +89,13 @@ def test_nan_series(constructor_eager: ConstructorEager) -> None:
"float": [False, False, True],
"float_na": [True, False, True],
}
elif "pandas" in str(constructor_eager) and PANDAS_VERSION >= (3,):
# NaN values are coerced into NA for nullable datatypes by default
expected = {
"int": [False, False, None],
"float": [False, False, None],
"float_na": [None, False, None],
}
else:
# Null are preserved and should be differentiated for nullable datatypes
expected = {
Expand Down
4 changes: 2 additions & 2 deletions tests/expr_and_series/str/zfill_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@


def test_str_zfill(request: pytest.FixtureRequest, constructor: Constructor) -> None:
if uses_pyarrow_backend(constructor):
if uses_pyarrow_backend(constructor) and PANDAS_VERSION < (3,):
reason = (
"pandas with pyarrow backend doesn't support str.zfill, see "
"https://github.com/pandas-dev/pandas/issues/61485"
Expand All @@ -43,7 +43,7 @@ def test_str_zfill(request: pytest.FixtureRequest, constructor: Constructor) ->
def test_str_zfill_series(
request: pytest.FixtureRequest, constructor_eager: ConstructorEager
) -> None:
if uses_pyarrow_backend(constructor_eager):
if uses_pyarrow_backend(constructor_eager) and PANDAS_VERSION < (3,):
reason = (
"pandas with pyarrow backend doesn't support str.zfill, see "
"https://github.com/pandas-dev/pandas/issues/61485"
Expand Down
2 changes: 1 addition & 1 deletion tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def assert_equal_data(result: Any, expected: Mapping[str, Any]) -> None:
elif isinstance(lhs, float) and math.isnan(lhs):
are_equivalent_values = rhs is None or math.isnan(rhs)
elif isinstance(rhs, float) and math.isnan(rhs):
are_equivalent_values = lhs is None or math.isnan(lhs)
are_equivalent_values = lhs is None or pd.isna(lhs) or math.isnan(lhs)
elif lhs is None:
are_equivalent_values = rhs is None
elif isinstance(lhs, list) and isinstance(rhs, list):
Expand Down
Loading