minor fixes

Dharin-shah · evertlammerts · commit 320deed73dc6 · 2025-12-08T08:53:19.000+01:00
diff --git a/duckdb/experimental/spark/sql/column.py b/duckdb/experimental/spark/sql/column.py
@@ -7,7 +7,7 @@
 if TYPE_CHECKING:
     from ._typing import DateTimeLiteral, DecimalLiteral, LiteralType
 
-from duckdb import ColumnExpression, ConstantExpression, Expression, FunctionExpression
+from duckdb import ConstantExpression, Expression, FunctionExpression
 from duckdb.sqltypes import DuckDBPyType
 
 __all__ = ["Column"]
@@ -173,9 +173,11 @@ def __getitem__(self, k: Any) -> "Column":  # noqa: ANN401
             #    raise ValueError("Using a slice with a step value is not supported")
             # return self.substr(k.start, k.stop)
         else:
-            # TODO: this is super hacky  # noqa: TD002, TD003
-            expr_str = str(self.expr) + "." + str(k)
-            return Column(ColumnExpression(expr_str))
+            # Use struct_extract for proper struct field access
+            from duckdb import ConstantExpression, FunctionExpression
+
+            field_name_expr = ConstantExpression(str(k))
+            return Column(FunctionExpression("struct_extract", self.expr, field_name_expr))
 
     def __getattr__(self, item: Any) -> "Column":  # noqa: ANN401
         """An expression that gets an item at position ``ordinal`` out of a list,
diff --git a/duckdb/experimental/spark/sql/functions.py b/duckdb/experimental/spark/sql/functions.py
@@ -30,6 +30,29 @@ def _invoke_function_over_columns(name: str, *cols: "ColumnOrName") -> Column:
     return _invoke_function(name, *cols)
 
 
+def _nan_constant() -> Expression:
+    """Create a NaN constant expression.
+
+    Note: ConstantExpression(float("nan")) returns NULL instead of NaN because
+    TransformPythonValue() in the C++ layer has nan_as_null=true by default.
+    This is intentional for data import scenarios (CSV, Pandas, etc.) where NaN
+    represents missing data.
+
+    For mathematical functions that need to return NaN (not NULL) for out-of-range
+    inputs per PySpark/IEEE 754 semantics, we use SQLExpression as a workaround.
+
+    Returns:
+    -------
+    Expression
+        An expression that evaluates to NaN (not NULL)
+
+    See Also:
+    --------
+    NAN_ROOT_CAUSE_ANALYSIS.md for full explanation
+    """
+    return SQLExpression("'NaN'::DOUBLE")
+
+
 def col(column: str) -> Column:  # noqa: D103
     return Column(ColumnExpression(column))
 
@@ -617,11 +640,9 @@ def asin(col: "ColumnOrName") -> Column:
     +--------+
     """
     col = _to_column_expr(col)
-    # TODO: ConstantExpression(float("nan")) gives NULL and not NaN  # noqa: TD002, TD003
+    # asin domain is [-1, 1]; return NaN for out-of-range values per PySpark semantics
     return Column(
-        CaseExpression((col < -1.0) | (col > 1.0), ConstantExpression(float("nan"))).otherwise(
-            FunctionExpression("asin", col)
-        )
+        CaseExpression((col < -1.0) | (col > 1.0), _nan_constant()).otherwise(FunctionExpression("asin", col))
     )
 
 
@@ -4177,7 +4198,11 @@ def acos(col: "ColumnOrName") -> Column:
     |     NaN|
     +--------+
     """
-    return _invoke_function_over_columns("acos", col)
+    col = _to_column_expr(col)
+    # acos domain is [-1, 1]; return NaN for out-of-range values per PySpark semantics
+    return Column(
+        CaseExpression((col < -1.0) | (col > 1.0), _nan_constant()).otherwise(FunctionExpression("acos", col))
+    )
 
 
 def call_function(funcName: str, *cols: "ColumnOrName") -> Column:
diff --git a/duckdb/experimental/spark/sql/readwriter.py b/duckdb/experimental/spark/sql/readwriter.py
@@ -125,7 +125,7 @@ def load(  # noqa: D102
             types, names = schema.extract_types_and_names()
             df = df._cast_types(types)
             df = df.toDF(names)
-        raise NotImplementedError
+        return df
 
     def csv(  # noqa: D102
         self,
diff --git a/duckdb/experimental/spark/sql/type_utils.py b/duckdb/experimental/spark/sql/type_utils.py
@@ -2,6 +2,7 @@
 
 from duckdb.sqltypes import DuckDBPyType
 
+from ..exception import ContributionsAcceptedError
 from .types import (
     ArrayType,
     BinaryType,
@@ -79,7 +80,12 @@ def convert_nested_type(dtype: DuckDBPyType) -> DataType:  # noqa: D103
     if id == "list" or id == "array":
         children = dtype.children
         return ArrayType(convert_type(children[0][1]))
-    # TODO: add support for 'union'  # noqa: TD002, TD003
+    if id == "union":
+        msg = (
+            "Union types are not supported in the PySpark interface. "
+            "DuckDB union types cannot be directly mapped to PySpark types."
+        )
+        raise ContributionsAcceptedError(msg)
     if id == "struct":
         children: list[tuple[str, DuckDBPyType]] = dtype.children
         fields = [StructField(x[0], convert_type(x[1])) for x in children]