Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
106555e
feat: add fill_null method to DataFrame for handling null values
kosiew Feb 12, 2025
cff9b7c
test: add coalesce function tests for handling default values
kosiew Feb 12, 2025
4cf7496
Resolve test cases for fill_null
kosiew Feb 12, 2025
df6208e
feat: add fill_nan method to DataFrame for handling NaN values
kosiew Feb 12, 2025
23ba1bd
move imports out of functions
kosiew Feb 12, 2025
d6ca465
docs: add documentation for fill_null and fill_nan methods in DataFrame
kosiew Feb 12, 2025
8582104
Add more tests
kosiew Feb 12, 2025
73b692f
fix ruff errors
kosiew Feb 12, 2025
07d4f4b
Merge branch 'main' into fill-null
kosiew Feb 25, 2025
8b51ee9
Merge branch 'main' into fill-null
kosiew Apr 3, 2025
5a3cd8c
amend def fill_null to invoke PyDataFrame's fill_null
kosiew Apr 3, 2025
924de28
Merge branch 'main' into fill-null
kosiew Apr 29, 2025
4499e45
refactor: remove fill_nan method documentation from functions.rst
kosiew Apr 29, 2025
bf9d7da
refactor: remove unused import of Enum from dataframe.py
kosiew Apr 29, 2025
dc86e77
refactor: improve error handling and type extraction in python_value_…
kosiew Apr 29, 2025
6fbafcd
refactor: enhance datetime and date conversion logic in python_value_…
kosiew Apr 29, 2025
681b2e5
refactor: streamline type extraction in python_value_to_scalar_value …
kosiew Apr 29, 2025
aa87a8e
fix try_convert_to_string
kosiew Apr 29, 2025
0dfbdfa
refactor: improve type handling in python_value_to_scalar_value function
kosiew Apr 30, 2025
ecc4376
refactor: move py_obj_to_scalar_value function to utils module
kosiew Apr 30, 2025
412029c
refactor: update fill_null to use py_obj_to_scalar_value from utils
kosiew Apr 30, 2025
4c40b85
Remove python_object_to_scalar_value code
kosiew Apr 30, 2025
82bf6f4
refactor: enhance py_obj_to_scalar_value to utilize PyArrow for compl…
kosiew Apr 30, 2025
b5d87b0
refactor: update py_obj_to_scalar_value to handle errors and use extr…
kosiew Apr 30, 2025
d546f7a
refactor: modify py_obj_to_scalar_value to return ScalarValue directl…
kosiew Apr 30, 2025
b89c695
refactor: update py_obj_to_scalar_value to return a Result for better…
kosiew Apr 30, 2025
b140523
test: add tests for fill_null functionality in DataFrame with null va…
kosiew Apr 30, 2025
3065773
test: enhance null DataFrame tests to include date32 and date64 columns
kosiew Apr 30, 2025
d7cf099
refactor: simplify py_obj_to_scalar_value by removing direct extracti…
kosiew Apr 30, 2025
0aebd74
refactor: remove unnecessary documentation from py_obj_to_scalar_valu…
kosiew Apr 30, 2025
e3d643b
Fix ruff errors
kosiew Apr 30, 2025
68b520e
test: update datetime handling in coalesce tests to include timezone …
kosiew Apr 30, 2025
22519aa
Fix ruff errors
kosiew Apr 30, 2025
799b67c
trigger ci
kosiew Apr 30, 2025
4681420
Merge branch 'main' into fill-null
kosiew May 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions docs/source/user-guide/common-operations/functions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -129,3 +129,39 @@ The function :py:func:`~datafusion.functions.in_list` allows to check a column f
.limit(20)
.to_pandas()
)


Handling Missing Values
=====================

DataFusion provides methods to handle missing values in DataFrames:

fill_null
---------

The ``fill_null()`` method replaces NULL values in specified columns with a provided value:

.. code-block:: python

# Fill all NULL values with 0 where possible
df = df.fill_null(0)

# Fill NULL values only in specific string columns
df = df.fill_null("missing", subset=["name", "category"])

The fill value will be cast to match each column's type. If casting fails for a column, that column remains unchanged.

fill_nan
--------

The ``fill_nan()`` method replaces NaN values in floating-point columns with a provided numeric value:

.. code-block:: python

# Fill all NaN values with 0 in numeric columns
df = df.fill_nan(0)

# Fill NaN values in specific numeric columns
df = df.fill_nan(99.9, subset=["price", "score"])

This only works on floating-point columns (float32, float64). The fill value must be numeric (int or float).
121 changes: 115 additions & 6 deletions python/datafusion/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from __future__ import annotations

import warnings
from enum import Enum
from typing import (
TYPE_CHECKING,
Any,
Expand All @@ -33,8 +34,12 @@
overload,
)

import pyarrow as pa
from typing_extensions import deprecated

from datafusion import functions as f
from datafusion._internal import DataFrame as DataFrameInternal
from datafusion.expr import Expr, SortExpr, sort_or_default
from datafusion.plan import ExecutionPlan, LogicalPlan
from datafusion.record_batch import RecordBatchStream

Expand All @@ -44,12 +49,6 @@

import pandas as pd
import polars as pl
import pyarrow as pa

from enum import Enum

from datafusion._internal import DataFrame as DataFrameInternal
from datafusion.expr import Expr, SortExpr, sort_or_default


# excerpt from deltalake
Expand Down Expand Up @@ -853,3 +852,113 @@ def within_limit(df: DataFrame, limit: int) -> DataFrame:
DataFrame: After applying func to the original dataframe.
"""
return func(self, *args)

def fill_null(self, value: Any, subset: list[str] | None = None) -> "DataFrame":
"""Fill null values in specified columns with a value.

Args:
value: Value to replace nulls with. Will be cast to match column type.
subset: Optional list of column names to fill. If None, fills all columns.

Returns:
DataFrame with null values replaced where type casting is possible

Examples:
>>> df = df.fill_null(0) # Fill all nulls with 0 where possible
>>> # Fill nulls in specific string columns
>>> df = df.fill_null("missing", subset=["name", "category"])

Notes:
- Only fills nulls in columns where the value can be cast to the column type
- For columns where casting fails, the original column is kept unchanged
- For columns not in subset, the original column is kept unchanged
"""
# Get columns to process
if subset is None:
subset = self.schema().names
else:
schema_cols = self.schema().names
for col in subset:
if col not in schema_cols:
raise ValueError(f"Column '{col}' not found in DataFrame")

# Build expressions for select
exprs = []
for col_name in self.schema().names:
if col_name in subset:
# Get column type
col_type = self.schema().field(col_name).type

try:
# Try casting value to column type
typed_value = pa.scalar(value, type=col_type)
literal_expr = f.Expr.literal(typed_value)

# Build coalesce expression
expr = f.coalesce(f.col(col_name), literal_expr)
exprs.append(expr.alias(col_name))

except (pa.ArrowTypeError, pa.ArrowInvalid):
# If cast fails, keep original column
exprs.append(f.col(col_name))
else:
# Keep columns not in subset unchanged
exprs.append(f.col(col_name))

return self.select(*exprs)

def fill_nan(
self, value: float | int, subset: list[str] | None = None
) -> "DataFrame":
"""Fill NaN values in specified numeric columns with a value.

Args:
value: Numeric value to replace NaN values with.
subset: Optional list of column names to fill. If None, fills all numeric
columns.

Returns:
DataFrame with NaN values replaced in numeric columns.

Examples:
>>> df = df.fill_nan(0) # Fill all NaNs with 0 in numeric columns
>>> # Fill NaNs in specific numeric columns
>>> df = df.fill_nan(99.9, subset=["price", "score"])

Notes:
- Only fills NaN values in numeric columns (float32, float64)
- Non-numeric columns are kept unchanged
- For columns not in subset, the original column is kept unchanged
- Value must be numeric (int or float)
"""
if not isinstance(value, (int, float)):
raise ValueError("Value must be numeric (int or float)")

# Get columns to process
if subset is None:
# Only get numeric columns if no subset specified
subset = [
field.name
for field in self.schema()
if pa.types.is_floating(field.type)
]
else:
schema_cols = self.schema().names
for col in subset:
if col not in schema_cols:
raise ValueError(f"Column '{col}' not found in DataFrame")
if not pa.types.is_floating(self.schema().field(col).type):
raise ValueError(f"Column '{col}' is not a numeric column")

# Build expressions for select
exprs = []
for col_name in self.schema().names:
if col_name in subset:
# Use nanvl function to replace NaN values
expr = f.nanvl(f.col(col_name), f.lit(value))
exprs.append(expr.alias(col_name))
else:
# Keep columns not in subset unchanged
exprs.append(f.col(col_name))

return self.select(*exprs)
126 changes: 126 additions & 0 deletions python/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1196,3 +1196,129 @@ def test_dataframe_repr_html(df) -> None:

# Ignore whitespace just to make this test look cleaner
assert output.replace(" ", "") == ref_html.replace(" ", "")


def test_fill_null(df):
# Test filling nulls with integer value
df_with_nulls = df.with_column("d", literal(None).cast(pa.int64()))
df_filled = df_with_nulls.fill_null(0)
result = df_filled.to_pydict()
assert result["d"] == [0, 0, 0]

# Test filling nulls with string value
df_with_nulls = df.with_column("d", literal(None).cast(pa.string()))
df_filled = df_with_nulls.fill_null("missing")
result = df_filled.to_pydict()
assert result["d"] == ["missing", "missing", "missing"]

# Test filling nulls with subset of columns
df_with_nulls = df.with_columns(
literal(None).cast(pa.int64()).alias("d"),
literal(None).cast(pa.string()).alias("e"),
)
df_filled = df_with_nulls.fill_null("missing", subset=["e"])
result = df_filled.to_pydict()
assert result["d"] == [None, None, None]
assert result["e"] == ["missing", "missing", "missing"]

# Test filling nulls with value that cannot be cast to column type
df_with_nulls = df.with_column("d", literal(None))
df_filled = df_with_nulls.fill_null("invalid")
result = df_filled.to_pydict()
assert result["d"] == [None, None, None]

# Test filling nulls with value that can be cast to some columns but not others
df_with_nulls = df.with_columns(
literal(None).alias("d").cast(pa.int64()),
literal(None).alias("e").cast(pa.string()),
)
df_filled = df_with_nulls.fill_null(0)
result = df_filled.to_pydict()
assert result["d"] == [0, 0, 0]
assert result["e"] == [None, None, None]

# Test filling nulls with subset of columns where some casts fail
df_with_nulls = df.with_columns(
literal(None).alias("d").cast(pa.int64()),
literal(None).alias("e").cast(pa.string()),
)
df_filled = df_with_nulls.fill_null(0, subset=["d", "e"])
result = df_filled.to_pydict()
assert result["d"] == [0, 0, 0]
assert result["e"] == [None, None, None]

# Test filling nulls with subset of columns where all casts succeed
df_with_nulls = df.with_columns(
literal(None).alias("d").cast(pa.int64()),
literal(None).alias("e").cast(pa.string()),
)
df_filled = df_with_nulls.fill_null("missing", subset=["e"])
result = df_filled.to_pydict()
assert result["d"] == [None, None, None]
assert result["e"] == ["missing", "missing", "missing"]

# Test filling nulls with subset of columns where some columns do not exist
df_with_nulls = df.with_columns(
literal(None).alias("d").cast(pa.int64()),
literal(None).alias("e").cast(pa.string()),
)
with pytest.raises(ValueError, match="Column 'f' not found in DataFrame"):
df_with_nulls.fill_null("missing", subset=["e", "f"])

def test_fill_nan(df):
# Test filling NaNs with integer value
df_with_nans = df.with_column("d", literal(float("nan")).cast(pa.float64()))
df_filled = df_with_nans.fill_nan(0)
result = df_filled.to_pydict()
assert result["d"] == [0, 0, 0]

# Test filling NaNs with float value
df_with_nans = df.with_column("d", literal(float("nan")).cast(pa.float64()))
df_filled = df_with_nans.fill_nan(99.9)
result = df_filled.to_pydict()
assert result["d"] == [99.9, 99.9, 99.9]

# Test filling NaNs with subset of columns
df_with_nans = df.with_columns(
literal(float("nan")).cast(pa.float64()).alias("d"),
literal(float("nan")).cast(pa.float64()).alias("e"),
)
df_filled = df_with_nans.fill_nan(99.9, subset=["e"])
result = df_filled.to_pydict()
assert result["d"] == [float("nan"), float("nan"), float("nan")]
assert result["e"] == [99.9, 99.9, 99.9]

# Test filling NaNs with value that cannot be cast to column type
df_with_nans = df.with_column("d", literal(float("nan")).cast(pa.float64()))
with pytest.raises(ValueError, match="Value must be numeric"):
df_with_nans.fill_nan("invalid")

# Test filling NaNs with subset of columns where some casts fail
df_with_nans = df.with_columns(
literal(float("nan")).alias("d").cast(pa.float64()),
literal(float("nan")).alias("e").cast(pa.float64()),
literal("abc").alias("f").cast(pa.string()), # non-numeric column
)
df_filled = df_with_nans.fill_nan(0, subset=["d", "e", "f"])
result = df_filled.to_pydict()
assert result["d"] == [0, 0, 0] # succeeds
assert result["e"] == [0, 0, 0] # succeeds
assert result["f"] == ["abc", "abc", "abc"] # skipped because not numeric

# Test filling NaNs fails on non-numeric columns
df_with_mixed = df.with_columns(
literal(float("nan")).alias("d").cast(pa.float64()),
literal("abc").alias("e").cast(pa.string()),
)
with pytest.raises(ValueError, match="Column 'e' is not a numeric column"):
df_with_mixed.fill_nan(0, subset=["d", "e"])

# Test filling NaNs with subset of columns where all casts succeed
df_with_nans = df.with_columns(
literal(float("nan")).alias("d").cast(pa.float64()),
literal(float("nan")).alias("e").cast(pa.float64()),
)
df_filled = df_with_nans.fill_nan(99.9, subset=["e"])
result = df_filled.to_pydict()
assert result["d"] == [float("nan"), float("nan"), float("nan")]
assert result["e"] == [99.9, 99.9, 99.9]
54 changes: 54 additions & 0 deletions python/tests/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1173,3 +1173,57 @@ def test_between_default(df):

actual = df.collect()[0].to_pydict()
assert actual == expected


def test_coalesce(df):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added this test because while researching this PR, I initially checked out the coalesce function and found there were no tests yet.

# Create a DataFrame with null values
ctx = SessionContext()
batch = pa.RecordBatch.from_arrays(
[
pa.array(["Hello", None, "!"]), # string column with null
pa.array([4, None, 6]), # integer column with null
pa.array(["hello ", None, " !"]), # string column with null
pa.array(
[datetime(2022, 12, 31), None, datetime(2020, 7, 2)]
), # datetime with null
pa.array([False, None, True]), # boolean column with null
],
names=["a", "b", "c", "d", "e"],
)
df_with_nulls = ctx.create_dataframe([[batch]])

# Test coalesce with different data types
result_df = df_with_nulls.select(
f.coalesce(column("a"), literal("default")).alias("a_coalesced"),
f.coalesce(column("b"), literal(0)).alias("b_coalesced"),
f.coalesce(column("c"), literal("default")).alias("c_coalesced"),
f.coalesce(column("d"), literal(datetime(2000, 1, 1))).alias("d_coalesced"),
f.coalesce(column("e"), literal(False)).alias("e_coalesced"),
)

result = result_df.collect()[0]

# Verify results
assert result.column(0) == pa.array(
["Hello", "default", "!"], type=pa.string_view()
)
assert result.column(1) == pa.array([4, 0, 6], type=pa.int64())
assert result.column(2) == pa.array(
["hello ", "default", " !"], type=pa.string_view()
)
assert result.column(3) == pa.array(
[datetime(2022, 12, 31), datetime(2000, 1, 1), datetime(2020, 7, 2)],
type=pa.timestamp("us"),
)
assert result.column(4) == pa.array([False, False, True], type=pa.bool_())

# Test multiple arguments
result_df = df_with_nulls.select(
f.coalesce(column("a"), literal(None), literal("fallback")).alias(
"multi_coalesce"
)
)
result = result_df.collect()[0]
assert result.column(0) == pa.array(
["Hello", "fallback", "!"], type=pa.string_view()
)