amend def fill_null to invoke PyDataFrame's fill_null

kosiew · kosiew · commit 5a3cd8c097ac · 2025-04-03T18:42:07.000+08:00
- Implemented `fill_null` method in `dataframe.rs` to allow filling null values with a specified value for specific columns or all columns.
- Added a helper function `python_value_to_scalar_value` to convert Python values to DataFusion ScalarValues, supporting various types including integers, floats, booleans, strings, and timestamps.
- Updated the `count` method in `PyDataFrame` to maintain functionality.
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -894,92 +894,5 @@ def fill_null(self, value: Any, subset: list[str] | None = None) -> "DataFrame":
             - For columns where casting fails, the original column is kept unchanged
             - For columns not in subset, the original column is kept unchanged
         """
-        # Get columns to process
-        if subset is None:
-            subset = self.schema().names
-        else:
-            schema_cols = self.schema().names
-            for col in subset:
-                if col not in schema_cols:
-                    raise ValueError(f"Column '{col}' not found in DataFrame")
-
-        # Build expressions for select
-        exprs = []
-        for col_name in self.schema().names:
-            if col_name in subset:
-                # Get column type
-                col_type = self.schema().field(col_name).type
-
-                try:
-                    # Try casting value to column type
-                    typed_value = pa.scalar(value, type=col_type)
-                    literal_expr = f.Expr.literal(typed_value)
-
-                    # Build coalesce expression
-                    expr = f.coalesce(f.col(col_name), literal_expr)
-                    exprs.append(expr.alias(col_name))
-
-                except (pa.ArrowTypeError, pa.ArrowInvalid):
-                    # If cast fails, keep original column
-                    exprs.append(f.col(col_name))
-            else:
-                # Keep columns not in subset unchanged
-                exprs.append(f.col(col_name))
-
-        return self.select(*exprs)
-
-    def fill_nan(
-        self, value: float | int, subset: list[str] | None = None
-    ) -> "DataFrame":
-        """Fill NaN values in specified numeric columns with a value.
-
-        Args:
-            value: Numeric value to replace NaN values with.
-            subset: Optional list of column names to fill. If None, fills all numeric
-                columns.
-
-        Returns:
-            DataFrame with NaN values replaced in numeric columns.
 
-        Examples:
-            >>> df = df.fill_nan(0)  # Fill all NaNs with 0 in numeric columns
-            >>> # Fill NaNs in specific numeric columns
-            >>> df = df.fill_nan(99.9, subset=["price", "score"])
-
-        Notes:
-            - Only fills NaN values in numeric columns (float32, float64)
-            - Non-numeric columns are kept unchanged
-            - For columns not in subset, the original column is kept unchanged
-            - Value must be numeric (int or float)
-        """
-        if not isinstance(value, (int, float)):
-            raise ValueError("Value must be numeric (int or float)")
-
-        # Get columns to process
-        if subset is None:
-            # Only get numeric columns if no subset specified
-            subset = [
-                field.name
-                for field in self.schema()
-                if pa.types.is_floating(field.type)
-            ]
-        else:
-            schema_cols = self.schema().names
-            for col in subset:
-                if col not in schema_cols:
-                    raise ValueError(f"Column '{col}' not found in DataFrame")
-                if not pa.types.is_floating(self.schema().field(col).type):
-                    raise ValueError(f"Column '{col}' is not a numeric column")
-
-        # Build expressions for select
-        exprs = []
-        for col_name in self.schema().names:
-            if col_name in subset:
-                # Use nanvl function to replace NaN values
-                expr = f.nanvl(f.col(col_name), f.lit(value))
-                exprs.append(expr.alias(col_name))
-            else:
-                # Keep columns not in subset unchanged
-                exprs.append(f.col(col_name))
-
-        return self.select(*exprs)
+        return DataFrame(self.df.fill_null(value, subset))
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -797,6 +797,25 @@ impl PyDataFrame {
     fn count(&self, py: Python) -> PyDataFusionResult<usize> {
         Ok(wait_for_future(py, self.df.as_ref().clone().count())?)
     }
+
+    /// Fill null values with a specified value for specific columns
+    #[pyo3(signature = (value, columns=None))]
+    fn fill_null(
+        &self,
+        value: PyObject,
+        columns: Option<Vec<PyBackedStr>>,
+        py: Python,
+    ) -> PyDataFusionResult<Self> {
+        let scalar_value = python_value_to_scalar_value(&value, py)?;
+
+        let cols = match columns {
+            Some(col_names) => col_names.iter().map(|c| c.to_string()).collect(),
+            None => Vec::new(), // Empty vector means fill null for all columns
+        };
+
+        let df = self.df.as_ref().clone().fill_null(scalar_value, cols)?;
+        Ok(Self::new(df))
+    }
 }
 
 /// Print DataFrame
@@ -951,3 +970,47 @@ async fn collect_record_batches_to_display(
 
     Ok((record_batches, has_more))
 }
+
+/// Convert a Python value to a DataFusion ScalarValue
+fn python_value_to_scalar_value(value: &PyObject, py: Python) -> PyDataFusionResult<ScalarValue> {
+    if value.is_none(py) {
+        return Err(PyDataFusionError::Common(
+            "Cannot use None as fill value".to_string(),
+        ));
+    } else if let Ok(val) = value.extract::<i64>(py) {
+        return Ok(ScalarValue::Int64(Some(val)));
+    } else if let Ok(val) = value.extract::<f64>(py) {
+        return Ok(ScalarValue::Float64(Some(val)));
+    } else if let Ok(val) = value.extract::<bool>(py) {
+        return Ok(ScalarValue::Boolean(Some(val)));
+    } else if let Ok(val) = value.extract::<String>(py) {
+        return Ok(ScalarValue::Utf8(Some(val)));
+    } else if let Ok(dt) = py
+        .import("datetime")
+        .and_then(|m| m.getattr("datetime"))
+        .and_then(|dt| value.is_instance(dt))
+    {
+        if value.is_instance_of::<pyo3::types::PyDateTime>(py) {
+            let naive_dt = value.extract::<chrono::NaiveDateTime>(py)?;
+            return Ok(ScalarValue::TimestampNanosecond(
+                Some(naive_dt.timestamp_nanos()),
+                None,
+            ));
+        } else {
+            return Err(PyDataFusionError::Common(
+                "Unsupported datetime type".to_string(),
+            ));
+        }
+    }
+
+    // Try to convert to string as fallback
+    match value.str(py) {
+        Ok(py_str) => {
+            let s = py_str.to_string()?;
+            Ok(ScalarValue::Utf8(Some(s)))
+        }
+        Err(_) => Err(PyDataFusionError::Common(
+            "Unsupported Python type for fill_null".to_string(),
+        )),
+    }
+}