feat: Allow log with non-integer base on decimals

Yuvraj-cyborg · Yuvraj-cyborg · commit f0b96f728095 · 2025-12-20T21:06:12.000+05:30
diff --git a/datafusion/functions/src/math/log.rs b/datafusion/functions/src/math/log.rs
@@ -21,9 +21,7 @@ use std::any::Any;
 
 use super::power::PowerFunc;
 
-use crate::utils::{
-    calculate_binary_math, decimal32_to_i32, decimal64_to_i64, decimal128_to_i128,
-};
+use crate::utils::calculate_binary_math;
 use arrow::array::{Array, ArrayRef};
 use arrow::datatypes::{
     DataType, Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type, Float16Type,
@@ -44,7 +42,7 @@ use datafusion_expr::{
 };
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
 use datafusion_macros::user_doc;
-use num_traits::Float;
+use num_traits::{Float, ToPrimitive};
 
 #[user_doc(
     doc_section(label = "Math Functions"),
@@ -104,91 +102,70 @@ impl LogFunc {
     }
 }
 
-/// Binary function to calculate logarithm of Decimal32 `value` using `base` base
-/// Returns error if base is invalid
-fn log_decimal32(value: i32, scale: i8, base: f64) -> Result<f64, ArrowError> {
-    if !base.is_finite() || base.trunc() != base {
-        return Err(ArrowError::ComputeError(format!(
-            "Log cannot use non-integer base: {base}"
-        )));
-    }
-    if (base as u32) < 2 {
-        return Err(ArrowError::ComputeError(format!(
-            "Log base must be greater than 1: {base}"
-        )));
-    }
-
-    let unscaled_value = decimal32_to_i32(value, scale)?;
-    if unscaled_value > 0 {
-        let log_value: u32 = unscaled_value.ilog(base as i32);
-        Ok(log_value as f64)
-    } else {
-        // Reflect f64::log behaviour
-        Ok(f64::NAN)
-    }
+/// Checks if the base is valid for the efficient integer logarithm algorithm.
+#[inline]
+fn is_valid_integer_base(base: f64) -> bool {
+    base.trunc() == base && base >= 2.0 && base <= u32::MAX as f64
 }
 
-/// Binary function to calculate logarithm of Decimal64 `value` using `base` base
-/// Returns error if base is invalid
-fn log_decimal64(value: i64, scale: i8, base: f64) -> Result<f64, ArrowError> {
-    if !base.is_finite() || base.trunc() != base {
-        return Err(ArrowError::ComputeError(format!(
-            "Log cannot use non-integer base: {base}"
-        )));
-    }
-    if (base as u32) < 2 {
-        return Err(ArrowError::ComputeError(format!(
-            "Log base must be greater than 1: {base}"
-        )));
+/// Generic function to calculate logarithm of a decimal value using the given base.
+///
+/// For integer bases >= 2 with non-negative scale, uses the efficient integer `ilog` algorithm.
+/// For all other cases (non-integer bases, negative bases, non-finite bases),
+/// falls back to f64 computation which naturally returns NaN for invalid inputs,
+/// matching the behavior of `f64::log`.
+fn log_decimal<T>(value: T, scale: i8, base: f64) -> Result<f64, ArrowError>
+where
+    T: ToPrimitive + Copy,
+{
+    // For integer bases >= 2 and non-negative scale, try the efficient integer algorithm
+    if is_valid_integer_base(base)
+        && scale >= 0
+        && let Some(unscaled) = unscale_decimal_value(value, scale)
+    {
+        return if unscaled > 0 {
+            Ok(unscaled.ilog(base as u128) as f64)
+        } else {
+            Ok(f64::NAN)
+        };
     }
 
-    let unscaled_value = decimal64_to_i64(value, scale)?;
-    if unscaled_value > 0 {
-        let log_value: u32 = unscaled_value.ilog(base as i64);
-        Ok(log_value as f64)
-    } else {
-        // Reflect f64::log behaviour
-        Ok(f64::NAN)
-    }
+    // Fallback to f64 computation for non-integer bases, negative scale, etc.
+    // This naturally returns NaN for invalid inputs (base <= 1, non-finite, value <= 0)
+    decimal_to_f64(value, scale).map(|v| v.log(base))
 }
 
-/// Binary function to calculate an integer logarithm of Decimal128 `value` using `base` base
-/// Returns error if base is invalid
-fn log_decimal128(value: i128, scale: i8, base: f64) -> Result<f64, ArrowError> {
-    if !base.is_finite() || base.trunc() != base {
-        return Err(ArrowError::ComputeError(format!(
-            "Log cannot use non-integer base: {base}"
-        )));
-    }
-    if (base as u32) < 2 {
-        return Err(ArrowError::ComputeError(format!(
-            "Log base must be greater than 1: {base}"
-        )));
-    }
-
-    if value <= 0 {
-        // Reflect f64::log behaviour
-        return Ok(f64::NAN);
-    }
+/// Unscale a decimal value by dividing by 10^scale, returning the result as u128.
+/// Returns None if the value is negative or the conversion fails.
+#[inline]
+fn unscale_decimal_value<T: ToPrimitive>(value: T, scale: i8) -> Option<u128> {
+    let value_u128 = value.to_u128()?;
+    let divisor = 10u128.checked_pow(scale as u32)?;
+    Some(value_u128 / divisor)
+}
 
-    if scale < 0 {
-        let actual_value = (value as f64) * 10.0_f64.powi(-(scale as i32));
-        Ok(actual_value.log(base))
-    } else {
-        let unscaled_value = decimal128_to_i128(value, scale)?;
-        let log_value: u32 = unscaled_value.ilog(base as i128);
-        Ok(log_value as f64)
-    }
+/// Convert a scaled decimal value to f64.
+#[inline]
+fn decimal_to_f64<T: ToPrimitive>(value: T, scale: i8) -> Result<f64, ArrowError> {
+    let value_f64 = value
+        .to_f64()
+        .ok_or_else(|| ArrowError::ComputeError("Cannot convert value to f64".to_string()))?;
+    let scale_factor = 10f64.powi(scale as i32);
+    Ok(value_f64 / scale_factor)
 }
 
-/// Binary function to calculate an integer logarithm of Decimal128 `value` using `base` base
-/// Returns error if base is invalid or if value is out of bounds of Decimal128
 fn log_decimal256(value: i256, scale: i8, base: f64) -> Result<f64, ArrowError> {
+    // Try to convert to i128 for the optimized path
     match value.to_i128() {
-        Some(value) => log_decimal128(value, scale, base),
-        None => Err(ArrowError::NotYetImplemented(format!(
-            "Log of Decimal256 larger than Decimal128 is not yet supported: {value}"
-        ))),
+        Some(v) => log_decimal(v, scale, base),
+        None => {
+            // For very large Decimal256 values, use f64 computation
+            let value_f64 = value.to_f64().ok_or_else(|| {
+                ArrowError::ComputeError(format!("Cannot convert {value} to f64"))
+            })?;
+            let scale_factor = 10f64.powi(scale as i32);
+            Ok((value_f64 / scale_factor).log(base))
+        }
     }
 }
 
@@ -282,21 +259,21 @@ impl ScalarUDFImpl for LogFunc {
                 calculate_binary_math::<Decimal32Type, Float64Type, Float64Type, _>(
                     &value,
                     &base,
-                    |value, base| log_decimal32(value, *scale, base),
+                    |value, base| log_decimal(value, *scale, base),
                 )?
             }
             DataType::Decimal64(_, scale) => {
                 calculate_binary_math::<Decimal64Type, Float64Type, Float64Type, _>(
                     &value,
                     &base,
-                    |value, base| log_decimal64(value, *scale, base),
+                    |value, base| log_decimal(value, *scale, base),
                 )?
             }
             DataType::Decimal128(_, scale) => {
                 calculate_binary_math::<Decimal128Type, Float64Type, Float64Type, _>(
                     &value,
                     &base,
-                    |value, base| log_decimal128(value, *scale, base),
+                    |value, base| log_decimal(value, *scale, base),
                 )?
             }
             DataType::Decimal256(_, scale) => {
@@ -433,7 +410,7 @@ mod tests {
         let value = 10_i128.pow(35);
         assert_eq!((value as f64).log2(), 116.26748332105768);
         assert_eq!(
-            log_decimal128(value, 0, 2.0).unwrap(),
+            log_decimal(value, 0, 2.0).unwrap(),
             // TODO: see we're losing our decimal points compared to above
             //       https://github.com/apache/datafusion/issues/18524
             116.0
@@ -1151,7 +1128,8 @@ mod tests {
     }
 
     #[test]
-    fn test_log_decimal128_wrong_base() {
+    fn test_log_decimal128_invalid_base() {
+        // Invalid base (-2.0) should return NaN, matching f64::log behavior
         let arg_fields = vec![
             Field::new("b", DataType::Float64, false).into(),
             Field::new("x", DataType::Decimal128(38, 0), false).into(),
@@ -1166,16 +1144,26 @@ mod tests {
             return_field: Field::new("f", DataType::Float64, true).into(),
             config_options: Arc::new(ConfigOptions::default()),
         };
-        let result = LogFunc::new().invoke_with_args(args);
-        assert!(result.is_err());
-        assert_eq!(
-            "Arrow error: Compute error: Log base must be greater than 1: -2",
-            result.unwrap_err().to_string().lines().next().unwrap()
-        );
+        let result = LogFunc::new()
+            .invoke_with_args(args)
+            .expect("should not error on invalid base");
+
+        match result {
+            ColumnarValue::Array(arr) => {
+                let floats = as_float64_array(&arr)
+                    .expect("failed to convert result to a Float64Array");
+                assert_eq!(floats.len(), 1);
+                assert!(floats.value(0).is_nan());
+            }
+            ColumnarValue::Scalar(_) => {
+                panic!("Expected an array value")
+            }
+        }
     }
 
     #[test]
-    fn test_log_decimal256_error() {
+    fn test_log_decimal256_large() {
+        // Large Decimal256 values that don't fit in i128 now use f64 fallback
         let arg_field = Field::new("a", DataType::Decimal256(38, 0), false).into();
         let args = ScalarFunctionArgs {
             args: vec![
@@ -1189,11 +1177,26 @@ mod tests {
             return_field: Field::new("f", DataType::Float64, true).into(),
             config_options: Arc::new(ConfigOptions::default()),
         };
-        let result = LogFunc::new().invoke_with_args(args);
-        assert!(result.is_err());
-        assert_eq!(
-            result.unwrap_err().to_string().lines().next().unwrap(),
-            "Arrow error: Not yet implemented: Log of Decimal256 larger than Decimal128 is not yet supported: 170141183460469231731687303715884106727"
-        );
+        let result = LogFunc::new()
+            .invoke_with_args(args)
+            .expect("should handle large Decimal256 via f64 fallback");
+
+        match result {
+            ColumnarValue::Array(arr) => {
+                let floats = as_float64_array(&arr)
+                    .expect("failed to convert result to a Float64Array");
+                assert_eq!(floats.len(), 1);
+                // The f64 fallback may lose some precision for very large numbers,
+                // but we verify we get a reasonable positive result (not NaN/infinity)
+                let log_result = floats.value(0);
+                assert!(
+                    log_result.is_finite() && log_result > 0.0,
+                    "Expected positive finite log result, got {log_result}"
+                );
+            }
+            ColumnarValue::Scalar(_) => {
+                panic!("Expected an array value")
+            }
+        }
     }
 }
diff --git a/datafusion/sqllogictest/test_files/decimal.slt b/datafusion/sqllogictest/test_files/decimal.slt
@@ -889,14 +889,20 @@ select log(2, 100000000000000000000000000000000000::decimal(38,0));
 ----
 116
 
-# log(10^35) for decimal128 with another base
+# log(10^35) for decimal128 with another base (float base)
 # TODO: this should be 116.267483321058, error with native decimal log impl
 #       https://github.com/apache/datafusion/issues/18524
 query R
 select log(2.0, 100000000000000000000000000000000000::decimal(38,0));
 ----
 116
 
+# log with non-integer base now works (fallback to f64)
+query R
+select log(2.5, 100::decimal(38,0));
+----
+5.025883189464
+
 # null cases
 query R
 select log(null, 100);