Improving hypot/rhypot.

vaijira · vaijira · commit 2d49684430da · 2025-11-29T12:50:32.000+01:00
Removing conditional branches when possible.
diff --git a/crates/cubecl-cpu/src/compiler/visitor/operation/arithmetic.rs b/crates/cubecl-cpu/src/compiler/visitor/operation/arithmetic.rs
@@ -450,93 +450,76 @@ impl<'a> Visitor<'a> {
                 let (a, b) = self.get_binary_op_variable(hypot.lhs, hypot.rhs);
                 let abs_a = self.get_absolute_val(hypot.lhs.ty, a);
                 let abs_b = self.get_absolute_val(hypot.rhs.ty, b);
-
-                let max =
-                    self.append_operation_with_result(arith::maxnumf(abs_a, abs_b, self.location));
                 let zero = self.create_float_constant_from_item(hypot.lhs.ty, 0.0);
                 let one = self.create_float_constant_from_item(hypot.lhs.ty, 1.0);
-                let is_zero = self.append_operation_with_result(arith::cmpf(
+                let max =
+                    self.append_operation_with_result(arith::maxnumf(abs_a, abs_b, self.location));
+                let is_max_zero = self.append_operation_with_result(arith::cmpf(
                     self.context,
                     arith::CmpfPredicate::Oeq,
                     max,
                     zero,
                     self.location,
                 ));
-                let scale = self.append_operation_with_result(arith::select(
-                    is_zero,
+                let max_safe = self.append_operation_with_result(arith::select(
+                    is_max_zero,
                     one,
                     max,
                     self.location,
                 ));
-                let a_scale =
-                    self.append_operation_with_result(arith::divf(abs_a, scale, self.location));
-                let b_scale =
-                    self.append_operation_with_result(arith::divf(abs_b, scale, self.location));
-                let a_scale_squared =
-                    self.append_operation_with_result(arith::mulf(a_scale, a_scale, self.location));
-                let b_scale_squared =
-                    self.append_operation_with_result(arith::mulf(b_scale, b_scale, self.location));
-                let sum = self.append_operation_with_result(arith::addf(
-                    a_scale_squared,
-                    b_scale_squared,
-                    self.location,
-                ));
+                let min =
+                    self.append_operation_with_result(arith::minimumf(abs_a, abs_b, self.location));
+                let t =
+                    self.append_operation_with_result(arith::divf(min, max_safe, self.location));
+                let t_square = self.append_operation_with_result(arith::mulf(t, t, self.location));
+                let t_square_plus_one =
+                    self.append_operation_with_result(arith::addf(t_square, one, self.location));
                 let square_root = self.append_operation_with_result(llvm_ods::intr_sqrt(
                     self.context,
-                    sum,
-                    self.location,
-                ));
-                let result = self.append_operation_with_result(arith::mulf(
-                    square_root,
-                    scale,
+                    t_square_plus_one,
                     self.location,
                 ));
+                let result =
+                    self.append_operation_with_result(arith::mulf(max, square_root, self.location));
 
                 self.insert_variable(out, result);
             }
             Arithmetic::Rhypot(hypot) => {
                 let (a, b) = self.get_binary_op_variable(hypot.lhs, hypot.rhs);
                 let abs_a = self.get_absolute_val(hypot.lhs.ty, a);
                 let abs_b = self.get_absolute_val(hypot.rhs.ty, b);
-
-                let max =
-                    self.append_operation_with_result(arith::maxnumf(abs_a, abs_b, self.location));
                 let zero = self.create_float_constant_from_item(hypot.lhs.ty, 0.0);
                 let one = self.create_float_constant_from_item(hypot.lhs.ty, 1.0);
-                let is_zero = self.append_operation_with_result(arith::cmpf(
+                let max =
+                    self.append_operation_with_result(arith::maxnumf(abs_a, abs_b, self.location));
+                let is_max_zero = self.append_operation_with_result(arith::cmpf(
                     self.context,
                     arith::CmpfPredicate::Oeq,
                     max,
                     zero,
                     self.location,
                 ));
-                let scale = self.append_operation_with_result(arith::select(
-                    is_zero,
+                let max_safe = self.append_operation_with_result(arith::select(
+                    is_max_zero,
                     one,
                     max,
                     self.location,
                 ));
-                let a_scale =
-                    self.append_operation_with_result(arith::divf(abs_a, scale, self.location));
-                let b_scale =
-                    self.append_operation_with_result(arith::divf(abs_b, scale, self.location));
-                let a_scale_squared =
-                    self.append_operation_with_result(arith::mulf(a_scale, a_scale, self.location));
-                let b_scale_squared =
-                    self.append_operation_with_result(arith::mulf(b_scale, b_scale, self.location));
-                let sum = self.append_operation_with_result(arith::addf(
-                    a_scale_squared,
-                    b_scale_squared,
-                    self.location,
-                ));
-                let rsquare_root = self.append_operation_with_result(math_ods::rsqrt(
+                let min =
+                    self.append_operation_with_result(arith::minimumf(abs_a, abs_b, self.location));
+                let t =
+                    self.append_operation_with_result(arith::divf(min, max_safe, self.location));
+                let t_square = self.append_operation_with_result(arith::mulf(t, t, self.location));
+                let t_square_plus_one =
+                    self.append_operation_with_result(arith::addf(t_square, one, self.location));
+                let inverse_square_root = self.append_operation_with_result(math_ods::rsqrt(
                     self.context,
-                    sum,
+                    t_square_plus_one,
                     self.location,
                 ));
                 let result = self.append_operation_with_result(arith::divf(
-                    rsquare_root,
-                    scale,
+                    inverse_square_root,
+                    max,
                     self.location,
                 ));
 
diff --git a/crates/cubecl-cuda/src/compute/server.rs b/crates/cubecl-cuda/src/compute/server.rs
@@ -368,7 +368,8 @@ impl ComputeServer for CudaServer {
                     pixels_per_column: _,
                 } => {
                     return Err(LaunchError::Unknown {
-                        context: "CUDA version 12.8 required for tensor map format Im2colWide".into,
+                        context: "CUDA version 12.8 required for tensor map format Im2colWide"
+                            .into(),
                     });
                 }
             };
diff --git a/crates/cubecl-spirv/src/arithmetic.rs b/crates/cubecl-spirv/src/arithmetic.rs
@@ -538,44 +538,32 @@ impl<T: SpirvTarget> SpirvCompiler<T> {
             Arithmetic::Hypot(op) => {
                 self.compile_binary_op(op, out, uniform, |b, out_ty, ty, lhs, rhs, out| {
                     let relaxed = matches!(out_ty.elem(), Elem::Relaxed);
+                    let zero = b.static_cast(ConstVal::Bit32(0), &Elem::Int(32, false), &out_ty);
+                    let one = b.static_cast(ConstVal::Bit32(1), &Elem::Int(32, false), &out_ty);
                     let abs_a = b.id();
                     T::f_abs(b, ty, lhs, abs_a);
                     let abs_b = b.id();
                     T::f_abs(b, ty, rhs, abs_b);
                     let max = b.id();
                     T::f_max(b, ty, abs_a, abs_b, max);
-                    let zero = b.static_cast(ConstVal::Bit32(0), &Elem::Int(32, false), &out_ty);
-                    let one = b.static_cast(ConstVal::Bit32(1), &Elem::Int(32, false), &out_ty);
+                    let min = b.id();
+                    T::f_min(b, ty, abs_a, abs_b, min);
                     let bool = Elem::Bool.id(b);
-                    let is_zero = b.f_ord_equal(bool, None, max, zero).unwrap();
-                    let scale = b.id();
-                    b.select(ty, Some(scale), is_zero, one, max).unwrap();
-                    let a_scaled = b.id();
-                    b.f_div(ty, Some(a_scaled), abs_a, scale).unwrap();
-                    let b_scaled = b.id();
-                    b.f_div(ty, Some(b_scaled), abs_b, scale).unwrap();
-                    let a_scale_squared = b.id();
-                    b.f_mul(ty, Some(a_scale_squared), a_scaled, a_scaled)
-                        .unwrap();
-                    let b_scale_squared = b.id();
-                    b.f_mul(ty, Some(b_scale_squared), b_scaled, b_scaled)
-                        .unwrap();
-                    let sum = b.id();
-                    b.f_add(ty, Some(sum), a_scale_squared, b_scale_squared)
-                        .unwrap();
+                    let is_max_zero = b.f_ord_equal(bool, None, max, zero).unwrap();
+                    let max_safe = b.id();
+                    b.select(ty, Some(max_safe), is_max_zero, one, max).unwrap();
+                    let t = b.id();
+                    b.f_div(ty, Some(t), min, max_safe).unwrap();
+                    let t_fma = b.gl_fma(ty, t, t, one).unwrap();
                     let square_root = b.id();
-                    T::sqrt(b, ty, sum, square_root);
+                    T::sqrt(b, ty, t_fma, square_root);
                     let ids = [
                         abs_a,
                         abs_b,
                         max,
-                        is_zero,
-                        scale,
-                        a_scaled,
-                        b_scaled,
-                        a_scale_squared,
-                        b_scale_squared,
-                        sum,
+                        is_max_zero,
+                        max_safe,
+                        t_fma,
                         square_root,
                         out,
                     ];
@@ -585,51 +573,39 @@ impl<T: SpirvTarget> SpirvCompiler<T> {
                             b.decorate(id, Decoration::RelaxedPrecision, []);
                         }
                     }
-                    b.f_mul(ty, Some(out), square_root, scale).unwrap();
+                    b.f_mul(ty, Some(out), square_root, max).unwrap();
                 })
             }
             Arithmetic::Rhypot(op) => {
                 self.compile_binary_op(op, out, uniform, |b, out_ty, ty, lhs, rhs, out| {
                     let relaxed = matches!(out_ty.elem(), Elem::Relaxed);
+                    let zero = b.static_cast(ConstVal::Bit32(0), &Elem::Int(32, false), &out_ty);
+                    let one = b.static_cast(ConstVal::Bit32(1), &Elem::Int(32, false), &out_ty);
                     let abs_a = b.id();
                     T::f_abs(b, ty, lhs, abs_a);
                     let abs_b = b.id();
                     T::f_abs(b, ty, rhs, abs_b);
                     let max = b.id();
                     T::f_max(b, ty, abs_a, abs_b, max);
-                    let zero = b.static_cast(ConstVal::Bit32(0), &Elem::Int(32, false), &out_ty);
-                    let one = b.static_cast(ConstVal::Bit32(1), &Elem::Int(32, false), &out_ty);
+                    let min = b.id();
+                    T::f_min(b, ty, abs_a, abs_b, min);
                     let bool = Elem::Bool.id(b);
-                    let is_zero = b.f_ord_equal(bool, None, max, zero).unwrap();
-                    let scale = b.id();
-                    b.select(ty, Some(scale), is_zero, one, max).unwrap();
-                    let a_scaled = b.id();
-                    b.f_div(ty, Some(a_scaled), abs_a, scale).unwrap();
-                    let b_scaled = b.id();
-                    b.f_div(ty, Some(b_scaled), abs_b, scale).unwrap();
-                    let a_scale_squared = b.id();
-                    b.f_mul(ty, Some(a_scale_squared), a_scaled, a_scaled)
-                        .unwrap();
-                    let b_scale_squared = b.id();
-                    b.f_mul(ty, Some(b_scale_squared), b_scaled, b_scaled)
-                        .unwrap();
-                    let sum = b.id();
-                    b.f_add(ty, Some(sum), a_scale_squared, b_scale_squared)
-                        .unwrap();
-                    let rsquare_root = b.id();
-                    T::inverse_sqrt(b, ty, sum, rsquare_root);
+                    let is_max_zero = b.f_ord_equal(bool, None, max, zero).unwrap();
+                    let max_safe = b.id();
+                    b.select(ty, Some(max_safe), is_max_zero, one, max).unwrap();
+                    let t = b.id();
+                    b.f_div(ty, Some(t), min, max_safe).unwrap();
+                    let t_fma = b.gl_fma(ty, t, t, one).unwrap();
+                    let inverse_square_root = b.id();
+                    T::inverse_sqrt(b, ty, t_fma, inverse_square_root);
                     let ids = [
                         abs_a,
                         abs_b,
                         max,
-                        is_zero,
-                        scale,
-                        a_scaled,
-                        b_scaled,
-                        a_scale_squared,
-                        b_scale_squared,
-                        sum,
-                        rsquare_root,
+                        is_max_zero,
+                        max_safe,
+                        t_fma,
+                        inverse_square_root,
                         out,
                     ];
                     for id in ids {
@@ -638,7 +614,7 @@ impl<T: SpirvTarget> SpirvCompiler<T> {
                             b.decorate(id, Decoration::RelaxedPrecision, []);
                         }
                     }
-                    b.f_div(ty, Some(out), rsquare_root, scale).unwrap();
+                    b.f_div(ty, Some(out), inverse_square_root, max).unwrap();
                 })
             }
             Arithmetic::Sqrt(op) => {
diff --git a/crates/cubecl-wgpu/src/compiler/wgsl/extension.rs b/crates/cubecl-wgpu/src/compiler/wgsl/extension.rs
@@ -352,13 +352,13 @@ fn format_hypot_primitive(
         f,
         "
 fn {function_name}(lhs: {elem}, rhs: {elem}) -> {elem} {{
-    if (lhs == 0.0) {{ return abs(rhs); }}
-    if (rhs == 0.0) {{ return abs(lhs); }}
     let a = abs(lhs);
     let b = abs(rhs);
     let max_val = max(a, b);
+    var max_val_safe = max_val;
+    if (max_val == 0.0) {{ max_val_safe = 1.0; }}
     let min_val = min(a, b);
-    let t = min_val / max_val;
+    let t = min_val / max_val_safe;
 
     return max_val * sqrt(fma(t, t, 1.0));
 }}
@@ -378,10 +378,11 @@ fn format_rhypot_primitive(
 fn {function_name}(lhs: {elem}, rhs: {elem}) -> {elem} {{
     let a = abs(lhs);
     let b = abs(rhs);
-    if (a == 0.0 && b == 0.0) {{ return bitcast<f32>(0x7F800000u); }}
     let max_val = max(a, b);
+    var max_val_safe = max_val;
+    if (max_val == 0.0) {{ max_val_safe = 1.0; }}
     let min_val = min(a, b);
-    let t = min_val / max_val;
+    let t = min_val / max_val_safe;
 
     return inverseSqrt(fma(t, t, 1.0)) / max_val;
 }}