[mgpu] Pointwise op can handle LHS splats.

cperivol · Google-ML-Automation · commit 1d2dc17e5f22 · 2024-11-21T09:50:21.000-08:00
PiperOrigin-RevId: 698818035
diff --git a/jax/experimental/mosaic/gpu/fragmented_array.py b/jax/experimental/mosaic/gpu/fragmented_array.py
@@ -623,6 +623,38 @@ def to_layout(self, new_layout: FragmentedLayout):
     )
 
   def _pointwise(self, op, *other, output_is_signed: bool | None = None):
+    if isinstance(self.layout, WGSplatFragLayout):
+      # Find either the largest operand or an operand that has a
+      # concrete layout base the layout computation of that.
+      widest_idx = None
+      for i, o in enumerate(other):
+        if not isinstance(o, FragmentedArray):
+          continue
+        elif not isinstance(o.layout, WGSplatFragLayout):
+          widest_idx = i
+          break
+        elif not o.layout.can_broadcast_to(self.layout.shape):
+          # Note: equal shapes can be broadcast to each other. Using
+          # the negation we make sure to only consider strictly larger
+          # shapes so that we don't end up ping ponging between equal
+          # shapes.
+          widest_idx = i
+
+      if widest_idx is not None:
+        # We need to retain the order of arguments that the op
+        # expects.
+        def _op(wide_o, self_o, *args):
+          pre_wide = args[:widest_idx - 1]
+          post_wide = args[widest_idx - 1:]
+          return op(self_o, *pre_wide, wide_o, *post_wide)
+        return other[widest_idx]._pointwise(
+            _op,
+            self,
+            *other[:widest_idx],
+            *other[widest_idx + 1:],
+            output_is_signed=output_is_signed,
+        )
+
     other_arrs = []
     for o in other:
       if not isinstance(o, FragmentedArray):
@@ -642,7 +674,7 @@ def _pointwise(self, op, *other, output_is_signed: bool | None = None):
             o.registers.flat[0],
             shape=self.shape,
             layout=self.layout,
-            is_signed=self.is_signed,
+            is_signed=o.is_signed,
         )
       else:
         if self.layout != o.layout:
diff --git a/tests/mosaic/gpu_test.py b/tests/mosaic/gpu_test.py
@@ -1489,6 +1489,29 @@ def kernel(ctx, dst, _):
     )()
     np.testing.assert_array_equal(result, np.full((128, 32), 3.14, np.float32))
 
+
+  def test_splat_binary_ops(self):
+    def kernel(ctx, src, dst, _):
+      f32 = ir.F32Type.get()
+      pi_arr = mgpu.FragmentedArray.load_strided(src)
+      assert isinstance(pi_arr.layout, mgpu.WGStridedFragLayout)
+      pi_scalar = arith.constant(f32, ir.FloatAttr.get(f32, 3.14))
+      pi_splat = mgpu.FragmentedArray.splat(pi_scalar, ())
+      assert isinstance(pi_splat.layout, mgpu.WGSplatFragLayout)
+      pi_arr_sq = pi_arr * pi_splat.broadcast(pi_arr.shape)
+      assert isinstance(pi_arr_sq.layout, mgpu.WGStridedFragLayout)
+      pi_arr_cube = pi_splat.broadcast(pi_arr.shape) * pi_arr_sq
+      assert isinstance(pi_arr_cube.layout, mgpu.WGStridedFragLayout)
+      (pi_arr_sq + pi_arr_cube).store_untiled(dst)
+
+    out_shape = jax.ShapeDtypeStruct((128, 32), jnp.float32)
+    inp = jnp.ones_like(out_shape) * 3.14
+    result = mgpu.as_gpu_kernel(
+        kernel, (1, 1, 1), (128, 1, 1), inp, out_shape, ()
+    )(inp)
+    np.testing.assert_allclose(result, np.full((128, 32), 3.14 ** 2 + 3.14 ** 3, np.float32))
+
+
   @parameterized.product(in_shape=((128, 128), (128, 64), (64, 128)))
   def test_strided_load_store(self, in_shape):
     def kernel(ctx, *args):