intel
diff --git a/‎.github/workflows/integration-tests-nvidia.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/integration-tests-nvidia.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎Makefile
Lines changed: 7 additions & 6 deletions b/‎Makefile
Lines changed: 7 additions & 6 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h
Lines changed: 7 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h
Lines changed: 7 additions & 0 deletions
diff --git a/‎lib/Dialect/Triton/IR/Ops.cpp
Lines changed: 2 additions & 2 deletions b/‎lib/Dialect/Triton/IR/Ops.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
Lines changed: 0 additions & 23 deletions b/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
Lines changed: 0 additions & 23 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp
Lines changed: 25 additions & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp
Lines changed: 25 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionScheduling.cpp
Lines changed: 13 additions & 6 deletions b/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionScheduling.cpp
Lines changed: 13 additions & 6 deletions
diff --git a/‎lib/Dialect/TritonNvidiaGPU/Transforms/PromoteLHSToTMem.cpp
Lines changed: 2 additions & 1 deletion b/‎lib/Dialect/TritonNvidiaGPU/Transforms/PromoteLHSToTMem.cpp
Lines changed: 2 additions & 1 deletion
diff --git a/‎python/src/gluon_ir.cc
Lines changed: 10 additions & 1 deletion b/‎python/src/gluon_ir.cc
Lines changed: 10 additions & 1 deletion
diff --git a/‎python/test/gluon/test_frontend.py
Lines changed: 181 additions & 0 deletions b/‎python/test/gluon/test_frontend.py
Lines changed: 181 additions & 0 deletions
@@ -88,7 +88,7 @@ jobs:
           if [ "${{ matrix.runner[0] }}" == "nvidia-gb200" ]; then
             source /venv/bin/activate
           fi
-          make test-unit
+          make NUM_PROCS=24 test-unit
       - name: Run interpreter tests
         if: ${{ matrix.runner[0] == 'nvidia-h100' }}
         run: make test-interpret
 
@@ -7,6 +7,7 @@ BUILD_DIR := $(shell cd python; $(PYTHON) -c 'from build_helpers import get_cmak
 TRITON_OPT := $(BUILD_DIR)/bin/triton-opt
 PYTEST := $(PYTHON) -m pytest
 LLVM_BUILD_PATH ?= ".llvm-project/build"
+NUM_PROCS ?= 8
 
 # Incremental builds
 
@@ -30,25 +31,25 @@ test-cpp:
 
 .PHONY: test-unit
 test-unit: all
-	cd python/test/unit && $(PYTEST) -s -n 8 --ignore=language/test_line_info.py \
+	cd python/test/unit && $(PYTEST) -s -n $(NUM_PROCS) --ignore=language/test_line_info.py \
 		--ignore=language/test_subprocess.py --ignore=test_debug.py
-	$(PYTEST) -s -n 8 python/test/unit/language/test_subprocess.py
-	$(PYTEST) -s -n 8 python/test/unit/test_debug.py --forked
+	$(PYTEST) -s -n $(NUM_PROCS) python/test/unit/language/test_subprocess.py
+	$(PYTEST) -s -n $(NUM_PROCS) python/test/unit/test_debug.py --forked
 	$(PYTEST) -s -n 8 python/triton_kernels/tests/
 	TRITON_DISABLE_LINE_INFO=0 $(PYTEST) -s python/test/unit/language/test_line_info.py
 	# Run attention separately to avoid out of gpu memory
 	$(PYTEST) -vs python/tutorials/06-fused-attention.py
 	TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=python/triton/instrumentation/libGPUInstrumentationTestLib.so \
 		$(PYTEST) --capture=tee-sys -rfs -vvv python/test/unit/instrumentation/test_gpuhello.py
-	$(PYTEST) -s -n 8 python/test/gluon
+	$(PYTEST) -s -n $(NUM_PROCS) python/test/gluon
 
 .PHONY: test-gluon
 test-gluon: all
-	$(PYTEST) -s -n 8 python/test/gluon
+	$(PYTEST) -s -n $(NUM_PROCS) python/test/gluon
 
 .PHONY: test-regression
 test-regression: all
-	$(PYTEST) -s -n 8 python/test/regression
+	$(PYTEST) -s -n $(NUM_PROCS) python/test/regression
 
 .PHONY: test-interpret
 test-interpret: all
 
@@ -263,6 +263,13 @@ void replaceUsesWithLocalLoad(
     OpBuilder &builder, OpResult old,
     TypedValue<triton::gpu::MemDescType> alloc,
     TypedValue<triton::gpu::AsyncTokenType> token = {});
+
+// Return true if the value comes from a load or a block argument.
+// This will skip convert layouts and memdesc views.
+// This is a helper useful to know if value is likely to come from shared memory
+// after converting loads into async loads.
+bool comesFromLoadOrBlockArg(Value v);
+
 } // namespace mlir::triton
 
 #endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_UTILITY_H_
@@ -368,8 +368,8 @@ OpFoldResult MakeRangeOp::fold(FoldAdaptor adaptor) {
 LogicalResult MakeRangeOp::verify() {
   int64_t start = getStartAttr().getInt();
   int64_t end = getEndAttr().getInt();
-  if (start > end) {
-    return this->emitOpError() << "start must be less than or equal to end";
+  if (start >= end) {
+    return this->emitOpError() << "start must be less than end";
   }
   auto ty = getType();
   if (ty.getShape().size() != 1) {
 
@@ -322,29 +322,6 @@ class BlockedToMMA : public mlir::OpRewritePattern<DotOp> {
     if (!(versionMajor >= 1 && versionMajor <= 3))
       return failure();
 
-    // If both of the operands are not loads, we fallback to MMAv2
-    // otherwise the reg-smem roundtrip will tank the MMAv3 performance
-    auto comesFromLoadOrBlockArg = [](Value v) -> bool {
-      // Peel out the original cvt dot_op<..., #blocked>
-      // and any other potential cvt/trans ops
-      while (true) {
-        if (auto cvtOp = v.getDefiningOp<ConvertLayoutOp>()) {
-          v = cvtOp.getSrc();
-          continue;
-        }
-        if (auto transOp = v.getDefiningOp<TransOp>()) {
-          v = transOp.getSrc();
-          continue;
-        }
-        break;
-      }
-      // We also accept block arguments as they appear in many MLIR tests
-      // If this is problematic we can totally drop them
-      return isa<BlockArgument>(v) ||
-             (v.getDefiningOp() &&
-              isa<LoadOp, DescriptorLoadOp>(v.getDefiningOp()));
-    };
-
     bool aFromLoad = comesFromLoadOrBlockArg(dotOp.getA());
     bool bFromLoad = comesFromLoadOrBlockArg(dotOp.getB());
     auto origDotOp = dotOp;
 
@@ -1554,4 +1554,29 @@ void replaceUsesWithLocalLoad(OpBuilder &builder, OpResult old,
     alloc.erase();
   }
 }
+
+bool comesFromLoadOrBlockArg(Value v) {
+  // Peel out the original cvt dot_op<..., #blocked>
+  // and any other potential cvt/trans ops
+  while (true) {
+    Operation *def = v.getDefiningOp();
+    if (!def)
+      break;
+    if (auto cvtOp = dyn_cast<ttg::ConvertLayoutOp>(def)) {
+      v = cvtOp.getSrc();
+      continue;
+    }
+    if (def->hasTrait<OpTrait::MemDescViewTrait>()) {
+      v = def->getOperand(0);
+      continue;
+    }
+    break;
+  }
+  // We also accept block arguments as they appear in many MLIR tests
+  // If this is problematic we can totally drop them
+  return isa<BlockArgument>(v) ||
+         (v.getDefiningOp() &&
+          isa<LoadOp, DescriptorLoadOp, DescriptorGatherOp>(v.getDefiningOp()));
+}
+
 } // namespace mlir::triton
@@ -226,11 +226,17 @@ static std::optional<WarpSchedule> getInitialSchedule(scf::ForOp loop) {
     return std::nullopt;
 
   // Propagate defs of exp.
-  for (auto expOp : loop.getOps<math::Exp2Op>()) {
-    auto tensorTy = dyn_cast<RankedTensorType>(expOp.getType());
-    if (tensorTy && tensorTy.getNumElements() > 256) {
-      schedule.trySchedule(defaultPartition, expOp);
-      scheduleDependencies(loop, schedule, defaultPartition, expOp);
+  for (Operation &op : loop.getOps()) {
+    if (!isa<math::Exp2Op, ElementwiseInlineAsmOp>(op))
+      continue;
+    int elementCount = 0;
+    for (Type type : op.getResultTypes()) {
+      if (auto tensorTy = dyn_cast<RankedTensorType>(type))
+        elementCount += tensorTy.getNumElements();
+    }
+    if (elementCount > 256) {
+      schedule.trySchedule(defaultPartition, &op);
+      scheduleDependencies(loop, schedule, defaultPartition, &op);
     }
   }
 
@@ -242,7 +248,8 @@ static std::optional<WarpSchedule> getInitialSchedule(scf::ForOp loop) {
   while (userPartitions.size() < mmas.size()) {
     userPartitions.push_back(schedule.addPartition(userPartitions.size()));
   }
-  for (auto [mmaOp, userPartition] : llvm::zip(mmas, userPartitions)) {
+  for (auto [mmaOp, userPartition] :
+       llvm::reverse(llvm::zip(mmas, userPartitions))) {
     scheduleUsers(loop, schedule, userPartition, mmaOp);
   }
 
 
@@ -69,7 +69,8 @@ template <class MMAOpTy> class LHSToTMem : public OpRewritePattern<MMAOpTy> {
         isDistributedLayoutTMemCompatible(tcGen5MMAOp, srcType, lhsMemDescType);
     Attribute newLayout = srcLayout;
     if (!layoutTmemCompatible) {
-      if (triton::tools::getBoolEnv("ALLOW_LHS_TMEM_LAYOUT_CONVERSION")) {
+      if (!comesFromLoadOrBlockArg(src) ||
+          triton::tools::getBoolEnv("ALLOW_LHS_TMEM_LAYOUT_CONVERSION")) {
         newLayout = getLHSTMemLayout(tcGen5MMAOp, srcType);
       } else {
         return failure();
 
@@ -11,6 +11,7 @@
 
 using namespace mlir;
 namespace py = pybind11;
+namespace tt = triton;
 namespace ttg = triton::gpu;
 namespace ttng = triton::nvidia_gpu;
 
@@ -298,7 +299,15 @@ void init_gluon_ir(py::module &&m) {
              self.create<ttng::AsyncTMAScatterOp>(descPtr, xOffsets, yOffset,
                                                   src);
            })
-
+      .def("create_broadcast",
+           [](TritonOpBuilder &self, Value &arg, Type retTy) -> Value {
+             return self.create<tt::BroadcastOp>(retTy, arg);
+           })
+      .def(
+          "create_expand_dims",
+          [](TritonOpBuilder &self, Value &arg, int axis, Type retTy) -> Value {
+            return self.create<tt::ExpandDimsOp>(retTy, arg, axis);
+          })
       .def("create_warp_return",
            [](GluonOpBuilder &self) -> Operation * {
              return self.create<ttg::WarpReturnOp>();
 
@@ -1,4 +1,5 @@
 import expecttest
+from triton.runtime.jit import MockTensor
 import torch
 import pytest
 import re
@@ -600,3 +601,183 @@ def kernel():
   }
 }
 """)
+
+
+@gluon.jit
+def broadcast_kernel():
+    layout: ttgl.constexpr = ttgl.BlockedLayout([1, 1], [2, 16], [4, 1], [1, 0])
+    a = ttgl.arange(0, 16, layout=ttgl.SliceLayout(0, layout))[None, :]
+    b = ttgl.arange(0, 16, layout=ttgl.SliceLayout(1, layout))[:, None]
+    0 + a + b
+
+
+def test_broadcast(fresh_knobs):
+    knobs.compilation.disable_line_info = True
+
+    h = broadcast_kernel.warmup(sanitize_overflow=False, grid=(1, ))
+    expecttest.assert_expected_inline(
+        anonymize_ir(h.asm["source"]), """\
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @broadcast_kernel() attributes {noinline = false} {
+    %0 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc)
+    %1 = tt.expand_dims %0 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc)
+    %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc)
+    %3 = tt.expand_dims %2 {axis = 1 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi32, #blocked> loc(#loc)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc)
+    %c0_i32_0 = arith.constant 0 : i32 loc(#loc)
+    %cst = arith.constant dense<0> : tensor<1x16xi32, #blocked> loc(#loc)
+    %4 = arith.addi %cst, %1 : tensor<1x16xi32, #blocked> loc(#loc)
+    %5 = tt.broadcast %4 : tensor<1x16xi32, #blocked> -> tensor<16x16xi32, #blocked> loc(#loc)
+    %6 = tt.broadcast %3 : tensor<16x1xi32, #blocked> -> tensor<16x16xi32, #blocked> loc(#loc)
+    %7 = arith.addi %5, %6 : tensor<16x16xi32, #blocked> loc(#loc)
+    tt.return loc(#loc)
+  } loc(#loc)
+} loc(#loc)
+#loc = loc(unknown)
+""")
+
+
+@gluon.jit
+def math_kernel():
+    layout: ttgl.constexpr = ttgl.BlockedLayout([1, 1], [1, 32], [4, 1], [1, 0])
+    a = ttgl.full([16, 16], 1, ttgl.float32, layout)
+    b = ttgl.full([16, 16], 2, ttgl.float32, layout)
+    c = ttgl.full([16, 16], 4, ttgl.float32, layout)
+    d = ttgl.full([16, 16], 1, ttgl.int32, layout)
+    e = ttgl.full([16, 16], 1, ttgl.int32, layout)
+    ttgl.umulhi(d, e)
+    ttgl.exp(a)
+    ttgl.exp2(a)
+    ttgl.log(a)
+    ttgl.log2(a)
+    ttgl.cos(a)
+    ttgl.sin(a)
+    ttgl.sqrt(a)
+    ttgl.sqrt_rn(a)
+    ttgl.rsqrt(a)
+    ttgl.abs(a)
+    ttgl.fdiv(a, b)
+    ttgl.div_rn(a, b)
+    ttgl.erf(a)
+    ttgl.floor(a)
+    ttgl.ceil(a)
+    ttgl.fma(a, b, c)
+
+
+def test_math(fresh_knobs):
+    knobs.compilation.disable_line_info = True
+
+    h = math_kernel.warmup(sanitize_overflow=False, grid=(1, ))
+    expecttest.assert_expected_inline(
+        anonymize_ir(h.asm["source"]), """\
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @math_kernel() attributes {noinline = false} {
+    %cst = arith.constant 1.000000e+00 : f32 loc(#loc)
+    %cst_0 = arith.constant dense<1.000000e+00> : tensor<16x16xf32, #blocked> loc(#loc)
+    %cst_1 = arith.constant 2.000000e+00 : f32 loc(#loc)
+    %cst_2 = arith.constant dense<2.000000e+00> : tensor<16x16xf32, #blocked> loc(#loc)
+    %cst_3 = arith.constant 4.000000e+00 : f32 loc(#loc)
+    %cst_4 = arith.constant dense<4.000000e+00> : tensor<16x16xf32, #blocked> loc(#loc)
+    %c1_i32 = arith.constant 1 : i32 loc(#loc)
+    %cst_5 = arith.constant dense<1> : tensor<16x16xi32, #blocked> loc(#loc)
+    %c1_i32_6 = arith.constant 1 : i32 loc(#loc)
+    %cst_7 = arith.constant dense<1> : tensor<16x16xi32, #blocked> loc(#loc)
+    %0 = tt.mulhiui %cst_5, %cst_7 : tensor<16x16xi32, #blocked> loc(#loc)
+    %1 = math.exp %cst_0 : tensor<16x16xf32, #blocked> loc(#loc)
+    %2 = math.exp2 %cst_0 : tensor<16x16xf32, #blocked> loc(#loc)
+    %3 = math.log %cst_0 : tensor<16x16xf32, #blocked> loc(#loc)
+    %4 = math.log2 %cst_0 : tensor<16x16xf32, #blocked> loc(#loc)
+    %5 = math.cos %cst_0 : tensor<16x16xf32, #blocked> loc(#loc)
+    %6 = math.sin %cst_0 : tensor<16x16xf32, #blocked> loc(#loc)
+    %7 = math.sqrt %cst_0 : tensor<16x16xf32, #blocked> loc(#loc)
+    %8 = tt.precise_sqrt %cst_0 : tensor<16x16xf32, #blocked> loc(#loc)
+    %9 = math.rsqrt %cst_0 : tensor<16x16xf32, #blocked> loc(#loc)
+    %10 = math.absf %cst_0 : tensor<16x16xf32, #blocked> loc(#loc)
+    %11 = arith.divf %cst_0, %cst_2 : tensor<16x16xf32, #blocked> loc(#loc)
+    %12 = tt.precise_divf %cst_0, %cst_2 : tensor<16x16xf32, #blocked> loc(#loc)
+    %13 = math.erf %cst_0 : tensor<16x16xf32, #blocked> loc(#loc)
+    %14 = math.floor %cst_0 : tensor<16x16xf32, #blocked> loc(#loc)
+    %15 = math.ceil %cst_0 : tensor<16x16xf32, #blocked> loc(#loc)
+    %16 = math.fma %cst_0, %cst_2, %cst_4 : tensor<16x16xf32, #blocked> loc(#loc)
+    tt.return loc(#loc)
+  } loc(#loc)
+} loc(#loc)
+#loc = loc(unknown)
+""")
+
+
+@gluon.jit
+def pair_add(a0, a1, b0, b1):
+    return a0 + b0, a1 + b1
+
+
+@gluon.jit
+def reduce_kernel(out):
+    layout: ttgl.constexpr = ttgl.BlockedLayout([1, 1], [1, 32], [4, 1], [1, 0])
+    a = ttgl.full([16, 16], 1, ttgl.float32, layout)
+    b = ttgl.full([16, 16], 2, ttgl.float32, layout)
+    s0 = ttgl.sum(a, 0)
+    ttgl.static_assert(s0.type.layout == ttgl.SliceLayout(0, layout))
+    s1 = ttgl.sum(a, 1)
+    ttgl.static_assert(s1.type.layout == ttgl.SliceLayout(1, layout))
+
+    scalar = ttgl.max(s0, 0)
+    ttgl.static_assert(scalar.type == ttgl.float32)
+
+    s1 = ttgl.convert_layout(s1, s0.type.layout)
+
+    pairs = ttgl.reduce((a, b), 0, pair_add)
+    ttgl.static_assert(pairs[0].type.layout == ttgl.SliceLayout(0, layout))
+    ttgl.static_assert(pairs[1].type.layout == ttgl.SliceLayout(0, layout))
+    result = scalar + s1 + pairs[0] + pairs[1]
+    tl.store(out + ttgl.arange(0, 16, s0.type.layout), result)
+
+
+def test_reduce(fresh_knobs):
+    knobs.compilation.disable_line_info = True
+
+    h = reduce_kernel.warmup(MockTensor(ttgl.float32), sanitize_overflow=False, grid=(1, ))
+    expecttest.assert_expected_inline(
+        anonymize_ir(h.asm["ttgir"]), """\
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+#loc = loc(unknown)
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @reduce_kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc(unknown)) attributes {noinline = false} {
+    %cst = arith.constant dense<2.000000e+00> : tensor<16x16xf32, #blocked> loc(#loc)
+    %cst_0 = arith.constant dense<1.000000e+00> : tensor<16x16xf32, #blocked> loc(#loc)
+    %0 = "tt.reduce"(%cst_0) <{axis = 0 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %12 = arith.addf %arg1, %arg2 : f32 loc(#loc)
+      tt.reduce.return %12 : f32 loc(#loc)
+    }) : (tensor<16x16xf32, #blocked>) -> tensor<16xf32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc)
+    %1 = "tt.reduce"(%cst_0) <{axis = 1 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %12 = arith.addf %arg1, %arg2 : f32 loc(#loc)
+      tt.reduce.return %12 : f32 loc(#loc)
+    }) : (tensor<16x16xf32, #blocked>) -> tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc)
+    %2 = "tt.reduce"(%0) <{axis = 0 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %12 = arith.maxnumf %arg1, %arg2 : f32 loc(#loc)
+      tt.reduce.return %12 : f32 loc(#loc)
+    }) : (tensor<16xf32, #ttg.slice<{dim = 0, parent = #blocked}>>) -> f32 loc(#loc)
+    %3 = ttg.convert_layout %1 : tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<16xf32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc)
+    %4:2 = "tt.reduce"(%cst_0, %cst) <{axis = 0 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown), %arg3: f32 loc(unknown), %arg4: f32 loc(unknown)):
+      %12 = arith.addf %arg1, %arg3 : f32 loc(#loc)
+      %13 = arith.addf %arg2, %arg4 : f32 loc(#loc)
+      tt.reduce.return %12, %13 : f32, f32 loc(#loc)
+    }) : (tensor<16x16xf32, #blocked>, tensor<16x16xf32, #blocked>) -> (tensor<16xf32, #ttg.slice<{dim = 0, parent = #blocked}>>, tensor<16xf32, #ttg.slice<{dim = 0, parent = #blocked}>>) loc(#loc)
+    %5 = tt.splat %2 : f32 -> tensor<16xf32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc)
+    %6 = arith.addf %5, %3 : tensor<16xf32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc)
+    %7 = arith.addf %6, %4#0 : tensor<16xf32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc)
+    %8 = arith.addf %7, %4#1 : tensor<16xf32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc)
+    %9 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc)
+    %10 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<16x!tt.ptr<f32>, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc)
+    %11 = tt.addptr %10, %9 : tensor<16x!tt.ptr<f32>, #ttg.slice<{dim = 0, parent = #blocked}>>, tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc)
+    tt.store %11, %8 : tensor<16x!tt.ptr<f32>, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc)
+    tt.return loc(#loc)
+  } loc(#loc)
+} loc(#loc)
+""")