intel
diff --git a/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 31 additions & 0 deletions b/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/ReorderInstructions.cpp‎
Lines changed: 32 additions & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/ReorderInstructions.cpp‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/src/llvm.cc‎
Lines changed: 6 additions & 5 deletions b/‎python/src/llvm.cc‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎python/test/gluon/test_core.py‎
Lines changed: 1 addition & 1 deletion b/‎python/test/gluon/test_core.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/test/gluon/test_frontend.py‎
Lines changed: 49 additions & 0 deletions b/‎python/test/gluon/test_frontend.py‎
Lines changed: 49 additions & 0 deletions
@@ -444,6 +444,37 @@ LogicalResult Fp4ToFpOp::verifyFp4ToFp(mlir::Operation *op,
                << ", dst=" << resShape[i] << ", axis=" << axis << ")";
     }
   }
+  if (bool(resTy.getEncoding()) != bool(srcTy.getEncoding()))
+    return op->emitError()
+           << "source and result must both have an encoding, or neither";
+  if (!resTy.getEncoding()) {
+    return success();
+  }
+  auto srcLl = toLinearLayout(srcTy);
+  auto resLl = toLinearLayout(resTy);
+  auto *ctx = srcTy.getContext();
+  auto regDim = StringAttr::get(ctx, "register");
+  auto outDims = standardOutDimNames(ctx, rank);
+
+  // We use backward inference here as it is striclty more general
+  Attribute inferSrc;
+  auto dialect =
+      resTy.getEncoding()
+          .getDialect()
+          .getRegisteredInterface<triton::DialectInferLayoutInterface>();
+  assert(dialect);
+  if (failed(dialect->inferFp4ToFpOpEncoding(
+          resTy.getShape(), axis, resTy.getEncoding(), inferSrc,
+          /*fwdInference*/ false, std::nullopt))) {
+    return op->emitError() << "failed to infer encoding";
+  }
+  if (!areLayoutsEquivalent(srcTy.getShape(),
+                            cast<LayoutEncodingTrait>(inferSrc),
+                            cast<LayoutEncodingTrait>(srcTy.getEncoding())))
+    return op->emitError()
+           << "Src and Dst encodings are not compatible:\n"
+           << toLinearLayout(srcTy.getShape(), inferSrc).toString() << "\n"
+           << srcLl.toString();
   return success();
 }
 
 
@@ -40,6 +40,36 @@ static bool willIncreaseRegisterPressure(Operation *op) {
   return false;
 }
 
+// Return true if it has side effects that are either unknown or writes.
+static bool hasWriteSideEffect(Operation *op) {
+  auto effects = getEffectsRecursively(op);
+  if (!effects)
+    return false;
+  return llvm::any_of(*effects, [](MemoryEffects::EffectInstance effect) {
+    return !isa<MemoryEffects::Read, MemoryEffects::Allocate,
+                MemoryEffects::Free>(effect.getEffect());
+  });
+}
+
+// Return true if there is a write side effect on any path between start and end
+// ops. This assumes start dominates end.
+static bool crossWriteSideEffectingOp(Operation *start, Operation *end) {
+  auto ancestor = start->getBlock()->findAncestorOpInBlock(*end);
+  // Couldn't find an ancestor in the same block, conservatively assume true.
+  if (!ancestor)
+    return true;
+  Operation *nextOp = start->getNextNode();
+  while (nextOp) {
+    if ((hasWriteSideEffect(nextOp)))
+      return true;
+    if (nextOp == ancestor)
+      return false;
+    nextOp = nextOp->getNextNode();
+  }
+  assert(false && "op doesn't dominate other");
+  return true;
+}
+
 class TritonGPUReorderInstructionsPass
     : public impl::TritonGPUReorderInstructionsBase<
           TritonGPUReorderInstructionsPass> {
@@ -135,6 +165,8 @@ class TritonGPUReorderInstructionsPass
       // after the conversion to OpIdx=0.
       if (!dom.dominates(op.getOperation(), AOp.getOperation()))
         return;
+      if (crossWriteSideEffectingOp(op, AOp))
+        return;
       moveAfter(op, AOp);
     });
     return;
 
@@ -431,7 +431,7 @@ static Attribute inferDstEncoding(triton::gpu::Fp4ToFpOp op, Attribute srcEnc) {
 
 static Attribute inferSrcEncoding(triton::gpu::Fp4ToFpOp op, Attribute dstEnc) {
   Attribute srcEnc;
-  auto shape = op.getSrc().getType().getShape();
+  auto shape = op.getType().getShape();
   if (succeeded(
           dstEnc.getDialect()
               .getRegisteredInterface<triton::DialectInferLayoutInterface>()
 
@@ -47,8 +47,8 @@ std::unique_ptr<TargetMachine>
 createTargetMachine(llvm::Module *module, std::string proc,
                     bool enable_fp_fusion, const std::string &features) {
   std::string error;
-  auto target = llvm::TargetRegistry::lookupTarget(
-      module->getTargetTriple().str(), error);
+  auto target =
+      llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
   llvm::TargetOptions opt;
   bool disableLLVMOpt = mlir::triton::tools::getBoolEnv("DISABLE_LLVM_OPT");
   if (enable_fp_fusion)
@@ -278,15 +278,16 @@ void init_triton_llvm(py::module &&m) {
                                 const std::string proc,
                                 const std::string features) {
     std::string error;
-    auto target = llvm::TargetRegistry::lookupTarget(triple, error);
+    llvm::Triple targetTriple(triple);
+    auto target = llvm::TargetRegistry::lookupTarget(targetTriple, error);
     if (!target) {
       throw std::runtime_error("target lookup error: " + error);
     }
     llvm::TargetOptions opt;
     // Target machine is only used to create the data layout.
     std::unique_ptr<llvm::TargetMachine> machine{target->createTargetMachine(
-        llvm::Triple(triple), proc, features, opt, llvm::Reloc::PIC_,
-        std::nullopt, llvm::CodeGenOptLevel::None)};
+        targetTriple, proc, features, opt, llvm::Reloc::PIC_, std::nullopt,
+        llvm::CodeGenOptLevel::None)};
     // set data layout
     mod->setDataLayout(machine->createDataLayout());
   });
 
@@ -1346,7 +1346,7 @@ def fp8e8m0_to_float32(scale):
     return scale
 
 
-@pytest.mark.skipif(not is_blackwell(), reason="Requires Blackwell")
+@pytest.mark.xfail(not is_blackwell(), reason="Requires Blackwell", run=False)
 def test_tcgen05_mma_scaled_minimal():
     M = 128
     N = 128
 
@@ -2445,6 +2445,29 @@ def kernel():
 """)
 
 
+@pytest.mark.parametrize("target", [HIP_TARGET_CDNA4])
+def test_amd_mfma_scaled_none(target):
+
+    @gluon.jit
+    def kernel():
+        mfma_layout: ttgl.constexpr = ttgl.amd.AMDMFMALayout(4, [16, 16, 128], True, [1, 1])
+        scale_layout: ttgl.constexpr = ttgl.DistributedLinearLayout([],
+                                                                    [[1, 0], [2, 0], [4, 0], [8, 0], [0, 1], [0, 2]],
+                                                                    [], [], [16, 4])
+
+        a = ttgl.full([16, 64], 0x11, ttgl.uint8, ttgl.DotOperandLayout(0, mfma_layout, 16))
+        b = ttgl.full([64, 16], 0x22, ttgl.uint8, ttgl.DotOperandLayout(1, mfma_layout, 16))
+
+        b_scale = ttgl.full([16, 4], 0x01, ttgl.uint8, scale_layout)
+        acc = ttgl.full([16, 16], 0, ttgl.float32, mfma_layout)
+        ttgl.amd.cdna4.mfma_scaled(a, None, 'e2m1', b, b_scale, 'e2m1', acc)
+
+    with pytest.raises(CompilationError) as e:
+        run_parser(kernel, target=target)
+
+    assert "Scales must not be None" in str(e.value)
+
+
 @pytest.mark.parametrize("target", [HIP_TARGET_GFX1250])
 def test_amd_wmma_scaled(target):
 
@@ -2497,6 +2520,32 @@ def kernel():
 """)
 
 
+@pytest.mark.parametrize("target", [HIP_TARGET_GFX1250])
+def test_amd_wmma_scaled_none(target):
+
+    @gluon.jit
+    def kernel():
+        wmma_layout: ttgl.constexpr = ttgl.amd.AMDWMMALayout(3, True, [1, 1], [16, 16, 128])
+        wmma_layout_packed: ttgl.constexpr = ttgl.amd.AMDWMMALayout(3, True, [1, 1], [16, 16, 64])
+        scale_layout: ttgl.constexpr = ttgl.DistributedLinearLayout([[0, 1], [0, 2]],
+                                                                    [[1, 0], [2, 0], [4, 0], [8, 0], [0, 0]], [], [],
+                                                                    [16, 4])
+        a_layout: ttgl.constexpr = ttgl.DotOperandLayout(0, wmma_layout_packed, 16)
+        b_layout: ttgl.constexpr = ttgl.DotOperandLayout(1, wmma_layout_packed, 16)
+
+        a = ttgl.full([16, 64], 0x11, ttgl.uint8, a_layout)
+        b = ttgl.full([64, 16], 0x22, ttgl.uint8, b_layout)
+        b_scale = ttgl.full([16, 4], 0x01, ttgl.uint8, scale_layout)
+        acc = ttgl.full([16, 16], 0, ttgl.float32, wmma_layout)
+
+        ttgl.amd.gfx1250.wmma_scaled(a, None, 'e2m1', b, b_scale, 'e2m1', acc)
+
+    with pytest.raises(CompilationError) as e:
+        run_parser(kernel, target=target)
+
+    assert "Scales must not be None" in str(e.value)
+
+
 @gluon.jit
 def padded_shared_layout_kernel():
     shape: ttgl.constexpr = [64, 64]