intel
diff --git a/‎lib/Analysis/AxisInfo.cpp‎
Lines changed: 15 additions & 19 deletions b/‎lib/Analysis/AxisInfo.cpp‎
Lines changed: 15 additions & 19 deletions
diff --git a/‎lib/Conversion/TritonInstrumentToLLVM/InstrumentationToLLVM.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Conversion/TritonInstrumentToLLVM/InstrumentationToLLVM.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 2 additions & 2 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp‎
Lines changed: 25 additions & 16 deletions b/‎lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp‎
Lines changed: 25 additions & 16 deletions
diff --git a/‎python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py‎
Lines changed: 6 additions & 1 deletion b/‎python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎test/Analysis/test-alignment.mlir‎
Lines changed: 15 additions & 0 deletions b/‎test/Analysis/test-alignment.mlir‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎test/Conversion/amd/async_ops_to_llvm_gfx1250.mlir‎
Lines changed: 0 additions & 3 deletions b/‎test/Conversion/amd/async_ops_to_llvm_gfx1250.mlir‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎test/Conversion/amd/math-denorm-handling.mlir‎
Lines changed: 14 additions & 16 deletions b/‎test/Conversion/amd/math-denorm-handling.mlir‎
Lines changed: 14 additions & 16 deletions
diff --git a/‎test/Conversion/amd/tritongpu_wmma_dot_scaled_to_llvm.mlir‎
Lines changed: 26 additions & 0 deletions b/‎test/Conversion/amd/tritongpu_wmma_dot_scaled_to_llvm.mlir‎
Lines changed: 26 additions & 0 deletions
@@ -1079,11 +1079,10 @@ AxisInfoAnalysis::AxisInfoAnalysis(DataFlowSolver &solver,
 LogicalResult AxisInfoAnalysis::visitOperation(
     Operation *op, ArrayRef<const dataflow::Lattice<AxisInfo> *> operands,
     ArrayRef<dataflow::Lattice<AxisInfo> *> results) {
-  // TODO: For sure not the right way to do this
-  // but why is scf.if not initialized otherwise?
+  // If any operands are not yet ready, skip this operation for now.
   for (auto op : operands)
     if (op->getValue().getRank() == 0)
-      setToEntryState((dataflow::Lattice<AxisInfo> *)op);
+      return success();
   AxisInfo curr = visitors.apply(op, operands);
   if (curr.getRank() == 0) {
     setAllToEntryStates(results);
@@ -1112,9 +1111,11 @@ void AxisInfoAnalysis::visitForOpInductionVar(
   ProgramPoint *programPoint = getProgramPointAfter(op);
   auto *lbLattice = getLatticeElementFor(programPoint, op.getLowerBound());
   auto *stepLattice = getLatticeElementFor(programPoint, op.getStep());
-  for (auto op_iter : {lbLattice, stepLattice})
-    if (op_iter->getValue().getRank() == 0)
-      setToEntryState((dataflow::Lattice<AxisInfo> *)op_iter);
+  // If lb or step is not yet ready, skip this operation for now.
+  if (lbLattice->getValue().getRank() == 0 ||
+      stepLattice->getValue().getRank() == 0) {
+    return;
+  }
 
   AxisInfo::DimVectorT knownContiguity(1, 1);
   AxisInfo::DimVectorT knownDivisibility(1, 1);
@@ -1188,24 +1189,15 @@ void AxisInfo::initDimVectorFromHint(Attribute attr, DimVectorT *vec) {
       initPessimisticStateFromFunc(blockArg.getArgNumber(), fun,
                                    &knownContiguity, &knownDivisibility,
                                    &knownConstancy);
-    } else if (isa<RegionBranchOpInterface, gpu::WarpSpecializePartitionsOp>(
-                   op)) {
-      // scf::ForOp, scf::IfOp, scf::WhileOp, gpu::WarpSpecializePartitionsOp
-      // Control flow operations are initialized with "unknown" state:
-      // the maximum possible divisibility, contiguity, and constancy.
+    } else if (isa<gpu::WarpSpecializePartitionsOp>(op)) {
+      // Initialize the arguments to gpu::WarpSpecializePartitionsOp with
+      // "unknown" state: the maximum possible divisibility, contiguity, and
+      // constancy.
       knownDivisibility = DimVectorT(rank, kMaxDivisor);
       knownConstancy = DimVectorT(rank, kMaxDivisor);
       knownContiguity = DimVectorT(rank, kMaxDivisor);
     }
   } else if (Operation *op = value.getDefiningOp()) {
-    if (isa<RegionBranchOpInterface>(op)) {
-      // scf::ForOp, scf::IfOp, scf::WhileOp
-      // Control flow operations are initialized with "unknown" state:
-      // the maximum possible divisibility, contiguity, and constancy.
-      knownDivisibility = DimVectorT(rank, kMaxDivisor);
-      knownConstancy = DimVectorT(rank, kMaxDivisor);
-      knownContiguity = DimVectorT(rank, kMaxDivisor);
-    }
     // Other operations are conservatively initialized with the lowest possible
     // divisibility, contiguity, and constancy unless they have specified.
     AxisInfo::initDimVectorFromHint(op->getDiscardableAttr("tt.divisibility"),
@@ -1358,6 +1350,10 @@ void ModuleAxisInfoAnalysis::initialize(FunctionOpInterface funcOp,
   auto *axisInfoMap = getFuncData(funcOp);
   auto updateAxisInfoMap = [&](Value value) {
     auto axisInfo = analysis->getLatticeElement(value)->getValue();
+    // If we could not determine the AxisInfo for this value, assume the
+    // pessimistic state.
+    if (axisInfo.getRank() == 0)
+      axisInfo = AxisInfo::getPessimisticValueState(value);
     AxisInfo curAxisInfo;
     if (axisInfoMap->count(value)) {
       curAxisInfo = AxisInfo::join(axisInfo, axisInfoMap->lookup(value));
 
@@ -27,7 +27,7 @@ Value createMemDescToI64(RewriterBase &rewriter, Location loc,
                          const LLVMTypeConverter *typeConverter,
                          ttg::MemDescType memDescTy, Value sharedMemStruct) {
   TritonLLVMOpBuilder b(loc, rewriter);
-  if (isa<ttng::TensorMemoryEncodingAttr>(memDescTy.getEncoding())) {
+  if (isa<ttng::TensorMemorySpaceAttr>(memDescTy.getMemorySpace())) {
     return b.ptrtoint(rewriter.getIntegerType(64), sharedMemStruct);
   }
   assert(isa<ttg::SharedEncodingTrait>(memDescTy.getEncoding()) &&
 
@@ -2505,9 +2505,9 @@ LogicalResult DotOperandEncodingAttr::verify(
       return emitError()
              << "ttg.dot_op kWidth parameter must be 4/8/16 for WMMA v2 "
                 "(including packed cases for `scaled_dot`)";
-    if (parentAttr.getVersion() == 3 && !llvm::is_contained({2, 8, 16}, kWidth))
+    if (parentAttr.getVersion() == 3 && kWidth == 0)
       return emitError()
-             << "ttg.dot_op kWidth parameter must be 2/8/16 for WMMA v3";
+             << "ttg.dot_op kWidth parameter is mandatory for WMMA v3 ";
     return success();
   }
 
 
@@ -127,7 +127,7 @@ class LayoutRematerialization {
   }
 
   void cleanup();
-  void backwardRematerialization();
+  bool backwardRematerialization();
   void backwardRematerialization(ConvertLayoutOp convertOp);
   // TODO: Merge the three hoistConvert*(); functions as they are duplicate code
   void hoistConvertDotOperand();
@@ -1019,7 +1019,8 @@ LogicalResult LayoutRematerialization::getRematerializableSlice(
   return success();
 }
 
-void LayoutRematerialization::backwardRematerialization() {
+bool LayoutRematerialization::backwardRematerialization() {
+  bool changed = false;
   // Go through each ConvertLayoutOp.
   SmallVector<ConvertLayoutOp> convertOps;
   funcOp.walk(
@@ -1031,8 +1032,11 @@ void LayoutRematerialization::backwardRematerialization() {
       // backward slices.
       addRematValue(convertOp.getSrc(), convertOp.getType().getEncoding(),
                     convertOp.getResult());
+    } else {
+      changed = true;
     }
   }
+  return changed;
 }
 
 void LayoutRematerialization::hoistConvertOnTopOfExtOrBroadcast() {
@@ -1593,12 +1597,14 @@ void LayoutRematerialization::hoistConvertIntoConditionals(
   rewriteSlice(slice, layout, convertOp, mapping);
 }
 
-void backwardRematerialization(ModuleOp module) {
-  module.walk([](FuncOp funcOp) {
+bool backwardRematerialization(ModuleOp module) {
+  bool changed = false;
+  module.walk([&](FuncOp funcOp) {
     LayoutRematerialization layoutRemat(funcOp);
-    layoutRemat.backwardRematerialization();
+    changed |= layoutRemat.backwardRematerialization();
     layoutRemat.cleanup();
   });
+  return changed;
 }
 
 void hoistConvert(ModuleOp module) {
@@ -1659,17 +1665,20 @@ class TritonGPURemoveLayoutConversionsPass
 
     cleanupConvertOps();
 
-    // 2. For remaining convert ops, try to rematerialize the slice of producer
-    // operation to avoid having to convert.
-    backwardRematerialization(m);
-    LLVM_DEBUG({
-      DBGS() << "Module after backward remat:\n";
-      m.dump();
-    });
-
-    // Cleanup dummy converts created during backward remat.
-    cleanupConvertOps();
-
+    bool changed = false;
+    do {
+      changed = false;
+      // 2. For remaining convert ops, try to rematerialize the slice of
+      // producer operation to avoid having to convert.
+      changed = backwardRematerialization(m);
+      LLVM_DEBUG({
+        DBGS() << "Module after backward remat:\n";
+        m.dump();
+      });
+
+      // Cleanup dummy converts created during backward remat.
+      cleanupConvertOps();
+    } while (changed);
     // 3. For remaining converts, try to hoist them above cast generating larger
     // size types in order to reduce the cost of the convert op.
     hoistConvert(m);
 
@@ -3,11 +3,12 @@
 from dataclasses import dataclass
 
 import triton
+from triton_kernels import target_info
 from triton_kernels.target_info import get_cdna_version
 from triton_kernels.tensor import FP4
 import torch
 from .opt_flags_details import opt_flags_amd, opt_flags_nvidia, opt_flags_intel
-from triton_kernels.tensor import bitwidth
+from triton_kernels.tensor import bitwidth, get_layout
 
 
 @dataclass
@@ -297,8 +298,12 @@ def make_default_opt_flags_nvidia(
     n_sms = torch.cuda.get_device_properties(0).multi_processor_count
     tiles_per_sm = grid_size_tma / n_sms
     supports_persistent = can_use_persistent_tma and (arch is None or int(arch[2:-1]) >= 9)
+    requires_persistent = (get_layout(precision_config.act_scale) is not None or get_layout(precision_config.weight_scale) is not None) and target_info.has_native_mxfp()
     if constraints.get("is_persistent", None) is not None:
         is_persistent = constraints["is_persistent"]
+    elif requires_persistent:
+        assert supports_persistent, "persistent kernel required but not supported"
+        is_persistent = True
     else:
         has_simple_epilogue = precision_config.max_num_imprecise_acc is None
         is_persistent = supports_persistent and has_simple_epilogue and (tiles_per_sm >= 2.0 or lhs_dtype.itemsize <= 1) and out_dtype.itemsize < 4
 
@@ -1089,3 +1089,18 @@ tt.func public @test_inductor_for() {
   }
   tt.return
 }
+
+// -----
+
+// Verify that if an operation is statically determined to be dead, we fall back
+// to assigning it a pessimistic value, rather than skipping it entirely.
+tt.func @dead_op_pessimistic() {
+  %c5 = arith.constant dense<5> : tensor<4xi32>
+  %c7 = arith.constant dense<7> : tensor<4xi32>
+  %false = arith.constant false
+  scf.if %false {
+    // expected-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
+    %add = arith.addi %c5, %c7 : tensor<4xi32>
+  }
+  tt.return
+}
@@ -81,7 +81,6 @@ module attributes {"ttg.num-ctas" = 8 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
   // CHECK-LABEL: async_load_multicast_to_half_ctas
   tt.func public @async_load_multicast_to_half_ctas(%arg0: tensor<32x32x!tt.ptr<f32>, #blocked> {tt.divisibility = dense<[16, 16]> : tensor<2xi32>, tt.contiguity = dense<[16, 16]> : tensor<2xi32>, tt.constancy = dense<[1, 1]> : tensor<2xi32>},
                                 %arg1: !ttg.memdesc<32x32xf32, #shared, #smem, mutable>) {
-    // CHECK: llvm.amdgcn.cluster.workgroup.id.x
     // CHECK: %[[CTA_ID:.*]] = {{.*}}llvm.amdgcn.cluster.workgroup.id.x
     // CHECK: %[[NON_FREE_BITS:.*]] = llvm.mlir.constant(-7 : i32) : i32
     // CHECK: %[[SHIFT_AMOUNT:.*]] = llvm.and %[[CTA_ID]], %[[NON_FREE_BITS]]
@@ -104,7 +103,6 @@ module attributes {"ttg.num-ctas" = 16 : i32, "ttg.num-warps" = 4 : i32, ttg.sha
   tt.func public @async_load_multicast_group_of_2_strided_by_8(%arg0: tensor<32x32x!tt.ptr<f32>, #blocked> {tt.divisibility = dense<[16, 16]> : tensor<2xi32>, tt.contiguity = dense<[16, 16]> : tensor<2xi32>, tt.constancy = dense<[1, 1]> : tensor<2xi32>},
                                 %arg1: !ttg.memdesc<32x32xf32, #shared, #smem, mutable>) {
     // Skip the first cluster id because it's emitted for address calculation
-    // CHECK: llvm.amdgcn.cluster.workgroup.id.x
     // CHECK: %[[CTA_ID:.*]] = {{.*}}llvm.amdgcn.cluster.workgroup.id.x
     // CHECK: %[[NON_FREE_BITS:.*]] = llvm.mlir.constant(-9 : i32) : i32
     // CHECK: %[[SHIFT_AMOUNT:.*]] = llvm.and %[[CTA_ID]], %[[NON_FREE_BITS]]
@@ -146,7 +144,6 @@ module attributes {"ttg.num-ctas" = 16 : i32, "ttg.num-warps" = 4 : i32, ttg.sha
   tt.func public @async_load_multi_cta_linear_layout(%arg0: tensor<32x32x!tt.ptr<f32>, #linear> {tt.divisibility = dense<[16, 16]> : tensor<2xi32>, tt.contiguity = dense<[16, 16]> : tensor<2xi32>, tt.constancy = dense<[1, 1]> : tensor<2xi32>},
                                 %arg1: !ttg.memdesc<32x32xf32, #shared, #smem, mutable>) {
     // Skip the first cluster id because it's emitted for address calculation
-    // CHECK: llvm.amdgcn.cluster.workgroup.id.x
     // CHECK: %[[CTA_ID:.*]] = {{.*}}llvm.amdgcn.cluster.workgroup.id.x
     // CHECK: %[[NON_FREE_BITS:.*]] = llvm.mlir.constant(-9 : i32) : i32
     // CHECK: %[[SHIFT_AMOUNT:.*]] = llvm.and %[[CTA_ID]], %[[NON_FREE_BITS]]
 
@@ -64,22 +64,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.targ
 #blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
   tt.func public @test_sqrt_rn_f32(%arg0: tensor<64xf32, #blocked>) {
-    // LLVM_FTZ-LABEL: test_sqrt_rn_f32
-    // LLVM_FTZ: llvm.amdgcn.rsq.f32
-    // LLVM_FTZ: llvm.fmul
-    // LLVM_FTZ: llvm.fmul
-    // LLVM_FTZ: llvm.fneg
-    // LLVM_FTZ: llvm.intr.fma
-    // LLVM_FTZ-NEXT: llvm.intr.fma
-    // LLVM_FTZ-NEXT: llvm.intr.fma
-    // LLVM_FTZ-NEXT: llvm.fneg
-    // LLVM_FTZ-NEXT: llvm.intr.fma
-    // LLVM_FTZ-NEXT: llvm.intr.fma
-    // LLVM_FTZ-NEXT: llvm.intr.is.fpclass
-    // LLVM_FTZ-NEXT: llvm.select
-    //
-    // LLVM_NO_FTZ-LABEL: test_sqrt_rn_f32
-    // LLVM_NO_FTZ: llvm.intr.sqrt
+    // COMMON-LABEL: test_sqrt_rn_f32
+    // COMMON: llvm.intr.sqrt
     %0 = tt.precise_sqrt %arg0 : tensor<64xf32, #blocked>
     tt.return
   }
@@ -96,3 +82,15 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.targ
     tt.return
   }
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @test_divf_rn_f32(%arg0: tensor<64xf32, #blocked>, %arg1: tensor<64xf32, #blocked>) {
+    // COMMON-LABEL: test_divf_rn_f32
+    // COMMON: llvm.fdiv
+    %0 = tt.precise_divf %arg0, %arg1 : tensor<64xf32, #blocked>
+    tt.return
+  }
+}
@@ -200,3 +200,29 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     tt.return
   }
 }
+
+// -----
+
+#linear = #ttg.linear<{register = [[0, 1], [0, 2], [64, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [0, 0]], warp = [[16, 0], [32, 0]], block = []}>
+#linear1 = #ttg.linear<{register = [[0, 1], [0, 2], [16, 0], [32, 0], [64, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [0, 0]], warp = [[0, 0], [0, 0]], block = []}>
+#mma = #ttg.amd_wmma<{version = 3, isTranspose = true, warpsPerCTA = [4, 1], instrShape=[16, 16, 128]}>
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx1250", "ttg.threads-per-warp" = 32 : i32} {
+  // CHECK-LABEL: wmma_scaled_dot_fp8_chained
+  tt.func @wmma_scaled_dot_fp8_chained(%arg0: tensor<128x128xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>>, %arg2: tensor<128x128xf8E5M2, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 16}>>, %arg3: tensor<128x128xf8E5M2, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>, %out0: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma>
+    %scale0 = arith.constant dense<127> :  tensor<128x4xi8, #linear>
+    %scale1 = arith.constant dense<127> :  tensor<128x4xi8, #linear1>
+    // CHECK-COUNT-16: llvm.call_intrinsic "llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4"{{.*}} : (i32, vector<16xi32>, i32, vector<16xi32>, i16, vector<8xf32>, i32, i32, i32, i32, i32, i32, i1, i1) -> vector<8xf32>
+    %mm0 = tt.dot_scaled %arg0 scale %scale0, %arg2 scale %scale1, %cst lhs = e4m3 rhs = e4m3 {fastMath = false} : tensor<128x128xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>>, tensor<128x4xi8, #linear> * tensor<128x128xf8E5M2, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 16}>>, tensor<128x4xi8, #linear1> -> tensor<128x128xf32, #mma>
+    // CHECK-NOT: rocdl.ds_swizzle
+    // CHECK-NOT: llvm.call_intrinsic "llvm.amdgcn.permlane16.swap"
+    %op0 = ttg.convert_layout %mm0 : tensor<128x128xf32, #mma> -> tensor<128x128xf32, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
+    %op1 = tt.fp_to_fp %op0, rounding = rtne : tensor<128x128xf32, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> -> tensor<128x128xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
+    // CHECK-COUNT-16: llvm.call_intrinsic "llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4"{{.*}} : (i32, vector<16xi32>, i32, vector<16xi32>, i16, vector<8xf32>, i32, i32, i32, i32, i32, i32, i1, i1) -> vector<8xf32>
+    %mm1 = tt.dot_scaled %op1 scale %scale0, %arg3 scale %scale1, %cst lhs = e4m3 rhs = e4m3 {fastMath = false} : tensor<128x128xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>, tensor<128x4xi8, #linear> * tensor<128x128xf8E5M2, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>, tensor<128x4xi8, #linear1> -> tensor<128x128xf32, #mma>
+    %ptr0 = tt.splat %out0 : !tt.ptr<f32> -> tensor<128x128x!tt.ptr<f32>, #mma>
+    tt.store %ptr0, %mm1 : tensor<128x128x!tt.ptr<f32>, #mma>
+    tt.return
+  }
+}
Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,7 @@ Value createMemDescToI64(RewriterBase &rewriter, Location loc,`
`27`	`27`	`const LLVMTypeConverter *typeConverter,`
`28`	`28`	`ttg::MemDescType memDescTy, Value sharedMemStruct) {`
`29`	`29`	`TritonLLVMOpBuilder b(loc, rewriter);`
`30`		`- if (isa<ttng::TensorMemoryEncodingAttr>(memDescTy.getEncoding())) {`
	`30`	`+ if (isa<ttng::TensorMemorySpaceAttr>(memDescTy.getMemorySpace())) {`
`31`	`31`	`return b.ptrtoint(rewriter.getIntegerType(64), sharedMemStruct);`
`32`	`32`	`}`
`33`	`33`	`assert(isa<ttg::SharedEncodingTrait>(memDescTy.getEncoding()) &&`