[AMD] Refactor FP conversion mode setting (#8351)

ravil-mobile · web-flow · commit d5f3f236550b · 2025-10-06T16:39:39.000-07:00
In the current implementation we reset mode register every time when we
perform FP conversion to FP8 data type. We modify F16_OVFL flag which
also effects clamping during conversions of the FP16 data type. In fact,
the flag should be inserted only one (e.g., at the beginning of a
kernel). This PR addresses this issue. It moves the manipulation with
the mode register to a dedicated function which gets initialized with an
`AMD::ISAFamily` instance. Note, the the layout of bits in mode register
may vary from architecture to architecture.
diff --git a/test/Conversion/amd/async_ops_to_llvm.mlir b/test/Conversion/amd/async_ops_to_llvm.mlir
@@ -138,7 +138,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
   tt.func public @async_commit_group(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32},
                                      %arg1: i32 {tt.divisibility = 16 : i32},
                                      %arg2: !ttg.memdesc<32x64xf16, #shared, #smem, mutable>) {
-    // CHECK-NEXT: llvm.mlir.constant(0 : i32) : i32
+    // CHECK: llvm.mlir.constant(0 : i32) : i32
     // CHECK-NEXT: llvm.return
     ttg.async_commit_group
     tt.return
diff --git a/test/Conversion/amd/minmax.mlir b/test/Conversion/amd/minmax.mlir
@@ -12,7 +12,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
 // GFX942: llvm.intr.maxnum
 
 // GFX950: llvm.func @min_max
-// GFX950-NEXT: llvm.intr.minimum
+// GFX950: llvm.intr.minimum
 // GFX950-NEXT: llvm.intr.maximum
   tt.func public @min_max(%arg0: f32, %arg1: f32) {
     %0 = arith.minimumf %arg0, %arg1 : f32
diff --git a/test/TritonGPU/amd/amd-conditional-barrier.mlir b/test/TritonGPU/amd/amd-conditional-barrier.mlir
@@ -4,8 +4,8 @@ module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32,
   tt.func @conditional_barrier() {
     // CHECK-LABEL: llvm.func @conditional_barrier
 
-    // CHECK:   %[[CMP0:.+]] = llvm.icmp "ne" %3, %1 : i32
-    // CHECK:   %[[CMP1:.+]] = llvm.icmp "eq" %3, %1 : i32
+    // CHECK:   %[[CMP0:.+]] = llvm.icmp "ne" %[[OP0:.+]], %[[OP1:.+]] : i32
+    // CHECK:   %[[CMP1:.+]] = llvm.icmp "eq" %[[OP0]], %[[OP1]] : i32
     // CHECK:   llvm.cond_br %[[CMP0]], ^bb1, ^bb2
     // CHECK: ^bb1:
     // CHECK:   rocdl.s.barrier
diff --git a/third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUAttrDefs.td b/third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUAttrDefs.td
@@ -34,6 +34,10 @@ class TritonAMDGPU_Attr<string name, list<Trait> traits = [],
   : AttrDef<TritonAMDGPU_Dialect, name, traits, baseCppClass> {
 }
 
+def SetFP8Clamping : TritonAMDGPU_Attr<"SetFP8Clamping"> {
+  let mnemonic = "amdgcn.set.fp8.clamping";
+}
+
 class TritonAMDGPU_I32Enum<string name, string description, list<I32EnumAttrCase> cases>
     : I32EnumAttr<name, description, cases> {
   let genSpecializedAttr = 0;
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp
@@ -1,3 +1,4 @@
+#include "Dialect/TritonAMDGPU/IR/Dialect.h"
 #include "TargetInfo.h"
 #include "Utility.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
@@ -237,16 +238,6 @@ cvtScalePkDowncastToFp8(Location loc, ConversionPatternRewriter &rewriter,
   assert(v.size() == 4);
   auto b = TritonLLVMOpBuilder(loc, rewriter);
 
-  // This is the location of the fp16_ovfl flag in the Mode register. It's
-  // calculated following this formula:
-  //     (mode register ID = 1) | (Offset << 6) | ((Width - 1) << 11)
-  // In this case, Offset = 23 and Width = 1.
-  // When the bit is 0/1, the conversion from fp32/fp16/bf16 to fp8/bf8 is in
-  // non-saturation/saturation mode.
-  Value fp16OVFLModeRegLoc = b.i32_val(1473);
-  LLVM::createLLVMIntrinsicCallOp(rewriter, loc, "llvm.amdgcn.s.setreg", {},
-                                  {fp16OVFLModeRegLoc, b.i32_val(1)});
-
   Type v2I16Ty = vec_ty(i16_ty, 2);
   Value v2I16Vec = b.undef(v2I16Ty);
   Value scale = b.f32_val(1);
@@ -1855,6 +1846,17 @@ struct FpToFpOpConversion
       }
     }
 
+    if (dstType.isFloat() && (dstType.getIntOrFloatBitWidth() == 8)) {
+      auto func = op->getParentOfType<LLVM::LLVMFuncOp>();
+      if (func) {
+        using attrType = triton::amdgpu::SetFP8ClampingAttr;
+        auto attrName = attrType::getMnemonic();
+        if (!func->hasAttrOfType<attrType>(attrName)) {
+          func->setAttr(attrName, attrType::get(op->getContext()));
+        }
+      }
+    }
+
     inVals.resize(numElements, b.undef(typeConverter->convertType(srcType)));
     SmallVector<Value> outVals;
     if (srcType != dstType) {
@@ -2323,10 +2325,41 @@ struct PreciseSqrtOpConversion
 private:
   bool ftz;
 };
-
 } // namespace
 
 namespace mlir::triton::AMD {
+void adjustModeRegister(ModuleOp mod, const TargetInfo &targetInfo) {
+  MLIRContext *ctx = mod->getContext();
+  Location loc = mod->getLoc();
+  mlir::OpBuilder builder(ctx);
+  auto auxBuilder = TritonLLVMOpBuilder(loc, builder);
+
+  mod->walk([&](LLVM::LLVMFuncOp func) {
+    using attrType = triton::amdgpu::SetFP8ClampingAttr;
+    auto attrName = attrType::getMnemonic();
+    if (!func->hasAttrOfType<attrType>(attrName))
+      return;
+    else
+      func->removeAttr(attrName);
+
+    if (func.getBody().empty())
+      return;
+    auto &body = func.getBody().front();
+    builder.setInsertionPoint(&body.front());
+
+    // This is the location of the fp16_ovfl flag in the Mode register. It's
+    // calculated following this formula:
+    //     (mode register ID = 1) | (Offset << 6) | ((Width - 1) << 11)
+    // In this case, Offset = 23 and Width = 1.
+    // When the bit is 0/1, the conversion from fp32/fp16/bf16 to fp8/bf8 is
+    // in non-saturation/saturation mode.
+    Value fp16OVFLModeRegLoc = auxBuilder.i32_val(1473);
+    LLVM::createLLVMIntrinsicCallOp(
+        builder, loc, "llvm.amdgcn.s.setreg", {},
+        {fp16OVFLModeRegLoc, auxBuilder.i32_val(1)});
+  });
+}
+
 void populateElementwiseOpToLLVMPatterns(
     LLVMTypeConverter &typeConverter, RewritePatternSet &patterns, bool ftz,
     ModuleAxisInfoAnalysis &axisInfoAnalysis, ModuleAllocation &allocation,
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/PatternTritonGPUOpToLLVM.h b/third_party/amd/lib/TritonAMDGPUToLLVM/PatternTritonGPUOpToLLVM.h
@@ -25,6 +25,12 @@ void populateElementwiseOpToLLVMPatterns(
     LLVMTypeConverter &typeConverter, RewritePatternSet &patterns, bool ftz,
     ModuleAxisInfoAnalysis &axisInfoAnalysis, ModuleAllocation &allocation,
     const TargetInfo &targetInfo, PatternBenefit benefit);
+
+// Manipulates with execution mode register which is per-wavefront one.
+// The register controls execution of instructions - e.g., rounding modes,
+// exception handling, etc.
+void adjustModeRegister(ModuleOp mod, const TargetInfo &targetInfo);
+
 void populateLoadStoreOpToLLVMPatterns(LLVMTypeConverter &typeConverter,
                                        const TargetInfo &targetInfo,
                                        RewritePatternSet &patterns,
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TritonGPUToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/TritonGPUToLLVM.cpp
@@ -266,6 +266,7 @@ struct ConvertTritonAMDGPUToLLVM
       return signalPassFailure();
     }
 
+    AMD::adjustModeRegister(mod, targetInfo);
     fixUpLoopAnnotation(mod);
   }
 

Original file line number	Diff line number	Diff line change
`@@ -266,6 +266,7 @@ struct ConvertTritonAMDGPUToLLVM`
`266`	`266`	`return signalPassFailure();`
`267`	`267`	`}`
`268`	`268`
	`269`	`+ AMD::adjustModeRegister(mod, targetInfo);`
`269`	`270`	`fixUpLoopAnnotation(mod);`
`270`	`271`	`}`
`271`	`272`