[AMD][BACKEND] Adjust cache modifier mappings (triton-lang#5852)

AlexAUT · web-flow · commit df66eb594d61 · 2025-02-11T09:14:56.000-08:00
Adjust mappings to better represent the expected caching behavior and
improve the misleading parameter name. `.cg` loads should be different
for all types of loads and not just `BufferLoads`.
diff --git a/test/Conversion/amd/async_ops_to_llvm.mlir b/test/Conversion/amd/async_ops_to_llvm.mlir
@@ -193,25 +193,13 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.sha
     // CHECK: rocdl.global.load.lds {{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[aux_ca]]
     %2 = ttg.async_copy_global_to_local %1, %arg2 cacheModifier = ca: tensor<32x32x!tt.ptr<f16>, #blocked> -> <32x32xf16, #shared, #smem, mutable>
     // CHECK: llvm.getelementptr
-    // CHECK: %[[aux_cg:.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK: %[[aux_cg:.*]] = llvm.mlir.constant(3 : i32) : i32
     // CHECK: rocdl.global.load.lds {{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[aux_cg]]
     %3 = ttg.async_copy_global_to_local %1, %arg2 cacheModifier = cg: tensor<32x32x!tt.ptr<f16>, #blocked> -> <32x32xf16, #shared, #smem, mutable>
     // CHECK: llvm.getelementptr
-    // CHECK: %[[aux_cs:.*]] = llvm.mlir.constant(3 : i32) : i32
-    // CHECK: rocdl.global.load.lds {{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[aux_cs]]
-    %5 = ttg.async_copy_global_to_local %1, %arg2 cacheModifier = cs: tensor<32x32x!tt.ptr<f16>, #blocked> -> <32x32xf16, #shared, #smem, mutable>
-    // CHECK: llvm.getelementptr
-    // CHECK: %[[aux_cv:.*]] = llvm.mlir.constant(9 : i32) : i32
+    // CHECK: %[[aux_cv:.*]] = llvm.mlir.constant(11 : i32) : i32
     // CHECK: rocdl.global.load.lds {{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[aux_cv]]
-    %6 = ttg.async_copy_global_to_local %1, %arg2 cacheModifier = cv: tensor<32x32x!tt.ptr<f16>, #blocked> -> <32x32xf16, #shared, #smem, mutable>
-    // CHECK: llvm.getelementptr
-    // CHECK: %[[aux_wb:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK: rocdl.global.load.lds {{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[aux_wb]]
-    %7 = ttg.async_copy_global_to_local %1, %arg2 cacheModifier = wb: tensor<32x32x!tt.ptr<f16>, #blocked> -> <32x32xf16, #shared, #smem, mutable>
-    // CHECK: llvm.getelementptr
-    // CHECK: %[[aux_wt:.*]] = llvm.mlir.constant(8 : i32) : i32
-    // CHECK: rocdl.global.load.lds {{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[aux_wt]]
-    %8 = ttg.async_copy_global_to_local %1, %arg2 cacheModifier = wt: tensor<32x32x!tt.ptr<f16>, #blocked> -> <32x32xf16, #shared, #smem, mutable>
+    %4 = ttg.async_copy_global_to_local %1, %arg2 cacheModifier = cv: tensor<32x32x!tt.ptr<f16>, #blocked> -> <32x32xf16, #shared, #smem, mutable>
     tt.return
   }
 }
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -511,7 +511,7 @@ struct AsyncCopyGlobalToLocalOpConversion
 
     Value cacheModifiers =
         b.i32_val(mlir::LLVM::AMD::getCtrlBitsForCacheModifierOnTarget(
-            op.getCache(), false, targetInfo));
+            op.getCache(), /*isLoad=*/true, targetInfo));
 
     Value llMask = adaptor.getMask();
     SmallVector<Value> maskElems;
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp
@@ -455,40 +455,43 @@ getCacheModifierFlagsForPredicatedCall(LLVM::CallOp callOp) {
 // Load   | .ca |  0  |  0  | 0  |
 //        | .cg |  0  |  1  | 1  |
 //        | .cs |  0  |  1  | 1  |
-//        | .cv |  1  |  1  | x  |
+//        | .cv |  1  |  1  | 1  |
 // -------+-----+-----+-----+----+--
 // Store  | .wb |  0  |  0  | 0  |
 //        | .cg |  0  |  0  | 0  |
 //        | .cs |  0  |  1  | 1  |
-//        | .wt |  1  |  x  | x  |
+//        | .wt |  1  |  1  | 1  |
 // -------+-----+-----+-----+----+--
 // Atomic | N/A |  0  |  1  | x  | Setting sc0 returns the pre-op value
 //        | N/A |  1  |  0  | x  | Setting sc1 performs a system-scope atomic
 // -------+-----+-----+-----+----+--
 static int32_t
 getCtrlBitsForCacheModifierOnGFX_942_950(triton::CacheModifier cm,
-                                         bool isBufferLoad) {
+                                         bool isLoad) {
   const int sc0Bit = 0b1, ntBit = 0b10, sc1Bit = 0b1000;
   int32_t aux = 0;
   switch (cm) {
   case triton::CacheModifier::CA:
     aux = 0;
     break;
   case triton::CacheModifier::CG:
-    if (isBufferLoad)
+    if (isLoad)
       aux |= sc0Bit | ntBit;
     break;
   case triton::CacheModifier::CS:
     aux |= sc0Bit | ntBit;
     break;
   case triton::CacheModifier::CV:
-    aux |= sc0Bit | sc1Bit;
+    assert(isLoad);
+    aux |= sc0Bit | sc1Bit | ntBit;
     break;
   case triton::CacheModifier::WB:
+    assert(!isLoad);
     aux = 0;
     break;
   case triton::CacheModifier::WT:
-    aux |= sc1Bit;
+    assert(!isLoad);
+    aux |= sc0Bit | sc1Bit | ntBit;
     break;
   default:
     aux = 0;
@@ -521,12 +524,12 @@ static int32_t getDefaultCtrlBitsForCacheModifier(triton::CacheModifier cm) {
 // .wb: write-back, writes back data at all cache levels
 // .wt: write-through, write data directly to system memory
 int32_t getCtrlBitsForCacheModifierOnTarget(
-    triton::CacheModifier cm, bool isBufferLoad,
+    triton::CacheModifier cm, bool isLoad,
     const mlir::triton::AMD::TargetInfo &targetInfo) {
   switch (targetInfo.getGPUKind()) {
   case llvm::AMDGPU::GK_GFX942:
   case llvm::AMDGPU::GK_GFX950:
-    return getCtrlBitsForCacheModifierOnGFX_942_950(cm, isBufferLoad);
+    return getCtrlBitsForCacheModifierOnGFX_942_950(cm, isLoad);
   default:
     return getDefaultCtrlBitsForCacheModifier(cm);
   }