[AMD] Get rid of flat load/store instructions (#5137)

joviliast · web-flow · commit f7378433ce05 · 2024-11-13T10:22:42.000-08:00
Flat instructions could be a reason of significant slowness of workloads
due to undefined addresspace of pointers, so llvm passes can not apply
some of optimizations to these instructions.

This patch removes addresspace casting to keep original addresspace for
pointers and generate proper loads/stores.

Signed-off-by: Ilya Veselov &lt;iveselov.nn@gmail.com&gt;
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -4090,13 +4090,12 @@ def _kernel(dst, src, CACHE: tl.constexpr):
         cv_cache_modifier_str = 'sc0 sc1'
         buffer_load_line = [line for line in amdgcn.splitlines() if "buffer_load" in line]
         global_load_line = [line for line in amdgcn.splitlines() if "global_load" in line]
-        flat_load_line = [line for line in amdgcn.splitlines() if "flat_load" in line]
         if cache == '' or cache == '.ca':
             assert cg_cache_modifier_str not in (global_load_line[0] if global_load_line else buffer_load_line[0])
         if cache == '.cg':
             assert cg_cache_modifier_str in global_load_line[0]
         if cache == '.cv':
-            assert cv_cache_modifier_str in flat_load_line[0]
+            assert cv_cache_modifier_str in global_load_line[0]
 
     if is_cuda():
         ptx = pgm.asm['ptx']
diff --git a/test/Conversion/amd/load_store.mlir b/test/Conversion/amd/load_store.mlir
@@ -15,10 +15,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
     %7 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>, #blocked0>
     %8 = tt.addptr %7, %4 : tensor<256x!tt.ptr<f32>, #blocked0>, tensor<256xi32, #blocked0>
     // Load 8 elements from A with two vectorized load instruction
-    // CHECK-COUNT-2: llvm.intr.masked.load {{.*}} : (!llvm.ptr, vector<4xi1>, vector<4xf32>) -> vector<4xf32>
+    // CHECK-COUNT-2: llvm.intr.masked.load {{.*}} : (!llvm.ptr<1>, vector<4xi1>, vector<4xf32>) -> vector<4xf32>
     %9 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256x!tt.ptr<f32>, #blocked0>
     // Load 8 elements from B with two vectorized load instruction
-    // CHECK-COUNT-2: llvm.intr.masked.load {{.*}} : (!llvm.ptr, vector<4xi1>, vector<4xf32>) -> vector<4xf32>
+    // CHECK-COUNT-2: llvm.intr.masked.load {{.*}} : (!llvm.ptr<1>, vector<4xi1>, vector<4xf32>) -> vector<4xf32>
     %10 = tt.load %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256x!tt.ptr<f32>, #blocked0>
     %11 = arith.addf %9, %10 : tensor<256xf32, #blocked0>
     %12 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>, #blocked0>
@@ -51,7 +51,7 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war
     %105 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<32x32x!tt.ptr<f16>, #mma>
     %106 = tt.addptr %105, %104 : tensor<32x32x!tt.ptr<f16>, #mma>, tensor<32x32xi32, #mma>
     // Store 16 elements with four vectorized store instruction
-    // CHECK-COUNT-4: llvm.intr.masked.store {{.*}}, {{.*}}, {{.*}} {alignment = 16 : i32} : vector<4xf16>, vector<4xi1> into !llvm.ptr
+    // CHECK-COUNT-4: llvm.intr.masked.store {{.*}}, {{.*}}, {{.*}} {alignment = 16 : i32} : vector<4xf16>, vector<4xi1> into !llvm.ptr<1>
     tt.store %106, %2 : tensor<32x32x!tt.ptr<f16>, #mma>
     tt.return
   }
diff --git a/test/Conversion/amd/tritongpu_to_llvm.mlir b/test/Conversion/amd/tritongpu_to_llvm.mlir
@@ -25,10 +25,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     // CHECK: llvm.cond_br
     // CHECK: llvm.atomicrmw
     // CHECK: llvm.atomicrmw
-    // CHECK: %[[ADDR1:.*]] = llvm.addrspacecast
-    // CHECK: llvm.intr.masked.store %{{.*}}, %[[ADDR1]]
-    // CHECK: %[[ADDR2:.*]] = llvm.addrspacecast
-    // CHECK: llvm.intr.masked.store %{{.*}}, %[[ADDR2]]
+    // CHECK: llvm.intr.masked.store
+    // CHECK: llvm.intr.masked.store
     %0 = tt.atomic_rmw fadd, relaxed, gpu, %arg0, %arg2, %arg1 : (tensor<256x!tt.ptr<f32>, #blocked0>, tensor<256xf32, #blocked0>, tensor<256xi1, #blocked0>) -> tensor<256xf32, #blocked0>
     tt.store %arg0, %0 : tensor<256x!tt.ptr<f32>, #blocked0>
     tt.return
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -300,7 +300,7 @@ struct LoadOpConversion : public ConvertOpToLLVMPattern<triton::LoadOp>,
       assert(wordNElems * nWords * numVecs == numElems);
 
       Value pred = mask ? maskElems[vecStart] : int_val(1, 1);
-      Value ptr = addrspacecast(ptr_ty(getContext()), ptrElems[vecStart]);
+      Value ptr = ptrElems[vecStart];
 
       Value falseVal = createZeroVector(rewriter, loc, cast<VectorType>(vecTy));
       // If we need to mask the loaded value with other elements
@@ -477,7 +477,7 @@ struct StoreOpConversion : public ConvertOpToLLVMPattern<triton::StoreOp>,
 
       SmallVector<std::pair<Value, std::string>> asmArgs;
       Value elem = valueElems[vecStart];
-      Value ptr = addrspacecast(ptr_ty(getContext()), ptrElems[vecStart]);
+      Value ptr = ptrElems[vecStart];
 
       // Create the store val
       Value storeVal = packElementRangeIntoVector(