diff --git a/cmake/llvm-hash.txt b/cmake/llvm-hash.txt index 4f839b752cab..af3c4032f076 100644 --- a/cmake/llvm-hash.txt +++ b/cmake/llvm-hash.txt @@ -1 +1 @@ -3c709802d31b5bc5ed3af8284b40593ff39b9eec +e12cbd8339b89563059c2bb2a312579b652560d0 diff --git a/test/Conversion/amd/async_ops_to_llvm.mlir b/test/Conversion/amd/async_ops_to_llvm.mlir index db46da365905..c00d6c09821d 100644 --- a/test/Conversion/amd/async_ops_to_llvm.mlir +++ b/test/Conversion/amd/async_ops_to_llvm.mlir @@ -259,16 +259,13 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.sha // Each thread needs to load 1 element and we load 1 (sizePerThread) per global.load.lds // CHECK: llvm.getelementptr - // CHECK: %[[aux_ca:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: rocdl.global.load.lds {{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[aux_ca]] + // CHECK: rocdl.global.load.lds {{.*}}, {{.*}}, 4, 0, 0 %2 = ttg.async_copy_global_to_local %1, %arg2 cacheModifier = ca: tensor<32x32x!tt.ptr, #blocked> -> <32x32xf32, #shared, #smem, mutable> // CHECK: llvm.getelementptr - // CHECK: %[[aux_cg:.*]] = llvm.mlir.constant(3 : i32) : i32 - // CHECK: rocdl.global.load.lds {{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[aux_cg]] + // CHECK: rocdl.global.load.lds {{.*}}, {{.*}}, 4, 0, 3 %3 = ttg.async_copy_global_to_local %1, %arg2 cacheModifier = cg: tensor<32x32x!tt.ptr, #blocked> -> <32x32xf32, #shared, #smem, mutable> // CHECK: llvm.getelementptr - // CHECK: %[[aux_cv:.*]] = llvm.mlir.constant(17 : i32) : i32 - // CHECK: rocdl.global.load.lds {{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[aux_cv]] + // CHECK: rocdl.global.load.lds {{.*}}, {{.*}}, 4, 0, 17 %4 = ttg.async_copy_global_to_local %1, %arg2 cacheModifier = cv: tensor<32x32x!tt.ptr, #blocked> -> <32x32xf32, #shared, #smem, mutable> tt.return } diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp index fa9c1f48d72a..4dd38387c170 100644 --- a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp +++ b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp @@ -639,10 +639,9 @@ struct AsyncCopyGlobalToLocalOpConversion (vecTy.getNumElements() * vecTy.getElementTypeBitWidth()) / 8; assert(llvm::isPowerOf2_32(vecBytes)); Value vecBytesVal = b.i32_val(vecBytes); - - Value cacheModifiers = - b.i32_val(mlir::LLVM::AMD::getCtrlBitsForCacheModifierOnTarget( - op.getCache(), /*isLoad=*/true, targetInfo)); + int32_t cacheModifiers = + mlir::LLVM::AMD::getCtrlBitsForCacheModifierOnTarget( + op.getCache(), /*isLoad=*/true, targetInfo); Value llMask = adaptor.getMask(); SmallVector maskElems; @@ -680,7 +679,7 @@ struct AsyncCopyGlobalToLocalOpConversion auto globalLoadLdsOp = rewriter.create( loc, /*globalPtr=*/srcPtr, /*ldsPtr=*/coalescedShmemAddr[i], - /*size=*/vecBytesVal, /*offset=*/b.i32_val(0), + /*size=*/vecBytes, /*offset=*/0, /*aux=*/cacheModifiers, /*alias_scopes=*/nullptr, /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr); LLVM::AMD::addAsyncCopyAliasScope(globalLoadLdsOp); @@ -695,8 +694,8 @@ struct AsyncCopyGlobalToLocalOpConversion rewriter.create(loc, pred, loadBlock, afterLoad); rewriter.setInsertionPointToStart(loadBlock); auto globalLoadLdsOp = rewriter.create( - loc, srcPtr, coalescedShmemAddr[i], vecBytesVal, - /*offset=*/b.i32_val(0), cacheModifiers, nullptr, nullptr, nullptr); + loc, srcPtr, coalescedShmemAddr[i], vecBytes, + /*offset=*/0, cacheModifiers, nullptr, nullptr, nullptr); LLVM::AMD::addAsyncCopyAliasScope(globalLoadLdsOp); rewriter.create(loc, afterLoad);