diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td index eda129fb59ded..e95e27bfa5ad3 100644 --- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td +++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td @@ -360,14 +360,14 @@ def cuf_SharedMemoryOp let arguments = (ins TypeAttr:$in_type, OptionalAttr:$uniq_name, OptionalAttr:$bindc_name, Variadic:$typeparams, Variadic:$shape, - OptionalAttr:$offset // offset in bytes from the shared memory - // base address. + Optional:$offset // offset in bytes from the shared memory + // base address. ); let results = (outs fir_ReferenceType:$ptr); let assemblyFormat = [{ - $in_type (`(` $typeparams^ `:` type($typeparams) `)`)? + (`[` $offset^ `:` type($offset) `]`)? $in_type (`(` $typeparams^ `:` type($typeparams) `)`)? (`,` $shape^ `:` type($shape) )? attr-dict `->` qualified(type($ptr)) }]; diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp index 3c7af9fc8a7d8..957e4c01fb4a1 100644 --- a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp +++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp @@ -315,7 +315,7 @@ void cuf::SharedMemoryOp::build( bindcName.empty() ? mlir::StringAttr{} : builder.getStringAttr(bindcName); build(builder, result, wrapAllocaResultType(inType), mlir::TypeAttr::get(inType), nameAttr, bindcAttr, typeparams, shape, - mlir::IntegerAttr{}); + /*offset=*/mlir::Value{}); result.addAttributes(attributes); } diff --git a/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp index 5c6d1233c3ed3..aec3ea294ac6c 100644 --- a/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp +++ b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp @@ -57,6 +57,7 @@ struct CUFComputeSharedMemoryOffsetsAndSize auto gpuMod = cuf::getOrCreateGPUModule(mod, symTab); mlir::Type i8Ty = builder.getI8Type(); + mlir::Type i32Ty = builder.getI32Type(); for (auto funcOp : gpuMod.getOps()) { unsigned nbDynamicSharedVariables = 0; unsigned nbStaticSharedVariables = 0; @@ -68,6 +69,8 @@ struct CUFComputeSharedMemoryOffsetsAndSize // are static. If this is dynamic shared memory, then only the alignment // is computed. for (auto sharedOp : funcOp.getOps()) { + mlir::Location loc = sharedOp.getLoc(); + builder.setInsertionPoint(sharedOp); if (fir::hasDynamicSize(sharedOp.getInType())) { mlir::Type ty = sharedOp.getInType(); // getTypeSizeAndAlignmentOrCrash will crash trying to compute the @@ -77,14 +80,17 @@ struct CUFComputeSharedMemoryOffsetsAndSize ty = seqTy.getEleTy(); unsigned short align = dl->getTypeABIAlignment(ty); ++nbDynamicSharedVariables; - sharedOp.setOffset(0); + mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0); + sharedOp.getOffsetMutable().assign(zero); alignment = std::max(alignment, align); continue; } auto [size, align] = fir::getTypeSizeAndAlignmentOrCrash( sharedOp.getLoc(), sharedOp.getInType(), *dl, kindMap); ++nbStaticSharedVariables; - sharedOp.setOffset(llvm::alignTo(sharedMemSize, align)); + mlir::Value offset = builder.createIntegerConstant( + loc, i32Ty, llvm::alignTo(sharedMemSize, align)); + sharedOp.getOffsetMutable().assign(offset); sharedMemSize = llvm::alignTo(sharedMemSize, align) + llvm::alignTo(size, align); alignment = std::max(alignment, align); diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp index b54332b6694c4..74260c5b5c2a3 100644 --- a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp @@ -232,8 +232,7 @@ struct CUFSharedMemoryOpConversion loc, mlir::LLVM::LLVMPointerType::get(rewriter.getContext()), sharedGlobalAddr); mlir::Type baseType = castPtr->getResultTypes().front(); - llvm::SmallVector gepArgs = { - static_cast(*op.getOffset())}; + llvm::SmallVector gepArgs = {op.getOffset()}; mlir::Value shmemPtr = rewriter.create( loc, baseType, rewriter.getI8Type(), castPtr, gepArgs); rewriter.replaceOp(op, {shmemPtr}); diff --git a/flang/test/Fir/CUDA/cuda-shared-offset.mlir b/flang/test/Fir/CUDA/cuda-shared-offset.mlir index b3ea7dfc89cc7..1eea75c802204 100644 --- a/flang/test/Fir/CUDA/cuda-shared-offset.mlir +++ b/flang/test/Fir/CUDA/cuda-shared-offset.mlir @@ -14,7 +14,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry, %c-1 : index {bindc_name = "r", offset = 0 : i32, uniq_name = "_QFdynsharedEr"} -> !fir.ref> +// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array, %c-1 : index {bindc_name = "r", uniq_name = "_QFdynsharedEr"} -> !fir.ref> // CHECK: gpu.return // CHECK: } // CHECK: fir.global internal @_QPdynshared__shared_mem {alignment = 4 : i64, data_attr = #cuf.cuda} : !fir.array<0xi8> @@ -43,12 +43,12 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry !fir.ref -// CHECK: cuf.shared_memory i32 {bindc_name = "b", offset = 4 : i32, uniq_name = "_QFshared_staticEb"} -> !fir.ref -// CHECK: cuf.shared_memory i32 {bindc_name = "c", offset = 8 : i32, uniq_name = "_QFshared_staticEc"} -> !fir.ref -// CHECK: cuf.shared_memory i32 {bindc_name = "d", offset = 12 : i32, uniq_name = "_QFshared_staticEd"} -> !fir.ref -// CHECK: cuf.shared_memory i64 {bindc_name = "e", offset = 16 : i32, uniq_name = "_QFshared_staticEe"} -> !fir.ref -// CHECK: cuf.shared_memory f32 {bindc_name = "r", offset = 24 : i32, uniq_name = "_QFshared_staticEr"} -> !fir.ref +// CHECK: cuf.shared_memory[%c0{{.*}} : i32] i32 {bindc_name = "a", uniq_name = "_QFshared_staticEa"} -> !fir.ref +// CHECK: cuf.shared_memory[%c4{{.*}} : i32] i32 {bindc_name = "b", uniq_name = "_QFshared_staticEb"} -> !fir.ref +// CHECK: cuf.shared_memory[%c8{{.*}} : i32] i32 {bindc_name = "c", uniq_name = "_QFshared_staticEc"} -> !fir.ref +// CHECK: cuf.shared_memory[%c12{{.*}} : i32] i32 {bindc_name = "d", uniq_name = "_QFshared_staticEd"} -> !fir.ref +// CHECK: cuf.shared_memory[%c16{{.*}} : i32] i64 {bindc_name = "e", uniq_name = "_QFshared_staticEe"} -> !fir.ref +// CHECK: cuf.shared_memory[%c24{{.*}} : i32] f32 {bindc_name = "r", uniq_name = "_QFshared_staticEr"} -> !fir.ref // CHECK: gpu.return // CHECK: } // CHECK: fir.global internal @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {alignment = 8 : i64, data_attr = #cuf.cuda} : !fir.array<28xi8> diff --git a/flang/test/Fir/CUDA/cuda-shared-to-llvm.mlir b/flang/test/Fir/CUDA/cuda-shared-to-llvm.mlir index 478ca92b63b60..26479d1cdd94f 100644 --- a/flang/test/Fir/CUDA/cuda-shared-to-llvm.mlir +++ b/flang/test/Fir/CUDA/cuda-shared-to-llvm.mlir @@ -3,8 +3,10 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry, dense<64> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git cae351f3453a0a26ec8eb2ddaf773c24a29d929e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} { gpu.module @cuda_device_mod { llvm.func @_QPshared_static() { - %0 = cuf.shared_memory i32 {bindc_name = "a", offset = 0 : i32, uniq_name = "_QFshared_staticEa"} -> !fir.ref - %1 = cuf.shared_memory i32 {bindc_name = "b", offset = 4 : i32, uniq_name = "_QFshared_staticEb"} -> !fir.ref + %c0 = arith.constant 0 : i32 + %0 = cuf.shared_memory [%c0 : i32] i32 {bindc_name = "a", uniq_name = "_QFshared_staticEa"} -> !fir.ref + %c4 = arith.constant 4 : i32 + %1 = cuf.shared_memory [%c4 : i32] i32 {bindc_name = "b", uniq_name = "_QFshared_staticEb"} -> !fir.ref llvm.return } llvm.mlir.global common @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {addr_space = 3 : i32, alignment = 8 : i64} : !llvm.array<28 x i8> @@ -14,7 +16,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry // CHECK: %[[ADDRCAST0:.*]] = llvm.addrspacecast %[[ADDR0]] : !llvm.ptr<3> to !llvm.ptr -// CHECK: %[[A:.*]] = llvm.getelementptr %[[ADDRCAST0]][0] : (!llvm.ptr) -> !llvm.ptr, i8 +// CHECK: %[[A:.*]] = llvm.getelementptr %[[ADDRCAST0]][%c0{{.*}}] : (!llvm.ptr, i32) -> !llvm.ptr, i8 // CHECK: %[[ADDR1:.*]] = llvm.mlir.addressof @_QPshared_static__shared_mem : !llvm.ptr<3> // CHECK: %[[ADDRCAST1:.*]] = llvm.addrspacecast %[[ADDR1]] : !llvm.ptr<3> to !llvm.ptr -// CHECK: %[[B:.*]] = llvm.getelementptr %[[ADDRCAST1]][4] : (!llvm.ptr) -> !llvm.ptr, i8 +// CHECK: %[[B:.*]] = llvm.getelementptr %[[ADDRCAST1]][%c4{{.*}}] : (!llvm.ptr, i32) -> !llvm.ptr, i8