diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td index 860f893367203..ccb1678aef919 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td @@ -99,6 +99,8 @@ def GPU_AddressSpaceEnum : GPU_I32Enum< def GPU_AddressSpaceAttr : GPU_I32EnumAttr<"address_space", GPU_AddressSpaceEnum>; +def GPU_AddressSpaceAttrArray : TypedArrayAttrBase; + //===----------------------------------------------------------------------===// // GPU Types. //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td index 6098eb34d04d5..9d89068c72969 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -1355,7 +1355,8 @@ def GPU_ShuffleOp : GPU_Op< ]; } -def GPU_BarrierOp : GPU_Op<"barrier"> { +def GPU_BarrierOp : GPU_Op<"barrier">, + Arguments<(ins OptionalAttr :$address_spaces)> { let summary = "Synchronizes all work items of a workgroup."; let description = [{ The "barrier" op synchronizes all work items of a workgroup. It is used @@ -1371,11 +1372,25 @@ def GPU_BarrierOp : GPU_Op<"barrier"> { accessing the same memory can be avoided by synchronizing work items in-between these accesses. + The address space of visible memory accesses can be modified by adding a + list of address spaces required to be visible. By default all address spaces + are included. + + ```mlir + // only workgroup address spaces accesses required to be visible + gpu.barrier memfence [#gpu.address_space] + // no memory accesses required to be visible + gpu.barrier memfence [] + // all memory accesses required to be visible + gpu.barrier + ``` + Either none or all work items of a workgroup need to execute this op in convergence. }]; - let assemblyFormat = "attr-dict"; + let assemblyFormat = "(`memfence` $address_spaces^)? attr-dict"; let hasCanonicalizer = 1; + let builders = [OpBuilder<(ins)>]; } def GPU_GPUModuleOp : GPU_Op<"module", [ diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp index 739a34e0aa610..f9e8e397f93f2 100644 --- a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp +++ b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp @@ -116,12 +116,31 @@ struct GPUBarrierConversion final : ConvertOpToLLVMPattern { lookupOrCreateSPIRVFn(moduleOp, funcName, flagTy, voidTy, /*isMemNone=*/false, /*isConvergent=*/true); - // Value used by SPIR-V backend to represent `CLK_LOCAL_MEM_FENCE`. - // See `llvm/lib/Target/SPIRV/SPIRVBuiltins.td`. - constexpr int64_t localMemFenceFlag = 1; + // Value used by SPIR-V backend to represent `CLK_LOCAL_MEM_FENCE` and + // `CLK_GLOBAL_MEM_FENCE`. See `llvm/lib/Target/SPIRV/SPIRVBuiltins.td`. + constexpr int32_t localMemFenceFlag = 1; + constexpr int32_t globalMemFenceFlag = 2; + int32_t memFenceFlag = 0; + std::optional addressSpaces = adaptor.getAddressSpaces(); + if (addressSpaces) { + for (Attribute attr : addressSpaces.value()) { + auto addressSpace = cast(attr).getValue(); + switch (addressSpace) { + case gpu::AddressSpace::Global: + memFenceFlag = memFenceFlag | globalMemFenceFlag; + break; + case gpu::AddressSpace::Workgroup: + memFenceFlag = memFenceFlag | localMemFenceFlag; + break; + case gpu::AddressSpace::Private: + break; + } + } + } else { + memFenceFlag = localMemFenceFlag | globalMemFenceFlag; + } Location loc = op->getLoc(); - Value flag = - rewriter.create(loc, flagTy, localMemFenceFlag); + Value flag = rewriter.create(loc, flagTy, memFenceFlag); rewriter.replaceOp(op, createSPIRVBuiltinCall(loc, rewriter, func, flag)); return success(); } diff --git a/mlir/lib/Conversion/GPUToNVVM/GPUToNVVM.td b/mlir/lib/Conversion/GPUToNVVM/GPUToNVVM.td index f513bb1a0a826..0fcda38631a9b 100644 --- a/mlir/lib/Conversion/GPUToNVVM/GPUToNVVM.td +++ b/mlir/lib/Conversion/GPUToNVVM/GPUToNVVM.td @@ -17,6 +17,6 @@ include "mlir/IR/PatternBase.td" include "mlir/Dialect/GPU/IR/GPUOps.td" include "mlir/Dialect/LLVMIR/NVVMOps.td" -def : Pat<(GPU_BarrierOp), (NVVM_Barrier0Op)>; +def : Pat<(GPU_BarrierOp : $op $memory_fence), (NVVM_Barrier0Op)>; #endif // MLIR_CONVERSION_GPUTONVVM_TD diff --git a/mlir/lib/Conversion/GPUToROCDL/GPUToROCDL.td b/mlir/lib/Conversion/GPUToROCDL/GPUToROCDL.td index 8d2f30a9a1683..d3bb774813437 100644 --- a/mlir/lib/Conversion/GPUToROCDL/GPUToROCDL.td +++ b/mlir/lib/Conversion/GPUToROCDL/GPUToROCDL.td @@ -17,6 +17,6 @@ include "mlir/IR/PatternBase.td" include "mlir/Dialect/GPU/IR/GPUOps.td" include "mlir/Dialect/LLVMIR/ROCDLOps.td" -def : Pat<(GPU_BarrierOp), (ROCDL_BarrierOp)>; +def : Pat<(GPU_BarrierOp : $op $memory_fence), (ROCDL_BarrierOp)>; #endif // MLIR_CONVERSION_GPUTOROCDL_TD diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp index 956877497d933..156d6b8fe1595 100644 --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -1351,6 +1351,9 @@ void BarrierOp::getCanonicalizationPatterns(RewritePatternSet &results, results.add(eraseRedundantGpuBarrierOps); } +void BarrierOp::build(mlir::OpBuilder &odsBuilder, + mlir::OperationState &odsState) {} + //===----------------------------------------------------------------------===// // GPUFuncOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir b/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir index 910105ddf6958..4767565ea0550 100644 --- a/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir +++ b/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir @@ -213,14 +213,29 @@ gpu.module @barriers { // CHECK-LABEL: gpu_barrier func.func @gpu_barrier() { - // CHECK: [[FLAGS:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: llvm.call spir_funccc @_Z7barrierj([[FLAGS]]) { + // CHECK: [[GLOBAL_AND_LOCAL_FLAG:%.*]] = llvm.mlir.constant(3 : i32) : i32 + // CHECK: llvm.call spir_funccc @_Z7barrierj([[GLOBAL_AND_LOCAL_FLAG]]) { // CHECK-SAME-DAG: no_unwind // CHECK-SAME-DAG: convergent // CHECK-SAME-DAG: will_return // CHECK-NOT: memory_effects = #llvm.memory_effects // CHECK-SAME: } : (i32) -> () gpu.barrier + // CHECK: [[GLOBAL_AND_LOCAL_FLAG2:%.*]] = llvm.mlir.constant(3 : i32) : i32 + // CHECK: llvm.call spir_funccc @_Z7barrierj([[GLOBAL_AND_LOCAL_FLAG2]]) + gpu.barrier memfence [#gpu.address_space, #gpu.address_space] + // CHECK: [[LOCAL_FLAG:%.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK: llvm.call spir_funccc @_Z7barrierj([[LOCAL_FLAG]]) + gpu.barrier memfence [#gpu.address_space] + // CHECK: [[GLOBAL_FLAG:%.*]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK: llvm.call spir_funccc @_Z7barrierj([[GLOBAL_FLAG]]) + gpu.barrier memfence [#gpu.address_space] + // CHECK: [[NONE_FLAG:%.*]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK: llvm.call spir_funccc @_Z7barrierj([[NONE_FLAG]]) + gpu.barrier memfence [] + // CHECK: [[NONE_FLAG2:%.*]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK: llvm.call spir_funccc @_Z7barrierj([[NONE_FLAG2]]) + gpu.barrier memfence [#gpu.address_space] return } } diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir index b9c0a0e79e8f2..2bba66f786f18 100644 --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -141,6 +141,12 @@ module attributes {gpu.container_module} { %shfl3, %pred3 = gpu.shuffle idx %arg0, %offset, %width : f32 "gpu.barrier"() : () -> () + gpu.barrier + gpu.barrier memfence [#gpu.address_space] + gpu.barrier memfence [#gpu.address_space] + gpu.barrier memfence [#gpu.address_space, #gpu.address_space] + gpu.barrier memfence [#gpu.address_space] + gpu.barrier memfence [] "some_op"(%bIdX, %tIdX) : (index, index) -> () %42 = memref.load %arg1[%bIdX] : memref