diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td index f946bb731e2ca..8731a6fca9e57 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -1517,7 +1517,7 @@ def GPU_GPUModuleOp : GPU_Op<"module", [ /// Sets the targets of the module. void setTargets(ArrayRef targets); }]; - + let hasVerifier = 1; } @@ -3212,4 +3212,52 @@ def GPU_WarpExecuteOnLane0Op : GPU_Op<"warp_execute_on_lane_0", }]; } +def GPU_BroadcastType : I32EnumAttr<"BroadcastType", + "a lane to broadcast from", + [ + I32EnumAttrCase<"first_active_lane", 0>, + I32EnumAttrCase<"any_lane", 1>, + I32EnumAttrCase<"specific_lane", 2> + ]>{ + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::gpu"; +} +def GPU_BroadcastTypeAttr : EnumAttr; + +def GPU_SubgroupBroadcastOp : GPU_Op<"subgroup_broadcast", + [NoMemoryEffect, AllTypesMatch<["result", "src"]>, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods] # + ElementwiseMappable.traits>, + Arguments<(ins AnyType:$src, + Optional:$lane, + GPU_BroadcastTypeAttr:$broadcast_type)> { + let summary = "Broadcasts a value from the specific lane across subgroup"; + let description = [{ + Broadcasts a value from one lane to all active lanes in a subgroup. The + result is guaranteed to be uniform across the active lanes in subgroup. + + The possible broadcast types are: + + * `first_active_lane` - broadcasts the value from the first active lane + in the subgroup. + * `specific_lane` - broadcasts from the specified lane. The lane index + must be uniform and within the subgroup size. The result is poison if the + lane index is invalid, non subgroup-uniform, or if the source lane is not + active. + * `any_lane` - broadcasts the value from any lane of the subgroup, + assuming the input is already subgroup uniform. The result is poison if + the input is not uniform. This is useful to convey uniformity to the + compiler to enable more optimizations. Also, it allows more speculation + opportunities than `first_active_lane` since `first_active_lane` results + can depend on active lanes which may change during speculation across + control flow. + }]; + let results = (outs AnyType:$result); + let assemblyFormat = [{ + $src `,` $broadcast_type ($lane^)? attr-dict `:` type($result) + }]; + let hasVerifier = 1; +} + #endif // GPU_OPS diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp index d22364e1ef441..e9534bfc68cc1 100644 --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -160,6 +160,27 @@ struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern { const amdgpu::Chipset chipset; }; +struct GPUSubgroupBroadcastOpToROCDL + : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(gpu::SubgroupBroadcastOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value src = adaptor.getSrc(); + if (adaptor.getBroadcastType() == gpu::BroadcastType::specific_lane) { + rewriter.replaceOpWithNewOp(op, src.getType(), src, + adaptor.getLane()); + } else { // first_active_lane or any_lane + // any_lane is lowered to readfirstlane too, to force value into scalar + // register. + rewriter.replaceOpWithNewOp(op, src.getType(), + src); + } + return success(); + } +}; + struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern { using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; @@ -453,7 +474,8 @@ void mlir::populateGpuToROCDLConversionPatterns( // TODO: Add alignment for workgroup memory patterns.add(converter); - patterns.add(converter); + patterns.add(converter); patterns.add(converter, chipset); populateMathToROCDLConversionPatterns(converter, patterns); diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp index 2503ccb6a2cfe..5c8bdef813c30 100644 --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -2511,6 +2511,45 @@ bool WarpExecuteOnLane0Op::areTypesCompatible(Type lhs, Type rhs) { verifyDistributedType(lhs, rhs, getWarpSize(), getOperation())); } +//===----------------------------------------------------------------------===// +// GPU_SubgroupBroadcastOp +//===----------------------------------------------------------------------===// + +void gpu::SubgroupBroadcastOp::inferResultRanges( + ArrayRef argRanges, SetIntRangeFn setResultRange) { + setResultRange(getResult(), argRanges.front()); +} + +Speculation::Speculatability gpu::SubgroupBroadcastOp::getSpeculatability() { + switch (getBroadcastType()) { + case BroadcastType::first_active_lane: + // Cannot speculate first_lane broadcast, because speculating it across + // control flow can change the active lanes. + return Speculation::NotSpeculatable; + case BroadcastType::any_lane: + LLVM_FALLTHROUGH; + case BroadcastType::specific_lane: + return Speculation::Speculatable; + } +} + +LogicalResult gpu::SubgroupBroadcastOp::verify() { + switch (getBroadcastType()) { + case BroadcastType::first_active_lane: + LLVM_FALLTHROUGH; + case BroadcastType::any_lane: + if (getLane()) + return emitOpError() + << "lane can only be specified for `specific_lane` broadcast"; + return success(); + case BroadcastType::specific_lane: + if (!getLane()) + return emitOpError() + << "lane must be specified for `specific_lane` broadcast"; + return success(); + } +} + //===----------------------------------------------------------------------===// // GPU KernelMetadataAttr //===----------------------------------------------------------------------===// diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir index 2b6adffc81f72..a7b021f3c94a2 100644 --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir @@ -701,7 +701,7 @@ gpu.module @test_module { // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32 // CHECK: %[[#PERMUTE:]] = rocdl.ds_bpermute %[[#ALIGNED_DST_LANE]], %[[#CAST_VALUE]] : (i32, i32) -> i32 // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32 - %shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32 + %shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32 // *** UP mode shuffle *** // CHECK: %[[#LANE_ID:]] = rocdl.mbcnt.hi // CHECK: %[[#ZERO:]] = llvm.mlir.constant(0 : i32) : i32 @@ -776,3 +776,19 @@ gpu.module @test_module { func.return %bDimX : index } } + +// ----- + +gpu.module @test_module { +// CHECK-LABEL: func @broadcast +// CHECK-SAME: (%[[ARG:.*]]: i64, %[[IDX:.*]]: i32) +func.func @broadcast(%arg0 : index, %arg1 : i32) -> (index, index, index) { +// CHECK: %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64 +// CHECK: %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64 +// CHECK: %{{.*}} = rocdl.readlane %[[ARG]], %[[IDX]] : (i64, i32) -> i64 + %0 = gpu.subgroup_broadcast %arg0, first_active_lane : index + %1 = gpu.subgroup_broadcast %arg0, any_lane : index + %2 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : index + func.return %0, %1, %2 : index, index, index +} +} diff --git a/mlir/test/Dialect/GPU/broadcast-speculatability.mlir b/mlir/test/Dialect/GPU/broadcast-speculatability.mlir new file mode 100644 index 0000000000000..ea32d62756c35 --- /dev/null +++ b/mlir/test/Dialect/GPU/broadcast-speculatability.mlir @@ -0,0 +1,24 @@ +// RUN: mlir-opt %s --loop-invariant-code-motion | FileCheck %s + +func.func private @side_effect(%arg0 : f32, %arg1 : f32, %arg2 : f32) + +// CHECK-LABEL: func @broadcast_hoisting +// CHECK-SAME: (%[[ARG:.*]]: f32, %[[IDX:.*]]: i32, {{.*}}: index) +func.func @broadcast_hoisting(%arg0 : f32, %arg1 : i32, %arg2 : index) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index +// `any_lane` and `specific_lane` can be speculated across the control flow, but +// `first_active_lane` cannot as active lanes can change. +// CHECK: %[[V1:.*]] = gpu.subgroup_broadcast %[[ARG]], any_lane : f32 +// CHECK: %[[V2:.*]] = gpu.subgroup_broadcast %[[ARG]], specific_lane %[[IDX]] : f32 +// CHECK: scf.for +// CHECK: %[[V0:.*]] = gpu.subgroup_broadcast %[[ARG]], first_active_lane : f32 +// CHECK: func.call @side_effect(%[[V0]], %[[V1]], %[[V2]]) + scf.for %i = %c0 to %arg2 step %c1 { + %0 = gpu.subgroup_broadcast %arg0, first_active_lane : f32 + %1 = gpu.subgroup_broadcast %arg0, any_lane : f32 + %2 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32 + func.call @side_effect(%0, %1, %2) : (f32, f32, f32) -> () + } + func.return +} diff --git a/mlir/test/Dialect/GPU/int-range-interface.mlir b/mlir/test/Dialect/GPU/int-range-interface.mlir index 1613f83b17bde..2e92db0f342aa 100644 --- a/mlir/test/Dialect/GPU/int-range-interface.mlir +++ b/mlir/test/Dialect/GPU/int-range-interface.mlir @@ -329,3 +329,22 @@ module attributes {gpu.container_module} { } } } + +// ----- + +// CHECK-LABEL: func @broadcast +func.func @broadcast(%idx: i32) { + %0 = test.with_bounds { umin = 0 : index, umax = 10 : index, smin = 0 : index, smax = 10 : index } : index + %1 = gpu.subgroup_broadcast %0, first_active_lane : index + %2 = gpu.subgroup_broadcast %0, any_lane : index + %3 = gpu.subgroup_broadcast %0, specific_lane %idx : index + + // CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index} + // CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index} + // CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index} + + %4 = test.reflect_bounds %1 : index + %5 = test.reflect_bounds %2 : index + %6 = test.reflect_bounds %3 : index + return +} diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir index 9cc0bf8f41d5a..cd889f8025e6f 100644 --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -126,7 +126,7 @@ module attributes {gpu.container_module} { // CHECK-NEXT: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32 // CHECK-NEXT: gpu.yield %{{.*}} : f32 // CHECK-NEXT: } : (f32) -> f32 - %sum2 = gpu.all_reduce %one { + %sum2 = gpu.all_reduce %one { ^bb(%lhs : f32, %rhs : f32): %tmp = arith.addf %lhs, %rhs : f32 gpu.yield %tmp : f32 @@ -259,7 +259,7 @@ module attributes {gpu.container_module} { %1 = arith.cmpi slt, %arg0, %arg0 : i32 scf.if %1 { gpu.printf ", " - } + } gpu.return } @@ -542,3 +542,15 @@ func.func @warp_operand_result(%laneid: index, %v0 : vector<4xi32>) -> (vector<4 } return %2 : vector<4xi32> } + +// CHECK-LABEL: func @subgroup_broadcast +// CHECK-SAME: (%[[ARG:.*]]: f32, %[[IDX:.*]]: i32) +func.func @subgroup_broadcast(%arg0 : f32, %arg1 : i32) -> (f32, f32, f32) { + // CHECK: gpu.subgroup_broadcast %[[ARG]], first_active_lane : f32 + %0 = gpu.subgroup_broadcast %arg0, first_active_lane : f32 + // CHECK: gpu.subgroup_broadcast %[[ARG]], any_lane : f32 + %1 = gpu.subgroup_broadcast %arg0, any_lane : f32 + // CHECK: gpu.subgroup_broadcast %[[ARG]], specific_lane %[[IDX]] : f32 + %2 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32 + func.return %0, %1, %2 : f32, f32, f32 +}