Skip to content

Commit 1b2e5ff

Browse files
committed
rename to permlane_swap
1 parent 90e65a5 commit 1b2e5ff

File tree

5 files changed

+49
-50
lines changed

5 files changed

+49
-50
lines changed

mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td

Lines changed: 13 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -656,40 +656,27 @@ def AMDGPU_SwizzleBitModeOp : AMDGPU_Op<"swizzle_bitmode",
656656
}];
657657
}
658658

659-
def AMDGPU_PermlanePerm : I32EnumAttr<"PermlanePerm",
660-
"The possible permutations for a permlane operation",
661-
[
662-
I32EnumAttrCase<"swap_16", 0>,
663-
I32EnumAttrCase<"swap_32", 1>,
664-
]> {
665-
let genSpecializedAttr = 0;
666-
let cppNamespace = "::mlir::amdgpu";
667-
}
668-
669-
def AMDGPU_PermlanePermAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_PermlanePerm,
670-
"permlane_perm">;
671-
672-
def AMDGPU_PermlaneOp : AMDGPU_Op<"permlane", [Pure, AllTypesMatch<["result", "src"]>]> {
673-
let summary = "AMDGPU permlane op";
659+
def AMDGPU_PermlaneSwapOp : AMDGPU_Op<"permlane_swap", [Pure, AllTypesMatch<["result", "src"]>]> {
660+
let summary = "AMDGPU permlane swap op";
674661
let description = [{
675-
High-level wrapper on `rocdl.permlane.*` variants for permutations
676-
on rows of lanes in a subgroup.
662+
High-level wrapper on `rocdl.permlane{16,32}.swap` variants for permutations
663+
on rows of lanes in a subgroup.
677664

678665
Supports arbitrary int/float/vector types, which will be repacked to i32 and
679-
one or more `rocdl.permlane.*` ops during lowering.
666+
one or more `rocdl.permlane_swap` ops during lowering.
680667
Supported lane permutations:
681-
- Swap the data between odd and even rows of 16 lanes (`swap_16`)
682-
- Swap the data between the first 32 lanes and the last 32 lanes (`swap_32`)
668+
- Swap the data between odd and even rows of 16 lanes
669+
- Swap the data between the first 32 lanes and the last 32 lanes
683670

684671
Example:
685672
```mlir
686-
%0 = amdgpu.permlane %src swap_16 : f16
687-
%1 = amdgpu.permlane %src swap_32 { fetch_inactive = true, bound_ctrl = true } : f16
673+
%0 = amdgpu.permlane %src 16 : f16
674+
%1 = amdgpu.permlane %src 32 { fetch_inactive = true, bound_ctrl = true } : f16
688675
```
689676

690677
Operands:
691678
* `$src`: Vector register to permute across lanes of the subgroup.
692-
* `$kind`: The kind of permutation operation.
679+
* `$row_length`: The length of a row to permute in number of lanes (valid values are 16 and 32).
693680
* `$fetch_inactive`: Optional. Used to dertermine behavior of a fetch from a disabled lane.
694681
`fetch_inactive = false`: If the source lane is disabled, use `bound_ctrl` to determine the source value.
695682
`fetch_inactive = true`: If the source lane is disabled, fetch the source value anyway (ignoring `bound_ctrl`).
@@ -701,13 +688,14 @@ def AMDGPU_PermlaneOp : AMDGPU_Op<"permlane", [Pure, AllTypesMatch<["result", "s
701688
Note: Lowering is only supported on gfx950 and up.
702689
}];
703690
let arguments = (ins AnyIntegerOrFloatOr1DVector:$src,
704-
AMDGPU_PermlanePermAttr:$kind,
691+
I32Attr:$row_length,
705692
DefaultValuedAttr<BoolAttr, "false">:$fetch_inactive,
706693
DefaultValuedAttr<BoolAttr, "false">:$bound_ctrl);
707694
let results = (outs AnyIntegerOrFloatOr1DVector:$result);
708695
let assemblyFormat = [{
709-
$src $kind attr-dict `:` type($result)
696+
$src $row_length attr-dict `:` type($result)
710697
}];
698+
let hasVerifier = 1;
711699
}
712700

713701
def AMDGPU_LDSBarrierOp : AMDGPU_Op<"lds_barrier"> {

mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1877,23 +1877,23 @@ struct AMDGPUSwizzleBitModeLowering
18771877
}
18781878
};
18791879

1880-
struct AMDGPUPermlaneLowering : public ConvertOpToLLVMPattern<PermlaneOp> {
1880+
struct AMDGPUPermlaneLowering : public ConvertOpToLLVMPattern<PermlaneSwapOp> {
18811881
using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
18821882

18831883
AMDGPUPermlaneLowering(const LLVMTypeConverter &converter, Chipset chipset)
1884-
: ConvertOpToLLVMPattern<PermlaneOp>(converter), chipset(chipset) {}
1884+
: ConvertOpToLLVMPattern<PermlaneSwapOp>(converter), chipset(chipset) {}
18851885
Chipset chipset;
18861886

18871887
LogicalResult
1888-
matchAndRewrite(PermlaneOp op, OpAdaptor adaptor,
1888+
matchAndRewrite(PermlaneSwapOp op, OpAdaptor adaptor,
18891889
ConversionPatternRewriter &rewriter) const override {
18901890
if (chipset < kGfx950)
18911891
return op->emitOpError("permlane_swap is only supported on gfx950+");
18921892

18931893
Location loc = op.getLoc();
18941894
Type i32 = rewriter.getI32Type();
18951895
Value src = adaptor.getSrc();
1896-
auto kind = op.getKind();
1896+
unsigned row_length = op.getRowLength();
18971897
bool fi = op.getFetchInactive();
18981898
bool boundctrl = op.getBoundCtrl();
18991899

@@ -1905,16 +1905,15 @@ struct AMDGPUPermlaneLowering : public ConvertOpToLLVMPattern<PermlaneOp> {
19051905
Value res;
19061906
Type i32pair = LLVM::LLVMStructType::getLiteral(
19071907
rewriter.getContext(), {v.getType(), v.getType()});
1908-
switch (kind) {
1909-
case PermlanePerm::swap_16:
1908+
1909+
if (row_length == 16)
19101910
res = ROCDL::Permlane16SwapOp::create(rewriter, loc, i32pair, v, v, fi,
19111911
boundctrl);
1912-
break;
1913-
case PermlanePerm::swap_32:
1912+
else if (row_length == 32)
19141913
res = ROCDL::Permlane32SwapOp::create(rewriter, loc, i32pair, v, v, fi,
19151914
boundctrl);
1916-
break;
1917-
}
1915+
else
1916+
llvm_unreachable("unsupported row length");
19181917

19191918
Value vdstNew = LLVM::ExtractValueOp::create(rewriter, loc, res, {0});
19201919
permuted.emplace_back(vdstNew);

mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,18 @@ LogicalResult DPPOp::verify() {
510510
return success();
511511
}
512512

513+
//===----------------------------------------------------------------------===//
514+
// PermlaneSwapOp
515+
//===----------------------------------------------------------------------===//
516+
LogicalResult PermlaneSwapOp::verify() {
517+
unsigned rowLength = getRowLength();
518+
519+
if (rowLength != 16 && rowLength != 32)
520+
return emitOpError("row_length attribute must either be 16 or 32.");
521+
522+
return success();
523+
}
524+
513525
//===----------------------------------------------------------------------===//
514526
// GatherToLDSOp
515527
//===----------------------------------------------------------------------===//

mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ func.func @test_permlane16_i32(%arg0 : i32) -> i32 {
66
// CHECK: %[[PERM:.*]] = rocdl.permlane16.swap %[[ARG0]], %[[ARG0]], false, false : (i32, i32) -> <(i32, i32)>
77
// CHECK: %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
88
// CHECK: return %[[RES]] : i32
9-
%0 = amdgpu.permlane %arg0 swap_16 : i32
9+
%0 = amdgpu.permlane_swap %arg0 16 : i32
1010
return %0 : i32
1111
}
1212

@@ -16,7 +16,7 @@ func.func @test_permlane16_i32_optional_attr(%arg0 : i32) -> i32 {
1616
// CHECK: %[[PERM:.*]] = rocdl.permlane16.swap %[[ARG0]], %[[ARG0]], true, true : (i32, i32) -> <(i32, i32)>
1717
// CHECK: %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
1818
// CHECK: return %[[RES]] : i32
19-
%0 = amdgpu.permlane %arg0 swap_16 { fetch_inactive = true, bound_ctrl = true } : i32
19+
%0 = amdgpu.permlane_swap %arg0 16 { fetch_inactive = true, bound_ctrl = true } : i32
2020
return %0 : i32
2121
}
2222

@@ -26,7 +26,7 @@ func.func @test_permlane32_i32(%arg0 : i32) -> i32 {
2626
// CHECK: %[[PERM:.*]] = rocdl.permlane32.swap %[[ARG0]], %[[ARG0]], false, false : (i32, i32) -> <(i32, i32)>
2727
// CHECK: %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
2828
// CHECK: return %[[RES]] : i32
29-
%0 = amdgpu.permlane %arg0 swap_32 : i32
29+
%0 = amdgpu.permlane_swap %arg0 32 : i32
3030
return %0 : i32
3131
}
3232

@@ -38,7 +38,7 @@ func.func @test_permlane16_f32(%arg0 : f32) -> f32 {
3838
// CHECK: %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
3939
// CHECK: %[[RES_CAST:.*]] = llvm.bitcast %[[RES]] : i32 to f32
4040
// CHECK: return %[[RES_CAST]] : f32
41-
%0 = amdgpu.permlane %arg0 swap_16 : f32
41+
%0 = amdgpu.permlane_swap %arg0 16 : f32
4242
return %0 : f32
4343
}
4444

@@ -50,7 +50,7 @@ func.func @test_permlane32_f32(%arg0 : f32) -> f32 {
5050
// CHECK: %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
5151
// CHECK: %[[RES_CAST:.*]] = llvm.bitcast %[[RES]] : i32 to f32
5252
// CHECK: return %[[RES_CAST]] : f32
53-
%0 = amdgpu.permlane %arg0 swap_32 : f32
53+
%0 = amdgpu.permlane_swap %arg0 32 : f32
5454
return %0 : f32
5555
}
5656

@@ -64,7 +64,7 @@ func.func @test_permlane16_f16(%arg0 : f16) -> f16 {
6464
// CHECK: %[[TRUNC:.*]] = llvm.trunc %[[RES]] : i32 to i16
6565
// CHECK: %[[RES_CAST:.*]] = llvm.bitcast %[[TRUNC]] : i16 to f16
6666
// CHECK: return %[[RES_CAST]] : f16
67-
%0 = amdgpu.permlane %arg0 swap_16 : f16
67+
%0 = amdgpu.permlane_swap %arg0 16 : f16
6868
return %0 : f16
6969
}
7070

@@ -78,7 +78,7 @@ func.func @test_permlane32_f16(%arg0 : f16) -> f16 {
7878
// CHECK: %[[TRUNC:.*]] = llvm.trunc %[[RES]] : i32 to i16
7979
// CHECK: %[[RES_CAST:.*]] = llvm.bitcast %[[TRUNC]] : i16 to f16
8080
// CHECK: return %[[RES_CAST]] : f16
81-
%0 = amdgpu.permlane %arg0 swap_32 : f16
81+
%0 = amdgpu.permlane_swap %arg0 32 : f16
8282
return %0 : f16
8383
}
8484

@@ -97,7 +97,7 @@ func.func @test_permlane16_2xi32(%arg0 : vector<2xi32>) -> vector<2xi32> {
9797
// CHECK: %[[VEC_INSERT0:.*]] = llvm.insertelement %[[PERM0]], %[[POISON]][%[[C0]] : i32] : vector<2xi32>
9898
// CHECK: %[[VEC_INSERT1:.*]] = llvm.insertelement %[[PERM1]], %[[VEC_INSERT0]][%[[C1]] : i32] : vector<2xi32>
9999
// CHECK: return %[[VEC_INSERT1]] : vector<2xi32>
100-
%0 = amdgpu.permlane %arg0 swap_16 : vector<2xi32>
100+
%0 = amdgpu.permlane_swap %arg0 16 : vector<2xi32>
101101
return %0 : vector<2xi32>
102102
}
103103

@@ -116,7 +116,7 @@ func.func @test_permlane32_2xi32(%arg0 : vector<2xi32>) -> vector<2xi32> {
116116
// CHECK: %[[VEC_INSERT0:.*]] = llvm.insertelement %[[PERM0]], %[[POISON]][%[[C0]] : i32] : vector<2xi32>
117117
// CHECK: %[[VEC_INSERT1:.*]] = llvm.insertelement %[[PERM1]], %[[VEC_INSERT0]][%[[C1]] : i32] : vector<2xi32>
118118
// CHECK: return %[[VEC_INSERT1]] : vector<2xi32>
119-
%0 = amdgpu.permlane %arg0 swap_32 : vector<2xi32>
119+
%0 = amdgpu.permlane_swap %arg0 32 : vector<2xi32>
120120
return %0 : vector<2xi32>
121121
}
122122

@@ -137,7 +137,7 @@ func.func @test_permlane16_4xf16(%arg0 : vector<4xf16>) -> vector<4xf16> {
137137
// CHECK: %[[VEC_INSERT1:.*]] = llvm.insertelement %[[PERM1]], %[[VEC_INSERT0]][%[[C1]] : i32] : vector<2xi32>
138138
// CHECK: %[[CAST2:.*]] = llvm.bitcast %[[VEC_INSERT1]] : vector<2xi32> to vector<4xf16>
139139
// CHECK: return %[[CAST2]] : vector<4xf16>
140-
%0 = amdgpu.permlane %arg0 swap_16 : vector<4xf16>
140+
%0 = amdgpu.permlane_swap %arg0 16 : vector<4xf16>
141141
return %0 : vector<4xf16>
142142
}
143143

@@ -158,6 +158,6 @@ func.func @test_permlane32_4xf16(%arg0 : vector<4xf16>) -> vector<4xf16> {
158158
// CHECK: %[[VEC_INSERT1:.*]] = llvm.insertelement %[[PERM1]], %[[VEC_INSERT0]][%[[C1]] : i32] : vector<2xi32>
159159
// CHECK: %[[CAST2:.*]] = llvm.bitcast %[[VEC_INSERT1]] : vector<2xi32> to vector<4xf16>
160160
// CHECK: return %[[CAST2]] : vector<4xf16>
161-
%0 = amdgpu.permlane %arg0 swap_32 : vector<4xf16>
161+
%0 = amdgpu.permlane_swap %arg0 32 : vector<4xf16>
162162
return %0 : vector<4xf16>
163163
}

mlir/test/Dialect/AMDGPU/ops.mlir

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -526,15 +526,15 @@ func.func @swizzle_bitmode(%arg0 : f32) -> f32 {
526526

527527
// CHECK-LABEL: func @permlane16_swap
528528
func.func @permlane16_swap(%arg0 : f32) -> f32 {
529-
// CHECK: amdgpu.permlane
530-
%0 = amdgpu.permlane %arg0 swap_16 : f32
529+
// CHECK: amdgpu.permlane_swap
530+
%0 = amdgpu.permlane_swap %arg0 16 : f32
531531
func.return %0 : f32
532532
}
533533

534534
// CHECK-LABEL: func @permlane32_swap
535535
func.func @permlane32_swap(%arg0 : f32) -> f32 {
536-
// CHECK: amdgpu.permlane
537-
%0 = amdgpu.permlane %arg0 swap_32 : f32
536+
// CHECK: amdgpu.permlane_swap
537+
%0 = amdgpu.permlane_swap %arg0 32 : f32
538538
func.return %0 : f32
539539
}
540540

0 commit comments

Comments
 (0)