@@ -6,13 +6,13 @@ func.func @wmma_to_rocdl(%arg0 : vector<16xf16>, %arg1 : vector<8xf32>, %arg2 :
66 %arg6 : vector <16 xi8 >, %arg7 : vector <8 xi32 >, %arg8 : vector <4 xi32 >,
77 %arg9 : vector <16 xui8 >, %arg10 : vector <16 xi4 >, %arg11 : vector <8 xi4 >) {
88 // CHECK: rocdl.wmma.f32.16x16x16.f16{{.*}}: (vector<16xf16>, vector<16xf16>, vector<8xf32>) -> vector<8xf32>
9- amdgpu.wmma 16 x16 x16 %arg0 * %arg0 + %arg1 : vector <16 xf16 >, vector <16 xf16 >, vector <8 xf32 >
9+ amdgpu.wmma 16 x16 x16 %arg0 * %arg0 + %arg1 { subwordOffset = 0 : i32 } : vector <16 xf16 >, vector <16 xf16 >, vector <8 xf32 >
1010 // CHECK: rocdl.wmma.f32.16x16x16.f16{{.*}}: (vector<16xf16>, vector<16xf16>, vector<4xf32>) -> vector<4xf32>
11- amdgpu.wmma 16 x16 x16 %arg0 * %arg0 + %arg2 : vector <16 xf16 >, vector <16 xf16 >, vector <4 xf32 >
11+ amdgpu.wmma 16 x16 x16 %arg0 * %arg0 + %arg2 { subwordOffset = 0 : i32 } : vector <16 xf16 >, vector <16 xf16 >, vector <4 xf32 >
1212 // CHECK: rocdl.wmma.f32.16x16x16.bf16{{.*}}: (vector<16xi16>, vector<16xi16>, vector<8xf32>) -> vector<8xf32>
13- amdgpu.wmma 16 x16 x16 %arg3 * %arg3 + %arg1 : vector <16 xbf16 >, vector <16 xbf16 >, vector <8 xf32 >
13+ amdgpu.wmma 16 x16 x16 %arg3 * %arg3 + %arg1 { subwordOffset = 0 : i32 } : vector <16 xbf16 >, vector <16 xbf16 >, vector <8 xf32 >
1414 // CHECK: rocdl.wmma.f32.16x16x16.bf16{{.*}}: (vector<16xi16>, vector<16xi16>, vector<4xf32>) -> vector<4xf32>
15- amdgpu.wmma 16 x16 x16 %arg3 * %arg3 + %arg2 : vector <16 xbf16 >, vector <16 xbf16 >, vector <4 xf32 >
15+ amdgpu.wmma 16 x16 x16 %arg3 * %arg3 + %arg2 { subwordOffset = 0 : i32 } : vector <16 xbf16 >, vector <16 xbf16 >, vector <4 xf32 >
1616 // CHECK: rocdl.wmma.f16.16x16x16.f16{{.*}}: (vector<16xf16>, vector<16xf16>, vector<16xf16>) -> vector<16xf16>
1717 amdgpu.wmma 16 x16 x16 %arg0 * %arg0 + %arg0 {subwordOffset = 1 : i32 }: vector <16 xf16 >, vector <16 xf16 >, vector <16 xf16 >
1818 // CHECK: rocdl.wmma.f16.16x16x16.f16{{.*}}: (vector<16xf16>, vector<16xf16>, vector<8xf16>) -> vector<8xf16>
@@ -23,13 +23,13 @@ func.func @wmma_to_rocdl(%arg0 : vector<16xf16>, %arg1 : vector<8xf32>, %arg2 :
2323 // CHECK: %[[raw_bf16x8:.+]] = rocdl.wmma.bf16.16x16x16.bf16{{.*}}: (vector<16xi16>, vector<16xi16>, vector<8xi16>) -> vector<8xi16>
2424 // CHECK-NEXT: llvm.bitcast %[[raw_bf16x8]] : vector<8xi16> to vector<8xbf16>
2525 amdgpu.wmma 16 x16 x16 %arg3 * %arg3 + %arg5 {subwordOffset = 0 : i32 }: vector <16 xbf16 >, vector <16 xbf16 >, vector <8 xbf16 >
26- // CHECK: rocdl.wmma.i32.16x16x16.iu8{{.*}}: (vector<4xi32>, vector<4xi32>, vector<8xi32>) -> vector<8xi32>
26+ // CHECK: rocdl.wmma.i32.16x16x16.iu8{{.*}}{clamp = true, signA = true, signB = true} : (vector<4xi32>, vector<4xi32>, vector<8xi32>) -> vector<8xi32>
2727 amdgpu.wmma 16 x16 x16 %arg6 * %arg6 + %arg7 {clamp }: vector <16 xi8 >, vector <16 xi8 >, vector <8 xi32 >
28- // CHECK: rocdl.wmma.i32.16x16x16.iu8{{.*}}: (vector<4xi32>, vector<4xi32>, vector<4xi32>) -> vector<4xi32>
28+ // CHECK: rocdl.wmma.i32.16x16x16.iu8{{.*}}{clamp = true} : (vector<4xi32>, vector<4xi32>, vector<4xi32>) -> vector<4xi32>
2929 amdgpu.wmma 16 x16 x16 %arg9 * %arg9 + %arg8 {unsignedA , unsignedB , clamp }: vector <16 xui8 >, vector <16 xui8 >, vector <4 xi32 >
30- // CHECK: rocdl.wmma.i32.16x16x16.iu4{{.*}}: (vector<2xi32>, vector<2xi32>, vector<8xi32>) -> vector<8xi32>
30+ // CHECK: rocdl.wmma.i32.16x16x16.iu4{{.*}}{clamp = true, signA = true, signB = true} : (vector<2xi32>, vector<2xi32>, vector<8xi32>) -> vector<8xi32>
3131 amdgpu.wmma 16 x16 x16 %arg10 * %arg10 + %arg7 {clamp }: vector <16 xi4 >, vector <16 xi4 >, vector <8 xi32 >
32- // CHECK: rocdl.wmma.i32.16x16x16.iu4{{.*}}: (i32, i32, vector<4xi32>) -> vector<4xi32>
32+ // CHECK: rocdl.wmma.i32.16x16x16.iu4{{.*}}{clamp = true, signA = true, signB = true} : (i32, i32, vector<4xi32>) -> vector<4xi32>
3333 amdgpu.wmma 16 x16 x16 %arg11 * %arg11 + %arg8 {clamp }: vector <8 xi4 >, vector <8 xi4 >, vector <4 xi32 >
3434
3535 return
0 commit comments