Skip to content

Commit 02f5d98

Browse files
PR review round 3
Signed-off-by: Muzammiluddin Syed <[email protected]>
1 parent 846c389 commit 02f5d98

File tree

5 files changed

+78
-64
lines changed

5 files changed

+78
-64
lines changed

mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -688,10 +688,9 @@ def MFMAOutTypes : AnyTypeOf<[F64,
688688
VectorOfLengthAndType<[4, 16, 32], [I32]>,
689689
VectorOfLengthAndType<[4], [F64]>]>;
690690
// scaled_mfma
691-
def ScaledMFMAInTypes : AnyTypeOf<[VectorOfLengthAndType<[8], [F8E5M2FNUZ, F8E4M3FNUZ]>,
692-
VectorOfLengthAndType<[8, 32], [F8E5M2, F8E4M3FN]>,
693-
VectorOfLengthAndType<[32], [F6E2M3FN, F6E3M2FN, F4E2M1FN]>]>;
694-
def ScaledMFMAOutTypes : AnyTypeOf<[VectorOfLengthAndType<[4, 16, 32], [F32]>]>;
691+
def ScaledMFMAInTypes : AnyTypeOf<[VectorOfLengthAndType<[32], [F8E5M2, F8E4M3FN]>,
692+
VectorOfLengthAndType<[32], [F6E2M3FN, F6E3M2FN, F4E2M1FN]>]>;
693+
def ScaledMFMAOutTypes : AnyTypeOf<[VectorOfLengthAndType<[4, 16], [F32]>]>;
695694
// wmma
696695
def WMMAInTypes : AnyTypeOf<[VectorOfLengthAndType<
697696
[4, 8, 16],
@@ -847,8 +846,9 @@ def AMDGPU_ScaledMFMAOp :
847846
ScaledMFMAOutTypes:$destC,
848847
AnyTypeOf<[I8, FixedVectorOfLengthAndType<[4], [I8]>]>:$scalesA,
849848
AnyTypeOf<[I8, FixedVectorOfLengthAndType<[4], [I8]>]>:$scalesB,
850-
I32Attr:$scalesIdxA,
851-
I32Attr:$scalesIdxB)>,
849+
ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<3>]>:$scalesIdxA,
850+
ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<3>]>:$scalesIdxB
851+
)>,
852852
Results<(outs ScaledMFMAOutTypes: $destD)> {
853853
let summary = "MLIR wrapper for CDNA scaled mfma instructions";
854854
let description = [{
@@ -879,8 +879,7 @@ def AMDGPU_ScaledMFMAOp :
879879
let assemblyFormat = [{
880880
`(` $scalesA `[` $scalesIdxA `]` `*` $sourceA `)` `*` `(` $scalesB `[` $scalesIdxB `]` `*` $sourceB `)` `+` $destC
881881
attr-dict
882-
`:` type($sourceA) `,` type($scalesA) `,` type($sourceB) `,` type($scalesB) `,` type($destC)
882+
`:` type($scalesA) `,` type($sourceA) `,` type($scalesB) `,` type($sourceB) `,` type($destC)
883883
}];
884-
let hasVerifier = 1;
885884
}
886885
#endif // AMDGPU

mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp

Lines changed: 40 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -529,6 +529,25 @@ static Value convertMFMAVectorOperand(ConversionPatternRewriter &rewriter,
529529
return input;
530530
}
531531

532+
/// Converts the scaled MFMA operands, `scalesA` and `scalesB`, from MLIR AMDGPU
533+
/// dialect convention to ROCDL and LLVM AMDGPU intrinsics convention.
534+
///
535+
/// Specifically:
536+
/// 1. If `input` is a i8 value, zero extend it to i32
537+
/// 2. If `input` is a vector of length 4 and type i8, cast it to i32
538+
///
539+
/// Note that the type of `input` has already been LLVM type converted:
540+
/// therefore 8-bit and smaller floats are represented as their corresponding
541+
/// `iN` integers.
542+
static Value castScaledMFMAVectorOperand(ConversionPatternRewriter &rewriter,
543+
Location loc, Value input) {
544+
Type inputType = input.getType();
545+
Type outputType = rewriter.getI32Type();
546+
if (auto intType = dyn_cast<IntegerType>(inputType))
547+
return rewriter.create<LLVM::ZExtOp>(loc, outputType, input);
548+
return rewriter.create<LLVM::BitcastOp>(loc, outputType, input);
549+
}
550+
532551
/// Push an input operand. If it is a float type, nothing to do. If it is
533552
/// an integer type, then we need to also push its signdness (1 for signed, 0
534553
/// for unsigned) and we need to pack the input 16xi8 vector into a 4xi32
@@ -827,20 +846,19 @@ mfmaOpToScaledIntrinsic(Type aType, Type bType, Type destType, uint32_t m,
827846
}
828847

829848
static std::optional<std::tuple<StringRef, uint32_t, uint32_t>>
830-
mfmaOpToScaledIntrinsic(Operation *op, Chipset chipset) {
831-
if (auto mfma = llvm::dyn_cast_or_null<MFMAOp>(op)) {
832-
return mfmaOpToScaledIntrinsic(
833-
mfma.getSourceA().getType(), mfma.getSourceB().getType(),
834-
mfma.getDestC().getType(), mfma.getM(), mfma.getN(), mfma.getK(),
835-
mfma.getBlocks(), chipset);
836-
}
837-
if (auto smfma = llvm::dyn_cast_or_null<ScaledMFMAOp>(op)) {
838-
return mfmaOpToScaledIntrinsic(smfma.getSourceA().getType(),
839-
smfma.getSourceB().getType(),
840-
smfma.getDestC().getType(), smfma.getM(),
841-
smfma.getN(), smfma.getK(), 1u, chipset);
842-
}
843-
return std::nullopt;
849+
mfmaOpToScaledIntrinsic(MFMAOp mfma, Chipset chipset) {
850+
return mfmaOpToScaledIntrinsic(
851+
mfma.getSourceA().getType(), mfma.getSourceB().getType(),
852+
mfma.getDestC().getType(), mfma.getM(), mfma.getN(), mfma.getK(),
853+
mfma.getBlocks(), chipset);
854+
}
855+
856+
static std::optional<std::tuple<StringRef, uint32_t, uint32_t>>
857+
mfmaOpToScaledIntrinsic(ScaledMFMAOp smfma, Chipset chipset) {
858+
return mfmaOpToScaledIntrinsic(smfma.getSourceA().getType(),
859+
smfma.getSourceB().getType(),
860+
smfma.getDestC().getType(), smfma.getM(),
861+
smfma.getN(), smfma.getK(), 1u, chipset);
844862
}
845863

846864
/// Return the `rocdl` intrinsic corresponding to a WMMA operation `wmma`
@@ -991,17 +1009,19 @@ struct ScaledMFMAOpLowering : public ConvertOpToLLVMPattern<ScaledMFMAOp> {
9911009
{convertMFMAVectorOperand(rewriter, loc, adaptor.getSourceA()),
9921010
convertMFMAVectorOperand(rewriter, loc, adaptor.getSourceB()),
9931011
adaptor.getDestC()});
994-
Value scalesIdxA = createI32Constant(rewriter, loc, adaptor.getScalesIdxA());
995-
Value scalesIdxB = createI32Constant(rewriter, loc, adaptor.getScalesIdxB());
1012+
Value scalesIdxA =
1013+
createI32Constant(rewriter, loc, adaptor.getScalesIdxA());
1014+
Value scalesIdxB =
1015+
createI32Constant(rewriter, loc, adaptor.getScalesIdxB());
9961016
loweredOp.addOperands(
9971017
{createI32Constant(rewriter, loc, aTypeCode),
9981018
createI32Constant(rewriter, loc, bTypeCode),
1019+
/*scales idx A=*/scalesIdxA,
9991020
/*scales A*/
1000-
convertMFMAVectorOperand(rewriter, loc, adaptor.getScalesA()),
1021+
castScaledMFMAVectorOperand(rewriter, loc, adaptor.getScalesA()),
1022+
/*scales idx B=*/scalesIdxB,
10011023
/*scales B*/
1002-
convertMFMAVectorOperand(rewriter, loc, adaptor.getScalesB()),
1003-
/*scales idx A=*/scalesIdxA,
1004-
/*scales idx B=*/scalesIdxB});
1024+
castScaledMFMAVectorOperand(rewriter, loc, adaptor.getScalesB())});
10051025
Value lowered = rewriter.create(loweredOp)->getResult(0);
10061026
rewriter.replaceOp(op, lowered);
10071027
return success();

mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -506,19 +506,6 @@ LogicalResult GatherToLDSOp::verify() {
506506
return success();
507507
}
508508

509-
LogicalResult ScaledMFMAOp::verify() {
510-
unsigned scalesIdxA = getScalesIdxA();
511-
unsigned scalesIdxB = getScalesIdxB();
512-
513-
if (scalesIdxA > 3)
514-
return emitOpError("scales idx A must be a value from 0 to 3 inclusive");
515-
516-
if (scalesIdxB > 3)
517-
return emitOpError("scales idx B must be a value from 0 to 3 inclusive");
518-
519-
return success();
520-
}
521-
522509
#include "mlir/Dialect/AMDGPU/IR/AMDGPUEnums.cpp.inc"
523510

524511
#define GET_ATTRDEF_CLASSES

mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir

Lines changed: 24 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -60,42 +60,43 @@ func.func @scaled_mfma_to_rocdl(%arg0 : vector<16xf32>,
6060

6161
// CHECK: %[[c0:.+]] = llvm.mlir.constant(0 : i32) : i32
6262
// CHECK: %[[c1:.+]] = llvm.mlir.constant(1 : i32) : i32
63-
// CHECK: llvm.bitcast
63+
// CHECK: %[[c2:.+]] = llvm.bitcast{{.*}} : vector<4xi8> to i32
64+
// CHECK: %[[c3:.+]] = llvm.zext{{.*}} : i8 to i32
6465

65-
// CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[c1]] : (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i8, i32, i32) -> vector<16xf32>
66-
amdgpu.scaled_mfma ( %arg7 [ 0 ] * %arg2 ) * ( %arg8 [ 1 ] * %arg2 ) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<32xf8E4M3FN>, vector<4xi8>, vector<32xf8E4M3FN>, i8, vector<16xf32>
67-
// CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[c1]] : (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i8, i32, i32) -> vector<4xf32>
68-
amdgpu.scaled_mfma ( %arg7 [ 0 ] * %arg2 ) * ( %arg8 [ 1 ] * %arg2 ) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<32xf8E4M3FN>, vector<4xi8>, vector<32xf8E4M3FN>, i8, vector<4xf32>
66+
// CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[c2]], %[[c1]], %[[c3]] : (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
67+
amdgpu.scaled_mfma(%arg7[0] * %arg2) * (%arg8[1] * %arg2) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xi8>, vector<32xf8E4M3FN>, i8, vector<32xf8E4M3FN>, vector<16xf32>
68+
// CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[c2]], %[[c1]], %[[c3]] : (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
69+
amdgpu.scaled_mfma(%arg7[0] * %arg2) * (%arg8[1] * %arg2) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xi8>, vector<32xf8E4M3FN>, i8, vector<32xf8E4M3FN>, vector<4xf32>
6970

7071
// CHECK: llvm.bitcast
7172

72-
// CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[c1]] : (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i8, i32, i32) -> vector<16xf32>
73-
amdgpu.scaled_mfma ( %arg7 [ 0 ] * %arg3 ) * ( %arg8 [ 1 ] * %arg3 ) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<32xf8E5M2>, vector<4xi8>, vector<32xf8E5M2>, i8, vector<16xf32>
74-
// CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[c1]] : (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i8, i32, i32) -> vector<4xf32>
75-
amdgpu.scaled_mfma ( %arg7 [ 0 ] * %arg3 ) * ( %arg8 [ 1 ] * %arg3 ) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<32xf8E5M2>, vector<4xi8>, vector<32xf8E5M2>, i8, vector<4xf32>
73+
// CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[c2]], %[[c1]], %[[c3]] : (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
74+
amdgpu.scaled_mfma(%arg7[0] * %arg3) * (%arg8[1] * %arg3) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xi8>, vector<32xf8E5M2>, i8, vector<32xf8E5M2>, vector<16xf32>
75+
// CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[c2]], %[[c1]], %[[c3]] : (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
76+
amdgpu.scaled_mfma(%arg7[0] * %arg3) * (%arg8[1] * %arg3) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xi8>, vector<32xf8E5M2>, i8, vector<32xf8E5M2>, vector<4xf32>
7677

7778
// CHECK: llvm.bitcast
7879

79-
// CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[c1]] : (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i8, i32, i32) -> vector<16xf32>
80-
amdgpu.scaled_mfma ( %arg7 [ 0 ] * %arg4 ) * ( %arg8 [ 1 ] * %arg4 ) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<32xf6E2M3FN>, vector<4xi8>, vector<32xf6E2M3FN>, i8, vector<16xf32>
81-
// CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[c1]] : (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i8, i32, i32) -> vector<4xf32>
82-
amdgpu.scaled_mfma ( %arg7 [ 0 ] * %arg4 ) * ( %arg8 [ 1 ] * %arg4 ) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<32xf6E2M3FN>, vector<4xi8>, vector<32xf6E2M3FN>, i8, vector<4xf32>
80+
// CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[c2]], %[[c1]], %[[c3]] : (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
81+
amdgpu.scaled_mfma(%arg7[0] * %arg4) * (%arg8[1] * %arg4) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xi8>, vector<32xf6E2M3FN>, i8, vector<32xf6E2M3FN>, vector<16xf32>
82+
// CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[c2]], %[[c1]], %[[c3]] : (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
83+
amdgpu.scaled_mfma(%arg7[0] * %arg4) * (%arg8[1] * %arg4) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xi8>, vector<32xf6E2M3FN>, i8, vector<32xf6E2M3FN>, vector<4xf32>
8384

8485
// CHECK: llvm.bitcast
85-
// CHECK: %[[c3:.+]] = llvm.mlir.constant(3 : i32) : i32
86+
// CHECK: llvm.mlir.constant(3 : i32) : i32
8687

87-
// CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[c1]] : (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i8, i32, i32) -> vector<16xf32>
88-
amdgpu.scaled_mfma ( %arg7 [ 0 ] * %arg5 ) * ( %arg8 [ 1 ] * %arg5 ) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<32xf6E3M2FN>, vector<4xi8>, vector<32xf6E3M2FN>, i8, vector<16xf32>
89-
// CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[c1]] : (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i8, i32, i32) -> vector<4xf32>
90-
amdgpu.scaled_mfma ( %arg7 [ 0 ] * %arg5 ) * ( %arg8 [ 1 ] * %arg5 ) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<32xf6E3M2FN>, vector<4xi8>, vector<32xf6E3M2FN>, i8, vector<4xf32>
88+
// CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[c2]], %[[c1]], %[[c3]] : (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
89+
amdgpu.scaled_mfma(%arg7[0] * %arg5) * (%arg8[1] * %arg5) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xi8>, vector<32xf6E3M2FN>, i8, vector<32xf6E3M2FN>, vector<16xf32>
90+
// CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[c2]], %[[c1]], %[[c3]] : (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
91+
amdgpu.scaled_mfma(%arg7[0] * %arg5) * (%arg8[1] * %arg5) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xi8>, vector<32xf6E3M2FN>, i8, vector<32xf6E3M2FN>, vector<4xf32>
9192

9293
// CHECK: llvm.bitcast
93-
// CHECK: %[[c4:.+]] = llvm.mlir.constant(4 : i32) : i32
94+
// CHECK: llvm.mlir.constant(4 : i32) : i32
9495

95-
// CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[c1]] : (vector<4xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i8, i32, i32) -> vector<16xf32>
96-
amdgpu.scaled_mfma ( %arg7 [ 0 ] * %arg6 ) * ( %arg8 [ 1 ] * %arg6 ) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<32xf4E2M1FN>, vector<4xi8>, vector<32xf4E2M1FN>, i8, vector<16xf32>
97-
// CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[c1]] : (vector<4xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i8, i32, i32) -> vector<4xf32>
98-
amdgpu.scaled_mfma ( %arg7 [ 0 ] * %arg6 ) * ( %arg8 [ 1 ] * %arg6 ) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<32xf4E2M1FN>, vector<4xi8>, vector<32xf4E2M1FN>, i8, vector<4xf32>
96+
// CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[c2]], %[[c1]], %[[c3]] : (vector<4xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
97+
amdgpu.scaled_mfma(%arg7[0] * %arg6) * (%arg8[1] * %arg6) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xi8>, vector<32xf4E2M1FN>, i8, vector<32xf4E2M1FN>, vector<16xf32>
98+
// CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[c2]], %[[c1]], %[[c3]] : (vector<4xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
99+
amdgpu.scaled_mfma(%arg7[0] * %arg6) * (%arg8[1] * %arg6) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xi8>, vector<32xf4E2M1FN>, i8, vector<32xf4E2M1FN>, vector<4xf32>
99100

100101
func.return
101102
}

mlir/test/Dialect/AMDGPU/ops.mlir

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,3 +164,10 @@ func.func @swizzle_bitmode(%arg0 : f32) -> f32 {
164164
%0 = amdgpu.swizzle_bitmode %arg0 1 2 4 : f32
165165
func.return %0 : f32
166166
}
167+
168+
// CHECK-LABEL: func @scaled_mfma
169+
func.func @scaled_mfma(%arg0 : i8, %arg1 : vector<32xf6E2M3FN>, %arg2 : vector<16xf32>) -> vector<16xf32> {
170+
// CHECK: amdgpu.scaled_mfma
171+
%0 = amdgpu.scaled_mfma(%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : i8, vector<32xf6E2M3FN>, i8, vector<32xf6E2M3FN>, vector<16xf32>
172+
func.return %0 : vector<16xf32>
173+
}

0 commit comments

Comments
 (0)