[Tosa] : Match accumulator type with torch for lowering aten.mm to tosa.matmul (#4264)

sahas3 · web-flow · commit 0bf6e9ca41d4 · 2025-07-17T09:21:33.000-04:00
For `i8` input, accumulator was being set to `i8` as well which produces
invalid `tosa.matmul` op. `fp16` input used `fp16` accumulator which
while valid by tosa spec, fails numerical verification. This change uses
the accumulator type as used by PyTorch (tosa-to-linalg does the same).
diff --git a/lib/Conversion/TorchToTosa/TorchToTosa.cpp b/lib/Conversion/TorchToTosa/TorchToTosa.cpp
@@ -1849,25 +1849,19 @@ class ConvertAtenMatmulBaseOp : public OpConversionPattern<AtenOpT> {
 
     SmallVector<int64_t> matmulOutputShape(
         {matmulLhsShape[0], matmulLhsShape[1], matmulRhsShape[2]});
-    Type outputElemTy;
 
     bool isInputElemTyQInt8 = false;
-    if (isa<mlir::quant::UniformQuantizedType>(lhsElemTy)) {
-      mlir::quant::UniformQuantizedType inputQTy =
-          dyn_cast<mlir::quant::UniformQuantizedType>(lhsElemTy);
+    Type inputElemTy{lhsElemTy};
+    if (auto inputQTy =
+            dyn_cast<mlir::quant::UniformQuantizedType>(lhsElemTy)) {
       if (inputQTy.getStorageTypeIntegralWidth() == 8)
         isInputElemTyQInt8 = true;
+      inputElemTy = inputQTy.getStorageType();
     }
 
-    if (isInputElemTyQInt8) {
-      // qint8 emits i32 matmul output
-      outputElemTy = rewriter.getIntegerType(32);
-    } else {
-      outputElemTy = lhsElemTy;
-    }
-
+    auto accElemTy = getDefaultAccType(rewriter, inputElemTy);
     auto mmOutputTy = RankedTensorType::get(
-        makeShapeLLVMCompatible(matmulOutputShape), outputElemTy);
+        makeShapeLLVMCompatible(matmulOutputShape), accElemTy);
 
     Value mmOpResult;
     if (!isInputElemTyQInt8) {
@@ -1997,7 +1991,7 @@ class ConvertAtenMatmulBaseOp : public OpConversionPattern<AtenOpT> {
 
       // Perform reshape
       auto reshapedOpType = RankedTensorType::get(
-          makeShapeLLVMCompatible(reshapedOpShape), outputElemTy);
+          makeShapeLLVMCompatible(reshapedOpShape), accElemTy);
       auto reshapedOp = rewriter.create<tosa::ReshapeOp>(
           op->getLoc(),
           OpConversionPattern<AtenOpT>::getTypeConverter()->convertType(
@@ -2007,7 +2001,7 @@ class ConvertAtenMatmulBaseOp : public OpConversionPattern<AtenOpT> {
 
       if (opNeedsTranspose) {
         auto transposedOpType = RankedTensorType::get(
-            makeShapeLLVMCompatible(transposedOpShape), outputElemTy);
+            makeShapeLLVMCompatible(transposedOpShape), accElemTy);
         output = rewriter
                      .create<tosa::TransposeOp>(
                          op->getLoc(),
@@ -2043,12 +2037,14 @@ class ConvertAtenMatmulBaseOp : public OpConversionPattern<AtenOpT> {
       return rewriter.notifyMatchFailure(op,
                                          "Failed to perform matmul operation");
 
-    rewriter.replaceOpWithNewOp<tensor::CastOp>(
+    rewriter.replaceOp(
         op,
-        cast<RankedTensorType>(
-            OpConversionPattern<AtenOpT>::getTypeConverter()->convertType(
-                op.getType())),
-        output);
+        {tosa::tosaCastTensorToType(
+             rewriter, output,
+             cast<RankedTensorType>(
+                 OpConversionPattern<AtenOpT>::getTypeConverter()->convertType(
+                     op.getType())))
+             .value()});
 
     return success();
   }
@@ -2165,10 +2161,6 @@ class ConvertAtenLinearOp : public ConvertAtenMatmulBaseOp<AtenOpT> {
     auto bias = adaptor.getBias();
     auto biasTy = bias.getType();
 
-    if (mlir::tosa::EqualizeRanks(rewriter, op->getLoc(), lhs, bias).failed())
-      return rewriter.notifyMatchFailure(
-          op, "Failed to equalize ranks among operands and result");
-
     // TOSA does not mandate that elementwise op tensors need to be ranked.
     if (!isa<Torch::NoneType>(biasTy) && !isa<TensorType>(biasTy))
       return rewriter.notifyMatchFailure(
@@ -2207,22 +2199,30 @@ class ConvertAtenLinearOp : public ConvertAtenMatmulBaseOp<AtenOpT> {
       return rewriter.notifyMatchFailure(op,
                                          "Failed to perform matmul operation");
 
-    Value matmulPlusBias = matmulOutput;
+    Value matmulPlusBias =
+        tosa::tosaCastTensorToType(
+            rewriter, matmulOutput,
+            cast<RankedTensorType>(
+                OpConversionPattern<AtenOpT>::getTypeConverter()->convertType(
+                    op.getType())))
+            .value();
+
     if (!isa<Torch::NoneType>(biasTy)) {
-      // Bias addition broadcasts to the matmul output shape.
+      // Broadcast bias to the matmul output shape for addition
+      if (mlir::tosa::EqualizeRanks(rewriter, op->getLoc(), matmulPlusBias,
+                                    bias)
+              .failed())
+        return rewriter.notifyMatchFailure(
+            op, "Failed to equalize ranks among operands and result");
+
       matmulPlusBias =
           rewriter
-              .create<tosa::AddOp>(op->getLoc(), matmulOutput.getType(),
-                                   matmulOutput, bias)
+              .create<tosa::AddOp>(op->getLoc(), matmulPlusBias.getType(),
+                                   matmulPlusBias, bias)
               .getResult();
     }
 
-    rewriter.replaceOpWithNewOp<tensor::CastOp>(
-        op,
-        cast<RankedTensorType>(
-            OpConversionPattern<AtenOpT>::getTypeConverter()->convertType(
-                op.getType())),
-        matmulPlusBias);
+    rewriter.replaceOp(op, {matmulPlusBias});
 
     return success();
   }
diff --git a/projects/pt1/e2e_testing/xfail_sets.py b/projects/pt1/e2e_testing/xfail_sets.py
@@ -85,6 +85,8 @@
     "TraceModule_empty",
     # Crashes due to copy to a smaller destination buffer than the source buffer.
     "SliceCopyStartGreaterThanDimSize_Module_basic",
+    # unimplemented: for conversion to byte or char type dstOriginalDtype has to be passed to convertScalarToDtype
+    "AtenMmInt8Types_basic",
 }
 
 TORCHDYNAMO_XFAIL_SET = {
@@ -641,6 +643,7 @@
     "AtenMatmulQint8VM_basic",
     "AtenMatmulQint8VV_basic",
     "AtenMatmulQint8_basic",
+    "AtenMmF16Types_basic",
     "AtenMmQMixedSigni8_basic",
     "AtenMmQint8_basic",
     "AtenMmQuint8_basic",
diff --git a/projects/pt1/python/torch_mlir_e2e_test/test_suite/matmul.py b/projects/pt1/python/torch_mlir_e2e_test/test_suite/matmul.py
@@ -336,6 +336,51 @@ def AtenMmIntTypes_basic(module, tu: TestUtils):
     module.forward(tu.randint(16, 4, high=100), tu.randint(4, 16, high=100))
 
 
+# ==============================================================================
+
+
+class AtenMmInt8Types(torch.nn.Module):
+    @export
+    @annotate_args(
+        [
+            None,
+            ([-1, -1], torch.int8, True),
+            ([-1, -1], torch.int8, True),
+        ]
+    )
+    def forward(self, a, b):
+        return torch.ops.aten.mm(a, b)
+
+
+@register_test_case(module_factory=lambda: AtenMmInt8Types())
+def AtenMmInt8Types_basic(module, tu: TestUtils):
+    module.forward(
+        tu.randint(16, 4, high=100).to(torch.int8),
+        tu.randint(4, 16, high=100).to(torch.int8),
+    )
+
+
+# ==============================================================================
+
+
+class AtenMmF16Types(torch.nn.Module):
+    @export
+    @annotate_args(
+        [
+            None,
+            ([-1, -1], torch.float16, True),
+            ([-1, -1], torch.float16, True),
+        ]
+    )
+    def forward(self, a, b):
+        return torch.ops.aten.mm(a, b)
+
+
+@register_test_case(module_factory=lambda: AtenMmF16Types())
+def AtenMmF16Types_basic(module, tu: TestUtils):
+    module.forward(tu.rand(16, 4).to(torch.float16), tu.rand(4, 16).to(torch.float16))
+
+
 # ==============================================================================
 # For DQ-Q fake quantization ops
 import torch.ao.quantization.fx._decomposed
diff --git a/test/Conversion/TorchToTosa/basic.mlir b/test/Conversion/TorchToTosa/basic.mlir
@@ -4218,3 +4218,113 @@ func.func @torch.aten.convolution$si8(%arg0: !torch.vtensor<[2,2,6,6],si8>, %arg
     %4 = torch.aten.convolution %arg0, %arg1, %arg2, %0, %1, %2, %false, %3, %int1 : !torch.vtensor<[2,2,6,6],si8>, !torch.vtensor<[8,2,3,3],si8>, !torch.vtensor<[8],si32>, !torch.list<int>, !torch.list<int>, !torch.list<int>, !torch.bool, !torch.list<int>, !torch.int -> !torch.vtensor<[2,8,4,4],si32>
     return %4 : !torch.vtensor<[2,8,4,4],si32>
   }
+// CHECK-LABEL:   func.func @torch.aten.mm$f32(
+// CHECK-SAME:                    %[[INP:.*]]: !torch.vtensor<[1,22],f32>,
+// CHECK-SAME:                    %[[WTS:.*]]: !torch.vtensor<[22,10],f32>) -> !torch.vtensor<[1,10],f32> {
+// CHECK:           %[[WTS_TENSOR:.*]] = torch_c.to_builtin_tensor %[[WTS]] : !torch.vtensor<[22,10],f32> -> tensor<22x10xf32>
+// CHECK:           %[[INP_TENSOR:.*]] = torch_c.to_builtin_tensor %[[INP]] : !torch.vtensor<[1,22],f32> -> tensor<1x22xf32>
+// CHECK:           %[[INP_SHAPE:.*]] = tosa.const_shape  {values = dense<[1, 1, 22]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK:           %[[INP_RESHAPE:.*]] = tosa.reshape %[[INP_TENSOR]], %[[INP_SHAPE]] : (tensor<1x22xf32>, !tosa.shape<3>) -> tensor<1x1x22xf32>
+// CHECK:           %[[WTS_SHAPE:.*]] = tosa.const_shape  {values = dense<[1, 22, 10]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK:           %[[WTS_RESHAPE:.*]] = tosa.reshape %[[WTS_TENSOR]], %[[WTS_SHAPE]] : (tensor<22x10xf32>, !tosa.shape<3>) -> tensor<1x22x10xf32>
+// CHECK:           %[[INP_ZP:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+// CHECK:           %[[WTS_ZP:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+// CHECK:           %[[MATMUL:.*]] = tosa.matmul %[[INP_RESHAPE]], %[[WTS_RESHAPE]], %[[INP_ZP]], %[[WTS_ZP]] : (tensor<1x1x22xf32>, tensor<1x22x10xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x1x10xf32>
+// CHECK:           %[[RES_SHAPE:.*]] = tosa.const_shape  {values = dense<[1, 10]> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK:           %[[RES_RESHAPE:.*]] = tosa.reshape %[[MATMUL]], %[[RES_SHAPE]] : (tensor<1x1x10xf32>, !tosa.shape<2>) -> tensor<1x10xf32>
+// CHECK:           %[[RES:.*]] = torch_c.from_builtin_tensor %[[RES_RESHAPE]] : tensor<1x10xf32> -> !torch.vtensor<[1,10],f32>
+// CHECK:           return %[[RES]]
+func.func @torch.aten.mm$f32(%arg0: !torch.vtensor<[1,22],f32>, %arg1: !torch.vtensor<[22,10],f32>) -> !torch.vtensor<[1,10],f32> {
+  %0 = torch.aten.mm %arg0, %arg1 : !torch.vtensor<[1,22],f32>, !torch.vtensor<[22,10],f32> -> !torch.vtensor<[1,10],f32>
+  return %0 : !torch.vtensor<[1,10],f32>
+}
+
+// -----
+// CHECK-LABEL:   func.func @torch.aten.mm$si8
+// CHECK: tosa.matmul
+// CHECK-SAME: (tensor<1x1x22xi8>, tensor<1x22x10xi8>, tensor<1xi8>, tensor<1xi8>) -> tensor<1x1x10xi32>
+// CHECK-NOT: torch.aten.mm
+// CHECK: tosa.cast
+// CHECK-SAME: (tensor<1x10xi32>) -> tensor<1x10xi8>
+func.func @torch.aten.mm$si8(%arg0: !torch.vtensor<[1,22],si8>, %arg1: !torch.vtensor<[22,10],si8>) -> !torch.vtensor<[1,10],si8> {
+  %0 = torch.aten.mm %arg0, %arg1 : !torch.vtensor<[1,22],si8>, !torch.vtensor<[22,10],si8> -> !torch.vtensor<[1,10],si8>
+  return %0 : !torch.vtensor<[1,10],si8>
+}
+
+// -----
+// CHECK-LABEL:   func.func @torch.aten.mm$f16
+// CHECK: tosa.matmul
+// CHECK-SAME: (tensor<1x1x22xf16>, tensor<1x22x10xf16>, tensor<1xf16>, tensor<1xf16>) -> tensor<1x1x10xf32>
+// CHECK-NOT: torch.aten.mm
+// CHECK: tosa.cast
+// CHECK-SAME: (tensor<1x10xf32>) -> tensor<1x10xf16>
+func.func @torch.aten.mm$f16(%arg0: !torch.vtensor<[1,22],f16>, %arg1: !torch.vtensor<[22,10],f16>) -> !torch.vtensor<[1,10],f16> {
+  %4 = torch.aten.mm %arg0, %arg1 : !torch.vtensor<[1,22],f16>, !torch.vtensor<[22,10],f16> -> !torch.vtensor<[1,10],f16>
+  return %4 : !torch.vtensor<[1,10],f16>
+}
+
+// -----
+// CHECK-LABEL:   func.func @torch.aten.mm$bf16
+// CHECK: tosa.matmul
+// CHECK-SAME: (tensor<1x1x22xbf16>, tensor<1x22x10xbf16>, tensor<1xbf16>, tensor<1xbf16>) -> tensor<1x1x10xf32>
+// CHECK-NOT: torch.aten.mm
+// CHECK: tosa.cast
+// CHECK-SAME: (tensor<1x10xf32>) -> tensor<1x10xbf16>
+func.func @torch.aten.mm$bf16(%arg0: !torch.vtensor<[1,22],bf16>, %arg1: !torch.vtensor<[22,10],bf16>) -> !torch.vtensor<[1,10],bf16> {
+  %4 = torch.aten.mm %arg0, %arg1 : !torch.vtensor<[1,22],bf16>, !torch.vtensor<[22,10],bf16> -> !torch.vtensor<[1,10],bf16>
+  return %4 : !torch.vtensor<[1,10],bf16>
+}
+
+// -----
+// CHECK-LABEL:   func.func @torch.aten.matmul$broadcast(
+// CHECK-SAME:                    %[[INP:.*]]: !torch.vtensor<[10,3,4],f32>,
+// CHECK-SAME:                    %[[WTS:.*]]: !torch.vtensor<[4],f32>) -> !torch.vtensor<[10,3],f32> {
+// CHECK:           %[[WTS_TENSOR:.*]] = torch_c.to_builtin_tensor %[[WTS]] : !torch.vtensor<[4],f32> -> tensor<4xf32>
+// CHECK:           %[[INP_TENSOR:.*]] = torch_c.to_builtin_tensor %[[INP]] : !torch.vtensor<[10,3,4],f32> -> tensor<10x3x4xf32>
+// CHECK:           %[[WTS_SHAPE:.*]] = tosa.const_shape  {values = dense<[1, 4, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK:           %[[WTS_RESHAPE:.*]] = tosa.reshape %[[WTS_TENSOR]], %[[WTS_SHAPE]] : (tensor<4xf32>, !tosa.shape<3>) -> tensor<1x4x1xf32>
+// CHECK:           %[[INP_SHAPE:.*]] = tosa.const_shape  {values = dense<[1, 30, 4]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK:           %[[INP_RESHAPE:.*]] = tosa.reshape %[[INP_TENSOR]], %[[INP_SHAPE]] : (tensor<10x3x4xf32>, !tosa.shape<3>) -> tensor<1x30x4xf32>
+// CHECK:           %[[WTS_TRANSPOSE:.*]] = tosa.transpose %[[WTS_RESHAPE]] {perms = array<i32: 1, 0, 2>} : (tensor<1x4x1xf32>) -> tensor<4x1x1xf32>
+// CHECK:           %[[WTS_SHAPE_2:.*]] = tosa.const_shape  {values = dense<[1, 4, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK:           %[[WTS_RESHAPE_2:.*]] = tosa.reshape %[[WTS_TRANSPOSE]], %[[WTS_SHAPE_2]] : (tensor<4x1x1xf32>, !tosa.shape<3>) -> tensor<1x4x1xf32>
+// CHECK:           %[[INP_ZP:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+// CHECK:           %[[WTS_ZP:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+// CHECK:           %[[MATMUL:.*]] = tosa.matmul %[[INP_RESHAPE]], %[[WTS_RESHAPE_2]], %[[INP_ZP]], %[[WTS_ZP]] : (tensor<1x30x4xf32>, tensor<1x4x1xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x30x1xf32>
+// CHECK:           %[[RES_SHAPE:.*]] = tosa.const_shape  {values = dense<[10, 3]> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK:           %[[RES_RESHAPE:.*]] = tosa.reshape %[[MATMUL]], %[[RES_SHAPE]] : (tensor<1x30x1xf32>, !tosa.shape<2>) -> tensor<10x3xf32>
+// CHECK:           %[[RES:.*]] = torch_c.from_builtin_tensor %[[RES_RESHAPE]] : tensor<10x3xf32> -> !torch.vtensor<[10,3],f32>
+// CHECK:           return %[[RES]]
+func.func @torch.aten.matmul$broadcast(%arg0: !torch.vtensor<[10,3,4],f32>, %arg1: !torch.vtensor<[4],f32>) -> !torch.vtensor<[10,3],f32> {
+  %0 = torch.aten.matmul %arg0, %arg1 : !torch.vtensor<[10,3,4],f32>, !torch.vtensor<[4],f32> -> !torch.vtensor<[10,3],f32>
+  return %0 : !torch.vtensor<[10,3],f32>
+}
+
+// -----
+// CHECK-LABEL:   func.func @torch.aten.linear$f16(
+// CHECK-SAME:      %[[INP:.*]]: !torch.vtensor<[2,4],f16>,
+// CHECK-SAME:      %[[WTS:.*]]: !torch.vtensor<[3,4],f16>,
+// CHECK-SAME:      %[[BIAS:.*]]: !torch.vtensor<[3],f16>) -> !torch.vtensor<[2,3],f16> {
+// CHECK:           %[[BIAS_TENSOR:.*]] = torch_c.to_builtin_tensor %[[BIAS]] : !torch.vtensor<[3],f16> -> tensor<3xf16>
+// CHECK:           %[[WTS_TENSOR:.*]] = torch_c.to_builtin_tensor %[[WTS]] : !torch.vtensor<[3,4],f16> -> tensor<3x4xf16>
+// CHECK:           %[[INP_TENSOR:.*]] = torch_c.to_builtin_tensor %[[INP]] : !torch.vtensor<[2,4],f16> -> tensor<2x4xf16>
+// CHECK:           %[[WTS_TRANSPOSE:.*]] = tosa.transpose %[[WTS_TENSOR]] {perms = array<i32: 1, 0>} : (tensor<3x4xf16>) -> tensor<4x3xf16>
+// CHECK:           %[[INP_SHAPE:.*]] = tosa.const_shape  {values = dense<[1, 2, 4]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK:           %[[INP_RESHAPE:.*]] = tosa.reshape %[[INP_TENSOR]], %[[INP_SHAPE]] : (tensor<2x4xf16>, !tosa.shape<3>) -> tensor<1x2x4xf16>
+// CHECK:           %[[WTS_SHAPE:.*]] = tosa.const_shape  {values = dense<[1, 4, 3]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK:           %[[WTS_RESHAPE:.*]] = tosa.reshape %[[WTS_TRANSPOSE]], %[[WTS_SHAPE]] : (tensor<4x3xf16>, !tosa.shape<3>) -> tensor<1x4x3xf16>
+// CHECK:           %[[INP_ZP:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf16>}> : () -> tensor<1xf16>
+// CHECK:           %[[WTS_ZP:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf16>}> : () -> tensor<1xf16>
+// CHECK:           %[[MATMUL:.*]] = tosa.matmul %[[INP_RESHAPE]], %[[WTS_RESHAPE]], %[[INP_ZP]], %[[WTS_ZP]] : (tensor<1x2x4xf16>, tensor<1x4x3xf16>, tensor<1xf16>, tensor<1xf16>) -> tensor<1x2x3xf32>
+// CHECK:           %[[RES_SHAPE:.*]] = tosa.const_shape  {values = dense<[2, 3]> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK:           %[[RES_RESHAPE:.*]] = tosa.reshape %[[MATMUL]], %[[RES_SHAPE]] : (tensor<1x2x3xf32>, !tosa.shape<2>) -> tensor<2x3xf32>
+// CHECK:           %[[CAST:.*]] = tosa.cast %[[RES_RESHAPE]] : (tensor<2x3xf32>) -> tensor<2x3xf16>
+// CHECK:           %[[BIAS_SHAPE:.*]] = tosa.const_shape  {values = dense<[1, 3]> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK:           %[[BIAS_RESHAPE:.*]] = tosa.reshape %[[BIAS_TENSOR]], %[[BIAS_SHAPE]] : (tensor<3xf16>, !tosa.shape<2>) -> tensor<1x3xf16>
+// CHECK:           %[[ADD:.*]] = tosa.add %[[CAST]], %[[BIAS_RESHAPE]] : (tensor<2x3xf16>, tensor<1x3xf16>) -> tensor<2x3xf16>
+// CHECK:           %[[RES:.*]] = torch_c.from_builtin_tensor %[[ADD]] : tensor<2x3xf16> -> !torch.vtensor<[2,3],f16>
+// CHECK:           return %[[RES]]
+func.func @torch.aten.linear$f16(%arg0: !torch.vtensor<[2,4],f16>, %arg1: !torch.vtensor<[3,4],f16>, %arg2: !torch.vtensor<[3],f16>) -> !torch.vtensor<[2,3],f16> {
+  %0 = torch.aten.linear %arg0, %arg1, %arg2 : !torch.vtensor<[2,4],f16>, !torch.vtensor<[3,4],f16>, !torch.vtensor<[3],f16> -> !torch.vtensor<[2,3],f16>
+  return %0 : !torch.vtensor<[2,3],f16>
+}