intel
diff --git a/‎.gitignore
Lines changed: 1 addition & 0 deletions b/‎.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/ExecutionEngine/ImexRunnerUtils.cpp
Lines changed: 2 additions & 2 deletions b/‎lib/ExecutionEngine/ImexRunnerUtils.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎test/Gen/PlaidML/CppEdsl.Convolution.mlir.in
Lines changed: 59 additions & 0 deletions b/‎test/Gen/PlaidML/CppEdsl.Convolution.mlir.in
Lines changed: 59 additions & 0 deletions
diff --git a/‎test/Gen/PlaidML/CppEdsl.DotF16_AccF32.mlir.in
Lines changed: 67 additions & 0 deletions b/‎test/Gen/PlaidML/CppEdsl.DotF16_AccF32.mlir.in
Lines changed: 67 additions & 0 deletions
diff --git a/‎test/Gen/PlaidML/OpTest.Argmax.mlir.in
Lines changed: 67 additions & 0 deletions b/‎test/Gen/PlaidML/OpTest.Argmax.mlir.in
Lines changed: 67 additions & 0 deletions
diff --git a/‎test/Gen/PlaidML/OpTest.BroadcastNonNumpy.mlir.in
Lines changed: 47 additions & 0 deletions b/‎test/Gen/PlaidML/OpTest.BroadcastNonNumpy.mlir.in
Lines changed: 47 additions & 0 deletions
@@ -8,6 +8,7 @@ _install/
 .idea/
 cmake-build-debug/
 cmake-build-release/
+.github/
 
 compile_commands.json
 *~
@@ -98,8 +98,8 @@ extern "C" bool _mlir_ciface_allcloseF16(UnrankedMemRefType<f16> *M,
   // atol, rtol values copied from
   // https://numpy.org/doc/stable/reference/generated/numpy.allclose.html
   // values may need to adjusted in the future
-  const float atol = 1e-08;
-  const float rtol = 1e-05;
+  const float atol = 1e-04;
+  const float rtol = 1e-03;
   DynamicMemRefType<f16> DM = DynamicMemRefType<f16>(*M);
   DynamicMemRefType<float> DN = DynamicMemRefType<float>(*N);
   DynamicMemRefIterator<f16> i = DM.begin();
 
@@ -0,0 +1,59 @@
+// NUMPLACEHOLDERS 3 NUMVARIANTS 3
+// PLACEHOLDER DTYPE f32 f16 bf16
+// PLACEHOLDER PRINTDTYPE @printMemrefF32 @printMemrefF16 @printMemrefBF16
+// PLACEHOLDER CHECKDTYPE @printAllcloseF32 @printAllcloseF16 @printAllcloseBF16
+// RUN: %python_executable %imex_runner -i %s --pass-pipeline-file=%p/linalg-to-cpu.pp \
+// RUN:                                       --runner imex-cpu-runner -e main \
+// RUN:                                       --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%irunner_utils \
+// RUN:                                       --entry-point-result=void --filecheck
+// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                       --runner imex-cpu-runner -e main \
+// RUN:                                       --entry-point-result=void \
+// RUN:                                       --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%irunner_utils,%levelzero_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%irunner_utils,%sycl_runtime --filecheck
+#map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+module @convolution {
+func.func @test(%arg0: tensor<1x56x56x64x@DTYPE@>, %arg1: tensor<3x3x64x64x@DTYPE@>) -> tensor<1x56x56x64x@DTYPE@> {
+    %cst = arith.constant 0.000000e+00 : @DTYPE@
+    %0 = tensor.empty() : tensor<1x56x56x64x@DTYPE@>
+    %1 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x56x56x64x@DTYPE@>) outs(%0 : tensor<1x56x56x64x@DTYPE@>) {
+    ^bb0(%arg2: @DTYPE@, %arg3: @DTYPE@):
+      linalg.yield %arg2 : @DTYPE@
+    } -> tensor<1x56x56x64x@DTYPE@>
+    %cst_0 = arith.constant 0.000000e+00 : @DTYPE@
+    %2 = tensor.pad %1 low[0, 1, 1, 0] high[0, 1, 1, 0] {
+    ^bb0(%arg2: index, %arg3: index, %arg4: index, %arg5: index):
+      tensor.yield %cst_0 : @DTYPE@
+    } : tensor<1x56x56x64x@DTYPE@> to tensor<1x58x58x64x@DTYPE@>
+    %3 = tensor.empty() : tensor<1x56x56x64x@DTYPE@>
+    %4 = linalg.fill ins(%cst : @DTYPE@) outs(%3 : tensor<1x56x56x64x@DTYPE@>) -> tensor<1x56x56x64x@DTYPE@>
+    %5 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%2, %arg1 : tensor<1x58x58x64x@DTYPE@>, tensor<3x3x64x64x@DTYPE@>) outs(%4 : tensor<1x56x56x64x@DTYPE@>) attrs =  {iterator_ranges = [1, 56, 56, 64, 3, 3, 64]} {
+    ^bb0(%arg2: @DTYPE@, %arg3: @DTYPE@, %arg4: @DTYPE@):
+      %6 = arith.mulf %arg2, %arg3 : @DTYPE@
+      %7 = arith.addf %arg4, %6 : @DTYPE@
+      linalg.yield %7 : @DTYPE@
+    } -> tensor<1x56x56x64x@DTYPE@>
+    return %5 : tensor<1x56x56x64x@DTYPE@>
+  }
+  func.func @main() {
+    %0 = arith.constant dense<1.0> : tensor<1x56x56x64x@DTYPE@>
+    %1 = arith.constant dense<0.5> : tensor<3x3x64x64x@DTYPE@>
+    %2 = call @test(%0, %1) : (tensor<1x56x56x64x@DTYPE@>, tensor<3x3x64x64x@DTYPE@>) -> tensor<1x56x56x64x@DTYPE@>
+    %3 = tensor.extract_slice %2[0, 0, 0, 0][1, 1, 1, 64][1, 1, 1, 1] : tensor<1x56x56x64x@DTYPE@> to tensor<64x@DTYPE@>
+    %unranked = tensor.cast %3 : tensor<64x@DTYPE@> to tensor<*x@DTYPE@>
+    %ref = arith.constant dense<[128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0,128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0,128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0,128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0,128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0,128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0,128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0,128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]>:tensor<64xf32>
+    %unranked_ref = tensor.cast %ref : tensor<64xf32>to tensor<*xf32>
+    call @CHECKDTYPE@(%unranked, %unranked_ref) : (tensor<*x@DTYPE@>, tensor<*xf32>) -> ()
+    call @PRINTDTYPE@(%unranked) : (tensor<*x@DTYPE@>) -> ()
+    return
+    // CHECK:   [ALLCLOSE: TRUE]
+  }
+func.func private @PRINTDTYPE@(tensor<*x@DTYPE@>)
+func.func private @CHECKDTYPE@(tensor<*x@DTYPE@>, tensor<*xf32>)
+}
@@ -0,0 +1,67 @@
+// NUMPLACEHOLDERS 3 NUMVARIANTS 2
+// PLACEHOLDER DTYPE f16 bf16
+// PLACEHOLDER PRINTDTYPE @printMemrefF16 @printMemrefBF16
+// PLACEHOLDER CHECKDTYPE @printAllcloseF16 @printAllcloseBF16
+// RUN: %python_executable %imex_runner -i %s --pass-pipeline-file=%p/linalg-to-cpu.pp \
+// RUN:                                       --runner imex-cpu-runner -e main \
+// RUN:                                       --shared-libs=%mlir_runner_utils,%irunner_utils \
+// RUN:                                       --entry-point-result=void --filecheck
+// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                       --runner imex-cpu-runner -e main \
+// RUN:                                       --entry-point-result=void \
+// RUN:                                       --shared-libs=%mlir_runner_utils,%irunner_utils,%levelzero_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%sycl_runtime --filecheck
+#map0 = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map2 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#map_convert = affine_map<(d0, d1) -> (d0, d1)>
+module @dot_f16_acc_f32 {
+func.func @test(%arg0: tensor<8x16x@DTYPE@>, %arg1: tensor<16x32x@DTYPE@>) -> tensor<8x32x@DTYPE@> {
+    %cst = arith.constant 0.000000e+00 : f32
+    %0 = tensor.empty() : tensor<8x16xf32>
+    %1 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<8x16x@DTYPE@>) outs(%0 : tensor<8x16xf32>) {
+    ^bb0(%arg2: @DTYPE@, %arg3: f32):
+      %9 = arith.extf %arg2 : @DTYPE@ to f32
+      linalg.yield %9 : f32
+    } -> tensor<8x16xf32>
+    %2 = tensor.empty() : tensor<16x32xf32>
+    %3 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%arg1 : tensor<16x32x@DTYPE@>) outs(%2 : tensor<16x32xf32>) {
+    ^bb0(%arg2: @DTYPE@, %arg3: f32):
+      %9 = arith.extf %arg2 : @DTYPE@ to f32
+      linalg.yield %9 : f32
+    } -> tensor<16x32xf32>
+    %4 = tensor.empty() : tensor<8x32xf32>
+    %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<8x32xf32>) -> tensor<8x32xf32>
+    %6 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"]} ins(%1, %3 : tensor<8x16xf32>, tensor<16x32xf32>) outs(%5 : tensor<8x32xf32>) attrs =  {iterator_ranges = [8, 32, 16]} {
+    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
+      %9 = arith.mulf %arg2, %arg3 : f32
+      %10 = arith.addf %arg4, %9 : f32
+      linalg.yield %10 : f32
+    } -> tensor<8x32xf32>
+    %7 = tensor.empty() : tensor<8x32x@DTYPE@>
+    %8 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<8x32xf32>) outs(%7 : tensor<8x32x@DTYPE@>) {
+    ^bb0(%arg2: f32, %arg3: @DTYPE@):
+      %9 = arith.truncf %arg2 : f32 to @DTYPE@
+      linalg.yield %9 : @DTYPE@
+    } -> tensor<8x32x@DTYPE@>
+    return %8 : tensor<8x32x@DTYPE@>
+  }
+  func.func @main() {
+    %0 = arith.constant dense<1.0> : tensor<8x16x@DTYPE@>
+    %1 = arith.constant dense<2.0> : tensor<16x32x@DTYPE@>
+    %2 = call @test(%0, %1) : (tensor<8x16x@DTYPE@>, tensor<16x32x@DTYPE@>) -> tensor<8x32x@DTYPE@>
+    %3 = tensor.extract_slice %2[0, 0][2, 32][1, 1] : tensor<8x32x@DTYPE@> to tensor<2x32x@DTYPE@>
+    %unranked = tensor.cast %3 : tensor<2x32x@DTYPE@> to tensor<*x@DTYPE@>
+    %ref = arith.constant dense<[[32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0,32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0,32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0,32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0],
+                                 [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0,32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0,32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0,32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]]>:tensor<2x32xf32>
+    %unranked_ref = tensor.cast %ref : tensor<2x32xf32>to tensor<*xf32>
+    call @CHECKDTYPE@(%unranked, %unranked_ref) : (tensor<*x@DTYPE@>, tensor<*xf32>) -> ()
+    return
+    // CHECK:   [ALLCLOSE: TRUE]
+  }
+  func.func private @CHECKDTYPE@(tensor<*x@DTYPE@>, tensor<*xf32>)
+}
@@ -0,0 +1,67 @@
+// NUMPLACEHOLDERS 2 NUMVARIANTS 3
+// PLACEHOLDER DTYPE f32 f16 bf16
+// PLACEHOLDER NEGINF 0xFF800000 0xFC00 0xFF80
+// RUN: %python_executable %imex_runner -i %s --pass-pipeline-file=%p/linalg-to-cpu.pp \
+// RUN:                                       --runner imex-cpu-runner -e main \
+// RUN:                                       --shared-libs=%mlir_runner_utils,%irunner_utils \
+// RUN:                                       --entry-point-result=void --filecheck
+// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                       --runner imex-cpu-runner -e main \
+// RUN:                                       --entry-point-result=void \
+// RUN:                                       --shared-libs=%mlir_runner_utils,%irunner_utils,%levelzero_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%sycl_runtime --filecheck
+#map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> ()>
+#map2 = affine_map<() -> ()>
+module @argmax {
+func.func @main() {
+    %0= arith.constant dense<[[[[9.0, 8.0, 0.0], [1.0, 5.0, 0.0], [1.0, 1.0, 7.0], [8.0, 2.0, 2.0]], [[8.0, 0.0, 4.0], [7.0, 5.0, 5.0], [8.0, 2.0, 0.0], [0.0, 9.0, 5.0]], [[4.0, 7.0, 2.0], [4.0, 5.0, 1.0], [3.0, 3.0, 6.0], [8.0, 0.0, 1.0]], [[2.0, 8.0, 4.0], [0.0, 5.0, 5.0], [6.0, 1.0, 1.0], [3.0, 3.0, 1.0]]]]>:tensor<1x4x4x3x@DTYPE@>
+    %1 = call @test(%0) : (tensor<1x4x4x3x@DTYPE@>) -> tensor<i32>
+    %unranked = tensor.cast %1 : tensor<i32>to tensor<*xi32>
+    call @printMemrefI32(%unranked) : (tensor<*xi32>) -> ()
+    return
+}
+func.func private @printMemrefI32(tensor<*xi32>)
+func.func @test(%arg0: tensor<1x4x4x3x@DTYPE@>)->tensor<i32>{
+    %c0_i32 = arith.constant 0 : i32
+    %cst = arith.constant @NEGINF@ : @DTYPE@
+    %0 = tensor.empty() : tensor<@DTYPE@>
+    %1 = linalg.fill ins(%cst : @DTYPE@) outs(%0 : tensor<@DTYPE@>) -> tensor<@DTYPE@>
+    %2 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["reduction", "reduction", "reduction", "reduction"]} ins(%arg0 : tensor<1x4x4x3x@DTYPE@>) outs(%1 : tensor<@DTYPE@>) attrs =  {iterator_ranges = [1, 4, 4, 3], name = "argmax"} {
+    ^bb0(%arg1: @DTYPE@, %arg2: @DTYPE@):
+      %10 = arith.cmpf ogt, %arg2, %arg1 : @DTYPE@
+      %11 = arith.select %10, %arg2, %arg1 : @DTYPE@
+      linalg.yield %11 : @DTYPE@
+    } -> tensor<@DTYPE@>
+    %3 = tensor.empty() : tensor<1x4x4x3xi32>
+    %4 = linalg.generic {indexing_maps = [#map0], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3 : tensor<1x4x4x3xi32>) {
+    ^bb0(%arg1: i32):
+      %10 = linalg.index 0 : index
+      %11 = arith.index_cast %10 : index to i32
+      linalg.yield %11 : i32
+    } -> tensor<1x4x4x3xi32>
+    %5 = tensor.empty() : tensor<i32>
+    %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<i32>) -> tensor<i32>
+    %7 = linalg.generic {indexing_maps = [#map0, #map1, #map0, #map1], iterator_types = ["reduction", "reduction", "reduction", "reduction"]} ins(%arg0, %2, %4 : tensor<1x4x4x3x@DTYPE@>, tensor<@DTYPE@>, tensor<1x4x4x3xi32>) outs(%6 : tensor<i32>) attrs =  {iterator_ranges = [1, 4, 4, 3], name = "argmax"} {
+    ^bb0(%arg1: @DTYPE@, %arg2: @DTYPE@, %arg3: i32, %arg4: i32):
+      %10 = arith.cmpf oeq, %arg1, %arg2 : @DTYPE@
+      %c0_i32_0 = arith.constant 0 : i32
+      %11 = arith.select %10, %arg3, %c0_i32_0 : i32
+      %12 = arith.cmpi ugt, %arg4, %11 : i32
+      %13 = arith.select %12, %arg4, %11 : i32
+      linalg.yield %13 : i32
+    } -> tensor<i32>
+    %8 = tensor.empty() : tensor<i32>
+    %9 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = []} ins(%7 : tensor<i32>) outs(%8 : tensor<i32>) {
+    ^bb0(%arg1: i32, %arg2: i32):
+      linalg.yield %arg1 : i32
+    } -> tensor<i32>
+    return %9 : tensor<i32>
+  }
+}
+// CHECK: Unranked Memref base@ = {{0x[-9a-f]*}}
+// CHECK-SAME: rank = {{.}} offset = {{.}} sizes = [] strides = {{.*}} data =
+// CHECK:   0
@@ -0,0 +1,47 @@
+// NUMPLACEHOLDERS 3 NUMVARIANTS 3
+// PLACEHOLDER DTYPE f32 f16 bf16
+// PLACEHOLDER PRINTDTYPE @printMemrefF32 @printMemrefF16 @printMemrefBF16
+// PLACEHOLDER CHECKDTYPE @printAllcloseF32 @printAllcloseF16 @printAllcloseBF16
+// RUN: %python_executable %imex_runner -i %s --pass-pipeline-file=%p/linalg-to-cpu.pp \
+// RUN:                                       --runner imex-cpu-runner -e main \
+// RUN:                                       --shared-libs=%mlir_runner_utils,%irunner_utils \
+// RUN:                                       --entry-point-result=void --filecheck
+// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                       --runner imex-cpu-runner -e main \
+// RUN:                                       --entry-point-result=void \
+// RUN:                                       --shared-libs=%mlir_runner_utils,%irunner_utils,%levelzero_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%mlir_runner_utils,%irunner_utils,%sycl_runtime --filecheck
+#map0 = affine_map<(d0, d1) -> (d0)>
+#map1 = affine_map<(d0, d1) -> (d0, d1)>
+module @broadcast_non_numpy {
+  func.func @test(%arg0: tensor<3x@DTYPE@>) -> tensor<3x4x@DTYPE@> {
+    %cst = arith.constant 0.000000e+00 : @DTYPE@
+    %0 = tensor.empty() : tensor<3x4x@DTYPE@>
+    %1 = linalg.fill ins(%cst : @DTYPE@) outs(%0 : tensor<3x4x@DTYPE@>) -> tensor<3x4x@DTYPE@>
+    %2 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<3x@DTYPE@>) outs(%1 : tensor<3x4x@DTYPE@>) attrs =  {iterator_ranges = [3, 4], name = "broadcast"} {
+    ^bb0(%arg1: @DTYPE@, %arg2: @DTYPE@):
+      linalg.yield %arg1 : @DTYPE@
+    } -> tensor<3x4x@DTYPE@>
+    return %2 : tensor<3x4x@DTYPE@>
+  }
+  func.func @main() {
+    %0 = arith.constant dense<[1.0, 2.0, 3.0]> : tensor<3x@DTYPE@>
+    %2 = call @test(%0) : (tensor<3x@DTYPE@>) -> tensor<3x4x@DTYPE@>
+    %unranked = tensor.cast %2 : tensor<3x4x@DTYPE@> to tensor<*x@DTYPE@>
+    %ref = arith.constant dense<[
+                                 [1.0, 1.0, 1.0, 1.0],
+                                 [2.0, 2.0, 2.0, 2.0],
+                                 [3.0, 3.0, 3.0, 3.0]
+                                ]>:tensor<3x4xf32>
+    %unranked_ref = tensor.cast %ref : tensor<3x4xf32>to tensor<*xf32>
+    call @CHECKDTYPE@(%unranked, %unranked_ref) : (tensor<*x@DTYPE@>, tensor<*xf32>) -> ()
+    call @PRINTDTYPE@(%unranked) : (tensor<*x@DTYPE@>) -> ()
+    return
+}
+func.func private @PRINTDTYPE@(tensor<*x@DTYPE@>)
+func.func private @CHECKDTYPE@(tensor<*x@DTYPE@>, tensor<*xf32>)
+}
+// CHECK:   [ALLCLOSE: TRUE]