Fix CPU EP Tile 0D overvalidation (microsoft#25821)

fdwr · web-flow · commit 978bfca5931a · 2025-10-01T17:31:32.000-07:00
### Description Fixes microsoft#11523. Scalars should just be nops for tile. Simply removing the overvalidation lets the case work. ### Motivation and Context Conformance with expectation.
diff --git a/onnxruntime/core/providers/cpu/tensor/tile.cc b/onnxruntime/core/providers/cpu/tensor/tile.cc
@@ -185,8 +185,6 @@ Status Tile::Compute(OpKernelContext* ctx) const {
   tensor_pointer = ctx->Input<Tensor>(1);
   if (tensor_pointer == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "Input count of Tile OP mismatch, the second one is empty");
   const Tensor& repeats_tensor = *tensor_pointer;
-  if (input_rank < 1)
-    return Status(ONNXRUNTIME, INVALID_ARGUMENT, "the tensor to be tiled using Tile OP must be atleast 1 dimensional");
   if (repeats_tensor.Shape().NumDimensions() != 1)
     return Status(ONNXRUNTIME, INVALID_ARGUMENT, "'repeat' input tensor must be 1 dimensional");
   if (size_t(repeats_tensor.Shape().Size()) != input_rank)
diff --git a/onnxruntime/test/providers/cpu/tensor/tile_op_test.cc b/onnxruntime/test/providers/cpu/tensor/tile_op_test.cc
@@ -53,12 +53,14 @@ void RunTest(const std::vector<int64_t>& input_dims,
   std::vector<T> output_data(output_size);
   std::vector<int64_t> input_strides(rank);
   std::vector<int64_t> output_strides(rank);
-  input_strides[rank - 1] = output_strides[rank - 1] = 1;
-  if (rank > 1) {
-    for (size_t i = rank - 2;; --i) {
-      input_strides[i] = input_dims[i + 1] * input_strides[i + 1];
-      output_strides[i] = output_dims[i + 1] * output_strides[i + 1];
-      if (i == 0) break;
+  if (rank >= 1) {
+    input_strides[rank - 1] = output_strides[rank - 1] = 1;
+    if (rank > 1) {
+      for (size_t i = rank - 2;; --i) {
+        input_strides[i] = input_dims[i + 1] * input_strides[i + 1];
+        output_strides[i] = output_dims[i + 1] * output_strides[i + 1];
+        if (i == 0) break;
+      }
     }
   }
   for (size_t i = 0; i < output_size; ++i) {
@@ -142,6 +144,14 @@ void RunTestWrapper() {
   RunTest<T>({2, 1, 3}, {2, 2, 1});
   RunTest<T>({2, 1, 3}, {2, 2, 1}, true);
 
+// The WebGPU EP is not currently prepared for this possibility:
+//   onnxruntime/core/providers/webgpu/program.cc:46
+//   ProgramUniformVariableValue(...) length > 0 was false. number of element of uniform variable must be greater than 0.
+#if !defined(USE_WEBGPU)
+  // Tile0D (nop)
+  RunTest<T>({}, {});
+#endif
+
 #if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_WEBGPU)
   // _TileMemcpyKernelFromInput, vectorized 4
   RunTest<T>({256, 512}, {3, 1});