From 1af3abd639c95f102ec4ade5fb69e9334a32bffe Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Tue, 16 Dec 2025 09:19:23 -0800
Subject: [PATCH 01/28] Add test model

---
 onnxruntime/test/testdata/test_pad_rce.onnx | Bin 0 -> 179 bytes
 onnxruntime/test/testdata/test_pad_rce.py   |  30 ++++++++++++++++++++
 2 files changed, 30 insertions(+)
 create mode 100644 onnxruntime/test/testdata/test_pad_rce.onnx
 create mode 100644 onnxruntime/test/testdata/test_pad_rce.py

diff --git a/onnxruntime/test/testdata/test_pad_rce.onnx b/onnxruntime/test/testdata/test_pad_rce.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..10ddd88cc517c800299a461376e6573be2ced17a
GIT binary patch
literal 179
zcmd<!v0A~%WzWT$nO9I+!o^aMm{QEemz<whT#}eq5?_{>Q<^HomS0)|lvQF5NKDZZ
z=3>dsPf1ncfGJwQ$SlMGl<@_M=0u4?jS}MF;^N>HV&h@}!6*TU(Lx+tY#i)DOh6F{
iOl!DUfI1kGB)Fg!3h{CA07W<;W^ge$v2Za6@Bsj<0x1*#

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/testdata/test_pad_rce.py b/onnxruntime/test/testdata/test_pad_rce.py
new file mode 100644
index 0000000000000..fcc487cbf4e73
--- /dev/null
+++ b/onnxruntime/test/testdata/test_pad_rce.py
@@ -0,0 +1,30 @@
+import onnx
+from onnx import helper, TensorProto
+import numpy as np
+
+def create_pad_model():
+    input_data = helper.make_tensor_value_info("input", TensorProto.UINT64, [None, None, None])
+    pads = helper.make_tensor_value_info("pads", TensorProto.INT64, [None])
+    constant_value = helper.make_tensor_value_info("constant_value", TensorProto.UINT64, [])
+
+    output = helper.make_tensor_value_info("output", TensorProto.UINT64, [None, None, None, None])
+
+    pad_node = helper.make_node(
+        op_type="Pad",
+        inputs=["input", "pads", "constant_value"],
+        outputs=["output"],
+        mode="constant"  # or reflect/edge
+    )
+    graph = helper.make_graph(
+        nodes=[pad_node],
+        name="PadModel",
+        inputs=[input_data, pads, constant_value],
+        outputs=[output]
+    )
+
+    model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 14)])
+    onnx.save(model, "test_pad_rce.onnx")
+
+
+if __name__ == "__main__":
+    create_pad_model()
\ No newline at end of file

From 4be2420d9e5a9127b1a75a16c9cc2b38df5dc114 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Tue, 16 Dec 2025 09:19:23 -0800
Subject: [PATCH 02/28] Add test model

---
 onnxruntime/test/testdata/test_pad_rce.onnx | Bin 0 -> 179 bytes
 onnxruntime/test/testdata/test_pad_rce.py   |  30 ++++++++++++++++++++
 2 files changed, 30 insertions(+)
 create mode 100644 onnxruntime/test/testdata/test_pad_rce.onnx
 create mode 100644 onnxruntime/test/testdata/test_pad_rce.py

diff --git a/onnxruntime/test/testdata/test_pad_rce.onnx b/onnxruntime/test/testdata/test_pad_rce.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..10ddd88cc517c800299a461376e6573be2ced17a
GIT binary patch
literal 179
zcmd<!v0A~%WzWT$nO9I+!o^aMm{QEemz<whT#}eq5?_{>Q<^HomS0)|lvQF5NKDZZ
z=3>dsPf1ncfGJwQ$SlMGl<@_M=0u4?jS}MF;^N>HV&h@}!6*TU(Lx+tY#i)DOh6F{
iOl!DUfI1kGB)Fg!3h{CA07W<;W^ge$v2Za6@Bsj<0x1*#

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/testdata/test_pad_rce.py b/onnxruntime/test/testdata/test_pad_rce.py
new file mode 100644
index 0000000000000..fcc487cbf4e73
--- /dev/null
+++ b/onnxruntime/test/testdata/test_pad_rce.py
@@ -0,0 +1,30 @@
+import onnx
+from onnx import helper, TensorProto
+import numpy as np
+
+def create_pad_model():
+    input_data = helper.make_tensor_value_info("input", TensorProto.UINT64, [None, None, None])
+    pads = helper.make_tensor_value_info("pads", TensorProto.INT64, [None])
+    constant_value = helper.make_tensor_value_info("constant_value", TensorProto.UINT64, [])
+
+    output = helper.make_tensor_value_info("output", TensorProto.UINT64, [None, None, None, None])
+
+    pad_node = helper.make_node(
+        op_type="Pad",
+        inputs=["input", "pads", "constant_value"],
+        outputs=["output"],
+        mode="constant"  # or reflect/edge
+    )
+    graph = helper.make_graph(
+        nodes=[pad_node],
+        name="PadModel",
+        inputs=[input_data, pads, constant_value],
+        outputs=[output]
+    )
+
+    model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 14)])
+    onnx.save(model, "test_pad_rce.onnx")
+
+
+if __name__ == "__main__":
+    create_pad_model()
\ No newline at end of file

From 40299515c8a8b93e9fc2004f445fe53ca17ff738 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Tue, 16 Dec 2025 11:56:27 -0800
Subject: [PATCH 03/28] Py formatting

---
 onnxruntime/test/testdata/test_pad_rce.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/onnxruntime/test/testdata/test_pad_rce.py b/onnxruntime/test/testdata/test_pad_rce.py
index fcc487cbf4e73..0a9faac70aab3 100644
--- a/onnxruntime/test/testdata/test_pad_rce.py
+++ b/onnxruntime/test/testdata/test_pad_rce.py
@@ -1,6 +1,6 @@
 import onnx
-from onnx import helper, TensorProto
-import numpy as np
+from onnx import TensorProto, helper
+
 
 def create_pad_model():
     input_data = helper.make_tensor_value_info("input", TensorProto.UINT64, [None, None, None])
@@ -13,13 +13,10 @@ def create_pad_model():
         op_type="Pad",
         inputs=["input", "pads", "constant_value"],
         outputs=["output"],
-        mode="constant"  # or reflect/edge
+        mode="constant",  # or reflect/edge
     )
     graph = helper.make_graph(
-        nodes=[pad_node],
-        name="PadModel",
-        inputs=[input_data, pads, constant_value],
-        outputs=[output]
+        nodes=[pad_node], name="PadModel", inputs=[input_data, pads, constant_value], outputs=[output]
     )
 
     model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 14)])
@@ -27,4 +24,4 @@ def create_pad_model():
 
 
 if __name__ == "__main__":
-    create_pad_model()
\ No newline at end of file
+    create_pad_model()

From 0b84e6bfb4a4fce5f08f4547dc356e524e4d1272 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Wed, 17 Dec 2025 19:29:16 -0800
Subject: [PATCH 04/28] Continue testing

---
 onnxruntime/core/providers/cpu/tensor/pad.cc  | 71 ++++++++++---------
 .../test/providers/cpu/tensor/pad_test.cc     | 25 +++++--
 onnxruntime/test/testdata/test_pad_rce.py     |  5 +-
 3 files changed, 55 insertions(+), 46 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc
index ab261bbb8cdb5..f9f9121233ce8 100644
--- a/onnxruntime/core/providers/cpu/tensor/pad.cc
+++ b/onnxruntime/core/providers/cpu/tensor/pad.cc
@@ -347,11 +347,11 @@ void PadBase::FlattenInnerShape(gsl::span<const int64_t> input_dims, gsl::span<c
                                 gsl::span<const int64_t> slices, TensorShapeVector& reshaped_dims) {
   const size_t dims_count = input_dims.size();
   size_t inner_axis = dims_count - 1;
-  size_t inner_size = 1;
+  SafeInt<size_t> inner_size = 1;
 
   // Find all inner most dimensions that can be flattened.
   do {
-    inner_size *= static_cast<size_t>(input_dims[inner_axis]);
+    inner_size *= input_dims[inner_axis];
 
     if (inner_axis == 0)
       break;
@@ -378,8 +378,8 @@ void PadBase::ReshapePads(gsl::span<const int64_t> src_pad, size_t src_dim_count
             reshaped_pad.begin() + new_dim_count);
 
   // Flatten inner axis.
-  reshaped_pad[inner_axis] = src_pad[inner_axis] * inner_no_pad_size;
-  reshaped_pad[inner_axis + new_dim_count] = src_pad[inner_axis + src_dim_count] * inner_no_pad_size;
+  reshaped_pad[inner_axis] = SafeInt<int64_t>(src_pad[inner_axis]) * inner_no_pad_size;
+  reshaped_pad[inner_axis + new_dim_count] = SafeInt<int64_t>(src_pad[inner_axis + src_dim_count]) * inner_no_pad_size;
 }
 
 // special handling for edge case where the input has one or more dims with value of 0
@@ -468,11 +468,11 @@ static Status PadImpl(OpKernelContext* ctx,
   PadBase::FlattenInnerShape(output_dims, pads, slices, reshaped_input_dims);
 
   // Reshape padding
-  size_t new_dims_count = reshaped_input_dims.size();
-  size_t inner_axis = new_dims_count - 1;
-  size_t inner_no_pad_size = onnxruntime::narrow<size_t>(output_dims[inner_axis] > 0
-                                                             ? reshaped_input_dims[inner_axis] / output_dims[inner_axis]
-                                                             : 0);
+  const size_t new_dims_count = reshaped_input_dims.size();
+  const size_t inner_axis = new_dims_count - 1;
+  const size_t inner_no_pad_size = onnxruntime::narrow<size_t>(output_dims[inner_axis] > 0
+                                                                   ? reshaped_input_dims[inner_axis] / output_dims[inner_axis]
+                                                                   : 0);
   PadsVector reshaped_pad(2 * new_dims_count), reshaped_slice(2 * new_dims_count);
   PadBase::ReshapePads(pads, data_rank, new_dims_count, inner_no_pad_size, reshaped_pad);
   PadBase::ReshapePads(slices, data_rank, new_dims_count, inner_no_pad_size, reshaped_slice);
@@ -486,13 +486,14 @@ static Status PadImpl(OpKernelContext* ctx,
   input_extents.reserve(new_dims_count);
   for (size_t i = 0; i < new_dims_count; i++) {
     input_starts.push_back(-1 * reshaped_slice[i]);
-    input_extents.push_back(reshaped_input_dims[i] + reshaped_slice[i] + reshaped_slice[i + new_dims_count]);
-    reshaped_output_dims[i] += reshaped_pad[i] + reshaped_pad[i + new_dims_count] +
+    auto extent = SafeInt<int64_t>(reshaped_input_dims[i]) + reshaped_slice[i] + reshaped_slice[i + new_dims_count];
+    input_extents.push_back(extent);
+    reshaped_output_dims[i] += SafeInt<int64_t>(reshaped_pad[i]) + reshaped_pad[i + new_dims_count] +
                                reshaped_slice[i] + reshaped_slice[i + new_dims_count];
   }
 
   for (size_t i = 0; i < data_rank; i++) {
-    output_dims[i] += pads[i] + pads[i + data_rank] + slices[i] + slices[i + data_rank];
+    output_dims[i] += SafeInt<int64_t>(pads[i]) + pads[i + data_rank] + slices[i] + slices[i + data_rank];
   }
 
   // special case an input with one or more dim values of 0. edge case that is easier to handle
@@ -510,11 +511,11 @@ static Status PadImpl(OpKernelContext* ctx,
   auto* output = reinterpret_cast<T*>(output_tensor.MutableDataRaw());
 
   TensorPitches output_pitches(reshaped_output_dims);
-  size_t alignSkip = 0;  // Amount to skip to align to where the next input tensor data needs to be written
+  SafeInt<size_t> align_skip = 0;  // Amount to skip to align to where the next input tensor data needs to be written
 
   // Initial skip, sum up the begin padding on each axis
   for (size_t i = 0; i < new_dims_count; i++)
-    alignSkip += SafeInt<size_t>(reshaped_pad[i]) * output_pitches[i];
+    align_skip += SafeInt<size_t>(reshaped_pad[i]) * output_pitches[i];
 
   ExtentAxisCounters input_counters(input_extents);
 
@@ -524,28 +525,28 @@ static Status PadImpl(OpKernelContext* ctx,
       // On loop entry, 'pad' is already set to the first continuous block of padding, and
       // after every pass through the inner loop it gets set to the next continuous pad size.
       while (input_counters) {
-        output += alignSkip;
+        output += align_skip;
         {
-          T* axisStart = output;
+          T* axis_start = output;
           output = input.CopyInnermostAxisSolitaryInnerStep(output);
 
-          int64_t prePad = reshaped_pad[inner_axis];
-          int64_t postPad = reshaped_pad[inner_axis + new_dims_count];
-          PadAxisConstant(axisStart - prePad, value, onnxruntime::narrow<size_t>(prePad));
-          PadAxisConstant(output, value, onnxruntime::narrow<size_t>(postPad));
-          output += postPad;
-          alignSkip = onnxruntime::narrow<size_t>(prePad);
+          const SafeInt<size_t> pre_pad = reshaped_pad[inner_axis];
+          const SafeInt<size_t> post_pad = reshaped_pad[inner_axis + new_dims_count];
+          PadAxisConstant(axis_start - pre_pad, value, pre_pad);
+          PadAxisConstant(output, value, post_pad);
+          output += post_pad;
+          align_skip = pre_pad;
         }
         // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done)
         while (input_counters.Increment()) {
           ptrdiff_t inner_pitch = onnxruntime::narrow<std::ptrdiff_t>(output_pitches[input_counters.Axis()]);
-          T* axisStart = output - inner_pitch * input_extents[input_counters.Axis()];
-          int64_t prePad = reshaped_pad[input_counters.Axis()];
-          int64_t postPad = reshaped_pad[input_counters.Axis() + new_dims_count];
-          PadAxisConstant(axisStart - prePad * inner_pitch, value, SafeInt<std::ptrdiff_t>(prePad) * inner_pitch);
-          PadAxisConstant(output, value, SafeInt<ptrdiff_t>(postPad) * inner_pitch);
-          output += inner_pitch * postPad;
-          alignSkip += inner_pitch * SafeInt<size_t>(prePad);
+          T* axis_start = output - inner_pitch * input_extents[input_counters.Axis()];
+          const int64_t pre_pad = reshaped_pad[input_counters.Axis()];
+          const int64_t post_pad = reshaped_pad[input_counters.Axis() + new_dims_count];
+          PadAxisConstant(axis_start - pre_pad * inner_pitch, value, SafeInt<std::ptrdiff_t>(pre_pad) * inner_pitch);
+          PadAxisConstant(output, value, SafeInt<ptrdiff_t>(post_pad) * inner_pitch);
+          output += inner_pitch * post_pad;
+          align_skip += inner_pitch * SafeInt<size_t>(pre_pad);
         }
       }
       break;
@@ -555,7 +556,7 @@ static Status PadImpl(OpKernelContext* ctx,
       // On loop entry, 'pad' is already set to the first continuous block of padding, and
       // after every pass through the inner loop it gets set to the next continuous pad size.
       while (input_counters) {
-        output += alignSkip;
+        output += align_skip;
         {
           T* axisStart = output;
           output = input.CopyInnermostAxisSolitaryInnerStep(output);
@@ -572,7 +573,7 @@ static Status PadImpl(OpKernelContext* ctx,
             PadAxis(output, output - inner_no_pad_size, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, onnxruntime::narrow<size_t>(pads[inner_axis + data_rank]));
           }
           output += postPad;
-          alignSkip = onnxruntime::narrow<size_t>(prePad);
+          align_skip = onnxruntime::narrow<size_t>(prePad);
         }
         // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done)
         while (input_counters.Increment()) {
@@ -583,7 +584,7 @@ static Status PadImpl(OpKernelContext* ctx,
           PadAxis(axisStart - prePad * inner_pitch, axisStart, 1, -inner_pitch, inner_pitch, onnxruntime::narrow<size_t>(prePad));
           PadAxis(output, output - inner_pitch, 1, -inner_pitch, inner_pitch, onnxruntime::narrow<size_t>(postPad));
           output += inner_pitch * postPad;
-          alignSkip += inner_pitch * SafeInt<size_t>(prePad);
+          align_skip += inner_pitch * SafeInt<size_t>(prePad);
         }
       }
       break;
@@ -594,7 +595,7 @@ static Status PadImpl(OpKernelContext* ctx,
       // On loop entry, 'pad' is already set to the first continuous block of padding, and
       // after every pass through the inner loop it gets set to the next continuous pad size.
       while (input_counters) {
-        output += alignSkip;
+        output += align_skip;
         {
           T* axisStart = output;
           output = input.CopyInnermostAxisSolitaryInnerStep(output);
@@ -644,7 +645,7 @@ static Status PadImpl(OpKernelContext* ctx,
             }
           }
           output += postPad;
-          alignSkip = onnxruntime::narrow<size_t>(prePad);
+          align_skip = onnxruntime::narrow<size_t>(prePad);
         }
         // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done)
         while (input_counters.Increment()) {
@@ -684,7 +685,7 @@ static Status PadImpl(OpKernelContext* ctx,
                 onnxruntime::narrow<size_t>(postPad));
           }
           output += inner_pitch * postPad;
-          alignSkip += inner_pitch * SafeInt<size_t>(prePad);
+          align_skip += inner_pitch * SafeInt<size_t>(prePad);
         }
       }
       break;
diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
index 1d9cd15f53327..97ee556d59501 100644
--- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
@@ -890,11 +890,6 @@ TYPED_TEST(PadOpTest, Pad_Edge_DimWithZeroInput) {
 }
 
 TYPED_TEST(PadOpTest, Pad_Reflect_DimWithZeroInput) {
-  // TODO: Unskip when fixed #41968513
-  if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: MLOperatorAuthorImpl.cpp(2100): The parameter is incorrect.";
-  }
-
   using T = TypeParam;
   RunAllOpsetAllDomainPadTests<T>({2, 0},  // 2D
                                   {},
@@ -902,7 +897,10 @@ TYPED_TEST(PadOpTest, Pad_Reflect_DimWithZeroInput) {
                                   T(1),
                                   {4, 0},
                                   {},
-                                  "reflect");
+                                  "reflect",
+                                  OpTester::ExpectResult::kExpectSuccess,
+                                  "",
+                                  {kDmlExecutionProvider});  // DML: Unskip when fixed #41968513
 
   RunAllOpsetAllDomainPadTests<T>({0, 2, 1},  // 3D
                                   {},
@@ -912,7 +910,8 @@ TYPED_TEST(PadOpTest, Pad_Reflect_DimWithZeroInput) {
                                   {},
                                   "reflect",
                                   OpTester::ExpectResult::kExpectFailure,
-                                  "Cannot use 'reflect' mode to pad dimension with a value of 0. Input shape:{0,2,1}", {kTensorrtExecutionProvider});
+                                  "Cannot use 'reflect' mode to pad dimension with a value of 0. Input shape:{0,2,1}",
+                                  {kDmlExecutionProvider, kTensorrtExecutionProvider});  // DML: Unskip when fixed #41968513
 }
 
 TEST(PadOpTest, BoolType) {
@@ -1089,5 +1088,17 @@ TEST(PadOpTest, ConstantPadNegativeAxes) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kNnapiExecutionProvider});
 }
 
+// Gh issue: https://github.com/microsoft/onnxruntime/issues/11828
+TEST(PadOpTest, Pad_Reflect_NegativeFront_PositiveBack) {
+  using T = float;
+  RunAllOpsetAllDomainPadTests<T>({4},
+                                  {T(1), T(2), T(3), T(4)},
+                                  {-3, 3},
+                                  T(0),
+                                  {4},
+                                  {4, 0, 0, 0},
+                                  "reflect");
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/testdata/test_pad_rce.py b/onnxruntime/test/testdata/test_pad_rce.py
index ccbee5565bfd9..0a9faac70aab3 100644
--- a/onnxruntime/test/testdata/test_pad_rce.py
+++ b/onnxruntime/test/testdata/test_pad_rce.py
@@ -16,10 +16,7 @@ def create_pad_model():
         mode="constant",  # or reflect/edge
     )
     graph = helper.make_graph(
-        nodes=[pad_node],
-        name="PadModel",
-        inputs=[input_data, pads, constant_value],
-        outputs=[output]
+        nodes=[pad_node], name="PadModel", inputs=[input_data, pads, constant_value], outputs=[output]
     )
 
     model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 14)])

From fdfe2b53200a6bc9868b0ae6af5830298bdf0b3b Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Thu, 18 Dec 2025 17:51:03 -0800
Subject: [PATCH 05/28] Commit the latest

---
 onnxruntime/core/providers/cpu/tensor/pad.cc  |  74 +++++++--
 .../core/providers/cpu/tensor/padbase.h       |  36 +++++
 .../test/providers/cpu/tensor/pad_test.cc     | 152 +++++++++++++++---
 3 files changed, 225 insertions(+), 37 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc
index f9f9121233ce8..92beeb9286001 100644
--- a/onnxruntime/core/providers/cpu/tensor/pad.cc
+++ b/onnxruntime/core/providers/cpu/tensor/pad.cc
@@ -457,7 +457,7 @@ static Status PadImpl(OpKernelContext* ctx,
   const auto& input_tensor = *ctx->Input<Tensor>(0);
   const auto& orig_input_shape = input_tensor.Shape();
   auto output_dims(orig_input_shape.AsShapeVector());
-  size_t data_rank = output_dims.size();
+  const size_t data_rank = output_dims.size();
 
   // make copy of raw_pads as it may be mutated below
   ORT_ENFORCE(data_rank > 0, "Input tensor has no dimensions");
@@ -465,14 +465,18 @@ static Status PadImpl(OpKernelContext* ctx,
 
   // Reshape input dims
   TensorShapeVector reshaped_input_dims;
-  PadBase::FlattenInnerShape(output_dims, pads, slices, reshaped_input_dims);
+  if (PadBase::ShouldFlattenInnerShape(output_dims, pads, slices)) {
+    PadBase::FlattenInnerShape(output_dims, pads, slices, reshaped_input_dims);
+  } else {
+    reshaped_input_dims = output_dims;
+  }
 
   // Reshape padding
   const size_t new_dims_count = reshaped_input_dims.size();
   const size_t inner_axis = new_dims_count - 1;
-  const size_t inner_no_pad_size = onnxruntime::narrow<size_t>(output_dims[inner_axis] > 0
-                                                                   ? reshaped_input_dims[inner_axis] / output_dims[inner_axis]
-                                                                   : 0);
+  const int64_t inner_no_pad_size = output_dims[inner_axis] > 0
+                                        ? reshaped_input_dims[inner_axis] / output_dims[inner_axis]
+                                        : 0;
   PadsVector reshaped_pad(2 * new_dims_count), reshaped_slice(2 * new_dims_count);
   PadBase::ReshapePads(pads, data_rank, new_dims_count, inner_no_pad_size, reshaped_pad);
   PadBase::ReshapePads(slices, data_rank, new_dims_count, inner_no_pad_size, reshaped_slice);
@@ -481,17 +485,22 @@ static Status PadImpl(OpKernelContext* ctx,
   TensorShapeVector input_starts;
   TensorShapeVector input_extents;
 
-  // Calculate output dimensions, and handle any negative padding
+  // Calculate reshaped output dimensions, and handle any negative padding
   input_starts.reserve(new_dims_count);
   input_extents.reserve(new_dims_count);
   for (size_t i = 0; i < new_dims_count; i++) {
+    // Starts for every dimension. If slice is negative, we need to start further in, handled by the SliceIterator
     input_starts.push_back(-1 * reshaped_slice[i]);
-    auto extent = SafeInt<int64_t>(reshaped_input_dims[i]) + reshaped_slice[i] + reshaped_slice[i + new_dims_count];
+    // Do not allow negative extents
+    int64_t extent = std::max<int64_t>(SafeInt<int64_t>(reshaped_input_dims[i]) +
+                                           reshaped_slice[i] + reshaped_slice[i + new_dims_count],
+                                       0U);
     input_extents.push_back(extent);
     reshaped_output_dims[i] += SafeInt<int64_t>(reshaped_pad[i]) + reshaped_pad[i + new_dims_count] +
                                reshaped_slice[i] + reshaped_slice[i + new_dims_count];
   }
 
+  // Compute true output dimensions
   for (size_t i = 0; i < data_rank; i++) {
     output_dims[i] += SafeInt<int64_t>(pads[i]) + pads[i + data_rank] + slices[i] + slices[i + data_rank];
   }
@@ -502,20 +511,53 @@ static Status PadImpl(OpKernelContext* ctx,
     return PadInputWithDimValueOfZero(ctx, mode, orig_input_shape, output_dims, value);
   }
 
-  TensorShape input_shape(reshaped_input_dims);
-  SliceIterator<T> input(input_tensor, input_shape, input_starts, input_extents, {});
-
-  // output_shape need to keep original.
+  // output_shape needs to keep original.
   TensorShape output_shape(output_dims);
   auto& output_tensor = *ctx->Output(0, output_shape);
   auto* output = reinterpret_cast<T*>(output_tensor.MutableDataRaw());
 
+  // Early constant-fill: if any input extent is zero, no data to copy
+  // only padding if any
+  bool no_data_to_copy = false;
+  for (size_t i = 0; i < input_extents.size(); ++i) {
+    if (input_extents[i] == 0) {
+      no_data_to_copy = true;
+      break;
+    }
+  }
+
+  const SafeInt<size_t> total_output_elems(output_shape.Size());
+  if (no_data_to_copy) {
+    if (mode == Mode::Constant) {
+      PadAxisConstant<T>(output, value, total_output_elems);
+      return Status::OK();
+    }
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                           "Pad: invalid mode: ", static_cast<int>(mode), " with zero effective input extent");
+  }
+
   TensorPitches output_pitches(reshaped_output_dims);
-  SafeInt<size_t> align_skip = 0;  // Amount to skip to align to where the next input tensor data needs to be written
+  // Initial skip, sum up the start padding on each axis
+  SafeInt<size_t> align_skip = 0;
+  for (size_t i = 0; i < new_dims_count; i++) {
+    const auto inc = SafeInt<int64_t>(reshaped_pad[i]) * output_pitches[i];
+    align_skip += inc;
+  }
 
-  // Initial skip, sum up the begin padding on each axis
-  for (size_t i = 0; i < new_dims_count; i++)
-    align_skip += SafeInt<size_t>(reshaped_pad[i]) * output_pitches[i];
+  // Validate coverage: pre + copy + post == total
+  SafeInt<size_t> copy_elems = 1;
+  for (size_t i = 0, lim = input_extents.size(); i < lim; ++i) {
+    // All extents are positive here due to the no_data_to_copy check above
+    copy_elems *= input_extents[i];
+  }
+
+  const size_t prepad_elems = align_skip;
+  const size_t postpad_elems = SafeInt<size_t>(total_output_elems) - prepad_elems - copy_elems;
+  ORT_RETURN_IF_ERROR(PadBase::ValidateTotalElementsCoverage(
+      total_output_elems, prepad_elems, copy_elems, postpad_elems));
+
+  TensorShape input_shape(reshaped_input_dims);
+  SliceIterator<T> input(input_tensor, input_shape, input_starts, input_extents, {});
 
   ExtentAxisCounters input_counters(input_extents);
 
@@ -532,7 +574,7 @@ static Status PadImpl(OpKernelContext* ctx,
 
           const SafeInt<size_t> pre_pad = reshaped_pad[inner_axis];
           const SafeInt<size_t> post_pad = reshaped_pad[inner_axis + new_dims_count];
-          PadAxisConstant(axis_start - pre_pad, value, pre_pad);
+          PadAxisConstant(axis_start - *pre_pad.Ptr(), value, pre_pad);
           PadAxisConstant(output, value, post_pad);
           output += post_pad;
           align_skip = pre_pad;
diff --git a/onnxruntime/core/providers/cpu/tensor/padbase.h b/onnxruntime/core/providers/cpu/tensor/padbase.h
index 43f9cbfc9f9a4..62324f1b48df1 100644
--- a/onnxruntime/core/providers/cpu/tensor/padbase.h
+++ b/onnxruntime/core/providers/cpu/tensor/padbase.h
@@ -67,6 +67,42 @@ class PadBase {
 
   // End provider shared
 
+  // Only flatten innermost axes when there is no padding and no slicing on ANY axis.
+  static bool ShouldFlattenInnerShape(gsl::span<const int64_t> input_dims,
+                                      gsl::span<const int64_t> pads,
+                                      gsl::span<const int64_t> slices) {
+    const size_t rank = input_dims.size();
+    if (rank == 0) return false;
+    for (size_t i = 0; i < rank; ++i) {
+      if (slices[i] != 0 || slices[rank + i] != 0) return false;
+    }
+
+    const size_t inner = rank - 1;
+    if (pads[inner] != 0 || pads[inner + rank] != 0 ||
+        slices[inner] != 0 || slices[inner + rank] != 0) {
+      return false;
+    }
+    return true;
+  }
+
+  // Guard: pre-pad + copy + post-pad must equal total output elements.
+  static Status ValidateTotalElementsCoverage(size_t total_output_elems,
+                                              size_t prepad_elems,
+                                              size_t copy_elems,
+                                              size_t postpad_elems) {
+    const size_t checked_sum =
+        SafeInt<size_t>(prepad_elems) +
+        SafeInt<size_t>(copy_elems) +
+        SafeInt<size_t>(postpad_elems);
+    if (checked_sum != total_output_elems) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                             +"Pad coverage invalid: pre=", prepad_elems,
+                             +" copy=", copy_elems, " post=", postpad_elems,
+                             +" total=", total_output_elems);
+    }
+    return Status::OK();
+  }
+
   /// <summary>
   /// Flatten no padding inner most Axis, so one memcpy cover multiple Axis.
   /// For example, for a shape of [1,224,224,3] with padding [0,3,3,0,0,3,3,0], can be flatten as
diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
index 97ee556d59501..386f1e4e37ec0 100644
--- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
@@ -836,11 +836,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) {
 //      In order to remove the warning, shape inference methods needs to be fixed.
 
 TYPED_TEST(PadOpTest, Pad_Edge_DimWithZeroInput) {
-  // TODO: Unskip when fixed #41968513
-  if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: MLOperatorAuthorImpl.cpp(2100): The parameter is incorrect.";
-  }
-
+  // TODO: Enable Dml when fixed #41968513
   using T = TypeParam;
   RunAllOpsetAllDomainPadTests<T>({0},  // 1D
                                   {},
@@ -850,7 +846,8 @@ TYPED_TEST(PadOpTest, Pad_Edge_DimWithZeroInput) {
                                   {},
                                   "edge",
                                   OpTester::ExpectResult::kExpectFailure,
-                                  "Cannot use 'edge' mode to pad dimension with a value of 0. Input shape:{0}", {kTensorrtExecutionProvider});
+                                  "Cannot use 'edge' mode to pad dimension with a value of 0. Input shape:{0}",
+                                  {kDmlExecutionProvider, kTensorrtExecutionProvider});
 
   RunAllOpsetAllDomainPadTests<T>({2, 0},  // 2D
                                   {},
@@ -860,7 +857,8 @@ TYPED_TEST(PadOpTest, Pad_Edge_DimWithZeroInput) {
                                   {},
                                   "edge",
                                   OpTester::ExpectResult::kExpectFailure,
-                                  "Cannot use 'edge' mode to pad dimension with a value of 0. Input shape:{2,0}", {kTensorrtExecutionProvider});
+                                  "Cannot use 'edge' mode to pad dimension with a value of 0. Input shape:{2,0}",
+                                  {kDmlExecutionProvider, kTensorrtExecutionProvider});
 
   RunAllOpsetAllDomainPadTests<T>({2, 0},  // 2D
                                   {},
@@ -878,7 +876,8 @@ TYPED_TEST(PadOpTest, Pad_Edge_DimWithZeroInput) {
                                   {},
                                   "edge",
                                   OpTester::ExpectResult::kExpectFailure,
-                                  "Cannot use 'edge' mode to pad dimension with a value of 0. Input shape:{2,2,0}", {kTensorrtExecutionProvider});
+                                  "Cannot use 'edge' mode to pad dimension with a value of 0. Input shape:{2,2,0}",
+                                  {kDmlExecutionProvider, kTensorrtExecutionProvider});
 
   RunAllOpsetAllDomainPadTests<T>({2, 2, 0},  // 3D
                                   {},
@@ -886,10 +885,22 @@ TYPED_TEST(PadOpTest, Pad_Edge_DimWithZeroInput) {
                                   T(1),
                                   {2, 4, 0},
                                   {},
-                                  "edge");
+                                  "edge",
+                                  OpTester::ExpectResult::kExpectSuccess,
+                                  {kDmlExecutionProvider});
 }
 
-TYPED_TEST(PadOpTest, Pad_Reflect_DimWithZeroInput) {
+static_assert(sizeof("Pad_Reflect_DimWithZeroInput") > 1, "test-name must not be empty");
+template <typename gtest_TypeParam_>
+class PadOpTest_Pad_Reflect_DimWithZeroInput_Test : public PadOpTest<gtest_TypeParam_> {
+ private:
+  typedef PadOpTest<gtest_TypeParam_> TestFixture;
+  typedef gtest_TypeParam_ TypeParam;
+  void TestBody() override;
+};
+[[maybe_unused]] static bool gtest_PadOpTest_Pad_Reflect_DimWithZeroInput_registered_ = ::testing::internal::TypeParameterizedTest<PadOpTest, ::testing::internal::TemplateSel<PadOpTest_Pad_Reflect_DimWithZeroInput_Test>, gtest_type_params_PadOpTest_>::Register("", ::testing::internal::CodeLocation("D:\\dev\\ort_main\\onnxruntime\\test\\providers\\cpu\\tensor\\pad_test.cc", 892), "PadOpTest", "Pad_Reflect_DimWithZeroInput", 0, ::testing::internal::GenerateNames<gtest_type_params_PadOpTest_NameGenerator, gtest_type_params_PadOpTest_>());
+template <typename gtest_TypeParam_>
+void PadOpTest_Pad_Reflect_DimWithZeroInput_Test<gtest_TypeParam_>::TestBody() {
   using T = TypeParam;
   RunAllOpsetAllDomainPadTests<T>({2, 0},  // 2D
                                   {},
@@ -1085,20 +1096,119 @@ TEST(PadOpTest, ConstantPadNegativeAxes) {
                          0.0f, 1.0f, 1.0f, 0.0f,
                          0.0f, 1.0f, 1.0f, 0.0f,
                          0.0f, 1.0f, 1.0f, 0.0f});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kNnapiExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
-// Gh issue: https://github.com/microsoft/onnxruntime/issues/11828
-TEST(PadOpTest, Pad_Reflect_NegativeFront_PositiveBack) {
-  using T = float;
-  RunAllOpsetAllDomainPadTests<T>({4},
-                                  {T(1), T(2), T(3), T(4)},
-                                  {-3, 3},
-                                  T(0),
-                                  {4},
-                                  {4, 0, 0, 0},
-                                  "reflect");
+TEST(PadOpTest, ConstantPadLargeNegativePadNoOutput) {
+  OpTester test("Pad", 18);
+  test.AddAttribute("mode", "constant");
+
+  const std::initializer_list<int64_t> input_shape{2, 18, 4};
+
+  /* clang-format off */
+  const std::vector<float> input_data = {
+      1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6,
+      1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6,
+      1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6,
+      1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6,
+      1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6,
+
+      1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6,
+      1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6,
+      1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6,
+      1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6,
+      1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6,
+  };
+  /* clang-format on */
+
+  // input_data is larger than the shape elements in this test
+  // constexpr const size_t input_data_size = static_cast<size_t>(2) * 18 * 4;
+  // ASSERT_EQ(input_data_size, input_data.size());
+  auto input_span = gsl::make_span(input_data.data(), static_cast<size_t>(2) * 18 * 4);
+
+  const std::initializer_list<int64_t> pads_shape{6};
+  std::initializer_list<int64_t> pads = {1, 0x100000, -2, -3, 0, 1};
+  ASSERT_EQ(6U, pads.size());
+
+  // Expected shape is as follows:
+  // dim0: 2 + 1(pad) - 3(crop at the back) = (0) removed // Should produce empty output
+  // dim1: 18 + 0x100000(pad) - 0(crop at the front) = 0x10000
+  // dim2: 4 + -2(crop at the front) + 1(pad at the back) = 3
+  // Resulting shape is {0, 0x10000, 3} with 0 at the front.
+  // How do we handle zero shapes? Currently ONNX spec allows it.
+  constexpr int64_t dim0 = 2 + 1 - 3;
+  constexpr int64_t dim1 = 18 + 0x100000 - 0;
+  constexpr int64_t dim2 = 4 + -2 + 1;
+  const std::initializer_list<int64_t> output_shape{dim0, dim1, dim2};
+
+  std::vector<float> output_data;  // empty now
+
+  test.AddInput<float>("data", input_shape, input_span);
+  test.AddInput<int64_t>("pads", pads_shape, pads);
+  test.AddInput<float>("value", {}, {100.f});
+
+  // Omit Axis input
+  test.AddOutput<float>("output", output_shape, output_data);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+TEST(PadOpTest, ConstantMode_MixedSigns_Small) {
+  const std::vector<int64_t> input_shape{2, 6, 4};
+  std::vector<float> input_data(2 * 6 * 4);
+
+  for (size_t i = 0; i < input_data.size(); ++i) {
+    input_data[i] = static_cast<float>((i % 5) + 1);
+  }
+
+  const std::vector<int64_t> pads{1, 3, -2, -1, 0, 1};
+  const float cv = 9.0f;
+  const std::vector<int64_t> expected_shape{2, 9, 3};
+
+  std::vector<float> expected_output = {
+      // a0 = 0
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+      3.f, 4.f, 9.f,
+      2.f, 3.f, 9.f,
+      1.f, 2.f, 9.f,
+      5.f, 1.f, 9.f,
+      4.f, 5.f, 9.f,
+      3.f, 4.f, 9.f,
+
+      // a0 = 1 (cropped original, fully padded slice)
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f};
+
+  ASSERT_EQ(2U * 9U * 3U, expected_output.size());
+
+  OpTester test("Pad", 18);
+  test.AddInput<float>("data", input_shape, input_data);
+  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads);
+  test.AddInput<float>("constant_value", {}, {cv});
+  test.AddOutput<float>("output", expected_shape, expected_output);
+  test.AddAttribute("mode", "constant");
+  test.Run();
 }
 
+// Gh issue: https://github.com/microsoft/onnxruntime/issues/11828
+// TEST(PadOpTest, Pad_Reflect_NegativeFront_PositiveBack) {
+//  using T = float;
+//  RunAllOpsetAllDomainPadTests<T>({4},
+//                                  {T(1), T(2), T(3), T(4)},
+//                                  {-3, 3},
+//                                  T(0),
+//                                  {4},
+//                                  {4, 0, 0, 0},
+//                                  "reflect");
+//}
+
 }  // namespace test
 }  // namespace onnxruntime

From 3e711a514bb32120090f719304cda26d53919dd3 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Fri, 19 Dec 2025 10:49:44 -0800
Subject: [PATCH 06/28] Fix up constant

---
 onnxruntime/core/providers/cpu/tensor/pad.cc | 22 +++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc
index 92beeb9286001..696acd069691b 100644
--- a/onnxruntime/core/providers/cpu/tensor/pad.cc
+++ b/onnxruntime/core/providers/cpu/tensor/pad.cc
@@ -574,8 +574,12 @@ static Status PadImpl(OpKernelContext* ctx,
 
           const SafeInt<size_t> pre_pad = reshaped_pad[inner_axis];
           const SafeInt<size_t> post_pad = reshaped_pad[inner_axis + new_dims_count];
-          PadAxisConstant(axis_start - *pre_pad.Ptr(), value, pre_pad);
-          PadAxisConstant(output, value, post_pad);
+          if (pre_pad > 0) {
+            PadAxisConstant(axis_start - static_cast<size_t>(pre_pad), value, pre_pad);
+          }
+          if (post_pad > 0) {
+            PadAxisConstant(output, value, post_pad);
+          }
           output += post_pad;
           align_skip = pre_pad;
         }
@@ -583,12 +587,16 @@ static Status PadImpl(OpKernelContext* ctx,
         while (input_counters.Increment()) {
           ptrdiff_t inner_pitch = onnxruntime::narrow<std::ptrdiff_t>(output_pitches[input_counters.Axis()]);
           T* axis_start = output - inner_pitch * input_extents[input_counters.Axis()];
-          const int64_t pre_pad = reshaped_pad[input_counters.Axis()];
-          const int64_t post_pad = reshaped_pad[input_counters.Axis() + new_dims_count];
-          PadAxisConstant(axis_start - pre_pad * inner_pitch, value, SafeInt<std::ptrdiff_t>(pre_pad) * inner_pitch);
-          PadAxisConstant(output, value, SafeInt<ptrdiff_t>(post_pad) * inner_pitch);
+          const SafeInt<size_t> pre_pad = reshaped_pad[input_counters.Axis()];
+          const SafeInt<size_t> post_pad = reshaped_pad[input_counters.Axis() + new_dims_count];
+          if (pre_pad > 0) {
+            PadAxisConstant(axis_start - static_cast<ptrdiff_t>(pre_pad * inner_pitch), value, pre_pad * inner_pitch);
+          }
+          if (post_pad > 0) {
+            PadAxisConstant(output, value, post_pad * inner_pitch);
+          }
           output += inner_pitch * post_pad;
-          align_skip += inner_pitch * SafeInt<size_t>(pre_pad);
+          align_skip += inner_pitch * pre_pad;
         }
       }
       break;

From af3a0160abc9509a58a0a2c983e195c8ebfb60b8 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Fri, 19 Dec 2025 11:15:20 -0800
Subject: [PATCH 07/28] More refactoring

---
 onnxruntime/core/providers/cpu/tensor/pad.cc  | 111 +++++++++++-------
 .../test/providers/cpu/tensor/pad_test.cc     |  90 +++++++-------
 2 files changed, 113 insertions(+), 88 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc
index 696acd069691b..6c3f1c71241c9 100644
--- a/onnxruntime/core/providers/cpu/tensor/pad.cc
+++ b/onnxruntime/core/providers/cpu/tensor/pad.cc
@@ -608,33 +608,48 @@ static Status PadImpl(OpKernelContext* ctx,
       while (input_counters) {
         output += align_skip;
         {
-          T* axisStart = output;
+          T* axis_start = output;
           output = input.CopyInnermostAxisSolitaryInnerStep(output);
 
-          int64_t prePad = reshaped_pad[inner_axis];
-          int64_t postPad = reshaped_pad[inner_axis + new_dims_count];
+          SafeInt<size_t> pre_pad = reshaped_pad[inner_axis];
+          SafeInt<size_t> post_pad = reshaped_pad[inner_axis + new_dims_count];
           if (inner_no_pad_size == 1) {
-            PadAxisConstant(axisStart - prePad, *axisStart, onnxruntime::narrow<size_t>(prePad));
-            PadAxisConstant(output, *(output - 1), onnxruntime::narrow<size_t>(postPad));
+            if (pre_pad > 0) {
+              PadAxisConstant(axis_start - static_cast<size_t>(pre_pad), *axis_start, pre_pad);
+            }
+            if (post_pad > 0) {
+              PadAxisConstant(output, *(output - 1), post_pad);
+            }
           } else {
             // When inner_most axis(es) do not need pad, above PadAxisConstant() do not fit for Edge mode.
             // Also general loop below after handling first pad axis with non-pad axis works fine.
-            PadAxis(axisStart - prePad, axisStart, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, onnxruntime::narrow<size_t>(pads[inner_axis]));
-            PadAxis(output, output - inner_no_pad_size, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, onnxruntime::narrow<size_t>(pads[inner_axis + data_rank]));
+            if (pads[inner_axis] > 0) {
+              PadAxis(axis_start - static_cast<size_t>(pre_pad), axis_start, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size,
+                      onnxruntime::narrow<size_t>(pads[inner_axis]));
+            }
+            if (pads[inner_axis + data_rank] > 0) {
+              PadAxis(output, output - inner_no_pad_size, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size,
+                      onnxruntime::narrow<size_t>(pads[inner_axis + data_rank]));
+            }
           }
-          output += postPad;
-          align_skip = onnxruntime::narrow<size_t>(prePad);
+          output += post_pad;
+          align_skip = pre_pad;
         }
         // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done)
         while (input_counters.Increment()) {
           ptrdiff_t inner_pitch = onnxruntime::narrow<std::ptrdiff_t>(output_pitches[input_counters.Axis()]);
-          T* axisStart = output - inner_pitch * input_extents[input_counters.Axis()];
-          int64_t prePad = reshaped_pad[input_counters.Axis()];
-          int64_t postPad = reshaped_pad[input_counters.Axis() + new_dims_count];
-          PadAxis(axisStart - prePad * inner_pitch, axisStart, 1, -inner_pitch, inner_pitch, onnxruntime::narrow<size_t>(prePad));
-          PadAxis(output, output - inner_pitch, 1, -inner_pitch, inner_pitch, onnxruntime::narrow<size_t>(postPad));
-          output += inner_pitch * postPad;
-          align_skip += inner_pitch * SafeInt<size_t>(prePad);
+          T* axis_start = output - inner_pitch * input_extents[input_counters.Axis()];
+          const SafeInt<size_t> pre_pad = reshaped_pad[input_counters.Axis()];
+          const SafeInt<size_t> post_pad = reshaped_pad[input_counters.Axis() + new_dims_count];
+          if (pre_pad > 0) {
+            PadAxis(axis_start - static_cast<size_t>(pre_pad) * inner_pitch, axis_start, 1, -inner_pitch, inner_pitch,
+                    pre_pad);
+          }
+          if (post_pad > 0) {
+            PadAxis(output, output - inner_pitch, 1, -inner_pitch, inner_pitch, post_pad);
+          }
+          output += inner_pitch * post_pad;
+          align_skip += inner_pitch * pre_pad;
         }
       }
       break;
@@ -647,25 +662,35 @@ static Status PadImpl(OpKernelContext* ctx,
       while (input_counters) {
         output += align_skip;
         {
-          T* axisStart = output;
+          T* axis_start = output;
           output = input.CopyInnermostAxisSolitaryInnerStep(output);
 
-          int64_t prePad = reshaped_pad[inner_axis];
-          int64_t postPad = reshaped_pad[inner_axis + new_dims_count];
+          const SafeInt<size_t> pre_pad = reshaped_pad[inner_axis];
+          const SafeInt<size_t> post_pad = reshaped_pad[inner_axis + new_dims_count];
           if (inner_no_pad_size == 1) {
             if (mode == Mode::Reflect) {
-              PadInnermostAxis(axisStart - prePad, axisStart + prePad, -1 /* inputDelta */, onnxruntime::narrow<size_t>(prePad));
-              PadInnermostAxis(output, output - 2, -1 /* inputDelta */, onnxruntime::narrow<size_t>(postPad));
+              if (pre_pad > 0) {
+                PadInnermostAxis(axis_start - static_cast<size_t>(pre_pad),
+                                 axis_start + static_cast<size_t>(pre_pad), -1 /* inputDelta */, pre_pad);
+              }
+              if (post_pad > 0) {
+                PadInnermostAxis(output, output - 2, -1 /* inputDelta */, post_pad);
+              }
             } else {
-              PadInnermostAxis(axisStart - prePad, output - prePad, 1 /* inputDelta */, onnxruntime::narrow<size_t>(prePad));
-              PadInnermostAxis(output, axisStart, 1 /* inputDelta */, onnxruntime::narrow<size_t>(postPad));
+              if (pre_pad > 0) {
+                PadInnermostAxis(axis_start - static_cast<size_t>(pre_pad),
+                                 output - static_cast<size_t>(pre_pad), 1 /* inputDelta */, pre_pad);
+              }
+              if (post_pad > 0) {
+                PadInnermostAxis(output, axis_start, 1 /* inputDelta */, post_pad);
+              }
             }
           } else {
             // When inner_most axis(es) do not need pad, Above PadInnermostAxis() do not fit for Reflect mode.
             if (mode == Mode::Reflect) {
               PadAxis(
-                  axisStart - prePad,
-                  axisStart + prePad,
+                  axis_start - static_cast<size_t>(pre_pad),
+                  axis_start + static_cast<size_t>(pre_pad),
                   1,
                   -ptrdiff_t(inner_no_pad_size * 2),
                   inner_no_pad_size,
@@ -679,7 +704,7 @@ static Status PadImpl(OpKernelContext* ctx,
                   onnxruntime::narrow<size_t>(pads[inner_axis + data_rank]));
             } else {
               PadAxis(
-                  axisStart - prePad,
+                  axis_start - static_cast<size_t>(pre_pad),
                   output - pads[inner_axis] * inner_no_pad_size,
                   1,
                   0,
@@ -687,55 +712,55 @@ static Status PadImpl(OpKernelContext* ctx,
                   onnxruntime::narrow<size_t>(pads[inner_axis]));
               PadAxis(
                   output,
-                  axisStart,
+                  axis_start,
                   1,
                   0,
                   inner_no_pad_size,
                   onnxruntime::narrow<size_t>(pads[inner_axis + data_rank]));
             }
           }
-          output += postPad;
-          align_skip = onnxruntime::narrow<size_t>(prePad);
+          output += post_pad;
+          align_skip = onnxruntime::narrow<size_t>(pre_pad);
         }
         // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done)
         while (input_counters.Increment()) {
           ptrdiff_t inner_pitch = onnxruntime::narrow<std::ptrdiff_t>(output_pitches[input_counters.Axis()]);
-          T* axisStart = output - inner_pitch * input_extents[input_counters.Axis()];
-          int64_t prePad = reshaped_pad[input_counters.Axis()];
-          int64_t postPad = reshaped_pad[input_counters.Axis() + new_dims_count];
+          T* axis_start = output - inner_pitch * input_extents[input_counters.Axis()];
+          SafeInt<size_t> pre_pad = reshaped_pad[input_counters.Axis()];
+          SafeInt<size_t> post_pad = reshaped_pad[input_counters.Axis() + new_dims_count];
           if (mode == Mode::Reflect) {
             PadAxis(
-                axisStart - prePad * inner_pitch,
-                axisStart + prePad * inner_pitch,
+                axis_start - static_cast<size_t>(pre_pad) * inner_pitch,
+                axis_start + static_cast<size_t>(pre_pad) * inner_pitch,
                 1,
                 -inner_pitch * 2,
                 inner_pitch,
-                onnxruntime::narrow<size_t>(prePad));
+                pre_pad);
             PadAxis(
                 output,
                 output - 2 * inner_pitch,
                 1,
                 -inner_pitch * 2,
                 inner_pitch,
-                onnxruntime::narrow<size_t>(postPad));
+                post_pad);
           } else {
             PadAxis(
-                axisStart - prePad * inner_pitch,
-                output - prePad * inner_pitch,
+                axis_start - static_cast<size_t>(pre_pad) * inner_pitch,
+                output - static_cast<size_t>(pre_pad) * inner_pitch,
                 1,
                 0,
                 inner_pitch,
-                onnxruntime::narrow<size_t>(prePad));
+                pre_pad);
             PadAxis(
                 output,
-                axisStart,
+                axis_start,
                 1,
                 0,
                 inner_pitch,
-                onnxruntime::narrow<size_t>(postPad));
+                post_pad);
           }
-          output += inner_pitch * postPad;
-          align_skip += inner_pitch * SafeInt<size_t>(prePad);
+          output += inner_pitch * post_pad;
+          align_skip += inner_pitch * pre_pad;
         }
       }
       break;
diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
index 386f1e4e37ec0..c82f17012fd47 100644
--- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
@@ -1152,51 +1152,51 @@ TEST(PadOpTest, ConstantPadLargeNegativePadNoOutput) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
-TEST(PadOpTest, ConstantMode_MixedSigns_Small) {
-  const std::vector<int64_t> input_shape{2, 6, 4};
-  std::vector<float> input_data(2 * 6 * 4);
-
-  for (size_t i = 0; i < input_data.size(); ++i) {
-    input_data[i] = static_cast<float>((i % 5) + 1);
-  }
-
-  const std::vector<int64_t> pads{1, 3, -2, -1, 0, 1};
-  const float cv = 9.0f;
-  const std::vector<int64_t> expected_shape{2, 9, 3};
-
-  std::vector<float> expected_output = {
-      // a0 = 0
-      9.f, 9.f, 9.f,
-      9.f, 9.f, 9.f,
-      9.f, 9.f, 9.f,
-      3.f, 4.f, 9.f,
-      2.f, 3.f, 9.f,
-      1.f, 2.f, 9.f,
-      5.f, 1.f, 9.f,
-      4.f, 5.f, 9.f,
-      3.f, 4.f, 9.f,
-
-      // a0 = 1 (cropped original, fully padded slice)
-      9.f, 9.f, 9.f,
-      9.f, 9.f, 9.f,
-      9.f, 9.f, 9.f,
-      9.f, 9.f, 9.f,
-      9.f, 9.f, 9.f,
-      9.f, 9.f, 9.f,
-      9.f, 9.f, 9.f,
-      9.f, 9.f, 9.f,
-      9.f, 9.f, 9.f};
-
-  ASSERT_EQ(2U * 9U * 3U, expected_output.size());
-
-  OpTester test("Pad", 18);
-  test.AddInput<float>("data", input_shape, input_data);
-  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads);
-  test.AddInput<float>("constant_value", {}, {cv});
-  test.AddOutput<float>("output", expected_shape, expected_output);
-  test.AddAttribute("mode", "constant");
-  test.Run();
-}
+// TEST(PadOpTest, ConstantMode_MixedSigns_Small) {
+//   const std::vector<int64_t> input_shape{2, 6, 4};
+//   std::vector<float> input_data(2 * 6 * 4);
+//
+//   for (size_t i = 0; i < input_data.size(); ++i) {
+//     input_data[i] = static_cast<float>((i % 5) + 1);
+//   }
+//
+//   const std::vector<int64_t> pads{1, 3, -2, -1, 0, 1};
+//   const float cv = 9.0f;
+//   const std::vector<int64_t> expected_shape{2, 9, 3};
+//
+//   std::vector<float> expected_output = {
+//       // a0 = 0
+//       9.f, 9.f, 9.f,
+//       9.f, 9.f, 9.f,
+//       9.f, 9.f, 9.f,
+//       3.f, 4.f, 9.f,
+//       2.f, 3.f, 9.f,
+//       1.f, 2.f, 9.f,
+//       5.f, 1.f, 9.f,
+//       4.f, 5.f, 9.f,
+//       3.f, 4.f, 9.f,
+//
+//       // a0 = 1 (cropped original, fully padded slice)
+//       9.f, 9.f, 9.f,
+//       9.f, 9.f, 9.f,
+//       9.f, 9.f, 9.f,
+//       9.f, 9.f, 9.f,
+//       9.f, 9.f, 9.f,
+//       9.f, 9.f, 9.f,
+//       9.f, 9.f, 9.f,
+//       9.f, 9.f, 9.f,
+//       9.f, 9.f, 9.f};
+//
+//   ASSERT_EQ(2U * 9U * 3U, expected_output.size());
+//
+//   OpTester test("Pad", 18);
+//   test.AddInput<float>("data", input_shape, input_data);
+//   test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads);
+//   test.AddInput<float>("constant_value", {}, {cv});
+//   test.AddOutput<float>("output", expected_shape, expected_output);
+//   test.AddAttribute("mode", "constant");
+//   test.Run();
+// }
 
 // Gh issue: https://github.com/microsoft/onnxruntime/issues/11828
 // TEST(PadOpTest, Pad_Reflect_NegativeFront_PositiveBack) {

From e1e8fa9aa7ef428bb757e7edfa1967117f7268a0 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Fri, 19 Dec 2025 13:41:10 -0800
Subject: [PATCH 08/28] Add instrumentation

---
 onnxruntime/core/providers/cpu/tensor/pad.cc  | 197 ++++++++++--------
 .../test/providers/cpu/tensor/pad_test.cc     |  90 ++++----
 2 files changed, 156 insertions(+), 131 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc
index 6c3f1c71241c9..73ceb080e7ffe 100644
--- a/onnxruntime/core/providers/cpu/tensor/pad.cc
+++ b/onnxruntime/core/providers/cpu/tensor/pad.cc
@@ -382,6 +382,28 @@ void PadBase::ReshapePads(gsl::span<const int64_t> src_pad, size_t src_dim_count
   reshaped_pad[inner_axis + new_dim_count] = SafeInt<int64_t>(src_pad[inner_axis + src_dim_count]) * inner_no_pad_size;
 }
 
+template <typename T>
+struct OutputSink {
+  void operator()(T* output, T value) const {
+#ifdef _DEBUG
+    if (output < beg || output >= end) {
+      throw std::out_of_range("Pad OutputSink: Output pointer is out of range");
+    }
+#endif
+    *output = value;
+  }
+
+#ifdef _DEBUG
+  OutputSink(T* output, T* output_end)
+      : beg(output), end(output_end) {}
+
+  T* beg;
+  T* end;
+#else
+  OutputSink(T* /* output */, T* /* output_end */) {}
+#endif
+};
+
 // special handling for edge case where the input has one or more dims with value of 0
 template <typename T>
 static Status PadInputWithDimValueOfZero(OpKernelContext* ctx,
@@ -406,11 +428,11 @@ static Status PadInputWithDimValueOfZero(OpKernelContext* ctx,
 
 // This is the general padding method to n-dimensionally do edge or reflection padding (based on the inputDelta values)
 template <typename T>
-static void PadAxis(T* output, T* input, ptrdiff_t input_delta, ptrdiff_t input_pitch,
+static void PadAxis(OutputSink<T>& sink, T* output, T* input, ptrdiff_t input_delta, ptrdiff_t input_pitch,
                     size_t block_size, size_t block_count) {
   for (size_t block_index = 0; block_index < block_count; block_index++) {
     for (size_t i = 0; i < block_size; i++) {
-      *output++ = *input;
+      sink(output++, *input);
       input += input_delta;
     }
     input += input_pitch;
@@ -420,27 +442,27 @@ static void PadAxis(T* output, T* input, ptrdiff_t input_delta, ptrdiff_t input_
 // These are optimizations of PadAxis. The inner loop is removed since the innermost axis has a blockSize of 1,
 // and inputPitch and inputDelta are just a single value added each iteration.
 template <typename T>
-static void PadInnermostAxis(T* output, T* input, ptrdiff_t input_delta, size_t block_count) {
+static void PadInnermostAxis(OutputSink<T>& sink, T* output, T* input, ptrdiff_t input_delta, size_t block_count) {
   for (size_t block_index = 0; block_index < block_count; block_index++) {
-    *output++ = *input;
+    sink(output++, *input);
     input += input_delta;
   }
 }
 
 // For constant padding, there is no input, just a size to write the constant to
 template <typename T>
-static void PadAxisConstant(T* output, T constant, size_t size) {
+static void PadAxisConstant(OutputSink<T>& sink, T* output, T constant, size_t size) {
   if (size == 1) {
-    *output = constant;
+    sink(output, constant);
   } else if (size == 2) {
-    *output = constant;
-    *(output + 1) = constant;
+    sink(output, constant);
+    sink(output + 1, constant);
   } else {
     // This would be faster with SSE instructions.
     // That would mean to have an implementation for each type (uint8, uint32, uint64).
     T* end = output + size;
     for (; output != end;)
-      *output++ = constant;
+      sink(output++, constant);
   }
 }
 
@@ -514,7 +536,11 @@ static Status PadImpl(OpKernelContext* ctx,
   // output_shape needs to keep original.
   TensorShape output_shape(output_dims);
   auto& output_tensor = *ctx->Output(0, output_shape);
+
+  const SafeInt<size_t> total_output_elems(output_shape.Size());
   auto* output = reinterpret_cast<T*>(output_tensor.MutableDataRaw());
+  auto* output_end = output + static_cast<size_t>(total_output_elems);
+  OutputSink<T> sink(output, output_end);
 
   // Early constant-fill: if any input extent is zero, no data to copy
   // only padding if any
@@ -526,10 +552,9 @@ static Status PadImpl(OpKernelContext* ctx,
     }
   }
 
-  const SafeInt<size_t> total_output_elems(output_shape.Size());
   if (no_data_to_copy) {
     if (mode == Mode::Constant) {
-      PadAxisConstant<T>(output, value, total_output_elems);
+      PadAxisConstant<T>(sink, output, value, total_output_elems);
       return Status::OK();
     }
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
@@ -575,10 +600,10 @@ static Status PadImpl(OpKernelContext* ctx,
           const SafeInt<size_t> pre_pad = reshaped_pad[inner_axis];
           const SafeInt<size_t> post_pad = reshaped_pad[inner_axis + new_dims_count];
           if (pre_pad > 0) {
-            PadAxisConstant(axis_start - static_cast<size_t>(pre_pad), value, pre_pad);
+            PadAxisConstant(sink, axis_start - static_cast<size_t>(pre_pad), value, pre_pad);
           }
           if (post_pad > 0) {
-            PadAxisConstant(output, value, post_pad);
+            PadAxisConstant(sink, output, value, post_pad);
           }
           output += post_pad;
           align_skip = pre_pad;
@@ -590,10 +615,10 @@ static Status PadImpl(OpKernelContext* ctx,
           const SafeInt<size_t> pre_pad = reshaped_pad[input_counters.Axis()];
           const SafeInt<size_t> post_pad = reshaped_pad[input_counters.Axis() + new_dims_count];
           if (pre_pad > 0) {
-            PadAxisConstant(axis_start - static_cast<ptrdiff_t>(pre_pad * inner_pitch), value, pre_pad * inner_pitch);
+            PadAxisConstant(sink, axis_start - static_cast<ptrdiff_t>(pre_pad * inner_pitch), value, pre_pad * inner_pitch);
           }
           if (post_pad > 0) {
-            PadAxisConstant(output, value, post_pad * inner_pitch);
+            PadAxisConstant(sink, output, value, post_pad * inner_pitch);
           }
           output += inner_pitch * post_pad;
           align_skip += inner_pitch * pre_pad;
@@ -611,24 +636,24 @@ static Status PadImpl(OpKernelContext* ctx,
           T* axis_start = output;
           output = input.CopyInnermostAxisSolitaryInnerStep(output);
 
-          SafeInt<size_t> pre_pad = reshaped_pad[inner_axis];
-          SafeInt<size_t> post_pad = reshaped_pad[inner_axis + new_dims_count];
+          const SafeInt<size_t> pre_pad = reshaped_pad[inner_axis];
+          const SafeInt<size_t> post_pad = reshaped_pad[inner_axis + new_dims_count];
           if (inner_no_pad_size == 1) {
             if (pre_pad > 0) {
-              PadAxisConstant(axis_start - static_cast<size_t>(pre_pad), *axis_start, pre_pad);
+              PadAxisConstant(sink, axis_start - static_cast<size_t>(pre_pad), *axis_start, pre_pad);
             }
             if (post_pad > 0) {
-              PadAxisConstant(output, *(output - 1), post_pad);
+              PadAxisConstant(sink, output, *(output - 1), post_pad);
             }
           } else {
             // When inner_most axis(es) do not need pad, above PadAxisConstant() do not fit for Edge mode.
             // Also general loop below after handling first pad axis with non-pad axis works fine.
             if (pads[inner_axis] > 0) {
-              PadAxis(axis_start - static_cast<size_t>(pre_pad), axis_start, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size,
+              PadAxis(sink, axis_start - static_cast<size_t>(pre_pad), axis_start, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size,
                       onnxruntime::narrow<size_t>(pads[inner_axis]));
             }
             if (pads[inner_axis + data_rank] > 0) {
-              PadAxis(output, output - inner_no_pad_size, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size,
+              PadAxis(sink, output, output - inner_no_pad_size, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size,
                       onnxruntime::narrow<size_t>(pads[inner_axis + data_rank]));
             }
           }
@@ -642,11 +667,11 @@ static Status PadImpl(OpKernelContext* ctx,
           const SafeInt<size_t> pre_pad = reshaped_pad[input_counters.Axis()];
           const SafeInt<size_t> post_pad = reshaped_pad[input_counters.Axis() + new_dims_count];
           if (pre_pad > 0) {
-            PadAxis(axis_start - static_cast<size_t>(pre_pad) * inner_pitch, axis_start, 1, -inner_pitch, inner_pitch,
+            PadAxis(sink, axis_start - static_cast<size_t>(pre_pad) * inner_pitch, axis_start, 1, -inner_pitch, inner_pitch,
                     pre_pad);
           }
           if (post_pad > 0) {
-            PadAxis(output, output - inner_pitch, 1, -inner_pitch, inner_pitch, post_pad);
+            PadAxis(sink, output, output - inner_pitch, 1, -inner_pitch, inner_pitch, post_pad);
           }
           output += inner_pitch * post_pad;
           align_skip += inner_pitch * pre_pad;
@@ -670,94 +695,94 @@ static Status PadImpl(OpKernelContext* ctx,
           if (inner_no_pad_size == 1) {
             if (mode == Mode::Reflect) {
               if (pre_pad > 0) {
-                PadInnermostAxis(axis_start - static_cast<size_t>(pre_pad),
+                PadInnermostAxis(sink, axis_start - static_cast<size_t>(pre_pad),
                                  axis_start + static_cast<size_t>(pre_pad), -1 /* inputDelta */, pre_pad);
               }
               if (post_pad > 0) {
-                PadInnermostAxis(output, output - 2, -1 /* inputDelta */, post_pad);
+                PadInnermostAxis(sink, output, output - 2, -1 /* inputDelta */, post_pad);
               }
             } else {
               if (pre_pad > 0) {
-                PadInnermostAxis(axis_start - static_cast<size_t>(pre_pad),
+                PadInnermostAxis(sink, axis_start - static_cast<size_t>(pre_pad),
                                  output - static_cast<size_t>(pre_pad), 1 /* inputDelta */, pre_pad);
               }
               if (post_pad > 0) {
-                PadInnermostAxis(output, axis_start, 1 /* inputDelta */, post_pad);
+                PadInnermostAxis(sink, output, axis_start, 1 /* inputDelta */, post_pad);
               }
             }
           } else {
             // When inner_most axis(es) do not need pad, Above PadInnermostAxis() do not fit for Reflect mode.
             if (mode == Mode::Reflect) {
-              PadAxis(
-                  axis_start - static_cast<size_t>(pre_pad),
-                  axis_start + static_cast<size_t>(pre_pad),
-                  1,
-                  -ptrdiff_t(inner_no_pad_size * 2),
-                  inner_no_pad_size,
-                  onnxruntime::narrow<size_t>(pads[inner_axis]));
-              PadAxis(
-                  output,
-                  output - 2 * inner_no_pad_size,
-                  1,
-                  -ptrdiff_t(inner_no_pad_size * 2),
-                  inner_no_pad_size,
-                  onnxruntime::narrow<size_t>(pads[inner_axis + data_rank]));
+              PadAxis(sink,
+                      axis_start - static_cast<size_t>(pre_pad),
+                      axis_start + static_cast<size_t>(pre_pad),
+                      1,
+                      -ptrdiff_t(inner_no_pad_size * 2),
+                      inner_no_pad_size,
+                      onnxruntime::narrow<size_t>(pads[inner_axis]));
+              PadAxis(sink,
+                      output,
+                      output - 2 * inner_no_pad_size,
+                      1,
+                      -ptrdiff_t(inner_no_pad_size * 2),
+                      inner_no_pad_size,
+                      onnxruntime::narrow<size_t>(pads[inner_axis + data_rank]));
             } else {
-              PadAxis(
-                  axis_start - static_cast<size_t>(pre_pad),
-                  output - pads[inner_axis] * inner_no_pad_size,
-                  1,
-                  0,
-                  inner_no_pad_size,
-                  onnxruntime::narrow<size_t>(pads[inner_axis]));
-              PadAxis(
-                  output,
-                  axis_start,
-                  1,
-                  0,
-                  inner_no_pad_size,
-                  onnxruntime::narrow<size_t>(pads[inner_axis + data_rank]));
+              PadAxis(sink,
+                      axis_start - static_cast<size_t>(pre_pad),
+                      output - pads[inner_axis] * inner_no_pad_size,
+                      1,
+                      0,
+                      inner_no_pad_size,
+                      onnxruntime::narrow<size_t>(pads[inner_axis]));
+              PadAxis(sink,
+                      output,
+                      axis_start,
+                      1,
+                      0,
+                      inner_no_pad_size,
+                      onnxruntime::narrow<size_t>(pads[inner_axis + data_rank]));
             }
           }
           output += post_pad;
-          align_skip = onnxruntime::narrow<size_t>(pre_pad);
+          align_skip = pre_pad;
         }
         // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done)
         while (input_counters.Increment()) {
           ptrdiff_t inner_pitch = onnxruntime::narrow<std::ptrdiff_t>(output_pitches[input_counters.Axis()]);
           T* axis_start = output - inner_pitch * input_extents[input_counters.Axis()];
-          SafeInt<size_t> pre_pad = reshaped_pad[input_counters.Axis()];
-          SafeInt<size_t> post_pad = reshaped_pad[input_counters.Axis() + new_dims_count];
+          const SafeInt<size_t> pre_pad = reshaped_pad[input_counters.Axis()];
+          const SafeInt<size_t> post_pad = reshaped_pad[input_counters.Axis() + new_dims_count];
           if (mode == Mode::Reflect) {
-            PadAxis(
-                axis_start - static_cast<size_t>(pre_pad) * inner_pitch,
-                axis_start + static_cast<size_t>(pre_pad) * inner_pitch,
-                1,
-                -inner_pitch * 2,
-                inner_pitch,
-                pre_pad);
-            PadAxis(
-                output,
-                output - 2 * inner_pitch,
-                1,
-                -inner_pitch * 2,
-                inner_pitch,
-                post_pad);
+            PadAxis(sink,
+                    axis_start - static_cast<size_t>(pre_pad) * inner_pitch,
+                    axis_start + static_cast<size_t>(pre_pad) * inner_pitch,
+                    1,
+                    -inner_pitch * 2,
+                    inner_pitch,
+                    pre_pad);
+            PadAxis(sink,
+                    output,
+                    output - 2 * inner_pitch,
+                    1,
+                    -inner_pitch * 2,
+                    inner_pitch,
+                    post_pad);
           } else {
-            PadAxis(
-                axis_start - static_cast<size_t>(pre_pad) * inner_pitch,
-                output - static_cast<size_t>(pre_pad) * inner_pitch,
-                1,
-                0,
-                inner_pitch,
-                pre_pad);
-            PadAxis(
-                output,
-                axis_start,
-                1,
-                0,
-                inner_pitch,
-                post_pad);
+            PadAxis(sink,
+                    axis_start - static_cast<size_t>(pre_pad) * inner_pitch,
+                    output - static_cast<size_t>(pre_pad) * inner_pitch,
+                    1,
+                    0,
+                    inner_pitch,
+                    pre_pad);
+            PadAxis(sink,
+                    output,
+                    axis_start,
+                    1,
+                    0,
+                    inner_pitch,
+                    post_pad);
           }
           output += inner_pitch * post_pad;
           align_skip += inner_pitch * pre_pad;
diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
index c82f17012fd47..386f1e4e37ec0 100644
--- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
@@ -1152,51 +1152,51 @@ TEST(PadOpTest, ConstantPadLargeNegativePadNoOutput) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
-// TEST(PadOpTest, ConstantMode_MixedSigns_Small) {
-//   const std::vector<int64_t> input_shape{2, 6, 4};
-//   std::vector<float> input_data(2 * 6 * 4);
-//
-//   for (size_t i = 0; i < input_data.size(); ++i) {
-//     input_data[i] = static_cast<float>((i % 5) + 1);
-//   }
-//
-//   const std::vector<int64_t> pads{1, 3, -2, -1, 0, 1};
-//   const float cv = 9.0f;
-//   const std::vector<int64_t> expected_shape{2, 9, 3};
-//
-//   std::vector<float> expected_output = {
-//       // a0 = 0
-//       9.f, 9.f, 9.f,
-//       9.f, 9.f, 9.f,
-//       9.f, 9.f, 9.f,
-//       3.f, 4.f, 9.f,
-//       2.f, 3.f, 9.f,
-//       1.f, 2.f, 9.f,
-//       5.f, 1.f, 9.f,
-//       4.f, 5.f, 9.f,
-//       3.f, 4.f, 9.f,
-//
-//       // a0 = 1 (cropped original, fully padded slice)
-//       9.f, 9.f, 9.f,
-//       9.f, 9.f, 9.f,
-//       9.f, 9.f, 9.f,
-//       9.f, 9.f, 9.f,
-//       9.f, 9.f, 9.f,
-//       9.f, 9.f, 9.f,
-//       9.f, 9.f, 9.f,
-//       9.f, 9.f, 9.f,
-//       9.f, 9.f, 9.f};
-//
-//   ASSERT_EQ(2U * 9U * 3U, expected_output.size());
-//
-//   OpTester test("Pad", 18);
-//   test.AddInput<float>("data", input_shape, input_data);
-//   test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads);
-//   test.AddInput<float>("constant_value", {}, {cv});
-//   test.AddOutput<float>("output", expected_shape, expected_output);
-//   test.AddAttribute("mode", "constant");
-//   test.Run();
-// }
+TEST(PadOpTest, ConstantMode_MixedSigns_Small) {
+  const std::vector<int64_t> input_shape{2, 6, 4};
+  std::vector<float> input_data(2 * 6 * 4);
+
+  for (size_t i = 0; i < input_data.size(); ++i) {
+    input_data[i] = static_cast<float>((i % 5) + 1);
+  }
+
+  const std::vector<int64_t> pads{1, 3, -2, -1, 0, 1};
+  const float cv = 9.0f;
+  const std::vector<int64_t> expected_shape{2, 9, 3};
+
+  std::vector<float> expected_output = {
+      // a0 = 0
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+      3.f, 4.f, 9.f,
+      2.f, 3.f, 9.f,
+      1.f, 2.f, 9.f,
+      5.f, 1.f, 9.f,
+      4.f, 5.f, 9.f,
+      3.f, 4.f, 9.f,
+
+      // a0 = 1 (cropped original, fully padded slice)
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f};
+
+  ASSERT_EQ(2U * 9U * 3U, expected_output.size());
+
+  OpTester test("Pad", 18);
+  test.AddInput<float>("data", input_shape, input_data);
+  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads);
+  test.AddInput<float>("constant_value", {}, {cv});
+  test.AddOutput<float>("output", expected_shape, expected_output);
+  test.AddAttribute("mode", "constant");
+  test.Run();
+}
 
 // Gh issue: https://github.com/microsoft/onnxruntime/issues/11828
 // TEST(PadOpTest, Pad_Reflect_NegativeFront_PositiveBack) {

From ec57b378ceae276d3e0c293c45cddb2def2d53f9 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Fri, 19 Dec 2025 15:26:09 -0800
Subject: [PATCH 09/28] Add tests

---
 onnxruntime/core/providers/cpu/tensor/pad.cc  |  11 +-
 .../test/providers/cpu/tensor/pad_test.cc     | 124 +++++++++++++++---
 2 files changed, 111 insertions(+), 24 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc
index 73ceb080e7ffe..18c6b00cfa2a1 100644
--- a/onnxruntime/core/providers/cpu/tensor/pad.cc
+++ b/onnxruntime/core/providers/cpu/tensor/pad.cc
@@ -595,17 +595,22 @@ static Status PadImpl(OpKernelContext* ctx,
         output += align_skip;
         {
           T* axis_start = output;
-          output = input.CopyInnermostAxisSolitaryInnerStep(output);
+          // Compute the actual number of data elements to copy on the innermost axis (after cropping).
+          const size_t inner_extent = onnxruntime::narrow<size_t>(input_extents[inner_axis]);
+
+          // Copy innermost block. IMPORTANT: do not rely on the returned 'output' to be end-of-the extent.
+          ORT_IGNORE_RETURN_VALUE(input.CopyInnermostAxisSolitaryInnerStep(output));
 
           const SafeInt<size_t> pre_pad = reshaped_pad[inner_axis];
           const SafeInt<size_t> post_pad = reshaped_pad[inner_axis + new_dims_count];
           if (pre_pad > 0) {
+            /// Pre - pad(innermost) retro - fill remains valid(write before row_start).
             PadAxisConstant(sink, axis_start - static_cast<size_t>(pre_pad), value, pre_pad);
           }
           if (post_pad > 0) {
-            PadAxisConstant(sink, output, value, post_pad);
+            PadAxisConstant(sink, axis_start + inner_extent, value, post_pad);
           }
-          output += post_pad;
+          output = axis_start + inner_extent + static_cast<size_t>(post_pad);
           align_skip = pre_pad;
         }
         // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done)
diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
index 386f1e4e37ec0..3183086139fe8 100644
--- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
@@ -763,10 +763,7 @@ edge
 
 // test handling of input with a 0 for a dimension
 TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) {
-  // TODO: Unskip when fixed #41968513
-  if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 13, which exceeds threshold";
-  }
+  // TODO: Unskip Dml when fixed #41968513
 
   using T = TypeParam;
   RunAllOpsetAllDomainPadTests<T>({0},  // 1D
@@ -774,7 +771,9 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) {
                                   {1, 1},
                                   T(1),
                                   {2},
-                                  {T(1), T(1)});
+                                  {T(1), T(1)},
+                                  "constant", OpTester::ExpectResult::kExpectSuccess,
+                                  {kDmlExecutionProvider});
 
   RunAllOpsetAllDomainPadTests<T>({0},  // 1D empty pads
                                   {},
@@ -788,35 +787,45 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) {
                                   {-1, 1},
                                   T(1),
                                   {0},
-                                  {});
+                                  {},
+                                  "constant", OpTester::ExpectResult::kExpectSuccess,
+                                  {kDmlExecutionProvider});
 
   RunAllOpsetAllDomainPadTests<T>({2, 0},  // 2D
                                   {},
                                   {1, 1, 1, 1},
                                   T(1),
                                   {4, 2},
-                                  {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)});
+                                  {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)},
+                                  "constant", OpTester::ExpectResult::kExpectSuccess,
+                                  {kDmlExecutionProvider});
 
   RunAllOpsetAllDomainPadTests<T>({0, 2},
                                   {},
                                   {1, 1, 1, 1},
                                   T(1),
                                   {2, 4},
-                                  {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)});
+                                  {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)},
+                                  "constant", OpTester::ExpectResult::kExpectSuccess,
+                                  {kDmlExecutionProvider});
 
   RunAllOpsetAllDomainPadTests<T>({0, 2},
                                   {},
                                   {1, 0, 1, 0},  // empty pads for dim 1
                                   T(1),
                                   {2, 2},
-                                  {T(1), T(1), T(1), T(1)});
+                                  {T(1), T(1), T(1), T(1)},
+                                  "constant", OpTester::ExpectResult::kExpectSuccess,
+                                  {kDmlExecutionProvider});
 
   RunAllOpsetAllDomainPadTests<T>({2, 0, 2},  // 3D
                                   {},
                                   {0, 1, 0, 0, 1, 0},
                                   T(1),
                                   {2, 2, 2},
-                                  {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)});
+                                  {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)},
+                                  "constant", OpTester::ExpectResult::kExpectSuccess,
+                                  {kDmlExecutionProvider});
 }
 // Added output shape verification b/w the output shape generated by operator specific ONNX inference and
 // the output shape generated by operator specific ORT implementation. After adding this verification,
@@ -1099,6 +1108,81 @@ TEST(PadOpTest, ConstantPadNegativeAxes) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
+TEST(PadOpTest, ConstantMode_MixedSigns_Small_F32) {
+  const std::vector<int64_t> input_shape{2, 6, 4};
+  std::vector<float> input_data(2 * 6 * 4);
+
+  for (size_t i = 0; i < input_data.size(); ++i) input_data[i] = static_cast<float>((i % 5) + 1);
+
+  const std::vector<int64_t> pads{1, 3, -2, -1, 0, 1};
+  const float cv = 9.0f;
+  // starting from input shape {2,6,4}
+  // after padding:      {2+1+-1,6+3-0,4-2_1}  => {2,9,3}
+  const std::vector<int64_t> expected_shape{2, 9, 3};
+
+  const std::vector<float> expected_data = {
+      // sample 0
+      9.0F, 9.0F, 9.0F,
+      9.0F, 9.0F, 9.0F,
+      9.0F, 9.0F, 9.0F,
+      9.0F, 9.0F, 9.0F,
+      9.0F, 9.0F, 9.0F,
+      9.0F, 9.0F, 9.0F,
+      9.0F, 9.0F, 9.0F,
+      9.0F, 9.0F, 9.0F,
+      9.0F, 9.0F, 9.0F,
+
+      // sample 1
+      9.0F, 9.0F, 9.0F,
+      9.0F, 9.0F, 9.0F,
+      9.0F, 9.0F, 9.0F,
+      3.0F, 4.0F, 9.0F,
+      2.0F, 3.0F, 9.0F,
+      1.0F, 2.0F, 9.0F,
+      5.0F, 1.0F, 9.0F,
+      4.0F, 5.0F, 9.0F,
+      3.0F, 4.0F, 9.0F};
+
+  OpTester test("Pad", 13);
+  test.AddInput<float>("data", input_shape, input_data);
+  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads);
+  test.AddInput<float>("constant_value", {}, {cv});
+  test.AddOutput<float>("output", expected_shape, expected_data);
+  test.AddAttribute("mode", "constant");
+  test.Run();
+}
+
+TEST(PadOpTest, ConstantFill_F32_RemovesAllDataOnAxis) {
+  OpTester test("Pad", 18);
+  test.AddAttribute("mode", "constant");
+
+  const std::vector<int64_t> input_shape = {1, 1, 4, 4};
+
+  const std::vector<float> input_data = {
+      1.0f, 2.0f, 3.0f, 4.0f,
+      5.0f, 6.0f, 7.0f, 8.0f,
+      9.0f, 10.0f, 11.0f, 12.0f,
+      13.0f, 14.0f, 15.0f, 16.0f};
+
+  // Calculate expected shape:
+  // dim0: 1 + 0 + 0 = 1
+  // dim1: 1 + 0 + 0 = 1
+  // dim2: 4 + -4 + 4 = 4
+  // dim3: 4 + 0 + 0 = 4
+  const std::vector<int64_t> expected_shape = {1, 1, 4, 4};
+  const std::vector<float> expected_data = {
+      0.f, 0.f, 0.f, 0.f,
+      0.f, 0.f, 0.f, 0.f,
+      0.f, 0.f, 0.f, 0.f,
+      0.f, 0.f, 0.f, 0.f};
+
+  test.AddInput<float>("data", input_shape, input_data);
+  test.AddInput<int64_t>("pads", {8}, {0, 0, -4, 0, 0, 0, 4, 0});
+  test.AddInput<float>("constant_value", {}, {0.0f});
+  test.AddOutput<float>("output", expected_shape, expected_data);
+  test.Run();
+}
+
 TEST(PadOpTest, ConstantPadLargeNegativePadNoOutput) {
   OpTester test("Pad", 18);
   test.AddAttribute("mode", "constant");
@@ -1164,28 +1248,26 @@ TEST(PadOpTest, ConstantMode_MixedSigns_Small) {
   const float cv = 9.0f;
   const std::vector<int64_t> expected_shape{2, 9, 3};
 
-  std::vector<float> expected_output = {
-      // a0 = 0
+  const std::vector<float> expected_output{
       9.f, 9.f, 9.f,
       9.f, 9.f, 9.f,
       9.f, 9.f, 9.f,
-      3.f, 4.f, 9.f,
-      2.f, 3.f, 9.f,
-      1.f, 2.f, 9.f,
-      5.f, 1.f, 9.f,
-      4.f, 5.f, 9.f,
-      3.f, 4.f, 9.f,
-
-      // a0 = 1 (cropped original, fully padded slice)
       9.f, 9.f, 9.f,
       9.f, 9.f, 9.f,
       9.f, 9.f, 9.f,
       9.f, 9.f, 9.f,
       9.f, 9.f, 9.f,
       9.f, 9.f, 9.f,
+
       9.f, 9.f, 9.f,
       9.f, 9.f, 9.f,
-      9.f, 9.f, 9.f};
+      9.f, 9.f, 9.f,
+      3.f, 4.f, 9.f,
+      2.f, 3.f, 9.f,
+      1.f, 2.f, 9.f,
+      5.f, 1.f, 9.f,
+      4.f, 5.f, 9.f,
+      3.f, 4.f, 9.f};
 
   ASSERT_EQ(2U * 9U * 3U, expected_output.size());
 

From 96a60451ba582eb645e499844dcfe397c6196875 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Mon, 22 Dec 2025 14:07:17 -0800
Subject: [PATCH 10/28] All Edge tests fail

---
 onnxruntime/core/providers/cpu/tensor/pad.cc  | 67 +++++++++++++++----
 .../test/providers/cpu/tensor/pad_test.cc     | 23 +++++++
 2 files changed, 78 insertions(+), 12 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc
index 18c6b00cfa2a1..dfedcc24b83c6 100644
--- a/onnxruntime/core/providers/cpu/tensor/pad.cc
+++ b/onnxruntime/core/providers/cpu/tensor/pad.cc
@@ -641,29 +641,72 @@ static Status PadImpl(OpKernelContext* ctx,
           T* axis_start = output;
           output = input.CopyInnermostAxisSolitaryInnerStep(output);
 
-          const SafeInt<size_t> pre_pad = reshaped_pad[inner_axis];
-          const SafeInt<size_t> post_pad = reshaped_pad[inner_axis + new_dims_count];
+          // Edge is invalid if there is no data to duplicate on this axis.
+          // however, input extents have been validated to be non-zero above.
+          const size_t inner_extent = onnxruntime::narrow<size_t>(input_extents[inner_axis]);
+
+          const SafeInt<size_t> inner_pitch = output_pitches[inner_axis];
+          // Row bounds in linear element space:
+          // axis_start points to the beginning of the copied row (returned above as 'output' after alignSkip).
+          T* axis_end = axis_start + inner_extent * inner_no_pad_size;  // one-past-last element of copied data
+
+          const T* first_elem = axis_start;
+          const T* last_elem = axis_end - 1;
           if (inner_no_pad_size == 1) {
+            const SafeInt<size_t> pre_pad = reshaped_pad[inner_axis];
+            const SafeInt<size_t> post_pad = reshaped_pad[inner_axis + new_dims_count];
             if (pre_pad > 0) {
-              PadAxisConstant(sink, axis_start - static_cast<size_t>(pre_pad), *axis_start, pre_pad);
+              PadAxisConstant(sink, axis_start - narrow<ptrdiff_t>(pre_pad), *first_elem, pre_pad);
+              align_skip = align_skip + inner_pitch * pre_pad;
             }
             if (post_pad > 0) {
-              PadAxisConstant(sink, output, *(output - 1), post_pad);
+              PadAxisConstant(sink, axis_end, *last_elem, post_pad);
             }
           } else {
             // When inner_most axis(es) do not need pad, above PadAxisConstant() do not fit for Edge mode.
             // Also general loop below after handling first pad axis with non-pad axis works fine.
-            if (pads[inner_axis] > 0) {
-              PadAxis(sink, axis_start - static_cast<size_t>(pre_pad), axis_start, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size,
-                      onnxruntime::narrow<size_t>(pads[inner_axis]));
+
+            // Source blocks inside the copied region
+            T* first_block_src = axis_start;                                      // first block
+            T* last_block_src = axis_end - narrow<ptrdiff_t>(inner_no_pad_size);  // last block
+
+            const SafeInt<size_t> pre_pad = pads[inner_axis];
+            const SafeInt<size_t> post_pad = reshaped_pad[inner_axis + new_dims_count];
+            const SafeInt<size_t> block_size = inner_no_pad_size;
+
+            // PadAxis writes 'block_size' elements per block, and after each block sets input += input_pitch.
+            // Using input_delta=1 to walk within the block, input_pitch=-(inner_no_pad_size)
+            // to reset input to the block start.
+            if (pre_pad > 0) {
+              T* dst_pre = first_block_src - static_cast<size_t>(pre_pad * block_size);
+              PadAxis(sink, dst_pre,
+                      first_block_src,
+                      1,                             /*input_delta=*/
+                      -ptrdiff_t(inner_no_pad_size), /* input_pitch */
+                      block_size,
+                      pre_pad);
+
+              // Roll align_skip forward so the outer loop's `output += align_skip` won't skip these rows again.
+              align_skip = SafeInt<size_t>(align_skip) + inner_pitch * pre_pad;
             }
-            if (pads[inner_axis + data_rank] > 0) {
-              PadAxis(sink, output, output - inner_no_pad_size, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size,
-                      onnxruntime::narrow<size_t>(pads[inner_axis + data_rank]));
+
+            if (post_pad > 0) {
+              // 2) POST-PAD: duplicate the last block forwards after row_end
+              T* dst_post = axis_end;  // start immediately after the copied data
+              PadAxis(sink,
+                      dst_post,
+                      last_block_src,
+                      1,                             /*input_delta=*/
+                      -ptrdiff_t(inner_no_pad_size), /* input_pitch */
+                      block_size,
+                      post_pad);
+
+              // Advance the linear write cursor past the post-pad area
+              output = dst_post + narrow<ptrdiff_t>(post_pad * inner_no_pad_size);
+            } else {
+              output = axis_end;
             }
           }
-          output += post_pad;
-          align_skip = pre_pad;
         }
         // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done)
         while (input_counters.Increment()) {
diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
index 3183086139fe8..28b6e001e0c44 100644
--- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
@@ -1183,6 +1183,29 @@ TEST(PadOpTest, ConstantFill_F32_RemovesAllDataOnAxis) {
   test.Run();
 }
 
+// GH Issue: https://github.com/microsoft/onnxruntime/issues/13332
+TEST(PadOpTest, ConstantPadDefaultValueMixedPads_GH_13332) {
+  OpTester test("Pad", 18);
+  test.AddAttribute("mode", "constant");
+
+  const std::vector<int64_t> input_shape{1, 1, 4, 4};
+  const std::vector<float> input_data{
+      1.0f, 2.0f, 3.0f, 4.0f,
+      5.0f, 6.0f, 7.0f, 8.0f,
+      9.0f, 10.0f, 11.0f, 12.0f,
+      13.0f, 14.0f, 15.0f, 16.0f};
+
+  const std::vector<int64_t> pads{0, 0, 4, 1, 0, 0, -4, -1};
+  const std::vector<int64_t> output_shape{1, 1, 4, 4};
+  const std::vector<float> expected_output(16, 0.0f);
+
+  test.AddInput<float>("data", input_shape, input_data);
+  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads);
+  test.AddOutput<float>("output", output_shape, expected_output);
+  test.Run();
+}
+
+// Internally reported
 TEST(PadOpTest, ConstantPadLargeNegativePadNoOutput) {
   OpTester test("Pad", 18);
   test.AddAttribute("mode", "constant");

From a2570d0199c71f47eb2723ebb894eeccd711253c Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Mon, 22 Dec 2025 14:38:07 -0800
Subject: [PATCH 11/28] Revert "All Edge tests fail"

This reverts commit 96a60451ba582eb645e499844dcfe397c6196875.
---
 onnxruntime/core/providers/cpu/tensor/pad.cc  | 67 ++++---------------
 .../test/providers/cpu/tensor/pad_test.cc     | 23 -------
 2 files changed, 12 insertions(+), 78 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc
index dfedcc24b83c6..18c6b00cfa2a1 100644
--- a/onnxruntime/core/providers/cpu/tensor/pad.cc
+++ b/onnxruntime/core/providers/cpu/tensor/pad.cc
@@ -641,72 +641,29 @@ static Status PadImpl(OpKernelContext* ctx,
           T* axis_start = output;
           output = input.CopyInnermostAxisSolitaryInnerStep(output);
 
-          // Edge is invalid if there is no data to duplicate on this axis.
-          // however, input extents have been validated to be non-zero above.
-          const size_t inner_extent = onnxruntime::narrow<size_t>(input_extents[inner_axis]);
-
-          const SafeInt<size_t> inner_pitch = output_pitches[inner_axis];
-          // Row bounds in linear element space:
-          // axis_start points to the beginning of the copied row (returned above as 'output' after alignSkip).
-          T* axis_end = axis_start + inner_extent * inner_no_pad_size;  // one-past-last element of copied data
-
-          const T* first_elem = axis_start;
-          const T* last_elem = axis_end - 1;
+          const SafeInt<size_t> pre_pad = reshaped_pad[inner_axis];
+          const SafeInt<size_t> post_pad = reshaped_pad[inner_axis + new_dims_count];
           if (inner_no_pad_size == 1) {
-            const SafeInt<size_t> pre_pad = reshaped_pad[inner_axis];
-            const SafeInt<size_t> post_pad = reshaped_pad[inner_axis + new_dims_count];
             if (pre_pad > 0) {
-              PadAxisConstant(sink, axis_start - narrow<ptrdiff_t>(pre_pad), *first_elem, pre_pad);
-              align_skip = align_skip + inner_pitch * pre_pad;
+              PadAxisConstant(sink, axis_start - static_cast<size_t>(pre_pad), *axis_start, pre_pad);
             }
             if (post_pad > 0) {
-              PadAxisConstant(sink, axis_end, *last_elem, post_pad);
+              PadAxisConstant(sink, output, *(output - 1), post_pad);
             }
           } else {
             // When inner_most axis(es) do not need pad, above PadAxisConstant() do not fit for Edge mode.
             // Also general loop below after handling first pad axis with non-pad axis works fine.
-
-            // Source blocks inside the copied region
-            T* first_block_src = axis_start;                                      // first block
-            T* last_block_src = axis_end - narrow<ptrdiff_t>(inner_no_pad_size);  // last block
-
-            const SafeInt<size_t> pre_pad = pads[inner_axis];
-            const SafeInt<size_t> post_pad = reshaped_pad[inner_axis + new_dims_count];
-            const SafeInt<size_t> block_size = inner_no_pad_size;
-
-            // PadAxis writes 'block_size' elements per block, and after each block sets input += input_pitch.
-            // Using input_delta=1 to walk within the block, input_pitch=-(inner_no_pad_size)
-            // to reset input to the block start.
-            if (pre_pad > 0) {
-              T* dst_pre = first_block_src - static_cast<size_t>(pre_pad * block_size);
-              PadAxis(sink, dst_pre,
-                      first_block_src,
-                      1,                             /*input_delta=*/
-                      -ptrdiff_t(inner_no_pad_size), /* input_pitch */
-                      block_size,
-                      pre_pad);
-
-              // Roll align_skip forward so the outer loop's `output += align_skip` won't skip these rows again.
-              align_skip = SafeInt<size_t>(align_skip) + inner_pitch * pre_pad;
+            if (pads[inner_axis] > 0) {
+              PadAxis(sink, axis_start - static_cast<size_t>(pre_pad), axis_start, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size,
+                      onnxruntime::narrow<size_t>(pads[inner_axis]));
             }
-
-            if (post_pad > 0) {
-              // 2) POST-PAD: duplicate the last block forwards after row_end
-              T* dst_post = axis_end;  // start immediately after the copied data
-              PadAxis(sink,
-                      dst_post,
-                      last_block_src,
-                      1,                             /*input_delta=*/
-                      -ptrdiff_t(inner_no_pad_size), /* input_pitch */
-                      block_size,
-                      post_pad);
-
-              // Advance the linear write cursor past the post-pad area
-              output = dst_post + narrow<ptrdiff_t>(post_pad * inner_no_pad_size);
-            } else {
-              output = axis_end;
+            if (pads[inner_axis + data_rank] > 0) {
+              PadAxis(sink, output, output - inner_no_pad_size, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size,
+                      onnxruntime::narrow<size_t>(pads[inner_axis + data_rank]));
             }
           }
+          output += post_pad;
+          align_skip = pre_pad;
         }
         // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done)
         while (input_counters.Increment()) {
diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
index 28b6e001e0c44..3183086139fe8 100644
--- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
@@ -1183,29 +1183,6 @@ TEST(PadOpTest, ConstantFill_F32_RemovesAllDataOnAxis) {
   test.Run();
 }
 
-// GH Issue: https://github.com/microsoft/onnxruntime/issues/13332
-TEST(PadOpTest, ConstantPadDefaultValueMixedPads_GH_13332) {
-  OpTester test("Pad", 18);
-  test.AddAttribute("mode", "constant");
-
-  const std::vector<int64_t> input_shape{1, 1, 4, 4};
-  const std::vector<float> input_data{
-      1.0f, 2.0f, 3.0f, 4.0f,
-      5.0f, 6.0f, 7.0f, 8.0f,
-      9.0f, 10.0f, 11.0f, 12.0f,
-      13.0f, 14.0f, 15.0f, 16.0f};
-
-  const std::vector<int64_t> pads{0, 0, 4, 1, 0, 0, -4, -1};
-  const std::vector<int64_t> output_shape{1, 1, 4, 4};
-  const std::vector<float> expected_output(16, 0.0f);
-
-  test.AddInput<float>("data", input_shape, input_data);
-  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads);
-  test.AddOutput<float>("output", output_shape, expected_output);
-  test.Run();
-}
-
-// Internally reported
 TEST(PadOpTest, ConstantPadLargeNegativePadNoOutput) {
   OpTester test("Pad", 18);
   test.AddAttribute("mode", "constant");

From 5a6a9798b6ddd1d9b5b4f03e96120b3c31bf69de Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Mon, 22 Dec 2025 15:50:08 -0800
Subject: [PATCH 12/28] Add missing Edge tests

---
 onnxruntime/core/providers/cpu/tensor/pad.cc  |   4 +-
 .../test/providers/cpu/tensor/pad_test.cc     | 136 ++++++++++++++++++
 2 files changed, 139 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc
index 18c6b00cfa2a1..d2466e644d70f 100644
--- a/onnxruntime/core/providers/cpu/tensor/pad.cc
+++ b/onnxruntime/core/providers/cpu/tensor/pad.cc
@@ -638,7 +638,9 @@ static Status PadImpl(OpKernelContext* ctx,
       while (input_counters) {
         output += align_skip;
         {
+          const SafeInt<size_t> inner_extent = input_extents[inner_axis];
           T* axis_start = output;
+          T* axis_end = axis_start + onnxruntime::narrow<ptrdiff_t>(inner_extent);
           output = input.CopyInnermostAxisSolitaryInnerStep(output);
 
           const SafeInt<size_t> pre_pad = reshaped_pad[inner_axis];
@@ -662,7 +664,7 @@ static Status PadImpl(OpKernelContext* ctx,
                       onnxruntime::narrow<size_t>(pads[inner_axis + data_rank]));
             }
           }
-          output += post_pad;
+          output = axis_end + static_cast<size_t>(post_pad);
           align_skip = pre_pad;
         }
         // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done)
diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
index 3183086139fe8..15e44482ec89f 100644
--- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
@@ -1280,6 +1280,142 @@ TEST(PadOpTest, ConstantMode_MixedSigns_Small) {
   test.Run();
 }
 
+TEST(PadOpTest, ConstantMode_InnermostCropThenPostPad) {
+  const std::vector<int64_t> input_shape{2, 3, 5};
+
+  std::vector<float> input_data(2 * 3 * 5);
+  std::iota(input_data.begin(), input_data.end(), 1.0f);
+
+  const std::vector<int64_t> pads{1, 3, -2, -1, 0, 1};
+  const float cv = 9.0f;
+  const std::vector<int64_t> expected_shape{2, 6, 4};
+
+  const std::vector<float> expected_output{
+      // depth 0
+      9.0F, 9.0F, 9.0F, 9.0F,
+      9.0F, 9.0F, 9.0F, 9.0F,
+      9.0F, 9.0F, 9.0F, 9.0F,
+      9.0F, 9.0F, 9.0F, 9.0F,
+      9.0F, 9.0F, 9.0F, 9.0F,
+      9.0F, 9.0F, 9.0F, 9.0F,
+
+      // depth 1
+      9.0F, 9.0F, 9.0F, 9.0F,
+      9.0F, 9.0F, 9.0F, 9.0F,
+      9.0F, 9.0F, 9.0F, 9.0F,
+      3.0F, 4.0F, 5.0F, 9.0F,
+      8.0F, 9.0F, 10.0F, 9.0F,
+      13.0F, 14.0F, 15.0F, 9.0F};
+
+  OpTester test("Pad", 18);
+  test.AddInput<float>("data", input_shape, input_data);
+  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads);
+  test.AddInput<float>("constant_value", {}, {cv});
+  test.AddOutput<float>("output", expected_shape, expected_output);
+  test.AddAttribute("mode", "constant");
+  test.Run();
+}
+
+TEST(PadOpTest, EdgeMode_ZeroExtentFails) {
+  std::vector<int64_t> input_shape = {4};
+  // Generate input as above
+  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f};
+  std::vector<int64_t> pads = {-4, 3};
+
+  const std::vector<int64_t> expected_shape{3};
+  const std::vector<float> expected_data = {1.f, 2.f, 3.f};
+
+  OpTester test("Pad", 18);
+  test.AddInput<float>("data", input_shape, input_data);
+  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads);
+  test.AddOutput<float>("output", expected_shape, expected_data);
+  test.AddAttribute("mode", "edge");
+  test.Run(OpTester::ExpectResult::kExpectFailure);
+}
+
+TEST(PadOpTest, EdgeMode_ExtentOne_Valid) {
+  const std::vector<int64_t> input_shape{4};
+  const std::vector<float> input_data{1.f, 1.f, 1.f, 1.f};
+  const std::vector<int64_t> pads{-3, 3};
+  const std::vector<int64_t> expected_shape{4};
+  const std::vector<float> expected_output{1.f, 1.f, 1.f, 1.f};
+
+  OpTester test("Pad", 18);
+  test.AddInput<float>("data", input_shape, input_data);
+  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads);
+  test.AddOutput<float>("output", expected_shape, expected_output);
+  test.AddAttribute("mode", "edge");
+  test.Run();
+}
+
+// TEST(PadOpTest, EdgeMode_FlattenedInnermostAxis) {
+//   // This covers the else branch of inner_no_pad_size != 1
+//   const std::vector<int64_t> input_shape{2, 3, 2, 4};
+//   std::vector<float> input_data(2 * 3 * 2 * 4);
+//   std::iota(input_data.begin(), input_data.end(), 1.f);
+//
+//   const std::vector<float> expected_shape{2, 3, 8};
+
+TEST(PadOpTest, EdgeMode_FlattenedInnermostAxis) {
+  // Shape chosen to force FlattenInnerShape():
+  // innermost dims {2,4} -> flattened to 8
+  const std::vector<int64_t> input_shape = {2, 3, 2, 4};
+
+  std::vector<float> input_data(2 * 3 * 2 * 4);
+  for (size_t i = 0; i < input_data.size(); ++i) {
+    input_data[i] = static_cast<float>(i);
+  }
+
+  // ONNX pad order: [b0,b1,b2,b3,e0,e1,e2,e3]
+  // The below shape will cause flattening the last two input dims to 8
+  const std::vector<int64_t> pads = {
+      0, 0, 0, 0,  // begin
+      0, 0, 0, 1   // end pad only on last original axis
+  };
+
+  // Expected shape:
+  // flattened axis grows from 8 -> 12
+  const std::vector<int64_t> expected_shape = {2, 3, 2, 5};
+
+  std::vector<float> expected_output = {
+      // [0][0][0]
+      0.f, 1.f, 2.f, 3.f, 3.f,
+      // [0][0][1]
+      4.f, 5.f, 6.f, 7.f, 7.f,
+
+      // [0][1][0]
+      8.f, 9.f, 10.f, 11.f, 11.f,
+      // [0][1][1]
+      12.f, 13.f, 14.f, 15.f, 15.f,
+
+      // [0][2][0]
+      16.f, 17.f, 18.f, 19.f, 19.f,
+      // [0][2][1]
+      20.f, 21.f, 22.f, 23.f, 23.f,
+
+      // [1][0][0]
+      24.f, 25.f, 26.f, 27.f, 27.f,
+      // [1][0][1]
+      28.f, 29.f, 30.f, 31.f, 31.f,
+
+      // [1][1][0]
+      32.f, 33.f, 34.f, 35.f, 35.f,
+      // [1][1][1]
+      36.f, 37.f, 38.f, 39.f, 39.f,
+
+      // [1][2][0]
+      40.f, 41.f, 42.f, 43.f, 43.f,
+      // [1][2][1]
+      44.f, 45.f, 46.f, 47.f, 47.f};
+
+  OpTester test("Pad", 18);
+  test.AddInput<float>("data", input_shape, input_data);
+  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads);
+  test.AddOutput<float>("output", expected_shape, expected_output);
+  test.AddAttribute("mode", "edge");
+  test.Run();
+}
+
 // Gh issue: https://github.com/microsoft/onnxruntime/issues/11828
 // TEST(PadOpTest, Pad_Reflect_NegativeFront_PositiveBack) {
 //  using T = float;

From 635c6f8f2af5944a74da6302c39fd8abda3e67d8 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Mon, 22 Dec 2025 18:25:57 -0800
Subject: [PATCH 13/28] Fix Reflect and add tests for Wrap

---
 onnxruntime/core/providers/cpu/tensor/pad.cc  | 12 +++++
 .../test/providers/cpu/tensor/pad_test.cc     | 50 ++++++++++++-------
 2 files changed, 44 insertions(+), 18 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc
index d2466e644d70f..d4692054683bd 100644
--- a/onnxruntime/core/providers/cpu/tensor/pad.cc
+++ b/onnxruntime/core/providers/cpu/tensor/pad.cc
@@ -522,6 +522,18 @@ static Status PadImpl(OpKernelContext* ctx,
                                reshaped_slice[i] + reshaped_slice[i + new_dims_count];
   }
 
+  if (mode == Mode::Reflect) {
+    for (size_t i = 0; i < new_dims_count; ++i) {
+      const int64_t extent = input_extents[i];  // length after slicing
+      const bool reflect_on_axis =
+          (reshaped_pad[i] > 0) || (reshaped_pad[i + new_dims_count] > 0);
+      if (reflect_on_axis && extent < 2) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                               "Pad reflect requires axis length >= 2 after slicing in reflect mode");
+      }
+    }
+  }
+
   // Compute true output dimensions
   for (size_t i = 0; i < data_rank; i++) {
     output_dims[i] += SafeInt<int64_t>(pads[i]) + pads[i + data_rank] + slices[i] + slices[i + data_rank];
diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
index 15e44482ec89f..6c81779dc34d5 100644
--- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
@@ -1348,14 +1348,6 @@ TEST(PadOpTest, EdgeMode_ExtentOne_Valid) {
   test.Run();
 }
 
-// TEST(PadOpTest, EdgeMode_FlattenedInnermostAxis) {
-//   // This covers the else branch of inner_no_pad_size != 1
-//   const std::vector<int64_t> input_shape{2, 3, 2, 4};
-//   std::vector<float> input_data(2 * 3 * 2 * 4);
-//   std::iota(input_data.begin(), input_data.end(), 1.f);
-//
-//   const std::vector<float> expected_shape{2, 3, 8};
-
 TEST(PadOpTest, EdgeMode_FlattenedInnermostAxis) {
   // Shape chosen to force FlattenInnerShape():
   // innermost dims {2,4} -> flattened to 8
@@ -1417,16 +1409,38 @@ TEST(PadOpTest, EdgeMode_FlattenedInnermostAxis) {
 }
 
 // Gh issue: https://github.com/microsoft/onnxruntime/issues/11828
-// TEST(PadOpTest, Pad_Reflect_NegativeFront_PositiveBack) {
-//  using T = float;
-//  RunAllOpsetAllDomainPadTests<T>({4},
-//                                  {T(1), T(2), T(3), T(4)},
-//                                  {-3, 3},
-//                                  T(0),
-//                                  {4},
-//                                  {4, 0, 0, 0},
-//                                  "reflect");
-//}
+TEST(PadOpTest, Pad_Reflect_NegativeFront_PositiveBack) {
+  const std::vector<int64_t> input_shape = {4};
+  const std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f};
+  const std::vector<int64_t> pads = {-3, 3};
+  const std::vector<int64_t> expected_shape{4};
+  const std::vector<float> expected_data = {2.f, 3.f, 4.f, 1.f};
+
+  OpTester test("Pad", 18);
+  test.AddInput<float>("data", input_shape, input_data);
+  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads);
+  test.AddOutput<float>("output", expected_shape, expected_data);
+  test.AddAttribute("mode", "reflect");
+  test.Run(OpTester::ExpectResult::kExpectFailure,
+           "Pad reflect requires axis length >= 2 after slicing in reflect mode");
+}
+
+TEST(PadOpTest, Pad_Wrap_NegativeFront_PositiveBack) {
+  const std::vector<int64_t> input_shape = {4};
+  const std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f};
+  const std::vector<int64_t> pads = {-3, 3};
+
+  const std::vector<int64_t> expected_shape{4};
+  // Post-slice core: [4]; wrap 3 -> [4, 4, 4, 4]
+  const std::vector<float> expected_data = {4, 4, 4, 4};
+
+  OpTester test("Pad", 18);
+  test.AddInput<float>("data", input_shape, input_data);
+  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads);
+  test.AddOutput<float>("output", expected_shape, expected_data);
+  test.AddAttribute("mode", "wrap");
+  test.Run();
+}
 
 }  // namespace test
 }  // namespace onnxruntime

From 096d5393fe1b2baff7093d5b9a47a4dc8015aea3 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Mon, 22 Dec 2025 18:54:20 -0800
Subject: [PATCH 14/28] Test pass on CPU

---
 onnxruntime/core/providers/cpu/tensor/pad.cc  | 29 +++++++++++--------
 .../test/providers/cpu/tensor/pad_test.cc     |  2 +-
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc
index d4692054683bd..2ebeb25aaeffb 100644
--- a/onnxruntime/core/providers/cpu/tensor/pad.cc
+++ b/onnxruntime/core/providers/cpu/tensor/pad.cc
@@ -522,18 +522,6 @@ static Status PadImpl(OpKernelContext* ctx,
                                reshaped_slice[i] + reshaped_slice[i + new_dims_count];
   }
 
-  if (mode == Mode::Reflect) {
-    for (size_t i = 0; i < new_dims_count; ++i) {
-      const int64_t extent = input_extents[i];  // length after slicing
-      const bool reflect_on_axis =
-          (reshaped_pad[i] > 0) || (reshaped_pad[i + new_dims_count] > 0);
-      if (reflect_on_axis && extent < 2) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                               "Pad reflect requires axis length >= 2 after slicing in reflect mode");
-      }
-    }
-  }
-
   // Compute true output dimensions
   for (size_t i = 0; i < data_rank; i++) {
     output_dims[i] += SafeInt<int64_t>(pads[i]) + pads[i + data_rank] + slices[i] + slices[i + data_rank];
@@ -545,6 +533,23 @@ static Status PadImpl(OpKernelContext* ctx,
     return PadInputWithDimValueOfZero(ctx, mode, orig_input_shape, output_dims, value);
   }
 
+  // Special case for Reflect mode: ensure all extents >= 2 after slicing
+  // otherwise reflection is not possible. Matches numpy behavior as ONNX only
+  // implies that this would be wrong as the start and end positions should be distinct
+  // values and with 0 there is not one, and with 1 reflection degenerates into ambiguity.
+  if (mode == Mode::Reflect) {
+    for (size_t i = 0; i < new_dims_count; ++i) {
+      const int64_t extent = input_extents[i];  // length after slicing
+      const bool reflect_on_axis =
+          (reshaped_pad[i] > 0) || (reshaped_pad[i + new_dims_count] > 0);
+      if (reflect_on_axis && extent < 2) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                               "Pad reflect requires axis length >= 2 after slicing. Input shape:",
+                               orig_input_shape);
+      }
+    }
+  }
+
   // output_shape needs to keep original.
   TensorShape output_shape(output_dims);
   auto& output_tensor = *ctx->Output(0, output_shape);
diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
index 6c81779dc34d5..72ec0f7506401 100644
--- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
@@ -1422,7 +1422,7 @@ TEST(PadOpTest, Pad_Reflect_NegativeFront_PositiveBack) {
   test.AddOutput<float>("output", expected_shape, expected_data);
   test.AddAttribute("mode", "reflect");
   test.Run(OpTester::ExpectResult::kExpectFailure,
-           "Pad reflect requires axis length >= 2 after slicing in reflect mode");
+           "Pad reflect requires axis length >= 2 after slicing");
 }
 
 TEST(PadOpTest, Pad_Wrap_NegativeFront_PositiveBack) {

From 0c521ae5d906aaf70d172cb450c6a14e3c38a849 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Tue, 23 Dec 2025 15:15:35 -0800
Subject: [PATCH 15/28] Add some cases handling for CUDA

---
 onnxruntime/core/providers/cuda/cuda_utils.cu |  1 +
 onnxruntime/core/providers/cuda/tensor/pad.cc | 62 ++++++++++++++++---
 .../test/providers/cpu/tensor/pad_test.cc     | 30 ++++-----
 3 files changed, 71 insertions(+), 22 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/cuda_utils.cu b/onnxruntime/core/providers/cuda/cuda_utils.cu
index 934425656e3c9..59f2deda1805e 100644
--- a/onnxruntime/core/providers/cuda/cuda_utils.cu
+++ b/onnxruntime/core/providers/cuda/cuda_utils.cu
@@ -81,6 +81,7 @@ template std::unique_ptr<IConstantBuffer<Float8E5M2>> CreateConstantOnes<Float8E
   template void Fill<T>(cudaStream_t stream, T * output, T value, int64_t count);
 
 SPECIALIZED_FILL(int8_t)
+SPECIALIZED_FILL(bool)
 SPECIALIZED_FILL(int16_t)
 SPECIALIZED_FILL(int32_t)
 SPECIALIZED_FILL(int64_t)
diff --git a/onnxruntime/core/providers/cuda/tensor/pad.cc b/onnxruntime/core/providers/cuda/tensor/pad.cc
index bdd6567d2ef34..e2e8842851c7c 100644
--- a/onnxruntime/core/providers/cuda/tensor/pad.cc
+++ b/onnxruntime/core/providers/cuda/tensor/pad.cc
@@ -94,7 +94,7 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
   typedef typename ToCudaType<T>::MappedType CudaT;
   const auto& input_tensor = *ctx->Input<Tensor>(0);
   auto const& input_shape = input_tensor.Shape();
-  int32_t dimension_count = static_cast<int32_t>(input_shape.NumDimensions());
+  const size_t dimension_count = input_shape.NumDimensions();
 
   const PadsVector* p_pads = &pads_;
   const PadsVector* p_slices = &slices_;
@@ -134,15 +134,41 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
   TArray<int64_t> input_strides(input_pitches);
 
   auto output_dims(input_shape.AsShapeVector());
-  ORT_ENFORCE(static_cast<size_t>(dimension_count) * 2 == p_pads->size(), "'pads' attribute has wrong number of values");
+  ORT_ENFORCE(dimension_count * 2 == p_pads->size(), "'pads' attribute has wrong number of values");
 
   // Calculate output dimensions, and handle any negative padding
   TArray<int64_t> lower_pads(dimension_count);
   TArray<int64_t> upper_pads(dimension_count);
-  for (auto i = 0; i < dimension_count; i++) {
-    lower_pads[i] = (*p_pads)[i] + (*p_slices)[i];
-    upper_pads[i] = (*p_pads)[static_cast<int64_t>(i) + dimension_count] + (*p_slices)[static_cast<int64_t>(i) + dimension_count];
-    output_dims[i] += lower_pads[i] + upper_pads[i];
+  for (size_t i = 0; i < dimension_count; i++) {
+    lower_pads[i] = SafeInt<int64_t>((*p_pads)[i]) + (*p_slices)[i];
+    upper_pads[i] = SafeInt<int64_t>((*p_pads)[i + dimension_count]) + (*p_slices)[i + dimension_count];
+    output_dims[i] += SafeInt<int64_t>(lower_pads[i]) + upper_pads[i];
+  }
+
+  TensorShapeVector input_extents;
+  input_extents.reserve(dimension_count);
+  for (size_t i = 0; i < dimension_count; i++) {
+    int64_t extent = std::max<int64_t>(SafeInt<int64_t>(input_dims[i]) +
+                                           (*p_slices)[i] + (*p_slices)[i + dimension_count],
+                                       0U);
+    input_extents.push_back(extent);
+  }
+
+  // Special case for Reflect mode: ensure all extents >= 2 after slicing
+  // otherwise reflection is not possible. Matches numpy behavior as ONNX only
+  // implies that this would be wrong as the start and end positions should be distinct
+  // values and with 0 there is not one, and with 1 reflection degenerates into ambiguity.
+  if (mode_ == Mode::Reflect) {
+    for (size_t i = 0; i < dimension_count; ++i) {
+      const int64_t extent = input_extents[i];  // length after slicing
+      const bool reflect_on_axis =
+          (*p_pads)[i] > 0 || (*p_pads)[i + dimension_count] > 0;
+      if (reflect_on_axis && extent < 2) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                               "Pad reflect requires axis length >= 2 after slicing. Input shape:",
+                               input_shape);
+      }
+    }
   }
 
   TensorShape output_shape(output_dims);
@@ -154,6 +180,28 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
 
   auto& output_tensor = *ctx->Output(0, output_shape);
 
+  // Early constant-fill: if any input extent is zero, no data to copy
+  // only padding if any
+  bool no_data_to_copy = false;
+  for (size_t i = 0; i < input_extents.size(); ++i) {
+    if (input_extents[i] == 0) {
+      no_data_to_copy = true;
+      break;
+    }
+  }
+
+  if (no_data_to_copy) {
+    if (mode_ == Mode::Constant) {
+      cuda::Fill<CudaT>(Stream(ctx), reinterpret_cast<CudaT*>(output_tensor.MutableDataRaw()),
+                        value,
+                        input_shape.Size());
+      return Status::OK();
+    }
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                           "Pad: invalid mode: ", static_cast<int>(mode_), " with zero effective input extent");
+  }
+
+  // Case of all pads and slices being zero: just copy input to output
   if (std::all_of(p_pads->begin(), p_pads->end(), [](const int64_t v) { return v == 0; }) &&
       std::all_of(p_slices->begin(), p_slices->end(), [](const int64_t v) { return v == 0; }) &&
       output_shape.Size() > 0) {
@@ -164,7 +212,7 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
     return Status::OK();
   }
 
-  if (IsNCHWInputWithPaddingAlongHAndW(static_cast<size_t>(dimension_count), lower_pads, upper_pads)) {
+  if (IsNCHWInputWithPaddingAlongHAndW(dimension_count, lower_pads, upper_pads)) {
     // If we have entered here, it means the input can only be 4-D (NCHW), 3-D (CHW), or 2-D (HW)
 
     // NCHW input
diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
index 72ec0f7506401..a7a54caea2fe8 100644
--- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
@@ -1145,8 +1145,8 @@ TEST(PadOpTest, ConstantMode_MixedSigns_Small_F32) {
 
   OpTester test("Pad", 13);
   test.AddInput<float>("data", input_shape, input_data);
-  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads);
-  test.AddInput<float>("constant_value", {}, {cv});
+  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads, true);
+  test.AddInput<float>("constant_value", {}, {cv}, true);
   test.AddOutput<float>("output", expected_shape, expected_data);
   test.AddAttribute("mode", "constant");
   test.Run();
@@ -1177,8 +1177,8 @@ TEST(PadOpTest, ConstantFill_F32_RemovesAllDataOnAxis) {
       0.f, 0.f, 0.f, 0.f};
 
   test.AddInput<float>("data", input_shape, input_data);
-  test.AddInput<int64_t>("pads", {8}, {0, 0, -4, 0, 0, 0, 4, 0});
-  test.AddInput<float>("constant_value", {}, {0.0f});
+  test.AddInput<int64_t>("pads", {8}, {0, 0, -4, 0, 0, 0, 4, 0}, true);
+  test.AddInput<float>("constant_value", {}, {0.0f}, true);
   test.AddOutput<float>("output", expected_shape, expected_data);
   test.Run();
 }
@@ -1228,8 +1228,8 @@ TEST(PadOpTest, ConstantPadLargeNegativePadNoOutput) {
   std::vector<float> output_data;  // empty now
 
   test.AddInput<float>("data", input_shape, input_span);
-  test.AddInput<int64_t>("pads", pads_shape, pads);
-  test.AddInput<float>("value", {}, {100.f});
+  test.AddInput<int64_t>("pads", pads_shape, pads, true);
+  test.AddInput<float>("value", {}, {100.f}, true);
 
   // Omit Axis input
   test.AddOutput<float>("output", output_shape, output_data);
@@ -1273,8 +1273,8 @@ TEST(PadOpTest, ConstantMode_MixedSigns_Small) {
 
   OpTester test("Pad", 18);
   test.AddInput<float>("data", input_shape, input_data);
-  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads);
-  test.AddInput<float>("constant_value", {}, {cv});
+  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads, true);
+  test.AddInput<float>("constant_value", {}, {cv}, true);
   test.AddOutput<float>("output", expected_shape, expected_output);
   test.AddAttribute("mode", "constant");
   test.Run();
@@ -1309,8 +1309,8 @@ TEST(PadOpTest, ConstantMode_InnermostCropThenPostPad) {
 
   OpTester test("Pad", 18);
   test.AddInput<float>("data", input_shape, input_data);
-  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads);
-  test.AddInput<float>("constant_value", {}, {cv});
+  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads, true);
+  test.AddInput<float>("constant_value", {}, {cv}, true);
   test.AddOutput<float>("output", expected_shape, expected_output);
   test.AddAttribute("mode", "constant");
   test.Run();
@@ -1327,7 +1327,7 @@ TEST(PadOpTest, EdgeMode_ZeroExtentFails) {
 
   OpTester test("Pad", 18);
   test.AddInput<float>("data", input_shape, input_data);
-  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads);
+  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads, true);
   test.AddOutput<float>("output", expected_shape, expected_data);
   test.AddAttribute("mode", "edge");
   test.Run(OpTester::ExpectResult::kExpectFailure);
@@ -1342,7 +1342,7 @@ TEST(PadOpTest, EdgeMode_ExtentOne_Valid) {
 
   OpTester test("Pad", 18);
   test.AddInput<float>("data", input_shape, input_data);
-  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads);
+  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads, true);
   test.AddOutput<float>("output", expected_shape, expected_output);
   test.AddAttribute("mode", "edge");
   test.Run();
@@ -1402,7 +1402,7 @@ TEST(PadOpTest, EdgeMode_FlattenedInnermostAxis) {
 
   OpTester test("Pad", 18);
   test.AddInput<float>("data", input_shape, input_data);
-  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads);
+  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads, true);
   test.AddOutput<float>("output", expected_shape, expected_output);
   test.AddAttribute("mode", "edge");
   test.Run();
@@ -1418,7 +1418,7 @@ TEST(PadOpTest, Pad_Reflect_NegativeFront_PositiveBack) {
 
   OpTester test("Pad", 18);
   test.AddInput<float>("data", input_shape, input_data);
-  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads);
+  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads, true);
   test.AddOutput<float>("output", expected_shape, expected_data);
   test.AddAttribute("mode", "reflect");
   test.Run(OpTester::ExpectResult::kExpectFailure,
@@ -1436,7 +1436,7 @@ TEST(PadOpTest, Pad_Wrap_NegativeFront_PositiveBack) {
 
   OpTester test("Pad", 18);
   test.AddInput<float>("data", input_shape, input_data);
-  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads);
+  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads, true);
   test.AddOutput<float>("output", expected_shape, expected_data);
   test.AddAttribute("mode", "wrap");
   test.Run();

From b78cfe71a567763bf05d3c8200a23dafe207a802 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Tue, 23 Dec 2025 15:34:26 -0800
Subject: [PATCH 16/28] Clamp output dimensions, early exist on zero output

---
 onnxruntime/core/providers/cuda/tensor/pad.cc | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/tensor/pad.cc b/onnxruntime/core/providers/cuda/tensor/pad.cc
index e2e8842851c7c..a841d93d1403e 100644
--- a/onnxruntime/core/providers/cuda/tensor/pad.cc
+++ b/onnxruntime/core/providers/cuda/tensor/pad.cc
@@ -142,7 +142,13 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
   for (size_t i = 0; i < dimension_count; i++) {
     lower_pads[i] = SafeInt<int64_t>((*p_pads)[i]) + (*p_slices)[i];
     upper_pads[i] = SafeInt<int64_t>((*p_pads)[i + dimension_count]) + (*p_slices)[i + dimension_count];
-    output_dims[i] += SafeInt<int64_t>(lower_pads[i]) + upper_pads[i];
+    output_dims[i] += std::max<int64_t>(0, SafeInt<int64_t>(lower_pads[i]) + upper_pads[i]);
+  }
+
+  TensorShape output_shape(output_dims);
+  if (output_shape.Size() == 0) {
+    // No elements to output
+    return Status::OK();
   }
 
   TensorShapeVector input_extents;
@@ -171,8 +177,6 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
     }
   }
 
-  TensorShape output_shape(output_dims);
-
   // special case when there is a dim value of 0 in the shape. behavior depends on mode
   if (input_shape.Size() == 0) {
     ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode_, input_shape, output_shape));

From f3e68f6a6387878798634288fbb3f7722ff5e907 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Tue, 23 Dec 2025 16:02:44 -0800
Subject: [PATCH 17/28] Produce outut before early exist

---
 onnxruntime/core/providers/cuda/tensor/pad.cc | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/tensor/pad.cc b/onnxruntime/core/providers/cuda/tensor/pad.cc
index a841d93d1403e..f82d19e9732d6 100644
--- a/onnxruntime/core/providers/cuda/tensor/pad.cc
+++ b/onnxruntime/core/providers/cuda/tensor/pad.cc
@@ -142,13 +142,7 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
   for (size_t i = 0; i < dimension_count; i++) {
     lower_pads[i] = SafeInt<int64_t>((*p_pads)[i]) + (*p_slices)[i];
     upper_pads[i] = SafeInt<int64_t>((*p_pads)[i + dimension_count]) + (*p_slices)[i + dimension_count];
-    output_dims[i] += std::max<int64_t>(0, SafeInt<int64_t>(lower_pads[i]) + upper_pads[i]);
-  }
-
-  TensorShape output_shape(output_dims);
-  if (output_shape.Size() == 0) {
-    // No elements to output
-    return Status::OK();
+    output_dims[i] += SafeInt<int64_t>(lower_pads[i]) + upper_pads[i];
   }
 
   TensorShapeVector input_extents;
@@ -177,12 +171,18 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
     }
   }
 
+  TensorShape output_shape(output_dims);
+
   // special case when there is a dim value of 0 in the shape. behavior depends on mode
   if (input_shape.Size() == 0) {
     ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode_, input_shape, output_shape));
   }
 
   auto& output_tensor = *ctx->Output(0, output_shape);
+  if (output_shape.Size() == 0) {
+    // No elements to output
+    return Status::OK();
+  }
 
   // Early constant-fill: if any input extent is zero, no data to copy
   // only padding if any

From 5ef84e2d5d52876005b7c475a1072bd02d432553 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Tue, 23 Dec 2025 16:27:55 -0800
Subject: [PATCH 18/28] Wrap is not supported on CUDA. Wrap test must be ver 18

---
 onnxruntime/test/providers/cpu/tensor/pad_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
index a7a54caea2fe8..c1f16317ef8e7 100644
--- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
@@ -1434,7 +1434,7 @@ TEST(PadOpTest, Pad_Wrap_NegativeFront_PositiveBack) {
   // Post-slice core: [4]; wrap 3 -> [4, 4, 4, 4]
   const std::vector<float> expected_data = {4, 4, 4, 4};
 
-  OpTester test("Pad", 18);
+  OpTester test("Pad", 19);  // CUDA registers only up to 18 and does not impl wrap mode
   test.AddInput<float>("data", input_shape, input_data);
   test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads, true);
   test.AddOutput<float>("output", expected_shape, expected_data);

From 8319d476d91ec85f60208a801827ce9c9dc79b17 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Thu, 25 Dec 2025 14:54:06 -0800
Subject: [PATCH 19/28] Still have CUDA failures, not clear why cudaMemset is
 not working as expected. [  FAILED  ]
 PadOpTest/0.Pad_Constant_DimWithZeroInput, where TypeParam = float [  FAILED 
 ] PadOpTest/1.Pad_Constant_DimWithZeroInput, where TypeParam = double

---
 onnxruntime/core/providers/cpu/tensor/pad.cc  | 78 +++++++++----------
 onnxruntime/core/providers/cuda/tensor/pad.cc | 78 ++++++++++---------
 onnxruntime/core/providers/cuda/tensor/pad.h  |  1 +
 .../test/providers/cpu/tensor/pad_test.cc     |  7 +-
 4 files changed, 83 insertions(+), 81 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc
index 2ebeb25aaeffb..b87e0d4b545f5 100644
--- a/onnxruntime/core/providers/cpu/tensor/pad.cc
+++ b/onnxruntime/core/providers/cpu/tensor/pad.cc
@@ -505,11 +505,11 @@ static Status PadImpl(OpKernelContext* ctx,
 
   TensorShapeVector reshaped_output_dims = reshaped_input_dims;
   TensorShapeVector input_starts;
-  TensorShapeVector input_extents;
+  TensorShapeVector effective_input_extents;
 
   // Calculate reshaped output dimensions, and handle any negative padding
   input_starts.reserve(new_dims_count);
-  input_extents.reserve(new_dims_count);
+  effective_input_extents.reserve(new_dims_count);
   for (size_t i = 0; i < new_dims_count; i++) {
     // Starts for every dimension. If slice is negative, we need to start further in, handled by the SliceIterator
     input_starts.push_back(-1 * reshaped_slice[i]);
@@ -517,7 +517,7 @@ static Status PadImpl(OpKernelContext* ctx,
     int64_t extent = std::max<int64_t>(SafeInt<int64_t>(reshaped_input_dims[i]) +
                                            reshaped_slice[i] + reshaped_slice[i + new_dims_count],
                                        0U);
-    input_extents.push_back(extent);
+    effective_input_extents.push_back(extent);
     reshaped_output_dims[i] += SafeInt<int64_t>(reshaped_pad[i]) + reshaped_pad[i + new_dims_count] +
                                reshaped_slice[i] + reshaped_slice[i + new_dims_count];
   }
@@ -527,29 +527,13 @@ static Status PadImpl(OpKernelContext* ctx,
     output_dims[i] += SafeInt<int64_t>(pads[i]) + pads[i + data_rank] + slices[i] + slices[i + data_rank];
   }
 
-  // special case an input with one or more dim values of 0. edge case that is easier to handle
-  // separately than to complicate all the code for normal usage.
+  // If the input is empty, but output shape may not be, need padding only
+  // this is expected for constant mode only, otherwise the output is empty
+  // no error
   if (orig_input_shape.Size() == 0) {
     return PadInputWithDimValueOfZero(ctx, mode, orig_input_shape, output_dims, value);
   }
 
-  // Special case for Reflect mode: ensure all extents >= 2 after slicing
-  // otherwise reflection is not possible. Matches numpy behavior as ONNX only
-  // implies that this would be wrong as the start and end positions should be distinct
-  // values and with 0 there is not one, and with 1 reflection degenerates into ambiguity.
-  if (mode == Mode::Reflect) {
-    for (size_t i = 0; i < new_dims_count; ++i) {
-      const int64_t extent = input_extents[i];  // length after slicing
-      const bool reflect_on_axis =
-          (reshaped_pad[i] > 0) || (reshaped_pad[i + new_dims_count] > 0);
-      if (reflect_on_axis && extent < 2) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                               "Pad reflect requires axis length >= 2 after slicing. Input shape:",
-                               orig_input_shape);
-      }
-    }
-  }
-
   // output_shape needs to keep original.
   TensorShape output_shape(output_dims);
   auto& output_tensor = *ctx->Output(0, output_shape);
@@ -559,17 +543,12 @@ static Status PadImpl(OpKernelContext* ctx,
   auto* output_end = output + static_cast<size_t>(total_output_elems);
   OutputSink<T> sink(output, output_end);
 
-  // Early constant-fill: if any input extent is zero, no data to copy
-  // only padding if any
-  bool no_data_to_copy = false;
-  for (size_t i = 0; i < input_extents.size(); ++i) {
-    if (input_extents[i] == 0) {
-      no_data_to_copy = true;
-      break;
-    }
-  }
+  // Early constant-fill: if any effective input extent is zero (input is not empty), no data to copy
+  // only padding if any for constant mode, for other modes it is an error
+  const bool no_effective_data_to_copy = std::any_of(effective_input_extents.begin(), effective_input_extents.end(),
+                                                     [](int64_t v) { return v == 0; });
 
-  if (no_data_to_copy) {
+  if (no_effective_data_to_copy) {
     if (mode == Mode::Constant) {
       PadAxisConstant<T>(sink, output, value, total_output_elems);
       return Status::OK();
@@ -578,6 +557,23 @@ static Status PadImpl(OpKernelContext* ctx,
                            "Pad: invalid mode: ", static_cast<int>(mode), " with zero effective input extent");
   }
 
+  // Special case for Reflect mode: ensure all extents >= 2 after slicing
+  // otherwise reflection is not possible. Matches numpy behavior as ONNX only
+  // implies that this would be wrong as the start and end positions should be distinct
+  // values and with 0 there is not one, and with 1 reflection degenerates into ambiguity.
+  if (mode == Mode::Reflect) {
+    for (size_t i = 0; i < new_dims_count; ++i) {
+      const int64_t extent = effective_input_extents[i];  // length after slicing
+      const bool reflect_on_axis =
+          (reshaped_pad[i] > 0) || (reshaped_pad[i + new_dims_count] > 0);
+      if (reflect_on_axis && extent < 2) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                               "Pad reflect requires axis length >= 2 after slicing. Input shape:",
+                               orig_input_shape);
+      }
+    }
+  }
+
   TensorPitches output_pitches(reshaped_output_dims);
   // Initial skip, sum up the start padding on each axis
   SafeInt<size_t> align_skip = 0;
@@ -588,9 +584,9 @@ static Status PadImpl(OpKernelContext* ctx,
 
   // Validate coverage: pre + copy + post == total
   SafeInt<size_t> copy_elems = 1;
-  for (size_t i = 0, lim = input_extents.size(); i < lim; ++i) {
+  for (size_t i = 0, lim = effective_input_extents.size(); i < lim; ++i) {
     // All extents are positive here due to the no_data_to_copy check above
-    copy_elems *= input_extents[i];
+    copy_elems *= effective_input_extents[i];
   }
 
   const size_t prepad_elems = align_skip;
@@ -599,9 +595,9 @@ static Status PadImpl(OpKernelContext* ctx,
       total_output_elems, prepad_elems, copy_elems, postpad_elems));
 
   TensorShape input_shape(reshaped_input_dims);
-  SliceIterator<T> input(input_tensor, input_shape, input_starts, input_extents, {});
+  SliceIterator<T> input(input_tensor, input_shape, input_starts, effective_input_extents, {});
 
-  ExtentAxisCounters input_counters(input_extents);
+  ExtentAxisCounters input_counters(effective_input_extents);
 
   switch (mode) {
     case Mode::Constant:
@@ -613,7 +609,7 @@ static Status PadImpl(OpKernelContext* ctx,
         {
           T* axis_start = output;
           // Compute the actual number of data elements to copy on the innermost axis (after cropping).
-          const size_t inner_extent = onnxruntime::narrow<size_t>(input_extents[inner_axis]);
+          const size_t inner_extent = onnxruntime::narrow<size_t>(effective_input_extents[inner_axis]);
 
           // Copy innermost block. IMPORTANT: do not rely on the returned 'output' to be end-of-the extent.
           ORT_IGNORE_RETURN_VALUE(input.CopyInnermostAxisSolitaryInnerStep(output));
@@ -633,7 +629,7 @@ static Status PadImpl(OpKernelContext* ctx,
         // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done)
         while (input_counters.Increment()) {
           ptrdiff_t inner_pitch = onnxruntime::narrow<std::ptrdiff_t>(output_pitches[input_counters.Axis()]);
-          T* axis_start = output - inner_pitch * input_extents[input_counters.Axis()];
+          T* axis_start = output - inner_pitch * effective_input_extents[input_counters.Axis()];
           const SafeInt<size_t> pre_pad = reshaped_pad[input_counters.Axis()];
           const SafeInt<size_t> post_pad = reshaped_pad[input_counters.Axis() + new_dims_count];
           if (pre_pad > 0) {
@@ -655,7 +651,7 @@ static Status PadImpl(OpKernelContext* ctx,
       while (input_counters) {
         output += align_skip;
         {
-          const SafeInt<size_t> inner_extent = input_extents[inner_axis];
+          const SafeInt<size_t> inner_extent = effective_input_extents[inner_axis];
           T* axis_start = output;
           T* axis_end = axis_start + onnxruntime::narrow<ptrdiff_t>(inner_extent);
           output = input.CopyInnermostAxisSolitaryInnerStep(output);
@@ -687,7 +683,7 @@ static Status PadImpl(OpKernelContext* ctx,
         // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done)
         while (input_counters.Increment()) {
           ptrdiff_t inner_pitch = onnxruntime::narrow<std::ptrdiff_t>(output_pitches[input_counters.Axis()]);
-          T* axis_start = output - inner_pitch * input_extents[input_counters.Axis()];
+          T* axis_start = output - inner_pitch * effective_input_extents[input_counters.Axis()];
           const SafeInt<size_t> pre_pad = reshaped_pad[input_counters.Axis()];
           const SafeInt<size_t> post_pad = reshaped_pad[input_counters.Axis() + new_dims_count];
           if (pre_pad > 0) {
@@ -774,7 +770,7 @@ static Status PadImpl(OpKernelContext* ctx,
         // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done)
         while (input_counters.Increment()) {
           ptrdiff_t inner_pitch = onnxruntime::narrow<std::ptrdiff_t>(output_pitches[input_counters.Axis()]);
-          T* axis_start = output - inner_pitch * input_extents[input_counters.Axis()];
+          T* axis_start = output - inner_pitch * effective_input_extents[input_counters.Axis()];
           const SafeInt<size_t> pre_pad = reshaped_pad[input_counters.Axis()];
           const SafeInt<size_t> post_pad = reshaped_pad[input_counters.Axis() + new_dims_count];
           if (mode == Mode::Reflect) {
diff --git a/onnxruntime/core/providers/cuda/tensor/pad.cc b/onnxruntime/core/providers/cuda/tensor/pad.cc
index f82d19e9732d6..91d01234c006b 100644
--- a/onnxruntime/core/providers/cuda/tensor/pad.cc
+++ b/onnxruntime/core/providers/cuda/tensor/pad.cc
@@ -145,13 +145,49 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
     output_dims[i] += SafeInt<int64_t>(lower_pads[i]) + upper_pads[i];
   }
 
-  TensorShapeVector input_extents;
-  input_extents.reserve(dimension_count);
+  TensorShapeVector effective_input_extents;
+  effective_input_extents.reserve(dimension_count);
   for (size_t i = 0; i < dimension_count; i++) {
     int64_t extent = std::max<int64_t>(SafeInt<int64_t>(input_dims[i]) +
                                            (*p_slices)[i] + (*p_slices)[i + dimension_count],
                                        0U);
-    input_extents.push_back(extent);
+    effective_input_extents.push_back(extent);
+  }
+
+  TensorShape output_shape(output_dims);
+  auto& output_tensor = *ctx->Output(0, output_shape);
+
+  // If the input size is zero, but output shape is not, need padding only
+  // this is expected for constant mode only, otherwise the output is empty
+  // no error
+  if (input_shape.Size() == 0) {
+    ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode_, input_shape, output_shape));
+    if (mode_ == Mode::Constant) {
+      CUDA_CALL_THROW(cudaMemsetAsync(output_tensor.MutableDataRaw(), value, output_tensor.SizeInBytes(),
+                                      Stream(ctx)));
+      cudaStreamSynchronize(Stream(ctx));
+    }
+    // No error for other modes (preserve CPU historical behavior),
+    // but no output should be expected either
+    return Status::OK();
+  }
+
+  // Early constant-fill: input is not empty as above
+  // However, if any effective input extent is zero, no data to copy
+  // only padding if any.
+  const bool no_effective_data_to_copy = std::any_of(effective_input_extents.begin(), effective_input_extents.end(),
+                                                     [](int64_t v) { return v == 0; });
+
+  if (no_effective_data_to_copy) {
+    if (mode_ == Mode::Constant) {
+      // Attempt to pad constant mode in case output is not empty
+      // all other modes are an error
+      CUDA_CALL_THROW(cudaMemsetAsync(output_tensor.MutableDataRaw(), value, output_tensor.SizeInBytes(),
+                                      Stream(ctx)));
+      return Status::OK();
+    }
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                           "Pad: invalid mode: ", static_cast<int>(mode_), " with zero effective input extent");
   }
 
   // Special case for Reflect mode: ensure all extents >= 2 after slicing
@@ -160,7 +196,7 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
   // values and with 0 there is not one, and with 1 reflection degenerates into ambiguity.
   if (mode_ == Mode::Reflect) {
     for (size_t i = 0; i < dimension_count; ++i) {
-      const int64_t extent = input_extents[i];  // length after slicing
+      const int64_t extent = effective_input_extents[i];  // length after slicing
       const bool reflect_on_axis =
           (*p_pads)[i] > 0 || (*p_pads)[i + dimension_count] > 0;
       if (reflect_on_axis && extent < 2) {
@@ -171,40 +207,6 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
     }
   }
 
-  TensorShape output_shape(output_dims);
-
-  // special case when there is a dim value of 0 in the shape. behavior depends on mode
-  if (input_shape.Size() == 0) {
-    ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode_, input_shape, output_shape));
-  }
-
-  auto& output_tensor = *ctx->Output(0, output_shape);
-  if (output_shape.Size() == 0) {
-    // No elements to output
-    return Status::OK();
-  }
-
-  // Early constant-fill: if any input extent is zero, no data to copy
-  // only padding if any
-  bool no_data_to_copy = false;
-  for (size_t i = 0; i < input_extents.size(); ++i) {
-    if (input_extents[i] == 0) {
-      no_data_to_copy = true;
-      break;
-    }
-  }
-
-  if (no_data_to_copy) {
-    if (mode_ == Mode::Constant) {
-      cuda::Fill<CudaT>(Stream(ctx), reinterpret_cast<CudaT*>(output_tensor.MutableDataRaw()),
-                        value,
-                        input_shape.Size());
-      return Status::OK();
-    }
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
-                           "Pad: invalid mode: ", static_cast<int>(mode_), " with zero effective input extent");
-  }
-
   // Case of all pads and slices being zero: just copy input to output
   if (std::all_of(p_pads->begin(), p_pads->end(), [](const int64_t v) { return v == 0; }) &&
       std::all_of(p_slices->begin(), p_slices->end(), [](const int64_t v) { return v == 0; }) &&
diff --git a/onnxruntime/core/providers/cuda/tensor/pad.h b/onnxruntime/core/providers/cuda/tensor/pad.h
index b206a35995e4e..b7e4131795292 100644
--- a/onnxruntime/core/providers/cuda/tensor/pad.h
+++ b/onnxruntime/core/providers/cuda/tensor/pad.h
@@ -1,4 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
 #include "core/providers/shared_library/provider_api.h"
diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
index c1f16317ef8e7..cc72a17ff57e2 100644
--- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
@@ -67,7 +67,7 @@ static void RunAllOpsetAllDomainPadTests(
     bool value_is_initializer;
   };
   const std::vector<TestParams> all_test_params{
-      {false, false},
+      {true, false},
 #if (defined(USE_NNAPI) && defined(__ANDROID__)) || (defined(USE_COREML) && defined(__APPLE__))
       // only enable when building NNAPI EP on Android or building CoreML EP for Apple environment
       // test runs out of memory in QEMU aarch64 environment, so don't enable otherwise
@@ -1434,7 +1434,10 @@ TEST(PadOpTest, Pad_Wrap_NegativeFront_PositiveBack) {
   // Post-slice core: [4]; wrap 3 -> [4, 4, 4, 4]
   const std::vector<float> expected_data = {4, 4, 4, 4};
 
-  OpTester test("Pad", 19);  // CUDA registers only up to 18 and does not impl wrap mode
+  // CUDA registers only up to 18 and does not impl wrap mode
+  // so we force version to 19 to automatically exclude EPs that do not
+  // implement wrap mode similar to the above tests.
+  OpTester test("Pad", 19);
   test.AddInput<float>("data", input_shape, input_data);
   test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads, true);
   test.AddOutput<float>("output", expected_shape, expected_data);

From f6f15db2ab04f261ff8647113d53661514256792 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Mon, 5 Jan 2026 12:36:37 -0800
Subject: [PATCH 20/28] Address Copilot review comments and some bugs

---
 onnxruntime/core/providers/cpu/tensor/pad.cc  |  4 +-
 .../core/providers/cpu/tensor/padbase.h       |  6 +--
 onnxruntime/core/providers/cuda/tensor/pad.cc | 17 ++++---
 onnxruntime/core/providers/cuda/tensor/pad.h  |  1 -
 .../test/providers/cpu/tensor/pad_test.cc     | 50 ++-----------------
 5 files changed, 20 insertions(+), 58 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc
index b87e0d4b545f5..09f0bd276d453 100644
--- a/onnxruntime/core/providers/cpu/tensor/pad.cc
+++ b/onnxruntime/core/providers/cpu/tensor/pad.cc
@@ -516,7 +516,7 @@ static Status PadImpl(OpKernelContext* ctx,
     // Do not allow negative extents
     int64_t extent = std::max<int64_t>(SafeInt<int64_t>(reshaped_input_dims[i]) +
                                            reshaped_slice[i] + reshaped_slice[i + new_dims_count],
-                                       0U);
+                                       0LL);
     effective_input_extents.push_back(extent);
     reshaped_output_dims[i] += SafeInt<int64_t>(reshaped_pad[i]) + reshaped_pad[i + new_dims_count] +
                                reshaped_slice[i] + reshaped_slice[i + new_dims_count];
@@ -617,7 +617,7 @@ static Status PadImpl(OpKernelContext* ctx,
           const SafeInt<size_t> pre_pad = reshaped_pad[inner_axis];
           const SafeInt<size_t> post_pad = reshaped_pad[inner_axis + new_dims_count];
           if (pre_pad > 0) {
-            /// Pre - pad(innermost) retro - fill remains valid(write before row_start).
+            /// Pre-pad(innermost) retro-fill remains valid(write before row_start).
             PadAxisConstant(sink, axis_start - static_cast<size_t>(pre_pad), value, pre_pad);
           }
           if (post_pad > 0) {
diff --git a/onnxruntime/core/providers/cpu/tensor/padbase.h b/onnxruntime/core/providers/cpu/tensor/padbase.h
index 62324f1b48df1..e2ab6ff6c8fb1 100644
--- a/onnxruntime/core/providers/cpu/tensor/padbase.h
+++ b/onnxruntime/core/providers/cpu/tensor/padbase.h
@@ -96,9 +96,9 @@ class PadBase {
         SafeInt<size_t>(postpad_elems);
     if (checked_sum != total_output_elems) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
-                             +"Pad coverage invalid: pre=", prepad_elems,
-                             +" copy=", copy_elems, " post=", postpad_elems,
-                             +" total=", total_output_elems);
+                             "Pad coverage invalid: pre=", prepad_elems,
+                             " copy=", copy_elems, " post=", postpad_elems,
+                             " total=", total_output_elems);
     }
     return Status::OK();
   }
diff --git a/onnxruntime/core/providers/cuda/tensor/pad.cc b/onnxruntime/core/providers/cuda/tensor/pad.cc
index 91d01234c006b..656890e796a1c 100644
--- a/onnxruntime/core/providers/cuda/tensor/pad.cc
+++ b/onnxruntime/core/providers/cuda/tensor/pad.cc
@@ -150,7 +150,7 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
   for (size_t i = 0; i < dimension_count; i++) {
     int64_t extent = std::max<int64_t>(SafeInt<int64_t>(input_dims[i]) +
                                            (*p_slices)[i] + (*p_slices)[i + dimension_count],
-                                       0U);
+                                       0LL);
     effective_input_extents.push_back(extent);
   }
 
@@ -163,9 +163,11 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
   if (input_shape.Size() == 0) {
     ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode_, input_shape, output_shape));
     if (mode_ == Mode::Constant) {
-      CUDA_CALL_THROW(cudaMemsetAsync(output_tensor.MutableDataRaw(), value, output_tensor.SizeInBytes(),
-                                      Stream(ctx)));
-      cudaStreamSynchronize(Stream(ctx));
+      const int64_t output_size = output_shape.Size();
+      if (output_size > 0) {
+        Fill<CudaT>(Stream(ctx), reinterpret_cast<CudaT*>(output_tensor.MutableData<T>()), value,
+                    output_size);
+      }
     }
     // No error for other modes (preserve CPU historical behavior),
     // but no output should be expected either
@@ -182,8 +184,11 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
     if (mode_ == Mode::Constant) {
       // Attempt to pad constant mode in case output is not empty
       // all other modes are an error
-      CUDA_CALL_THROW(cudaMemsetAsync(output_tensor.MutableDataRaw(), value, output_tensor.SizeInBytes(),
-                                      Stream(ctx)));
+      const int64_t output_size = output_shape.Size();
+      if (output_size > 0) {
+        Fill<CudaT>(Stream(ctx), reinterpret_cast<CudaT*>(output_tensor.MutableData<T>()), value,
+                    output_size);
+      }
       return Status::OK();
     }
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
diff --git a/onnxruntime/core/providers/cuda/tensor/pad.h b/onnxruntime/core/providers/cuda/tensor/pad.h
index b7e4131795292..b206a35995e4e 100644
--- a/onnxruntime/core/providers/cuda/tensor/pad.h
+++ b/onnxruntime/core/providers/cuda/tensor/pad.h
@@ -1,5 +1,4 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
-// Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
 #include "core/providers/shared_library/provider_api.h"
diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
index cc72a17ff57e2..324934346ba65 100644
--- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
@@ -911,6 +911,7 @@ class PadOpTest_Pad_Reflect_DimWithZeroInput_Test : public PadOpTest<gtest_TypeP
 template <typename gtest_TypeParam_>
 void PadOpTest_Pad_Reflect_DimWithZeroInput_Test<gtest_TypeParam_>::TestBody() {
   using T = TypeParam;
+  // DML: Unskip when fixed #41968513
   RunAllOpsetAllDomainPadTests<T>({2, 0},  // 2D
                                   {},
                                   {1, 0, 1, 0},  // allowed if it doesn't pad the empty dim
@@ -920,8 +921,9 @@ void PadOpTest_Pad_Reflect_DimWithZeroInput_Test<gtest_TypeParam_>::TestBody() {
                                   "reflect",
                                   OpTester::ExpectResult::kExpectSuccess,
                                   "",
-                                  {kDmlExecutionProvider});  // DML: Unskip when fixed #41968513
+                                  {kDmlExecutionProvider});
 
+  // DML: Unskip when fixed #41968513
   RunAllOpsetAllDomainPadTests<T>({0, 2, 1},  // 3D
                                   {},
                                   {1, 1, 1, 1, 1, 1},  // not allowed if it pads the empty dim
@@ -931,7 +933,7 @@ void PadOpTest_Pad_Reflect_DimWithZeroInput_Test<gtest_TypeParam_>::TestBody() {
                                   "reflect",
                                   OpTester::ExpectResult::kExpectFailure,
                                   "Cannot use 'reflect' mode to pad dimension with a value of 0. Input shape:{0,2,1}",
-                                  {kDmlExecutionProvider, kTensorrtExecutionProvider});  // DML: Unskip when fixed #41968513
+                                  {kDmlExecutionProvider, kTensorrtExecutionProvider});
 }
 
 TEST(PadOpTest, BoolType) {
@@ -1108,50 +1110,6 @@ TEST(PadOpTest, ConstantPadNegativeAxes) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
-TEST(PadOpTest, ConstantMode_MixedSigns_Small_F32) {
-  const std::vector<int64_t> input_shape{2, 6, 4};
-  std::vector<float> input_data(2 * 6 * 4);
-
-  for (size_t i = 0; i < input_data.size(); ++i) input_data[i] = static_cast<float>((i % 5) + 1);
-
-  const std::vector<int64_t> pads{1, 3, -2, -1, 0, 1};
-  const float cv = 9.0f;
-  // starting from input shape {2,6,4}
-  // after padding:      {2+1+-1,6+3-0,4-2_1}  => {2,9,3}
-  const std::vector<int64_t> expected_shape{2, 9, 3};
-
-  const std::vector<float> expected_data = {
-      // sample 0
-      9.0F, 9.0F, 9.0F,
-      9.0F, 9.0F, 9.0F,
-      9.0F, 9.0F, 9.0F,
-      9.0F, 9.0F, 9.0F,
-      9.0F, 9.0F, 9.0F,
-      9.0F, 9.0F, 9.0F,
-      9.0F, 9.0F, 9.0F,
-      9.0F, 9.0F, 9.0F,
-      9.0F, 9.0F, 9.0F,
-
-      // sample 1
-      9.0F, 9.0F, 9.0F,
-      9.0F, 9.0F, 9.0F,
-      9.0F, 9.0F, 9.0F,
-      3.0F, 4.0F, 9.0F,
-      2.0F, 3.0F, 9.0F,
-      1.0F, 2.0F, 9.0F,
-      5.0F, 1.0F, 9.0F,
-      4.0F, 5.0F, 9.0F,
-      3.0F, 4.0F, 9.0F};
-
-  OpTester test("Pad", 13);
-  test.AddInput<float>("data", input_shape, input_data);
-  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads, true);
-  test.AddInput<float>("constant_value", {}, {cv}, true);
-  test.AddOutput<float>("output", expected_shape, expected_data);
-  test.AddAttribute("mode", "constant");
-  test.Run();
-}
-
 TEST(PadOpTest, ConstantFill_F32_RemovesAllDataOnAxis) {
   OpTester test("Pad", 18);
   test.AddAttribute("mode", "constant");

From 7e583dcb1f48d17e09008e2b7a5690e458111555 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Mon, 5 Jan 2026 19:02:07 -0800
Subject: [PATCH 21/28] Address CI failures

---
 onnxruntime/test/providers/cpu/tensor/pad_test.cc | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
index 324934346ba65..a27271465663a 100644
--- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
@@ -1288,7 +1288,9 @@ TEST(PadOpTest, EdgeMode_ZeroExtentFails) {
   test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads, true);
   test.AddOutput<float>("output", expected_shape, expected_data);
   test.AddAttribute("mode", "edge");
-  test.Run(OpTester::ExpectResult::kExpectFailure);
+  test.ConfigExcludeEps({kQnnExecutionProvider, kTensorrtExecutionProvider, kWebGpuExecutionProvider});
+  test.Config(OpTester::ExpectResult::kExpectFailure, "");
+  test.RunWithConfig();
 }
 
 TEST(PadOpTest, EdgeMode_ExtentOne_Valid) {
@@ -1379,8 +1381,10 @@ TEST(PadOpTest, Pad_Reflect_NegativeFront_PositiveBack) {
   test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads, true);
   test.AddOutput<float>("output", expected_shape, expected_data);
   test.AddAttribute("mode", "reflect");
-  test.Run(OpTester::ExpectResult::kExpectFailure,
-           "Pad reflect requires axis length >= 2 after slicing");
+  test.ConfigExcludeEps({kQnnExecutionProvider, kTensorrtExecutionProvider, kWebGpuExecutionProvider});
+  test.Config(OpTester::ExpectResult::kExpectFailure,
+              "Pad reflect requires axis length >= 2 after slicing");
+  test.RunWithConfig();
 }
 
 TEST(PadOpTest, Pad_Wrap_NegativeFront_PositiveBack) {
@@ -1400,7 +1404,9 @@ TEST(PadOpTest, Pad_Wrap_NegativeFront_PositiveBack) {
   test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads, true);
   test.AddOutput<float>("output", expected_shape, expected_data);
   test.AddAttribute("mode", "wrap");
-  test.Run();
+  test.ConfigExcludeEps({kQnnExecutionProvider, kTensorrtExecutionProvider, kWebGpuExecutionProvider});
+  test.Config(OpTester::ExpectResult::kExpectFailure, "");
+  test.RunWithConfig();
 }
 
 }  // namespace test

From 0058aae7ec2f7ef54adec0ca3b0fa969a903ab68 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Tue, 6 Jan 2026 11:43:17 -0800
Subject: [PATCH 22/28] Fix accidentally broken test

---
 onnxruntime/test/providers/cpu/tensor/pad_test.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
index a27271465663a..f2fb4efefae4d 100644
--- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
@@ -1405,7 +1405,6 @@ TEST(PadOpTest, Pad_Wrap_NegativeFront_PositiveBack) {
   test.AddOutput<float>("output", expected_shape, expected_data);
   test.AddAttribute("mode", "wrap");
   test.ConfigExcludeEps({kQnnExecutionProvider, kTensorrtExecutionProvider, kWebGpuExecutionProvider});
-  test.Config(OpTester::ExpectResult::kExpectFailure, "");
   test.RunWithConfig();
 }
 

From 49e7359aa5f251373a1e8df9d341b8037b2074d4 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Tue, 6 Jan 2026 13:10:40 -0800
Subject: [PATCH 23/28] Address missing ep exclusion and re-instate original
 cast

---
 onnxruntime/core/providers/cpu/tensor/pad.cc      | 6 +++---
 onnxruntime/test/providers/cpu/tensor/pad_test.cc | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc
index 09f0bd276d453..84e22d86dec0c 100644
--- a/onnxruntime/core/providers/cpu/tensor/pad.cc
+++ b/onnxruntime/core/providers/cpu/tensor/pad.cc
@@ -496,9 +496,9 @@ static Status PadImpl(OpKernelContext* ctx,
   // Reshape padding
   const size_t new_dims_count = reshaped_input_dims.size();
   const size_t inner_axis = new_dims_count - 1;
-  const int64_t inner_no_pad_size = output_dims[inner_axis] > 0
-                                        ? reshaped_input_dims[inner_axis] / output_dims[inner_axis]
-                                        : 0;
+  const size_t inner_no_pad_size = narrow<size_t>(output_dims[inner_axis] > 0
+                                                      ? reshaped_input_dims[inner_axis] / output_dims[inner_axis]
+                                                      : 0);
   PadsVector reshaped_pad(2 * new_dims_count), reshaped_slice(2 * new_dims_count);
   PadBase::ReshapePads(pads, data_rank, new_dims_count, inner_no_pad_size, reshaped_pad);
   PadBase::ReshapePads(slices, data_rank, new_dims_count, inner_no_pad_size, reshaped_slice);
diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
index f2fb4efefae4d..d38e35771a75d 100644
--- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
@@ -1107,7 +1107,7 @@ TEST(PadOpTest, ConstantPadNegativeAxes) {
                          0.0f, 1.0f, 1.0f, 0.0f,
                          0.0f, 1.0f, 1.0f, 0.0f,
                          0.0f, 1.0f, 1.0f, 0.0f});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kNnapiExecutionProvider});
 }
 
 TEST(PadOpTest, ConstantFill_F32_RemovesAllDataOnAxis) {

From e836ba8253e5b52d1b34856ef17df0b1654454e2 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Tue, 6 Jan 2026 13:51:51 -0800
Subject: [PATCH 24/28] Remove extra files

---
 onnxruntime/core/providers/cpu/tensor/pad.cc |   2 +-
 onnxruntime/test/testdata/test_pad_rce.onnx  | Bin 179 -> 0 bytes
 onnxruntime/test/testdata/test_pad_rce.py    |  27 -------------------
 3 files changed, 1 insertion(+), 28 deletions(-)
 delete mode 100644 onnxruntime/test/testdata/test_pad_rce.onnx
 delete mode 100644 onnxruntime/test/testdata/test_pad_rce.py

diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc
index 84e22d86dec0c..681494382c37a 100644
--- a/onnxruntime/core/providers/cpu/tensor/pad.cc
+++ b/onnxruntime/core/providers/cpu/tensor/pad.cc
@@ -347,7 +347,7 @@ void PadBase::FlattenInnerShape(gsl::span<const int64_t> input_dims, gsl::span<c
                                 gsl::span<const int64_t> slices, TensorShapeVector& reshaped_dims) {
   const size_t dims_count = input_dims.size();
   size_t inner_axis = dims_count - 1;
-  SafeInt<size_t> inner_size = 1;
+  SafeInt<int64_t> inner_size = 1;
 
   // Find all inner most dimensions that can be flattened.
   do {
diff --git a/onnxruntime/test/testdata/test_pad_rce.onnx b/onnxruntime/test/testdata/test_pad_rce.onnx
deleted file mode 100644
index 10ddd88cc517c800299a461376e6573be2ced17a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 179
zcmd<!v0A~%WzWT$nO9I+!o^aMm{QEemz<whT#}eq5?_{>Q<^HomS0)|lvQF5NKDZZ
z=3>dsPf1ncfGJwQ$SlMGl<@_M=0u4?jS}MF;^N>HV&h@}!6*TU(Lx+tY#i)DOh6F{
iOl!DUfI1kGB)Fg!3h{CA07W<;W^ge$v2Za6@Bsj<0x1*#

diff --git a/onnxruntime/test/testdata/test_pad_rce.py b/onnxruntime/test/testdata/test_pad_rce.py
deleted file mode 100644
index 0a9faac70aab3..0000000000000
--- a/onnxruntime/test/testdata/test_pad_rce.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import onnx
-from onnx import TensorProto, helper
-
-
-def create_pad_model():
-    input_data = helper.make_tensor_value_info("input", TensorProto.UINT64, [None, None, None])
-    pads = helper.make_tensor_value_info("pads", TensorProto.INT64, [None])
-    constant_value = helper.make_tensor_value_info("constant_value", TensorProto.UINT64, [])
-
-    output = helper.make_tensor_value_info("output", TensorProto.UINT64, [None, None, None, None])
-
-    pad_node = helper.make_node(
-        op_type="Pad",
-        inputs=["input", "pads", "constant_value"],
-        outputs=["output"],
-        mode="constant",  # or reflect/edge
-    )
-    graph = helper.make_graph(
-        nodes=[pad_node], name="PadModel", inputs=[input_data, pads, constant_value], outputs=[output]
-    )
-
-    model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 14)])
-    onnx.save(model, "test_pad_rce.onnx")
-
-
-if __name__ == "__main__":
-    create_pad_model()

From 9ce795825db0e4c86af9b91110506875d770ee30 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Tue, 6 Jan 2026 16:42:25 -0800
Subject: [PATCH 25/28] Fix TYPED_TEST macro expansion

---
 onnxruntime/test/providers/cpu/tensor/pad_test.cc | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
index d38e35771a75d..20e6a8523c0bd 100644
--- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
@@ -899,17 +899,7 @@ TYPED_TEST(PadOpTest, Pad_Edge_DimWithZeroInput) {
                                   {kDmlExecutionProvider});
 }
 
-static_assert(sizeof("Pad_Reflect_DimWithZeroInput") > 1, "test-name must not be empty");
-template <typename gtest_TypeParam_>
-class PadOpTest_Pad_Reflect_DimWithZeroInput_Test : public PadOpTest<gtest_TypeParam_> {
- private:
-  typedef PadOpTest<gtest_TypeParam_> TestFixture;
-  typedef gtest_TypeParam_ TypeParam;
-  void TestBody() override;
-};
-[[maybe_unused]] static bool gtest_PadOpTest_Pad_Reflect_DimWithZeroInput_registered_ = ::testing::internal::TypeParameterizedTest<PadOpTest, ::testing::internal::TemplateSel<PadOpTest_Pad_Reflect_DimWithZeroInput_Test>, gtest_type_params_PadOpTest_>::Register("", ::testing::internal::CodeLocation("D:\\dev\\ort_main\\onnxruntime\\test\\providers\\cpu\\tensor\\pad_test.cc", 892), "PadOpTest", "Pad_Reflect_DimWithZeroInput", 0, ::testing::internal::GenerateNames<gtest_type_params_PadOpTest_NameGenerator, gtest_type_params_PadOpTest_>());
-template <typename gtest_TypeParam_>
-void PadOpTest_Pad_Reflect_DimWithZeroInput_Test<gtest_TypeParam_>::TestBody() {
+TYPED_TEST(PadOpTest, Pad_Reflect_DimWithZeroInput) {
   using T = TypeParam;
   // DML: Unskip when fixed #41968513
   RunAllOpsetAllDomainPadTests<T>({2, 0},  // 2D

From 2c62d9fbd920abec0c85d4ef8d154cf7d794e85d Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Wed, 7 Jan 2026 11:26:50 -0800
Subject: [PATCH 26/28] Address review and CI failures

---
 .../test/providers/cpu/tensor/pad_test.cc     | 40 +++++++++++--------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
index 20e6a8523c0bd..11cdad7010c84 100644
--- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
@@ -67,7 +67,7 @@ static void RunAllOpsetAllDomainPadTests(
     bool value_is_initializer;
   };
   const std::vector<TestParams> all_test_params{
-      {true, false},
+      {false, false},
 #if (defined(USE_NNAPI) && defined(__ANDROID__)) || (defined(USE_COREML) && defined(__APPLE__))
       // only enable when building NNAPI EP on Android or building CoreML EP for Apple environment
       // test runs out of memory in QEMU aarch64 environment, so don't enable otherwise
@@ -772,7 +772,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) {
                                   T(1),
                                   {2},
                                   {T(1), T(1)},
-                                  "constant", OpTester::ExpectResult::kExpectSuccess,
+                                  "constant", OpTester::ExpectResult::kExpectSuccess, "",
                                   {kDmlExecutionProvider});
 
   RunAllOpsetAllDomainPadTests<T>({0},  // 1D empty pads
@@ -780,7 +780,10 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) {
                                   {0, 0},
                                   T(1),
                                   {0},
-                                  {});
+                                  {},
+                                  "constant",
+                                  OpTester::ExpectResult::kExpectSuccess, "",
+                                  {kDmlExecutionProvider});
 
   RunAllOpsetAllDomainPadTests<T>({0},  // 1D offsetting pads
                                   {},
@@ -788,7 +791,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) {
                                   T(1),
                                   {0},
                                   {},
-                                  "constant", OpTester::ExpectResult::kExpectSuccess,
+                                  "constant", OpTester::ExpectResult::kExpectSuccess, "",
                                   {kDmlExecutionProvider});
 
   RunAllOpsetAllDomainPadTests<T>({2, 0},  // 2D
@@ -797,7 +800,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) {
                                   T(1),
                                   {4, 2},
                                   {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)},
-                                  "constant", OpTester::ExpectResult::kExpectSuccess,
+                                  "constant", OpTester::ExpectResult::kExpectSuccess, "",
                                   {kDmlExecutionProvider});
 
   RunAllOpsetAllDomainPadTests<T>({0, 2},
@@ -806,7 +809,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) {
                                   T(1),
                                   {2, 4},
                                   {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)},
-                                  "constant", OpTester::ExpectResult::kExpectSuccess,
+                                  "constant", OpTester::ExpectResult::kExpectSuccess, "",
                                   {kDmlExecutionProvider});
 
   RunAllOpsetAllDomainPadTests<T>({0, 2},
@@ -815,7 +818,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) {
                                   T(1),
                                   {2, 2},
                                   {T(1), T(1), T(1), T(1)},
-                                  "constant", OpTester::ExpectResult::kExpectSuccess,
+                                  "constant", OpTester::ExpectResult::kExpectSuccess, "",
                                   {kDmlExecutionProvider});
 
   RunAllOpsetAllDomainPadTests<T>({2, 0, 2},  // 3D
@@ -824,7 +827,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) {
                                   T(1),
                                   {2, 2, 2},
                                   {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)},
-                                  "constant", OpTester::ExpectResult::kExpectSuccess,
+                                  "constant", OpTester::ExpectResult::kExpectSuccess, "",
                                   {kDmlExecutionProvider});
 }
 // Added output shape verification b/w the output shape generated by operator specific ONNX inference and
@@ -895,7 +898,7 @@ TYPED_TEST(PadOpTest, Pad_Edge_DimWithZeroInput) {
                                   {2, 4, 0},
                                   {},
                                   "edge",
-                                  OpTester::ExpectResult::kExpectSuccess,
+                                  OpTester::ExpectResult::kExpectSuccess, "",
                                   {kDmlExecutionProvider});
 }
 
@@ -1164,10 +1167,11 @@ TEST(PadOpTest, ConstantPadLargeNegativePadNoOutput) {
 
   // Expected shape is as follows:
   // dim0: 2 + 1(pad) - 3(crop at the back) = (0) removed // Should produce empty output
-  // dim1: 18 + 0x100000(pad) - 0(crop at the front) = 0x10000
+  // dim1: 18 + 0x100000(pad) - 0(crop at the front) = 65554
   // dim2: 4 + -2(crop at the front) + 1(pad at the back) = 3
-  // Resulting shape is {0, 0x10000, 3} with 0 at the front.
+  // Resulting shape is {0, 65554, 3} with 0 at the front.
   // How do we handle zero shapes? Currently ONNX spec allows it.
+  // We choose to produce a empty tensor
   constexpr int64_t dim0 = 2 + 1 - 3;
   constexpr int64_t dim1 = 18 + 0x100000 - 0;
   constexpr int64_t dim2 = 4 + -2 + 1;
@@ -1225,7 +1229,8 @@ TEST(PadOpTest, ConstantMode_MixedSigns_Small) {
   test.AddInput<float>("constant_value", {}, {cv}, true);
   test.AddOutput<float>("output", expected_shape, expected_output);
   test.AddAttribute("mode", "constant");
-  test.Run();
+  test.ConfigExcludeEps({kDmlExecutionProvider});
+  test.RunWithConfig();
 }
 
 TEST(PadOpTest, ConstantMode_InnermostCropThenPostPad) {
@@ -1261,7 +1266,8 @@ TEST(PadOpTest, ConstantMode_InnermostCropThenPostPad) {
   test.AddInput<float>("constant_value", {}, {cv}, true);
   test.AddOutput<float>("output", expected_shape, expected_output);
   test.AddAttribute("mode", "constant");
-  test.Run();
+  test.ConfigExcludeEps({kDmlExecutionProvider});
+  test.RunWithConfig();
 }
 
 TEST(PadOpTest, EdgeMode_ZeroExtentFails) {
@@ -1278,7 +1284,7 @@ TEST(PadOpTest, EdgeMode_ZeroExtentFails) {
   test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads, true);
   test.AddOutput<float>("output", expected_shape, expected_data);
   test.AddAttribute("mode", "edge");
-  test.ConfigExcludeEps({kQnnExecutionProvider, kTensorrtExecutionProvider, kWebGpuExecutionProvider});
+  test.ConfigExcludeEps({kDmlExecutionProvider, kQnnExecutionProvider, kTensorrtExecutionProvider, kWebGpuExecutionProvider});
   test.Config(OpTester::ExpectResult::kExpectFailure, "");
   test.RunWithConfig();
 }
@@ -1371,7 +1377,8 @@ TEST(PadOpTest, Pad_Reflect_NegativeFront_PositiveBack) {
   test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads, true);
   test.AddOutput<float>("output", expected_shape, expected_data);
   test.AddAttribute("mode", "reflect");
-  test.ConfigExcludeEps({kQnnExecutionProvider, kTensorrtExecutionProvider, kWebGpuExecutionProvider});
+  test.ConfigExcludeEps({kDmlExecutionProvider, kQnnExecutionProvider,
+                         kTensorrtExecutionProvider, kWebGpuExecutionProvider});
   test.Config(OpTester::ExpectResult::kExpectFailure,
               "Pad reflect requires axis length >= 2 after slicing");
   test.RunWithConfig();
@@ -1394,7 +1401,8 @@ TEST(PadOpTest, Pad_Wrap_NegativeFront_PositiveBack) {
   test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads, true);
   test.AddOutput<float>("output", expected_shape, expected_data);
   test.AddAttribute("mode", "wrap");
-  test.ConfigExcludeEps({kQnnExecutionProvider, kTensorrtExecutionProvider, kWebGpuExecutionProvider});
+  test.ConfigExcludeEps({kDmlExecutionProvider, kQnnExecutionProvider,
+                         kTensorrtExecutionProvider, kWebGpuExecutionProvider});
   test.RunWithConfig();
 }
 

From 584876c974729793a3d4659da2da776ff7afd13d Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Wed, 7 Jan 2026 15:14:59 -0800
Subject: [PATCH 27/28] Address review comments

---
 onnxruntime/core/providers/cpu/tensor/pad.cc      |  2 +-
 onnxruntime/test/providers/cpu/tensor/pad_test.cc | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc
index 681494382c37a..9588f69697cfb 100644
--- a/onnxruntime/core/providers/cpu/tensor/pad.cc
+++ b/onnxruntime/core/providers/cpu/tensor/pad.cc
@@ -387,7 +387,7 @@ struct OutputSink {
   void operator()(T* output, T value) const {
 #ifdef _DEBUG
     if (output < beg || output >= end) {
-      throw std::out_of_range("Pad OutputSink: Output pointer is out of range");
+      ORT_THROW("Pad OutputSink: Output pointer is out of range");
     }
 #endif
     *output = value;
diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
index 11cdad7010c84..6babebb1e561f 100644
--- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
@@ -1167,14 +1167,14 @@ TEST(PadOpTest, ConstantPadLargeNegativePadNoOutput) {
 
   // Expected shape is as follows:
   // dim0: 2 + 1(pad) - 3(crop at the back) = (0) removed // Should produce empty output
-  // dim1: 18 + 0x100000(pad) - 0(crop at the front) = 65554
+  // dim1: 18 + 0x100000(pad) - 0(crop at the front) = 1,048,594
   // dim2: 4 + -2(crop at the front) + 1(pad at the back) = 3
-  // Resulting shape is {0, 65554, 3} with 0 at the front.
+  // Resulting shape is {0, 1048594, 3} with 0 at the front.
   // How do we handle zero shapes? Currently ONNX spec allows it.
   // We choose to produce a empty tensor
-  constexpr int64_t dim0 = 2 + 1 - 3;
-  constexpr int64_t dim1 = 18 + 0x100000 - 0;
-  constexpr int64_t dim2 = 4 + -2 + 1;
+  constexpr int64_t dim0 = 2LL + 1 - 3;
+  constexpr int64_t dim1 = 18LL + 0x100000 - 0;
+  constexpr int64_t dim2 = 4LL + -2 + 1;
   const std::initializer_list<int64_t> output_shape{dim0, dim1, dim2};
 
   std::vector<float> output_data;  // empty now

From 4d386d09c9801b3b8ccad3cb5578e544515e893c Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Wed, 7 Jan 2026 18:39:39 -0800
Subject: [PATCH 28/28] Skip for DML EP

---
 .../test/providers/cpu/tensor/pad_test.cc     | 25 ++++++++-----------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
index 6babebb1e561f..49c9d360f9046 100644
--- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
@@ -764,6 +764,9 @@ edge
 // test handling of input with a 0 for a dimension
 TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) {
   // TODO: Unskip Dml when fixed #41968513
+  if (DefaultDmlExecutionProvider().get() != nullptr) {
+    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 13, which exceeds threshold";
+  }
 
   using T = TypeParam;
   RunAllOpsetAllDomainPadTests<T>({0},  // 1D
@@ -772,8 +775,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) {
                                   T(1),
                                   {2},
                                   {T(1), T(1)},
-                                  "constant", OpTester::ExpectResult::kExpectSuccess, "",
-                                  {kDmlExecutionProvider});
+                                  "constant");
 
   RunAllOpsetAllDomainPadTests<T>({0},  // 1D empty pads
                                   {},
@@ -781,9 +783,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) {
                                   T(1),
                                   {0},
                                   {},
-                                  "constant",
-                                  OpTester::ExpectResult::kExpectSuccess, "",
-                                  {kDmlExecutionProvider});
+                                  "constant");
 
   RunAllOpsetAllDomainPadTests<T>({0},  // 1D offsetting pads
                                   {},
@@ -791,8 +791,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) {
                                   T(1),
                                   {0},
                                   {},
-                                  "constant", OpTester::ExpectResult::kExpectSuccess, "",
-                                  {kDmlExecutionProvider});
+                                  "constant");
 
   RunAllOpsetAllDomainPadTests<T>({2, 0},  // 2D
                                   {},
@@ -800,8 +799,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) {
                                   T(1),
                                   {4, 2},
                                   {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)},
-                                  "constant", OpTester::ExpectResult::kExpectSuccess, "",
-                                  {kDmlExecutionProvider});
+                                  "constant");
 
   RunAllOpsetAllDomainPadTests<T>({0, 2},
                                   {},
@@ -809,8 +807,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) {
                                   T(1),
                                   {2, 4},
                                   {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)},
-                                  "constant", OpTester::ExpectResult::kExpectSuccess, "",
-                                  {kDmlExecutionProvider});
+                                  "constant");
 
   RunAllOpsetAllDomainPadTests<T>({0, 2},
                                   {},
@@ -818,8 +815,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) {
                                   T(1),
                                   {2, 2},
                                   {T(1), T(1), T(1), T(1)},
-                                  "constant", OpTester::ExpectResult::kExpectSuccess, "",
-                                  {kDmlExecutionProvider});
+                                  "constant");
 
   RunAllOpsetAllDomainPadTests<T>({2, 0, 2},  // 3D
                                   {},
@@ -827,8 +823,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) {
                                   T(1),
                                   {2, 2, 2},
                                   {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)},
-                                  "constant", OpTester::ExpectResult::kExpectSuccess, "",
-                                  {kDmlExecutionProvider});
+                                  "constant");
 }
 // Added output shape verification b/w the output shape generated by operator specific ONNX inference and
 // the output shape generated by operator specific ORT implementation. After adding this verification,