From 1af3abd639c95f102ec4ade5fb69e9334a32bffe Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Tue, 16 Dec 2025 09:19:23 -0800 Subject: [PATCH 01/28] Add test model --- onnxruntime/test/testdata/test_pad_rce.onnx | Bin 0 -> 179 bytes onnxruntime/test/testdata/test_pad_rce.py | 30 ++++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 onnxruntime/test/testdata/test_pad_rce.onnx create mode 100644 onnxruntime/test/testdata/test_pad_rce.py diff --git a/onnxruntime/test/testdata/test_pad_rce.onnx b/onnxruntime/test/testdata/test_pad_rce.onnx new file mode 100644 index 0000000000000000000000000000000000000000..10ddd88cc517c800299a461376e6573be2ced17a GIT binary patch literal 179 zcmdQ<^HomS0)|lvQF5NKDZZ z=3>dsPf1ncfGJwQ$SlMGl<@_M=0u4?jS}MF;^N>HV&h@}!6*TU(Lx+tY#i)DOh6F{ iOl!DUfI1kGB)Fg!3h{CA07W<;W^ge$v2Za6@Bsj<0x1*# literal 0 HcmV?d00001 diff --git a/onnxruntime/test/testdata/test_pad_rce.py b/onnxruntime/test/testdata/test_pad_rce.py new file mode 100644 index 0000000000000..fcc487cbf4e73 --- /dev/null +++ b/onnxruntime/test/testdata/test_pad_rce.py @@ -0,0 +1,30 @@ +import onnx +from onnx import helper, TensorProto +import numpy as np + +def create_pad_model(): + input_data = helper.make_tensor_value_info("input", TensorProto.UINT64, [None, None, None]) + pads = helper.make_tensor_value_info("pads", TensorProto.INT64, [None]) + constant_value = helper.make_tensor_value_info("constant_value", TensorProto.UINT64, []) + + output = helper.make_tensor_value_info("output", TensorProto.UINT64, [None, None, None, None]) + + pad_node = helper.make_node( + op_type="Pad", + inputs=["input", "pads", "constant_value"], + outputs=["output"], + mode="constant" # or reflect/edge + ) + graph = helper.make_graph( + nodes=[pad_node], + name="PadModel", + inputs=[input_data, pads, constant_value], + outputs=[output] + ) + + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 14)]) + onnx.save(model, "test_pad_rce.onnx") + + +if __name__ == "__main__": + create_pad_model() \ No newline at end of file From 4be2420d9e5a9127b1a75a16c9cc2b38df5dc114 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Tue, 16 Dec 2025 09:19:23 -0800 Subject: [PATCH 02/28] Add test model --- onnxruntime/test/testdata/test_pad_rce.onnx | Bin 0 -> 179 bytes onnxruntime/test/testdata/test_pad_rce.py | 30 ++++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 onnxruntime/test/testdata/test_pad_rce.onnx create mode 100644 onnxruntime/test/testdata/test_pad_rce.py diff --git a/onnxruntime/test/testdata/test_pad_rce.onnx b/onnxruntime/test/testdata/test_pad_rce.onnx new file mode 100644 index 0000000000000000000000000000000000000000..10ddd88cc517c800299a461376e6573be2ced17a GIT binary patch literal 179 zcmdQ<^HomS0)|lvQF5NKDZZ z=3>dsPf1ncfGJwQ$SlMGl<@_M=0u4?jS}MF;^N>HV&h@}!6*TU(Lx+tY#i)DOh6F{ iOl!DUfI1kGB)Fg!3h{CA07W<;W^ge$v2Za6@Bsj<0x1*# literal 0 HcmV?d00001 diff --git a/onnxruntime/test/testdata/test_pad_rce.py b/onnxruntime/test/testdata/test_pad_rce.py new file mode 100644 index 0000000000000..fcc487cbf4e73 --- /dev/null +++ b/onnxruntime/test/testdata/test_pad_rce.py @@ -0,0 +1,30 @@ +import onnx +from onnx import helper, TensorProto +import numpy as np + +def create_pad_model(): + input_data = helper.make_tensor_value_info("input", TensorProto.UINT64, [None, None, None]) + pads = helper.make_tensor_value_info("pads", TensorProto.INT64, [None]) + constant_value = helper.make_tensor_value_info("constant_value", TensorProto.UINT64, []) + + output = helper.make_tensor_value_info("output", TensorProto.UINT64, [None, None, None, None]) + + pad_node = helper.make_node( + op_type="Pad", + inputs=["input", "pads", "constant_value"], + outputs=["output"], + mode="constant" # or reflect/edge + ) + graph = helper.make_graph( + nodes=[pad_node], + name="PadModel", + inputs=[input_data, pads, constant_value], + outputs=[output] + ) + + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 14)]) + onnx.save(model, "test_pad_rce.onnx") + + +if __name__ == "__main__": + create_pad_model() \ No newline at end of file From 40299515c8a8b93e9fc2004f445fe53ca17ff738 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Tue, 16 Dec 2025 11:56:27 -0800 Subject: [PATCH 03/28] Py formatting --- onnxruntime/test/testdata/test_pad_rce.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/onnxruntime/test/testdata/test_pad_rce.py b/onnxruntime/test/testdata/test_pad_rce.py index fcc487cbf4e73..0a9faac70aab3 100644 --- a/onnxruntime/test/testdata/test_pad_rce.py +++ b/onnxruntime/test/testdata/test_pad_rce.py @@ -1,6 +1,6 @@ import onnx -from onnx import helper, TensorProto -import numpy as np +from onnx import TensorProto, helper + def create_pad_model(): input_data = helper.make_tensor_value_info("input", TensorProto.UINT64, [None, None, None]) @@ -13,13 +13,10 @@ def create_pad_model(): op_type="Pad", inputs=["input", "pads", "constant_value"], outputs=["output"], - mode="constant" # or reflect/edge + mode="constant", # or reflect/edge ) graph = helper.make_graph( - nodes=[pad_node], - name="PadModel", - inputs=[input_data, pads, constant_value], - outputs=[output] + nodes=[pad_node], name="PadModel", inputs=[input_data, pads, constant_value], outputs=[output] ) model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 14)]) @@ -27,4 +24,4 @@ def create_pad_model(): if __name__ == "__main__": - create_pad_model() \ No newline at end of file + create_pad_model() From 0b84e6bfb4a4fce5f08f4547dc356e524e4d1272 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Wed, 17 Dec 2025 19:29:16 -0800 Subject: [PATCH 04/28] Continue testing --- onnxruntime/core/providers/cpu/tensor/pad.cc | 71 ++++++++++--------- .../test/providers/cpu/tensor/pad_test.cc | 25 +++++-- onnxruntime/test/testdata/test_pad_rce.py | 5 +- 3 files changed, 55 insertions(+), 46 deletions(-) diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc index ab261bbb8cdb5..f9f9121233ce8 100644 --- a/onnxruntime/core/providers/cpu/tensor/pad.cc +++ b/onnxruntime/core/providers/cpu/tensor/pad.cc @@ -347,11 +347,11 @@ void PadBase::FlattenInnerShape(gsl::span input_dims, gsl::span slices, TensorShapeVector& reshaped_dims) { const size_t dims_count = input_dims.size(); size_t inner_axis = dims_count - 1; - size_t inner_size = 1; + SafeInt inner_size = 1; // Find all inner most dimensions that can be flattened. do { - inner_size *= static_cast(input_dims[inner_axis]); + inner_size *= input_dims[inner_axis]; if (inner_axis == 0) break; @@ -378,8 +378,8 @@ void PadBase::ReshapePads(gsl::span src_pad, size_t src_dim_count reshaped_pad.begin() + new_dim_count); // Flatten inner axis. - reshaped_pad[inner_axis] = src_pad[inner_axis] * inner_no_pad_size; - reshaped_pad[inner_axis + new_dim_count] = src_pad[inner_axis + src_dim_count] * inner_no_pad_size; + reshaped_pad[inner_axis] = SafeInt(src_pad[inner_axis]) * inner_no_pad_size; + reshaped_pad[inner_axis + new_dim_count] = SafeInt(src_pad[inner_axis + src_dim_count]) * inner_no_pad_size; } // special handling for edge case where the input has one or more dims with value of 0 @@ -468,11 +468,11 @@ static Status PadImpl(OpKernelContext* ctx, PadBase::FlattenInnerShape(output_dims, pads, slices, reshaped_input_dims); // Reshape padding - size_t new_dims_count = reshaped_input_dims.size(); - size_t inner_axis = new_dims_count - 1; - size_t inner_no_pad_size = onnxruntime::narrow(output_dims[inner_axis] > 0 - ? reshaped_input_dims[inner_axis] / output_dims[inner_axis] - : 0); + const size_t new_dims_count = reshaped_input_dims.size(); + const size_t inner_axis = new_dims_count - 1; + const size_t inner_no_pad_size = onnxruntime::narrow(output_dims[inner_axis] > 0 + ? reshaped_input_dims[inner_axis] / output_dims[inner_axis] + : 0); PadsVector reshaped_pad(2 * new_dims_count), reshaped_slice(2 * new_dims_count); PadBase::ReshapePads(pads, data_rank, new_dims_count, inner_no_pad_size, reshaped_pad); PadBase::ReshapePads(slices, data_rank, new_dims_count, inner_no_pad_size, reshaped_slice); @@ -486,13 +486,14 @@ static Status PadImpl(OpKernelContext* ctx, input_extents.reserve(new_dims_count); for (size_t i = 0; i < new_dims_count; i++) { input_starts.push_back(-1 * reshaped_slice[i]); - input_extents.push_back(reshaped_input_dims[i] + reshaped_slice[i] + reshaped_slice[i + new_dims_count]); - reshaped_output_dims[i] += reshaped_pad[i] + reshaped_pad[i + new_dims_count] + + auto extent = SafeInt(reshaped_input_dims[i]) + reshaped_slice[i] + reshaped_slice[i + new_dims_count]; + input_extents.push_back(extent); + reshaped_output_dims[i] += SafeInt(reshaped_pad[i]) + reshaped_pad[i + new_dims_count] + reshaped_slice[i] + reshaped_slice[i + new_dims_count]; } for (size_t i = 0; i < data_rank; i++) { - output_dims[i] += pads[i] + pads[i + data_rank] + slices[i] + slices[i + data_rank]; + output_dims[i] += SafeInt(pads[i]) + pads[i + data_rank] + slices[i] + slices[i + data_rank]; } // special case an input with one or more dim values of 0. edge case that is easier to handle @@ -510,11 +511,11 @@ static Status PadImpl(OpKernelContext* ctx, auto* output = reinterpret_cast(output_tensor.MutableDataRaw()); TensorPitches output_pitches(reshaped_output_dims); - size_t alignSkip = 0; // Amount to skip to align to where the next input tensor data needs to be written + SafeInt align_skip = 0; // Amount to skip to align to where the next input tensor data needs to be written // Initial skip, sum up the begin padding on each axis for (size_t i = 0; i < new_dims_count; i++) - alignSkip += SafeInt(reshaped_pad[i]) * output_pitches[i]; + align_skip += SafeInt(reshaped_pad[i]) * output_pitches[i]; ExtentAxisCounters input_counters(input_extents); @@ -524,28 +525,28 @@ static Status PadImpl(OpKernelContext* ctx, // On loop entry, 'pad' is already set to the first continuous block of padding, and // after every pass through the inner loop it gets set to the next continuous pad size. while (input_counters) { - output += alignSkip; + output += align_skip; { - T* axisStart = output; + T* axis_start = output; output = input.CopyInnermostAxisSolitaryInnerStep(output); - int64_t prePad = reshaped_pad[inner_axis]; - int64_t postPad = reshaped_pad[inner_axis + new_dims_count]; - PadAxisConstant(axisStart - prePad, value, onnxruntime::narrow(prePad)); - PadAxisConstant(output, value, onnxruntime::narrow(postPad)); - output += postPad; - alignSkip = onnxruntime::narrow(prePad); + const SafeInt pre_pad = reshaped_pad[inner_axis]; + const SafeInt post_pad = reshaped_pad[inner_axis + new_dims_count]; + PadAxisConstant(axis_start - pre_pad, value, pre_pad); + PadAxisConstant(output, value, post_pad); + output += post_pad; + align_skip = pre_pad; } // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done) while (input_counters.Increment()) { ptrdiff_t inner_pitch = onnxruntime::narrow(output_pitches[input_counters.Axis()]); - T* axisStart = output - inner_pitch * input_extents[input_counters.Axis()]; - int64_t prePad = reshaped_pad[input_counters.Axis()]; - int64_t postPad = reshaped_pad[input_counters.Axis() + new_dims_count]; - PadAxisConstant(axisStart - prePad * inner_pitch, value, SafeInt(prePad) * inner_pitch); - PadAxisConstant(output, value, SafeInt(postPad) * inner_pitch); - output += inner_pitch * postPad; - alignSkip += inner_pitch * SafeInt(prePad); + T* axis_start = output - inner_pitch * input_extents[input_counters.Axis()]; + const int64_t pre_pad = reshaped_pad[input_counters.Axis()]; + const int64_t post_pad = reshaped_pad[input_counters.Axis() + new_dims_count]; + PadAxisConstant(axis_start - pre_pad * inner_pitch, value, SafeInt(pre_pad) * inner_pitch); + PadAxisConstant(output, value, SafeInt(post_pad) * inner_pitch); + output += inner_pitch * post_pad; + align_skip += inner_pitch * SafeInt(pre_pad); } } break; @@ -555,7 +556,7 @@ static Status PadImpl(OpKernelContext* ctx, // On loop entry, 'pad' is already set to the first continuous block of padding, and // after every pass through the inner loop it gets set to the next continuous pad size. while (input_counters) { - output += alignSkip; + output += align_skip; { T* axisStart = output; output = input.CopyInnermostAxisSolitaryInnerStep(output); @@ -572,7 +573,7 @@ static Status PadImpl(OpKernelContext* ctx, PadAxis(output, output - inner_no_pad_size, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, onnxruntime::narrow(pads[inner_axis + data_rank])); } output += postPad; - alignSkip = onnxruntime::narrow(prePad); + align_skip = onnxruntime::narrow(prePad); } // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done) while (input_counters.Increment()) { @@ -583,7 +584,7 @@ static Status PadImpl(OpKernelContext* ctx, PadAxis(axisStart - prePad * inner_pitch, axisStart, 1, -inner_pitch, inner_pitch, onnxruntime::narrow(prePad)); PadAxis(output, output - inner_pitch, 1, -inner_pitch, inner_pitch, onnxruntime::narrow(postPad)); output += inner_pitch * postPad; - alignSkip += inner_pitch * SafeInt(prePad); + align_skip += inner_pitch * SafeInt(prePad); } } break; @@ -594,7 +595,7 @@ static Status PadImpl(OpKernelContext* ctx, // On loop entry, 'pad' is already set to the first continuous block of padding, and // after every pass through the inner loop it gets set to the next continuous pad size. while (input_counters) { - output += alignSkip; + output += align_skip; { T* axisStart = output; output = input.CopyInnermostAxisSolitaryInnerStep(output); @@ -644,7 +645,7 @@ static Status PadImpl(OpKernelContext* ctx, } } output += postPad; - alignSkip = onnxruntime::narrow(prePad); + align_skip = onnxruntime::narrow(prePad); } // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done) while (input_counters.Increment()) { @@ -684,7 +685,7 @@ static Status PadImpl(OpKernelContext* ctx, onnxruntime::narrow(postPad)); } output += inner_pitch * postPad; - alignSkip += inner_pitch * SafeInt(prePad); + align_skip += inner_pitch * SafeInt(prePad); } } break; diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc index 1d9cd15f53327..97ee556d59501 100644 --- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc @@ -890,11 +890,6 @@ TYPED_TEST(PadOpTest, Pad_Edge_DimWithZeroInput) { } TYPED_TEST(PadOpTest, Pad_Reflect_DimWithZeroInput) { - // TODO: Unskip when fixed #41968513 - if (DefaultDmlExecutionProvider().get() != nullptr) { - GTEST_SKIP() << "Skipping because of the following error: MLOperatorAuthorImpl.cpp(2100): The parameter is incorrect."; - } - using T = TypeParam; RunAllOpsetAllDomainPadTests({2, 0}, // 2D {}, @@ -902,7 +897,10 @@ TYPED_TEST(PadOpTest, Pad_Reflect_DimWithZeroInput) { T(1), {4, 0}, {}, - "reflect"); + "reflect", + OpTester::ExpectResult::kExpectSuccess, + "", + {kDmlExecutionProvider}); // DML: Unskip when fixed #41968513 RunAllOpsetAllDomainPadTests({0, 2, 1}, // 3D {}, @@ -912,7 +910,8 @@ TYPED_TEST(PadOpTest, Pad_Reflect_DimWithZeroInput) { {}, "reflect", OpTester::ExpectResult::kExpectFailure, - "Cannot use 'reflect' mode to pad dimension with a value of 0. Input shape:{0,2,1}", {kTensorrtExecutionProvider}); + "Cannot use 'reflect' mode to pad dimension with a value of 0. Input shape:{0,2,1}", + {kDmlExecutionProvider, kTensorrtExecutionProvider}); // DML: Unskip when fixed #41968513 } TEST(PadOpTest, BoolType) { @@ -1089,5 +1088,17 @@ TEST(PadOpTest, ConstantPadNegativeAxes) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kNnapiExecutionProvider}); } +// Gh issue: https://github.com/microsoft/onnxruntime/issues/11828 +TEST(PadOpTest, Pad_Reflect_NegativeFront_PositiveBack) { + using T = float; + RunAllOpsetAllDomainPadTests({4}, + {T(1), T(2), T(3), T(4)}, + {-3, 3}, + T(0), + {4}, + {4, 0, 0, 0}, + "reflect"); +} + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/testdata/test_pad_rce.py b/onnxruntime/test/testdata/test_pad_rce.py index ccbee5565bfd9..0a9faac70aab3 100644 --- a/onnxruntime/test/testdata/test_pad_rce.py +++ b/onnxruntime/test/testdata/test_pad_rce.py @@ -16,10 +16,7 @@ def create_pad_model(): mode="constant", # or reflect/edge ) graph = helper.make_graph( - nodes=[pad_node], - name="PadModel", - inputs=[input_data, pads, constant_value], - outputs=[output] + nodes=[pad_node], name="PadModel", inputs=[input_data, pads, constant_value], outputs=[output] ) model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 14)]) From fdfe2b53200a6bc9868b0ae6af5830298bdf0b3b Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Thu, 18 Dec 2025 17:51:03 -0800 Subject: [PATCH 05/28] Commit the latest --- onnxruntime/core/providers/cpu/tensor/pad.cc | 74 +++++++-- .../core/providers/cpu/tensor/padbase.h | 36 +++++ .../test/providers/cpu/tensor/pad_test.cc | 152 +++++++++++++++--- 3 files changed, 225 insertions(+), 37 deletions(-) diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc index f9f9121233ce8..92beeb9286001 100644 --- a/onnxruntime/core/providers/cpu/tensor/pad.cc +++ b/onnxruntime/core/providers/cpu/tensor/pad.cc @@ -457,7 +457,7 @@ static Status PadImpl(OpKernelContext* ctx, const auto& input_tensor = *ctx->Input(0); const auto& orig_input_shape = input_tensor.Shape(); auto output_dims(orig_input_shape.AsShapeVector()); - size_t data_rank = output_dims.size(); + const size_t data_rank = output_dims.size(); // make copy of raw_pads as it may be mutated below ORT_ENFORCE(data_rank > 0, "Input tensor has no dimensions"); @@ -465,14 +465,18 @@ static Status PadImpl(OpKernelContext* ctx, // Reshape input dims TensorShapeVector reshaped_input_dims; - PadBase::FlattenInnerShape(output_dims, pads, slices, reshaped_input_dims); + if (PadBase::ShouldFlattenInnerShape(output_dims, pads, slices)) { + PadBase::FlattenInnerShape(output_dims, pads, slices, reshaped_input_dims); + } else { + reshaped_input_dims = output_dims; + } // Reshape padding const size_t new_dims_count = reshaped_input_dims.size(); const size_t inner_axis = new_dims_count - 1; - const size_t inner_no_pad_size = onnxruntime::narrow(output_dims[inner_axis] > 0 - ? reshaped_input_dims[inner_axis] / output_dims[inner_axis] - : 0); + const int64_t inner_no_pad_size = output_dims[inner_axis] > 0 + ? reshaped_input_dims[inner_axis] / output_dims[inner_axis] + : 0; PadsVector reshaped_pad(2 * new_dims_count), reshaped_slice(2 * new_dims_count); PadBase::ReshapePads(pads, data_rank, new_dims_count, inner_no_pad_size, reshaped_pad); PadBase::ReshapePads(slices, data_rank, new_dims_count, inner_no_pad_size, reshaped_slice); @@ -481,17 +485,22 @@ static Status PadImpl(OpKernelContext* ctx, TensorShapeVector input_starts; TensorShapeVector input_extents; - // Calculate output dimensions, and handle any negative padding + // Calculate reshaped output dimensions, and handle any negative padding input_starts.reserve(new_dims_count); input_extents.reserve(new_dims_count); for (size_t i = 0; i < new_dims_count; i++) { + // Starts for every dimension. If slice is negative, we need to start further in, handled by the SliceIterator input_starts.push_back(-1 * reshaped_slice[i]); - auto extent = SafeInt(reshaped_input_dims[i]) + reshaped_slice[i] + reshaped_slice[i + new_dims_count]; + // Do not allow negative extents + int64_t extent = std::max(SafeInt(reshaped_input_dims[i]) + + reshaped_slice[i] + reshaped_slice[i + new_dims_count], + 0U); input_extents.push_back(extent); reshaped_output_dims[i] += SafeInt(reshaped_pad[i]) + reshaped_pad[i + new_dims_count] + reshaped_slice[i] + reshaped_slice[i + new_dims_count]; } + // Compute true output dimensions for (size_t i = 0; i < data_rank; i++) { output_dims[i] += SafeInt(pads[i]) + pads[i + data_rank] + slices[i] + slices[i + data_rank]; } @@ -502,20 +511,53 @@ static Status PadImpl(OpKernelContext* ctx, return PadInputWithDimValueOfZero(ctx, mode, orig_input_shape, output_dims, value); } - TensorShape input_shape(reshaped_input_dims); - SliceIterator input(input_tensor, input_shape, input_starts, input_extents, {}); - - // output_shape need to keep original. + // output_shape needs to keep original. TensorShape output_shape(output_dims); auto& output_tensor = *ctx->Output(0, output_shape); auto* output = reinterpret_cast(output_tensor.MutableDataRaw()); + // Early constant-fill: if any input extent is zero, no data to copy + // only padding if any + bool no_data_to_copy = false; + for (size_t i = 0; i < input_extents.size(); ++i) { + if (input_extents[i] == 0) { + no_data_to_copy = true; + break; + } + } + + const SafeInt total_output_elems(output_shape.Size()); + if (no_data_to_copy) { + if (mode == Mode::Constant) { + PadAxisConstant(output, value, total_output_elems); + return Status::OK(); + } + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, + "Pad: invalid mode: ", static_cast(mode), " with zero effective input extent"); + } + TensorPitches output_pitches(reshaped_output_dims); - SafeInt align_skip = 0; // Amount to skip to align to where the next input tensor data needs to be written + // Initial skip, sum up the start padding on each axis + SafeInt align_skip = 0; + for (size_t i = 0; i < new_dims_count; i++) { + const auto inc = SafeInt(reshaped_pad[i]) * output_pitches[i]; + align_skip += inc; + } - // Initial skip, sum up the begin padding on each axis - for (size_t i = 0; i < new_dims_count; i++) - align_skip += SafeInt(reshaped_pad[i]) * output_pitches[i]; + // Validate coverage: pre + copy + post == total + SafeInt copy_elems = 1; + for (size_t i = 0, lim = input_extents.size(); i < lim; ++i) { + // All extents are positive here due to the no_data_to_copy check above + copy_elems *= input_extents[i]; + } + + const size_t prepad_elems = align_skip; + const size_t postpad_elems = SafeInt(total_output_elems) - prepad_elems - copy_elems; + ORT_RETURN_IF_ERROR(PadBase::ValidateTotalElementsCoverage( + total_output_elems, prepad_elems, copy_elems, postpad_elems)); + + TensorShape input_shape(reshaped_input_dims); + SliceIterator input(input_tensor, input_shape, input_starts, input_extents, {}); ExtentAxisCounters input_counters(input_extents); @@ -532,7 +574,7 @@ static Status PadImpl(OpKernelContext* ctx, const SafeInt pre_pad = reshaped_pad[inner_axis]; const SafeInt post_pad = reshaped_pad[inner_axis + new_dims_count]; - PadAxisConstant(axis_start - pre_pad, value, pre_pad); + PadAxisConstant(axis_start - *pre_pad.Ptr(), value, pre_pad); PadAxisConstant(output, value, post_pad); output += post_pad; align_skip = pre_pad; diff --git a/onnxruntime/core/providers/cpu/tensor/padbase.h b/onnxruntime/core/providers/cpu/tensor/padbase.h index 43f9cbfc9f9a4..62324f1b48df1 100644 --- a/onnxruntime/core/providers/cpu/tensor/padbase.h +++ b/onnxruntime/core/providers/cpu/tensor/padbase.h @@ -67,6 +67,42 @@ class PadBase { // End provider shared + // Only flatten innermost axes when there is no padding and no slicing on ANY axis. + static bool ShouldFlattenInnerShape(gsl::span input_dims, + gsl::span pads, + gsl::span slices) { + const size_t rank = input_dims.size(); + if (rank == 0) return false; + for (size_t i = 0; i < rank; ++i) { + if (slices[i] != 0 || slices[rank + i] != 0) return false; + } + + const size_t inner = rank - 1; + if (pads[inner] != 0 || pads[inner + rank] != 0 || + slices[inner] != 0 || slices[inner + rank] != 0) { + return false; + } + return true; + } + + // Guard: pre-pad + copy + post-pad must equal total output elements. + static Status ValidateTotalElementsCoverage(size_t total_output_elems, + size_t prepad_elems, + size_t copy_elems, + size_t postpad_elems) { + const size_t checked_sum = + SafeInt(prepad_elems) + + SafeInt(copy_elems) + + SafeInt(postpad_elems); + if (checked_sum != total_output_elems) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, + +"Pad coverage invalid: pre=", prepad_elems, + +" copy=", copy_elems, " post=", postpad_elems, + +" total=", total_output_elems); + } + return Status::OK(); + } + /// /// Flatten no padding inner most Axis, so one memcpy cover multiple Axis. /// For example, for a shape of [1,224,224,3] with padding [0,3,3,0,0,3,3,0], can be flatten as diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc index 97ee556d59501..386f1e4e37ec0 100644 --- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc @@ -836,11 +836,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) { // In order to remove the warning, shape inference methods needs to be fixed. TYPED_TEST(PadOpTest, Pad_Edge_DimWithZeroInput) { - // TODO: Unskip when fixed #41968513 - if (DefaultDmlExecutionProvider().get() != nullptr) { - GTEST_SKIP() << "Skipping because of the following error: MLOperatorAuthorImpl.cpp(2100): The parameter is incorrect."; - } - + // TODO: Enable Dml when fixed #41968513 using T = TypeParam; RunAllOpsetAllDomainPadTests({0}, // 1D {}, @@ -850,7 +846,8 @@ TYPED_TEST(PadOpTest, Pad_Edge_DimWithZeroInput) { {}, "edge", OpTester::ExpectResult::kExpectFailure, - "Cannot use 'edge' mode to pad dimension with a value of 0. Input shape:{0}", {kTensorrtExecutionProvider}); + "Cannot use 'edge' mode to pad dimension with a value of 0. Input shape:{0}", + {kDmlExecutionProvider, kTensorrtExecutionProvider}); RunAllOpsetAllDomainPadTests({2, 0}, // 2D {}, @@ -860,7 +857,8 @@ TYPED_TEST(PadOpTest, Pad_Edge_DimWithZeroInput) { {}, "edge", OpTester::ExpectResult::kExpectFailure, - "Cannot use 'edge' mode to pad dimension with a value of 0. Input shape:{2,0}", {kTensorrtExecutionProvider}); + "Cannot use 'edge' mode to pad dimension with a value of 0. Input shape:{2,0}", + {kDmlExecutionProvider, kTensorrtExecutionProvider}); RunAllOpsetAllDomainPadTests({2, 0}, // 2D {}, @@ -878,7 +876,8 @@ TYPED_TEST(PadOpTest, Pad_Edge_DimWithZeroInput) { {}, "edge", OpTester::ExpectResult::kExpectFailure, - "Cannot use 'edge' mode to pad dimension with a value of 0. Input shape:{2,2,0}", {kTensorrtExecutionProvider}); + "Cannot use 'edge' mode to pad dimension with a value of 0. Input shape:{2,2,0}", + {kDmlExecutionProvider, kTensorrtExecutionProvider}); RunAllOpsetAllDomainPadTests({2, 2, 0}, // 3D {}, @@ -886,10 +885,22 @@ TYPED_TEST(PadOpTest, Pad_Edge_DimWithZeroInput) { T(1), {2, 4, 0}, {}, - "edge"); + "edge", + OpTester::ExpectResult::kExpectSuccess, + {kDmlExecutionProvider}); } -TYPED_TEST(PadOpTest, Pad_Reflect_DimWithZeroInput) { +static_assert(sizeof("Pad_Reflect_DimWithZeroInput") > 1, "test-name must not be empty"); +template +class PadOpTest_Pad_Reflect_DimWithZeroInput_Test : public PadOpTest { + private: + typedef PadOpTest TestFixture; + typedef gtest_TypeParam_ TypeParam; + void TestBody() override; +}; +[[maybe_unused]] static bool gtest_PadOpTest_Pad_Reflect_DimWithZeroInput_registered_ = ::testing::internal::TypeParameterizedTest, gtest_type_params_PadOpTest_>::Register("", ::testing::internal::CodeLocation("D:\\dev\\ort_main\\onnxruntime\\test\\providers\\cpu\\tensor\\pad_test.cc", 892), "PadOpTest", "Pad_Reflect_DimWithZeroInput", 0, ::testing::internal::GenerateNames()); +template +void PadOpTest_Pad_Reflect_DimWithZeroInput_Test::TestBody() { using T = TypeParam; RunAllOpsetAllDomainPadTests({2, 0}, // 2D {}, @@ -1085,20 +1096,119 @@ TEST(PadOpTest, ConstantPadNegativeAxes) { 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f}); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kNnapiExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } -// Gh issue: https://github.com/microsoft/onnxruntime/issues/11828 -TEST(PadOpTest, Pad_Reflect_NegativeFront_PositiveBack) { - using T = float; - RunAllOpsetAllDomainPadTests({4}, - {T(1), T(2), T(3), T(4)}, - {-3, 3}, - T(0), - {4}, - {4, 0, 0, 0}, - "reflect"); +TEST(PadOpTest, ConstantPadLargeNegativePadNoOutput) { + OpTester test("Pad", 18); + test.AddAttribute("mode", "constant"); + + const std::initializer_list input_shape{2, 18, 4}; + + /* clang-format off */ + const std::vector input_data = { + 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, + + 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, + }; + /* clang-format on */ + + // input_data is larger than the shape elements in this test + // constexpr const size_t input_data_size = static_cast(2) * 18 * 4; + // ASSERT_EQ(input_data_size, input_data.size()); + auto input_span = gsl::make_span(input_data.data(), static_cast(2) * 18 * 4); + + const std::initializer_list pads_shape{6}; + std::initializer_list pads = {1, 0x100000, -2, -3, 0, 1}; + ASSERT_EQ(6U, pads.size()); + + // Expected shape is as follows: + // dim0: 2 + 1(pad) - 3(crop at the back) = (0) removed // Should produce empty output + // dim1: 18 + 0x100000(pad) - 0(crop at the front) = 0x10000 + // dim2: 4 + -2(crop at the front) + 1(pad at the back) = 3 + // Resulting shape is {0, 0x10000, 3} with 0 at the front. + // How do we handle zero shapes? Currently ONNX spec allows it. + constexpr int64_t dim0 = 2 + 1 - 3; + constexpr int64_t dim1 = 18 + 0x100000 - 0; + constexpr int64_t dim2 = 4 + -2 + 1; + const std::initializer_list output_shape{dim0, dim1, dim2}; + + std::vector output_data; // empty now + + test.AddInput("data", input_shape, input_span); + test.AddInput("pads", pads_shape, pads); + test.AddInput("value", {}, {100.f}); + + // Omit Axis input + test.AddOutput("output", output_shape, output_data); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); +} + +TEST(PadOpTest, ConstantMode_MixedSigns_Small) { + const std::vector input_shape{2, 6, 4}; + std::vector input_data(2 * 6 * 4); + + for (size_t i = 0; i < input_data.size(); ++i) { + input_data[i] = static_cast((i % 5) + 1); + } + + const std::vector pads{1, 3, -2, -1, 0, 1}; + const float cv = 9.0f; + const std::vector expected_shape{2, 9, 3}; + + std::vector expected_output = { + // a0 = 0 + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + 3.f, 4.f, 9.f, + 2.f, 3.f, 9.f, + 1.f, 2.f, 9.f, + 5.f, 1.f, 9.f, + 4.f, 5.f, 9.f, + 3.f, 4.f, 9.f, + + // a0 = 1 (cropped original, fully padded slice) + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f}; + + ASSERT_EQ(2U * 9U * 3U, expected_output.size()); + + OpTester test("Pad", 18); + test.AddInput("data", input_shape, input_data); + test.AddInput("pads", {static_cast(pads.size())}, pads); + test.AddInput("constant_value", {}, {cv}); + test.AddOutput("output", expected_shape, expected_output); + test.AddAttribute("mode", "constant"); + test.Run(); } +// Gh issue: https://github.com/microsoft/onnxruntime/issues/11828 +// TEST(PadOpTest, Pad_Reflect_NegativeFront_PositiveBack) { +// using T = float; +// RunAllOpsetAllDomainPadTests({4}, +// {T(1), T(2), T(3), T(4)}, +// {-3, 3}, +// T(0), +// {4}, +// {4, 0, 0, 0}, +// "reflect"); +//} + } // namespace test } // namespace onnxruntime From 3e711a514bb32120090f719304cda26d53919dd3 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Fri, 19 Dec 2025 10:49:44 -0800 Subject: [PATCH 06/28] Fix up constant --- onnxruntime/core/providers/cpu/tensor/pad.cc | 22 +++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc index 92beeb9286001..696acd069691b 100644 --- a/onnxruntime/core/providers/cpu/tensor/pad.cc +++ b/onnxruntime/core/providers/cpu/tensor/pad.cc @@ -574,8 +574,12 @@ static Status PadImpl(OpKernelContext* ctx, const SafeInt pre_pad = reshaped_pad[inner_axis]; const SafeInt post_pad = reshaped_pad[inner_axis + new_dims_count]; - PadAxisConstant(axis_start - *pre_pad.Ptr(), value, pre_pad); - PadAxisConstant(output, value, post_pad); + if (pre_pad > 0) { + PadAxisConstant(axis_start - static_cast(pre_pad), value, pre_pad); + } + if (post_pad > 0) { + PadAxisConstant(output, value, post_pad); + } output += post_pad; align_skip = pre_pad; } @@ -583,12 +587,16 @@ static Status PadImpl(OpKernelContext* ctx, while (input_counters.Increment()) { ptrdiff_t inner_pitch = onnxruntime::narrow(output_pitches[input_counters.Axis()]); T* axis_start = output - inner_pitch * input_extents[input_counters.Axis()]; - const int64_t pre_pad = reshaped_pad[input_counters.Axis()]; - const int64_t post_pad = reshaped_pad[input_counters.Axis() + new_dims_count]; - PadAxisConstant(axis_start - pre_pad * inner_pitch, value, SafeInt(pre_pad) * inner_pitch); - PadAxisConstant(output, value, SafeInt(post_pad) * inner_pitch); + const SafeInt pre_pad = reshaped_pad[input_counters.Axis()]; + const SafeInt post_pad = reshaped_pad[input_counters.Axis() + new_dims_count]; + if (pre_pad > 0) { + PadAxisConstant(axis_start - static_cast(pre_pad * inner_pitch), value, pre_pad * inner_pitch); + } + if (post_pad > 0) { + PadAxisConstant(output, value, post_pad * inner_pitch); + } output += inner_pitch * post_pad; - align_skip += inner_pitch * SafeInt(pre_pad); + align_skip += inner_pitch * pre_pad; } } break; From af3a0160abc9509a58a0a2c983e195c8ebfb60b8 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Fri, 19 Dec 2025 11:15:20 -0800 Subject: [PATCH 07/28] More refactoring --- onnxruntime/core/providers/cpu/tensor/pad.cc | 111 +++++++++++------- .../test/providers/cpu/tensor/pad_test.cc | 90 +++++++------- 2 files changed, 113 insertions(+), 88 deletions(-) diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc index 696acd069691b..6c3f1c71241c9 100644 --- a/onnxruntime/core/providers/cpu/tensor/pad.cc +++ b/onnxruntime/core/providers/cpu/tensor/pad.cc @@ -608,33 +608,48 @@ static Status PadImpl(OpKernelContext* ctx, while (input_counters) { output += align_skip; { - T* axisStart = output; + T* axis_start = output; output = input.CopyInnermostAxisSolitaryInnerStep(output); - int64_t prePad = reshaped_pad[inner_axis]; - int64_t postPad = reshaped_pad[inner_axis + new_dims_count]; + SafeInt pre_pad = reshaped_pad[inner_axis]; + SafeInt post_pad = reshaped_pad[inner_axis + new_dims_count]; if (inner_no_pad_size == 1) { - PadAxisConstant(axisStart - prePad, *axisStart, onnxruntime::narrow(prePad)); - PadAxisConstant(output, *(output - 1), onnxruntime::narrow(postPad)); + if (pre_pad > 0) { + PadAxisConstant(axis_start - static_cast(pre_pad), *axis_start, pre_pad); + } + if (post_pad > 0) { + PadAxisConstant(output, *(output - 1), post_pad); + } } else { // When inner_most axis(es) do not need pad, above PadAxisConstant() do not fit for Edge mode. // Also general loop below after handling first pad axis with non-pad axis works fine. - PadAxis(axisStart - prePad, axisStart, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, onnxruntime::narrow(pads[inner_axis])); - PadAxis(output, output - inner_no_pad_size, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, onnxruntime::narrow(pads[inner_axis + data_rank])); + if (pads[inner_axis] > 0) { + PadAxis(axis_start - static_cast(pre_pad), axis_start, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, + onnxruntime::narrow(pads[inner_axis])); + } + if (pads[inner_axis + data_rank] > 0) { + PadAxis(output, output - inner_no_pad_size, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, + onnxruntime::narrow(pads[inner_axis + data_rank])); + } } - output += postPad; - align_skip = onnxruntime::narrow(prePad); + output += post_pad; + align_skip = pre_pad; } // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done) while (input_counters.Increment()) { ptrdiff_t inner_pitch = onnxruntime::narrow(output_pitches[input_counters.Axis()]); - T* axisStart = output - inner_pitch * input_extents[input_counters.Axis()]; - int64_t prePad = reshaped_pad[input_counters.Axis()]; - int64_t postPad = reshaped_pad[input_counters.Axis() + new_dims_count]; - PadAxis(axisStart - prePad * inner_pitch, axisStart, 1, -inner_pitch, inner_pitch, onnxruntime::narrow(prePad)); - PadAxis(output, output - inner_pitch, 1, -inner_pitch, inner_pitch, onnxruntime::narrow(postPad)); - output += inner_pitch * postPad; - align_skip += inner_pitch * SafeInt(prePad); + T* axis_start = output - inner_pitch * input_extents[input_counters.Axis()]; + const SafeInt pre_pad = reshaped_pad[input_counters.Axis()]; + const SafeInt post_pad = reshaped_pad[input_counters.Axis() + new_dims_count]; + if (pre_pad > 0) { + PadAxis(axis_start - static_cast(pre_pad) * inner_pitch, axis_start, 1, -inner_pitch, inner_pitch, + pre_pad); + } + if (post_pad > 0) { + PadAxis(output, output - inner_pitch, 1, -inner_pitch, inner_pitch, post_pad); + } + output += inner_pitch * post_pad; + align_skip += inner_pitch * pre_pad; } } break; @@ -647,25 +662,35 @@ static Status PadImpl(OpKernelContext* ctx, while (input_counters) { output += align_skip; { - T* axisStart = output; + T* axis_start = output; output = input.CopyInnermostAxisSolitaryInnerStep(output); - int64_t prePad = reshaped_pad[inner_axis]; - int64_t postPad = reshaped_pad[inner_axis + new_dims_count]; + const SafeInt pre_pad = reshaped_pad[inner_axis]; + const SafeInt post_pad = reshaped_pad[inner_axis + new_dims_count]; if (inner_no_pad_size == 1) { if (mode == Mode::Reflect) { - PadInnermostAxis(axisStart - prePad, axisStart + prePad, -1 /* inputDelta */, onnxruntime::narrow(prePad)); - PadInnermostAxis(output, output - 2, -1 /* inputDelta */, onnxruntime::narrow(postPad)); + if (pre_pad > 0) { + PadInnermostAxis(axis_start - static_cast(pre_pad), + axis_start + static_cast(pre_pad), -1 /* inputDelta */, pre_pad); + } + if (post_pad > 0) { + PadInnermostAxis(output, output - 2, -1 /* inputDelta */, post_pad); + } } else { - PadInnermostAxis(axisStart - prePad, output - prePad, 1 /* inputDelta */, onnxruntime::narrow(prePad)); - PadInnermostAxis(output, axisStart, 1 /* inputDelta */, onnxruntime::narrow(postPad)); + if (pre_pad > 0) { + PadInnermostAxis(axis_start - static_cast(pre_pad), + output - static_cast(pre_pad), 1 /* inputDelta */, pre_pad); + } + if (post_pad > 0) { + PadInnermostAxis(output, axis_start, 1 /* inputDelta */, post_pad); + } } } else { // When inner_most axis(es) do not need pad, Above PadInnermostAxis() do not fit for Reflect mode. if (mode == Mode::Reflect) { PadAxis( - axisStart - prePad, - axisStart + prePad, + axis_start - static_cast(pre_pad), + axis_start + static_cast(pre_pad), 1, -ptrdiff_t(inner_no_pad_size * 2), inner_no_pad_size, @@ -679,7 +704,7 @@ static Status PadImpl(OpKernelContext* ctx, onnxruntime::narrow(pads[inner_axis + data_rank])); } else { PadAxis( - axisStart - prePad, + axis_start - static_cast(pre_pad), output - pads[inner_axis] * inner_no_pad_size, 1, 0, @@ -687,55 +712,55 @@ static Status PadImpl(OpKernelContext* ctx, onnxruntime::narrow(pads[inner_axis])); PadAxis( output, - axisStart, + axis_start, 1, 0, inner_no_pad_size, onnxruntime::narrow(pads[inner_axis + data_rank])); } } - output += postPad; - align_skip = onnxruntime::narrow(prePad); + output += post_pad; + align_skip = onnxruntime::narrow(pre_pad); } // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done) while (input_counters.Increment()) { ptrdiff_t inner_pitch = onnxruntime::narrow(output_pitches[input_counters.Axis()]); - T* axisStart = output - inner_pitch * input_extents[input_counters.Axis()]; - int64_t prePad = reshaped_pad[input_counters.Axis()]; - int64_t postPad = reshaped_pad[input_counters.Axis() + new_dims_count]; + T* axis_start = output - inner_pitch * input_extents[input_counters.Axis()]; + SafeInt pre_pad = reshaped_pad[input_counters.Axis()]; + SafeInt post_pad = reshaped_pad[input_counters.Axis() + new_dims_count]; if (mode == Mode::Reflect) { PadAxis( - axisStart - prePad * inner_pitch, - axisStart + prePad * inner_pitch, + axis_start - static_cast(pre_pad) * inner_pitch, + axis_start + static_cast(pre_pad) * inner_pitch, 1, -inner_pitch * 2, inner_pitch, - onnxruntime::narrow(prePad)); + pre_pad); PadAxis( output, output - 2 * inner_pitch, 1, -inner_pitch * 2, inner_pitch, - onnxruntime::narrow(postPad)); + post_pad); } else { PadAxis( - axisStart - prePad * inner_pitch, - output - prePad * inner_pitch, + axis_start - static_cast(pre_pad) * inner_pitch, + output - static_cast(pre_pad) * inner_pitch, 1, 0, inner_pitch, - onnxruntime::narrow(prePad)); + pre_pad); PadAxis( output, - axisStart, + axis_start, 1, 0, inner_pitch, - onnxruntime::narrow(postPad)); + post_pad); } - output += inner_pitch * postPad; - align_skip += inner_pitch * SafeInt(prePad); + output += inner_pitch * post_pad; + align_skip += inner_pitch * pre_pad; } } break; diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc index 386f1e4e37ec0..c82f17012fd47 100644 --- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc @@ -1152,51 +1152,51 @@ TEST(PadOpTest, ConstantPadLargeNegativePadNoOutput) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } -TEST(PadOpTest, ConstantMode_MixedSigns_Small) { - const std::vector input_shape{2, 6, 4}; - std::vector input_data(2 * 6 * 4); - - for (size_t i = 0; i < input_data.size(); ++i) { - input_data[i] = static_cast((i % 5) + 1); - } - - const std::vector pads{1, 3, -2, -1, 0, 1}; - const float cv = 9.0f; - const std::vector expected_shape{2, 9, 3}; - - std::vector expected_output = { - // a0 = 0 - 9.f, 9.f, 9.f, - 9.f, 9.f, 9.f, - 9.f, 9.f, 9.f, - 3.f, 4.f, 9.f, - 2.f, 3.f, 9.f, - 1.f, 2.f, 9.f, - 5.f, 1.f, 9.f, - 4.f, 5.f, 9.f, - 3.f, 4.f, 9.f, - - // a0 = 1 (cropped original, fully padded slice) - 9.f, 9.f, 9.f, - 9.f, 9.f, 9.f, - 9.f, 9.f, 9.f, - 9.f, 9.f, 9.f, - 9.f, 9.f, 9.f, - 9.f, 9.f, 9.f, - 9.f, 9.f, 9.f, - 9.f, 9.f, 9.f, - 9.f, 9.f, 9.f}; - - ASSERT_EQ(2U * 9U * 3U, expected_output.size()); - - OpTester test("Pad", 18); - test.AddInput("data", input_shape, input_data); - test.AddInput("pads", {static_cast(pads.size())}, pads); - test.AddInput("constant_value", {}, {cv}); - test.AddOutput("output", expected_shape, expected_output); - test.AddAttribute("mode", "constant"); - test.Run(); -} +// TEST(PadOpTest, ConstantMode_MixedSigns_Small) { +// const std::vector input_shape{2, 6, 4}; +// std::vector input_data(2 * 6 * 4); +// +// for (size_t i = 0; i < input_data.size(); ++i) { +// input_data[i] = static_cast((i % 5) + 1); +// } +// +// const std::vector pads{1, 3, -2, -1, 0, 1}; +// const float cv = 9.0f; +// const std::vector expected_shape{2, 9, 3}; +// +// std::vector expected_output = { +// // a0 = 0 +// 9.f, 9.f, 9.f, +// 9.f, 9.f, 9.f, +// 9.f, 9.f, 9.f, +// 3.f, 4.f, 9.f, +// 2.f, 3.f, 9.f, +// 1.f, 2.f, 9.f, +// 5.f, 1.f, 9.f, +// 4.f, 5.f, 9.f, +// 3.f, 4.f, 9.f, +// +// // a0 = 1 (cropped original, fully padded slice) +// 9.f, 9.f, 9.f, +// 9.f, 9.f, 9.f, +// 9.f, 9.f, 9.f, +// 9.f, 9.f, 9.f, +// 9.f, 9.f, 9.f, +// 9.f, 9.f, 9.f, +// 9.f, 9.f, 9.f, +// 9.f, 9.f, 9.f, +// 9.f, 9.f, 9.f}; +// +// ASSERT_EQ(2U * 9U * 3U, expected_output.size()); +// +// OpTester test("Pad", 18); +// test.AddInput("data", input_shape, input_data); +// test.AddInput("pads", {static_cast(pads.size())}, pads); +// test.AddInput("constant_value", {}, {cv}); +// test.AddOutput("output", expected_shape, expected_output); +// test.AddAttribute("mode", "constant"); +// test.Run(); +// } // Gh issue: https://github.com/microsoft/onnxruntime/issues/11828 // TEST(PadOpTest, Pad_Reflect_NegativeFront_PositiveBack) { From e1e8fa9aa7ef428bb757e7edfa1967117f7268a0 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Fri, 19 Dec 2025 13:41:10 -0800 Subject: [PATCH 08/28] Add instrumentation --- onnxruntime/core/providers/cpu/tensor/pad.cc | 197 ++++++++++-------- .../test/providers/cpu/tensor/pad_test.cc | 90 ++++---- 2 files changed, 156 insertions(+), 131 deletions(-) diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc index 6c3f1c71241c9..73ceb080e7ffe 100644 --- a/onnxruntime/core/providers/cpu/tensor/pad.cc +++ b/onnxruntime/core/providers/cpu/tensor/pad.cc @@ -382,6 +382,28 @@ void PadBase::ReshapePads(gsl::span src_pad, size_t src_dim_count reshaped_pad[inner_axis + new_dim_count] = SafeInt(src_pad[inner_axis + src_dim_count]) * inner_no_pad_size; } +template +struct OutputSink { + void operator()(T* output, T value) const { +#ifdef _DEBUG + if (output < beg || output >= end) { + throw std::out_of_range("Pad OutputSink: Output pointer is out of range"); + } +#endif + *output = value; + } + +#ifdef _DEBUG + OutputSink(T* output, T* output_end) + : beg(output), end(output_end) {} + + T* beg; + T* end; +#else + OutputSink(T* /* output */, T* /* output_end */) {} +#endif +}; + // special handling for edge case where the input has one or more dims with value of 0 template static Status PadInputWithDimValueOfZero(OpKernelContext* ctx, @@ -406,11 +428,11 @@ static Status PadInputWithDimValueOfZero(OpKernelContext* ctx, // This is the general padding method to n-dimensionally do edge or reflection padding (based on the inputDelta values) template -static void PadAxis(T* output, T* input, ptrdiff_t input_delta, ptrdiff_t input_pitch, +static void PadAxis(OutputSink& sink, T* output, T* input, ptrdiff_t input_delta, ptrdiff_t input_pitch, size_t block_size, size_t block_count) { for (size_t block_index = 0; block_index < block_count; block_index++) { for (size_t i = 0; i < block_size; i++) { - *output++ = *input; + sink(output++, *input); input += input_delta; } input += input_pitch; @@ -420,27 +442,27 @@ static void PadAxis(T* output, T* input, ptrdiff_t input_delta, ptrdiff_t input_ // These are optimizations of PadAxis. The inner loop is removed since the innermost axis has a blockSize of 1, // and inputPitch and inputDelta are just a single value added each iteration. template -static void PadInnermostAxis(T* output, T* input, ptrdiff_t input_delta, size_t block_count) { +static void PadInnermostAxis(OutputSink& sink, T* output, T* input, ptrdiff_t input_delta, size_t block_count) { for (size_t block_index = 0; block_index < block_count; block_index++) { - *output++ = *input; + sink(output++, *input); input += input_delta; } } // For constant padding, there is no input, just a size to write the constant to template -static void PadAxisConstant(T* output, T constant, size_t size) { +static void PadAxisConstant(OutputSink& sink, T* output, T constant, size_t size) { if (size == 1) { - *output = constant; + sink(output, constant); } else if (size == 2) { - *output = constant; - *(output + 1) = constant; + sink(output, constant); + sink(output + 1, constant); } else { // This would be faster with SSE instructions. // That would mean to have an implementation for each type (uint8, uint32, uint64). T* end = output + size; for (; output != end;) - *output++ = constant; + sink(output++, constant); } } @@ -514,7 +536,11 @@ static Status PadImpl(OpKernelContext* ctx, // output_shape needs to keep original. TensorShape output_shape(output_dims); auto& output_tensor = *ctx->Output(0, output_shape); + + const SafeInt total_output_elems(output_shape.Size()); auto* output = reinterpret_cast(output_tensor.MutableDataRaw()); + auto* output_end = output + static_cast(total_output_elems); + OutputSink sink(output, output_end); // Early constant-fill: if any input extent is zero, no data to copy // only padding if any @@ -526,10 +552,9 @@ static Status PadImpl(OpKernelContext* ctx, } } - const SafeInt total_output_elems(output_shape.Size()); if (no_data_to_copy) { if (mode == Mode::Constant) { - PadAxisConstant(output, value, total_output_elems); + PadAxisConstant(sink, output, value, total_output_elems); return Status::OK(); } return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, @@ -575,10 +600,10 @@ static Status PadImpl(OpKernelContext* ctx, const SafeInt pre_pad = reshaped_pad[inner_axis]; const SafeInt post_pad = reshaped_pad[inner_axis + new_dims_count]; if (pre_pad > 0) { - PadAxisConstant(axis_start - static_cast(pre_pad), value, pre_pad); + PadAxisConstant(sink, axis_start - static_cast(pre_pad), value, pre_pad); } if (post_pad > 0) { - PadAxisConstant(output, value, post_pad); + PadAxisConstant(sink, output, value, post_pad); } output += post_pad; align_skip = pre_pad; @@ -590,10 +615,10 @@ static Status PadImpl(OpKernelContext* ctx, const SafeInt pre_pad = reshaped_pad[input_counters.Axis()]; const SafeInt post_pad = reshaped_pad[input_counters.Axis() + new_dims_count]; if (pre_pad > 0) { - PadAxisConstant(axis_start - static_cast(pre_pad * inner_pitch), value, pre_pad * inner_pitch); + PadAxisConstant(sink, axis_start - static_cast(pre_pad * inner_pitch), value, pre_pad * inner_pitch); } if (post_pad > 0) { - PadAxisConstant(output, value, post_pad * inner_pitch); + PadAxisConstant(sink, output, value, post_pad * inner_pitch); } output += inner_pitch * post_pad; align_skip += inner_pitch * pre_pad; @@ -611,24 +636,24 @@ static Status PadImpl(OpKernelContext* ctx, T* axis_start = output; output = input.CopyInnermostAxisSolitaryInnerStep(output); - SafeInt pre_pad = reshaped_pad[inner_axis]; - SafeInt post_pad = reshaped_pad[inner_axis + new_dims_count]; + const SafeInt pre_pad = reshaped_pad[inner_axis]; + const SafeInt post_pad = reshaped_pad[inner_axis + new_dims_count]; if (inner_no_pad_size == 1) { if (pre_pad > 0) { - PadAxisConstant(axis_start - static_cast(pre_pad), *axis_start, pre_pad); + PadAxisConstant(sink, axis_start - static_cast(pre_pad), *axis_start, pre_pad); } if (post_pad > 0) { - PadAxisConstant(output, *(output - 1), post_pad); + PadAxisConstant(sink, output, *(output - 1), post_pad); } } else { // When inner_most axis(es) do not need pad, above PadAxisConstant() do not fit for Edge mode. // Also general loop below after handling first pad axis with non-pad axis works fine. if (pads[inner_axis] > 0) { - PadAxis(axis_start - static_cast(pre_pad), axis_start, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, + PadAxis(sink, axis_start - static_cast(pre_pad), axis_start, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, onnxruntime::narrow(pads[inner_axis])); } if (pads[inner_axis + data_rank] > 0) { - PadAxis(output, output - inner_no_pad_size, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, + PadAxis(sink, output, output - inner_no_pad_size, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, onnxruntime::narrow(pads[inner_axis + data_rank])); } } @@ -642,11 +667,11 @@ static Status PadImpl(OpKernelContext* ctx, const SafeInt pre_pad = reshaped_pad[input_counters.Axis()]; const SafeInt post_pad = reshaped_pad[input_counters.Axis() + new_dims_count]; if (pre_pad > 0) { - PadAxis(axis_start - static_cast(pre_pad) * inner_pitch, axis_start, 1, -inner_pitch, inner_pitch, + PadAxis(sink, axis_start - static_cast(pre_pad) * inner_pitch, axis_start, 1, -inner_pitch, inner_pitch, pre_pad); } if (post_pad > 0) { - PadAxis(output, output - inner_pitch, 1, -inner_pitch, inner_pitch, post_pad); + PadAxis(sink, output, output - inner_pitch, 1, -inner_pitch, inner_pitch, post_pad); } output += inner_pitch * post_pad; align_skip += inner_pitch * pre_pad; @@ -670,94 +695,94 @@ static Status PadImpl(OpKernelContext* ctx, if (inner_no_pad_size == 1) { if (mode == Mode::Reflect) { if (pre_pad > 0) { - PadInnermostAxis(axis_start - static_cast(pre_pad), + PadInnermostAxis(sink, axis_start - static_cast(pre_pad), axis_start + static_cast(pre_pad), -1 /* inputDelta */, pre_pad); } if (post_pad > 0) { - PadInnermostAxis(output, output - 2, -1 /* inputDelta */, post_pad); + PadInnermostAxis(sink, output, output - 2, -1 /* inputDelta */, post_pad); } } else { if (pre_pad > 0) { - PadInnermostAxis(axis_start - static_cast(pre_pad), + PadInnermostAxis(sink, axis_start - static_cast(pre_pad), output - static_cast(pre_pad), 1 /* inputDelta */, pre_pad); } if (post_pad > 0) { - PadInnermostAxis(output, axis_start, 1 /* inputDelta */, post_pad); + PadInnermostAxis(sink, output, axis_start, 1 /* inputDelta */, post_pad); } } } else { // When inner_most axis(es) do not need pad, Above PadInnermostAxis() do not fit for Reflect mode. if (mode == Mode::Reflect) { - PadAxis( - axis_start - static_cast(pre_pad), - axis_start + static_cast(pre_pad), - 1, - -ptrdiff_t(inner_no_pad_size * 2), - inner_no_pad_size, - onnxruntime::narrow(pads[inner_axis])); - PadAxis( - output, - output - 2 * inner_no_pad_size, - 1, - -ptrdiff_t(inner_no_pad_size * 2), - inner_no_pad_size, - onnxruntime::narrow(pads[inner_axis + data_rank])); + PadAxis(sink, + axis_start - static_cast(pre_pad), + axis_start + static_cast(pre_pad), + 1, + -ptrdiff_t(inner_no_pad_size * 2), + inner_no_pad_size, + onnxruntime::narrow(pads[inner_axis])); + PadAxis(sink, + output, + output - 2 * inner_no_pad_size, + 1, + -ptrdiff_t(inner_no_pad_size * 2), + inner_no_pad_size, + onnxruntime::narrow(pads[inner_axis + data_rank])); } else { - PadAxis( - axis_start - static_cast(pre_pad), - output - pads[inner_axis] * inner_no_pad_size, - 1, - 0, - inner_no_pad_size, - onnxruntime::narrow(pads[inner_axis])); - PadAxis( - output, - axis_start, - 1, - 0, - inner_no_pad_size, - onnxruntime::narrow(pads[inner_axis + data_rank])); + PadAxis(sink, + axis_start - static_cast(pre_pad), + output - pads[inner_axis] * inner_no_pad_size, + 1, + 0, + inner_no_pad_size, + onnxruntime::narrow(pads[inner_axis])); + PadAxis(sink, + output, + axis_start, + 1, + 0, + inner_no_pad_size, + onnxruntime::narrow(pads[inner_axis + data_rank])); } } output += post_pad; - align_skip = onnxruntime::narrow(pre_pad); + align_skip = pre_pad; } // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done) while (input_counters.Increment()) { ptrdiff_t inner_pitch = onnxruntime::narrow(output_pitches[input_counters.Axis()]); T* axis_start = output - inner_pitch * input_extents[input_counters.Axis()]; - SafeInt pre_pad = reshaped_pad[input_counters.Axis()]; - SafeInt post_pad = reshaped_pad[input_counters.Axis() + new_dims_count]; + const SafeInt pre_pad = reshaped_pad[input_counters.Axis()]; + const SafeInt post_pad = reshaped_pad[input_counters.Axis() + new_dims_count]; if (mode == Mode::Reflect) { - PadAxis( - axis_start - static_cast(pre_pad) * inner_pitch, - axis_start + static_cast(pre_pad) * inner_pitch, - 1, - -inner_pitch * 2, - inner_pitch, - pre_pad); - PadAxis( - output, - output - 2 * inner_pitch, - 1, - -inner_pitch * 2, - inner_pitch, - post_pad); + PadAxis(sink, + axis_start - static_cast(pre_pad) * inner_pitch, + axis_start + static_cast(pre_pad) * inner_pitch, + 1, + -inner_pitch * 2, + inner_pitch, + pre_pad); + PadAxis(sink, + output, + output - 2 * inner_pitch, + 1, + -inner_pitch * 2, + inner_pitch, + post_pad); } else { - PadAxis( - axis_start - static_cast(pre_pad) * inner_pitch, - output - static_cast(pre_pad) * inner_pitch, - 1, - 0, - inner_pitch, - pre_pad); - PadAxis( - output, - axis_start, - 1, - 0, - inner_pitch, - post_pad); + PadAxis(sink, + axis_start - static_cast(pre_pad) * inner_pitch, + output - static_cast(pre_pad) * inner_pitch, + 1, + 0, + inner_pitch, + pre_pad); + PadAxis(sink, + output, + axis_start, + 1, + 0, + inner_pitch, + post_pad); } output += inner_pitch * post_pad; align_skip += inner_pitch * pre_pad; diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc index c82f17012fd47..386f1e4e37ec0 100644 --- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc @@ -1152,51 +1152,51 @@ TEST(PadOpTest, ConstantPadLargeNegativePadNoOutput) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } -// TEST(PadOpTest, ConstantMode_MixedSigns_Small) { -// const std::vector input_shape{2, 6, 4}; -// std::vector input_data(2 * 6 * 4); -// -// for (size_t i = 0; i < input_data.size(); ++i) { -// input_data[i] = static_cast((i % 5) + 1); -// } -// -// const std::vector pads{1, 3, -2, -1, 0, 1}; -// const float cv = 9.0f; -// const std::vector expected_shape{2, 9, 3}; -// -// std::vector expected_output = { -// // a0 = 0 -// 9.f, 9.f, 9.f, -// 9.f, 9.f, 9.f, -// 9.f, 9.f, 9.f, -// 3.f, 4.f, 9.f, -// 2.f, 3.f, 9.f, -// 1.f, 2.f, 9.f, -// 5.f, 1.f, 9.f, -// 4.f, 5.f, 9.f, -// 3.f, 4.f, 9.f, -// -// // a0 = 1 (cropped original, fully padded slice) -// 9.f, 9.f, 9.f, -// 9.f, 9.f, 9.f, -// 9.f, 9.f, 9.f, -// 9.f, 9.f, 9.f, -// 9.f, 9.f, 9.f, -// 9.f, 9.f, 9.f, -// 9.f, 9.f, 9.f, -// 9.f, 9.f, 9.f, -// 9.f, 9.f, 9.f}; -// -// ASSERT_EQ(2U * 9U * 3U, expected_output.size()); -// -// OpTester test("Pad", 18); -// test.AddInput("data", input_shape, input_data); -// test.AddInput("pads", {static_cast(pads.size())}, pads); -// test.AddInput("constant_value", {}, {cv}); -// test.AddOutput("output", expected_shape, expected_output); -// test.AddAttribute("mode", "constant"); -// test.Run(); -// } +TEST(PadOpTest, ConstantMode_MixedSigns_Small) { + const std::vector input_shape{2, 6, 4}; + std::vector input_data(2 * 6 * 4); + + for (size_t i = 0; i < input_data.size(); ++i) { + input_data[i] = static_cast((i % 5) + 1); + } + + const std::vector pads{1, 3, -2, -1, 0, 1}; + const float cv = 9.0f; + const std::vector expected_shape{2, 9, 3}; + + std::vector expected_output = { + // a0 = 0 + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + 3.f, 4.f, 9.f, + 2.f, 3.f, 9.f, + 1.f, 2.f, 9.f, + 5.f, 1.f, 9.f, + 4.f, 5.f, 9.f, + 3.f, 4.f, 9.f, + + // a0 = 1 (cropped original, fully padded slice) + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f}; + + ASSERT_EQ(2U * 9U * 3U, expected_output.size()); + + OpTester test("Pad", 18); + test.AddInput("data", input_shape, input_data); + test.AddInput("pads", {static_cast(pads.size())}, pads); + test.AddInput("constant_value", {}, {cv}); + test.AddOutput("output", expected_shape, expected_output); + test.AddAttribute("mode", "constant"); + test.Run(); +} // Gh issue: https://github.com/microsoft/onnxruntime/issues/11828 // TEST(PadOpTest, Pad_Reflect_NegativeFront_PositiveBack) { From ec57b378ceae276d3e0c293c45cddb2def2d53f9 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Fri, 19 Dec 2025 15:26:09 -0800 Subject: [PATCH 09/28] Add tests --- onnxruntime/core/providers/cpu/tensor/pad.cc | 11 +- .../test/providers/cpu/tensor/pad_test.cc | 124 +++++++++++++++--- 2 files changed, 111 insertions(+), 24 deletions(-) diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc index 73ceb080e7ffe..18c6b00cfa2a1 100644 --- a/onnxruntime/core/providers/cpu/tensor/pad.cc +++ b/onnxruntime/core/providers/cpu/tensor/pad.cc @@ -595,17 +595,22 @@ static Status PadImpl(OpKernelContext* ctx, output += align_skip; { T* axis_start = output; - output = input.CopyInnermostAxisSolitaryInnerStep(output); + // Compute the actual number of data elements to copy on the innermost axis (after cropping). + const size_t inner_extent = onnxruntime::narrow(input_extents[inner_axis]); + + // Copy innermost block. IMPORTANT: do not rely on the returned 'output' to be end-of-the extent. + ORT_IGNORE_RETURN_VALUE(input.CopyInnermostAxisSolitaryInnerStep(output)); const SafeInt pre_pad = reshaped_pad[inner_axis]; const SafeInt post_pad = reshaped_pad[inner_axis + new_dims_count]; if (pre_pad > 0) { + /// Pre - pad(innermost) retro - fill remains valid(write before row_start). PadAxisConstant(sink, axis_start - static_cast(pre_pad), value, pre_pad); } if (post_pad > 0) { - PadAxisConstant(sink, output, value, post_pad); + PadAxisConstant(sink, axis_start + inner_extent, value, post_pad); } - output += post_pad; + output = axis_start + inner_extent + static_cast(post_pad); align_skip = pre_pad; } // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done) diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc index 386f1e4e37ec0..3183086139fe8 100644 --- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc @@ -763,10 +763,7 @@ edge // test handling of input with a 0 for a dimension TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) { - // TODO: Unskip when fixed #41968513 - if (DefaultDmlExecutionProvider().get() != nullptr) { - GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 13, which exceeds threshold"; - } + // TODO: Unskip Dml when fixed #41968513 using T = TypeParam; RunAllOpsetAllDomainPadTests({0}, // 1D @@ -774,7 +771,9 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) { {1, 1}, T(1), {2}, - {T(1), T(1)}); + {T(1), T(1)}, + "constant", OpTester::ExpectResult::kExpectSuccess, + {kDmlExecutionProvider}); RunAllOpsetAllDomainPadTests({0}, // 1D empty pads {}, @@ -788,35 +787,45 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) { {-1, 1}, T(1), {0}, - {}); + {}, + "constant", OpTester::ExpectResult::kExpectSuccess, + {kDmlExecutionProvider}); RunAllOpsetAllDomainPadTests({2, 0}, // 2D {}, {1, 1, 1, 1}, T(1), {4, 2}, - {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)}); + {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)}, + "constant", OpTester::ExpectResult::kExpectSuccess, + {kDmlExecutionProvider}); RunAllOpsetAllDomainPadTests({0, 2}, {}, {1, 1, 1, 1}, T(1), {2, 4}, - {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)}); + {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)}, + "constant", OpTester::ExpectResult::kExpectSuccess, + {kDmlExecutionProvider}); RunAllOpsetAllDomainPadTests({0, 2}, {}, {1, 0, 1, 0}, // empty pads for dim 1 T(1), {2, 2}, - {T(1), T(1), T(1), T(1)}); + {T(1), T(1), T(1), T(1)}, + "constant", OpTester::ExpectResult::kExpectSuccess, + {kDmlExecutionProvider}); RunAllOpsetAllDomainPadTests({2, 0, 2}, // 3D {}, {0, 1, 0, 0, 1, 0}, T(1), {2, 2, 2}, - {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)}); + {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)}, + "constant", OpTester::ExpectResult::kExpectSuccess, + {kDmlExecutionProvider}); } // Added output shape verification b/w the output shape generated by operator specific ONNX inference and // the output shape generated by operator specific ORT implementation. After adding this verification, @@ -1099,6 +1108,81 @@ TEST(PadOpTest, ConstantPadNegativeAxes) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } +TEST(PadOpTest, ConstantMode_MixedSigns_Small_F32) { + const std::vector input_shape{2, 6, 4}; + std::vector input_data(2 * 6 * 4); + + for (size_t i = 0; i < input_data.size(); ++i) input_data[i] = static_cast((i % 5) + 1); + + const std::vector pads{1, 3, -2, -1, 0, 1}; + const float cv = 9.0f; + // starting from input shape {2,6,4} + // after padding: {2+1+-1,6+3-0,4-2_1} => {2,9,3} + const std::vector expected_shape{2, 9, 3}; + + const std::vector expected_data = { + // sample 0 + 9.0F, 9.0F, 9.0F, + 9.0F, 9.0F, 9.0F, + 9.0F, 9.0F, 9.0F, + 9.0F, 9.0F, 9.0F, + 9.0F, 9.0F, 9.0F, + 9.0F, 9.0F, 9.0F, + 9.0F, 9.0F, 9.0F, + 9.0F, 9.0F, 9.0F, + 9.0F, 9.0F, 9.0F, + + // sample 1 + 9.0F, 9.0F, 9.0F, + 9.0F, 9.0F, 9.0F, + 9.0F, 9.0F, 9.0F, + 3.0F, 4.0F, 9.0F, + 2.0F, 3.0F, 9.0F, + 1.0F, 2.0F, 9.0F, + 5.0F, 1.0F, 9.0F, + 4.0F, 5.0F, 9.0F, + 3.0F, 4.0F, 9.0F}; + + OpTester test("Pad", 13); + test.AddInput("data", input_shape, input_data); + test.AddInput("pads", {static_cast(pads.size())}, pads); + test.AddInput("constant_value", {}, {cv}); + test.AddOutput("output", expected_shape, expected_data); + test.AddAttribute("mode", "constant"); + test.Run(); +} + +TEST(PadOpTest, ConstantFill_F32_RemovesAllDataOnAxis) { + OpTester test("Pad", 18); + test.AddAttribute("mode", "constant"); + + const std::vector input_shape = {1, 1, 4, 4}; + + const std::vector input_data = { + 1.0f, 2.0f, 3.0f, 4.0f, + 5.0f, 6.0f, 7.0f, 8.0f, + 9.0f, 10.0f, 11.0f, 12.0f, + 13.0f, 14.0f, 15.0f, 16.0f}; + + // Calculate expected shape: + // dim0: 1 + 0 + 0 = 1 + // dim1: 1 + 0 + 0 = 1 + // dim2: 4 + -4 + 4 = 4 + // dim3: 4 + 0 + 0 = 4 + const std::vector expected_shape = {1, 1, 4, 4}; + const std::vector expected_data = { + 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f}; + + test.AddInput("data", input_shape, input_data); + test.AddInput("pads", {8}, {0, 0, -4, 0, 0, 0, 4, 0}); + test.AddInput("constant_value", {}, {0.0f}); + test.AddOutput("output", expected_shape, expected_data); + test.Run(); +} + TEST(PadOpTest, ConstantPadLargeNegativePadNoOutput) { OpTester test("Pad", 18); test.AddAttribute("mode", "constant"); @@ -1164,28 +1248,26 @@ TEST(PadOpTest, ConstantMode_MixedSigns_Small) { const float cv = 9.0f; const std::vector expected_shape{2, 9, 3}; - std::vector expected_output = { - // a0 = 0 + const std::vector expected_output{ 9.f, 9.f, 9.f, 9.f, 9.f, 9.f, 9.f, 9.f, 9.f, - 3.f, 4.f, 9.f, - 2.f, 3.f, 9.f, - 1.f, 2.f, 9.f, - 5.f, 1.f, 9.f, - 4.f, 5.f, 9.f, - 3.f, 4.f, 9.f, - - // a0 = 1 (cropped original, fully padded slice) 9.f, 9.f, 9.f, 9.f, 9.f, 9.f, 9.f, 9.f, 9.f, 9.f, 9.f, 9.f, 9.f, 9.f, 9.f, 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, 9.f, 9.f, 9.f, - 9.f, 9.f, 9.f}; + 9.f, 9.f, 9.f, + 3.f, 4.f, 9.f, + 2.f, 3.f, 9.f, + 1.f, 2.f, 9.f, + 5.f, 1.f, 9.f, + 4.f, 5.f, 9.f, + 3.f, 4.f, 9.f}; ASSERT_EQ(2U * 9U * 3U, expected_output.size()); From 96a60451ba582eb645e499844dcfe397c6196875 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 22 Dec 2025 14:07:17 -0800 Subject: [PATCH 10/28] All Edge tests fail --- onnxruntime/core/providers/cpu/tensor/pad.cc | 67 +++++++++++++++---- .../test/providers/cpu/tensor/pad_test.cc | 23 +++++++ 2 files changed, 78 insertions(+), 12 deletions(-) diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc index 18c6b00cfa2a1..dfedcc24b83c6 100644 --- a/onnxruntime/core/providers/cpu/tensor/pad.cc +++ b/onnxruntime/core/providers/cpu/tensor/pad.cc @@ -641,29 +641,72 @@ static Status PadImpl(OpKernelContext* ctx, T* axis_start = output; output = input.CopyInnermostAxisSolitaryInnerStep(output); - const SafeInt pre_pad = reshaped_pad[inner_axis]; - const SafeInt post_pad = reshaped_pad[inner_axis + new_dims_count]; + // Edge is invalid if there is no data to duplicate on this axis. + // however, input extents have been validated to be non-zero above. + const size_t inner_extent = onnxruntime::narrow(input_extents[inner_axis]); + + const SafeInt inner_pitch = output_pitches[inner_axis]; + // Row bounds in linear element space: + // axis_start points to the beginning of the copied row (returned above as 'output' after alignSkip). + T* axis_end = axis_start + inner_extent * inner_no_pad_size; // one-past-last element of copied data + + const T* first_elem = axis_start; + const T* last_elem = axis_end - 1; if (inner_no_pad_size == 1) { + const SafeInt pre_pad = reshaped_pad[inner_axis]; + const SafeInt post_pad = reshaped_pad[inner_axis + new_dims_count]; if (pre_pad > 0) { - PadAxisConstant(sink, axis_start - static_cast(pre_pad), *axis_start, pre_pad); + PadAxisConstant(sink, axis_start - narrow(pre_pad), *first_elem, pre_pad); + align_skip = align_skip + inner_pitch * pre_pad; } if (post_pad > 0) { - PadAxisConstant(sink, output, *(output - 1), post_pad); + PadAxisConstant(sink, axis_end, *last_elem, post_pad); } } else { // When inner_most axis(es) do not need pad, above PadAxisConstant() do not fit for Edge mode. // Also general loop below after handling first pad axis with non-pad axis works fine. - if (pads[inner_axis] > 0) { - PadAxis(sink, axis_start - static_cast(pre_pad), axis_start, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, - onnxruntime::narrow(pads[inner_axis])); + + // Source blocks inside the copied region + T* first_block_src = axis_start; // first block + T* last_block_src = axis_end - narrow(inner_no_pad_size); // last block + + const SafeInt pre_pad = pads[inner_axis]; + const SafeInt post_pad = reshaped_pad[inner_axis + new_dims_count]; + const SafeInt block_size = inner_no_pad_size; + + // PadAxis writes 'block_size' elements per block, and after each block sets input += input_pitch. + // Using input_delta=1 to walk within the block, input_pitch=-(inner_no_pad_size) + // to reset input to the block start. + if (pre_pad > 0) { + T* dst_pre = first_block_src - static_cast(pre_pad * block_size); + PadAxis(sink, dst_pre, + first_block_src, + 1, /*input_delta=*/ + -ptrdiff_t(inner_no_pad_size), /* input_pitch */ + block_size, + pre_pad); + + // Roll align_skip forward so the outer loop's `output += align_skip` won't skip these rows again. + align_skip = SafeInt(align_skip) + inner_pitch * pre_pad; } - if (pads[inner_axis + data_rank] > 0) { - PadAxis(sink, output, output - inner_no_pad_size, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, - onnxruntime::narrow(pads[inner_axis + data_rank])); + + if (post_pad > 0) { + // 2) POST-PAD: duplicate the last block forwards after row_end + T* dst_post = axis_end; // start immediately after the copied data + PadAxis(sink, + dst_post, + last_block_src, + 1, /*input_delta=*/ + -ptrdiff_t(inner_no_pad_size), /* input_pitch */ + block_size, + post_pad); + + // Advance the linear write cursor past the post-pad area + output = dst_post + narrow(post_pad * inner_no_pad_size); + } else { + output = axis_end; } } - output += post_pad; - align_skip = pre_pad; } // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done) while (input_counters.Increment()) { diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc index 3183086139fe8..28b6e001e0c44 100644 --- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc @@ -1183,6 +1183,29 @@ TEST(PadOpTest, ConstantFill_F32_RemovesAllDataOnAxis) { test.Run(); } +// GH Issue: https://github.com/microsoft/onnxruntime/issues/13332 +TEST(PadOpTest, ConstantPadDefaultValueMixedPads_GH_13332) { + OpTester test("Pad", 18); + test.AddAttribute("mode", "constant"); + + const std::vector input_shape{1, 1, 4, 4}; + const std::vector input_data{ + 1.0f, 2.0f, 3.0f, 4.0f, + 5.0f, 6.0f, 7.0f, 8.0f, + 9.0f, 10.0f, 11.0f, 12.0f, + 13.0f, 14.0f, 15.0f, 16.0f}; + + const std::vector pads{0, 0, 4, 1, 0, 0, -4, -1}; + const std::vector output_shape{1, 1, 4, 4}; + const std::vector expected_output(16, 0.0f); + + test.AddInput("data", input_shape, input_data); + test.AddInput("pads", {static_cast(pads.size())}, pads); + test.AddOutput("output", output_shape, expected_output); + test.Run(); +} + +// Internally reported TEST(PadOpTest, ConstantPadLargeNegativePadNoOutput) { OpTester test("Pad", 18); test.AddAttribute("mode", "constant"); From a2570d0199c71f47eb2723ebb894eeccd711253c Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 22 Dec 2025 14:38:07 -0800 Subject: [PATCH 11/28] Revert "All Edge tests fail" This reverts commit 96a60451ba582eb645e499844dcfe397c6196875. --- onnxruntime/core/providers/cpu/tensor/pad.cc | 67 ++++--------------- .../test/providers/cpu/tensor/pad_test.cc | 23 ------- 2 files changed, 12 insertions(+), 78 deletions(-) diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc index dfedcc24b83c6..18c6b00cfa2a1 100644 --- a/onnxruntime/core/providers/cpu/tensor/pad.cc +++ b/onnxruntime/core/providers/cpu/tensor/pad.cc @@ -641,72 +641,29 @@ static Status PadImpl(OpKernelContext* ctx, T* axis_start = output; output = input.CopyInnermostAxisSolitaryInnerStep(output); - // Edge is invalid if there is no data to duplicate on this axis. - // however, input extents have been validated to be non-zero above. - const size_t inner_extent = onnxruntime::narrow(input_extents[inner_axis]); - - const SafeInt inner_pitch = output_pitches[inner_axis]; - // Row bounds in linear element space: - // axis_start points to the beginning of the copied row (returned above as 'output' after alignSkip). - T* axis_end = axis_start + inner_extent * inner_no_pad_size; // one-past-last element of copied data - - const T* first_elem = axis_start; - const T* last_elem = axis_end - 1; + const SafeInt pre_pad = reshaped_pad[inner_axis]; + const SafeInt post_pad = reshaped_pad[inner_axis + new_dims_count]; if (inner_no_pad_size == 1) { - const SafeInt pre_pad = reshaped_pad[inner_axis]; - const SafeInt post_pad = reshaped_pad[inner_axis + new_dims_count]; if (pre_pad > 0) { - PadAxisConstant(sink, axis_start - narrow(pre_pad), *first_elem, pre_pad); - align_skip = align_skip + inner_pitch * pre_pad; + PadAxisConstant(sink, axis_start - static_cast(pre_pad), *axis_start, pre_pad); } if (post_pad > 0) { - PadAxisConstant(sink, axis_end, *last_elem, post_pad); + PadAxisConstant(sink, output, *(output - 1), post_pad); } } else { // When inner_most axis(es) do not need pad, above PadAxisConstant() do not fit for Edge mode. // Also general loop below after handling first pad axis with non-pad axis works fine. - - // Source blocks inside the copied region - T* first_block_src = axis_start; // first block - T* last_block_src = axis_end - narrow(inner_no_pad_size); // last block - - const SafeInt pre_pad = pads[inner_axis]; - const SafeInt post_pad = reshaped_pad[inner_axis + new_dims_count]; - const SafeInt block_size = inner_no_pad_size; - - // PadAxis writes 'block_size' elements per block, and after each block sets input += input_pitch. - // Using input_delta=1 to walk within the block, input_pitch=-(inner_no_pad_size) - // to reset input to the block start. - if (pre_pad > 0) { - T* dst_pre = first_block_src - static_cast(pre_pad * block_size); - PadAxis(sink, dst_pre, - first_block_src, - 1, /*input_delta=*/ - -ptrdiff_t(inner_no_pad_size), /* input_pitch */ - block_size, - pre_pad); - - // Roll align_skip forward so the outer loop's `output += align_skip` won't skip these rows again. - align_skip = SafeInt(align_skip) + inner_pitch * pre_pad; + if (pads[inner_axis] > 0) { + PadAxis(sink, axis_start - static_cast(pre_pad), axis_start, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, + onnxruntime::narrow(pads[inner_axis])); } - - if (post_pad > 0) { - // 2) POST-PAD: duplicate the last block forwards after row_end - T* dst_post = axis_end; // start immediately after the copied data - PadAxis(sink, - dst_post, - last_block_src, - 1, /*input_delta=*/ - -ptrdiff_t(inner_no_pad_size), /* input_pitch */ - block_size, - post_pad); - - // Advance the linear write cursor past the post-pad area - output = dst_post + narrow(post_pad * inner_no_pad_size); - } else { - output = axis_end; + if (pads[inner_axis + data_rank] > 0) { + PadAxis(sink, output, output - inner_no_pad_size, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, + onnxruntime::narrow(pads[inner_axis + data_rank])); } } + output += post_pad; + align_skip = pre_pad; } // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done) while (input_counters.Increment()) { diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc index 28b6e001e0c44..3183086139fe8 100644 --- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc @@ -1183,29 +1183,6 @@ TEST(PadOpTest, ConstantFill_F32_RemovesAllDataOnAxis) { test.Run(); } -// GH Issue: https://github.com/microsoft/onnxruntime/issues/13332 -TEST(PadOpTest, ConstantPadDefaultValueMixedPads_GH_13332) { - OpTester test("Pad", 18); - test.AddAttribute("mode", "constant"); - - const std::vector input_shape{1, 1, 4, 4}; - const std::vector input_data{ - 1.0f, 2.0f, 3.0f, 4.0f, - 5.0f, 6.0f, 7.0f, 8.0f, - 9.0f, 10.0f, 11.0f, 12.0f, - 13.0f, 14.0f, 15.0f, 16.0f}; - - const std::vector pads{0, 0, 4, 1, 0, 0, -4, -1}; - const std::vector output_shape{1, 1, 4, 4}; - const std::vector expected_output(16, 0.0f); - - test.AddInput("data", input_shape, input_data); - test.AddInput("pads", {static_cast(pads.size())}, pads); - test.AddOutput("output", output_shape, expected_output); - test.Run(); -} - -// Internally reported TEST(PadOpTest, ConstantPadLargeNegativePadNoOutput) { OpTester test("Pad", 18); test.AddAttribute("mode", "constant"); From 5a6a9798b6ddd1d9b5b4f03e96120b3c31bf69de Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 22 Dec 2025 15:50:08 -0800 Subject: [PATCH 12/28] Add missing Edge tests --- onnxruntime/core/providers/cpu/tensor/pad.cc | 4 +- .../test/providers/cpu/tensor/pad_test.cc | 136 ++++++++++++++++++ 2 files changed, 139 insertions(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc index 18c6b00cfa2a1..d2466e644d70f 100644 --- a/onnxruntime/core/providers/cpu/tensor/pad.cc +++ b/onnxruntime/core/providers/cpu/tensor/pad.cc @@ -638,7 +638,9 @@ static Status PadImpl(OpKernelContext* ctx, while (input_counters) { output += align_skip; { + const SafeInt inner_extent = input_extents[inner_axis]; T* axis_start = output; + T* axis_end = axis_start + onnxruntime::narrow(inner_extent); output = input.CopyInnermostAxisSolitaryInnerStep(output); const SafeInt pre_pad = reshaped_pad[inner_axis]; @@ -662,7 +664,7 @@ static Status PadImpl(OpKernelContext* ctx, onnxruntime::narrow(pads[inner_axis + data_rank])); } } - output += post_pad; + output = axis_end + static_cast(post_pad); align_skip = pre_pad; } // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done) diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc index 3183086139fe8..15e44482ec89f 100644 --- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc @@ -1280,6 +1280,142 @@ TEST(PadOpTest, ConstantMode_MixedSigns_Small) { test.Run(); } +TEST(PadOpTest, ConstantMode_InnermostCropThenPostPad) { + const std::vector input_shape{2, 3, 5}; + + std::vector input_data(2 * 3 * 5); + std::iota(input_data.begin(), input_data.end(), 1.0f); + + const std::vector pads{1, 3, -2, -1, 0, 1}; + const float cv = 9.0f; + const std::vector expected_shape{2, 6, 4}; + + const std::vector expected_output{ + // depth 0 + 9.0F, 9.0F, 9.0F, 9.0F, + 9.0F, 9.0F, 9.0F, 9.0F, + 9.0F, 9.0F, 9.0F, 9.0F, + 9.0F, 9.0F, 9.0F, 9.0F, + 9.0F, 9.0F, 9.0F, 9.0F, + 9.0F, 9.0F, 9.0F, 9.0F, + + // depth 1 + 9.0F, 9.0F, 9.0F, 9.0F, + 9.0F, 9.0F, 9.0F, 9.0F, + 9.0F, 9.0F, 9.0F, 9.0F, + 3.0F, 4.0F, 5.0F, 9.0F, + 8.0F, 9.0F, 10.0F, 9.0F, + 13.0F, 14.0F, 15.0F, 9.0F}; + + OpTester test("Pad", 18); + test.AddInput("data", input_shape, input_data); + test.AddInput("pads", {static_cast(pads.size())}, pads); + test.AddInput("constant_value", {}, {cv}); + test.AddOutput("output", expected_shape, expected_output); + test.AddAttribute("mode", "constant"); + test.Run(); +} + +TEST(PadOpTest, EdgeMode_ZeroExtentFails) { + std::vector input_shape = {4}; + // Generate input as above + std::vector input_data = {1.0f, 2.0f, 3.0f, 4.0f}; + std::vector pads = {-4, 3}; + + const std::vector expected_shape{3}; + const std::vector expected_data = {1.f, 2.f, 3.f}; + + OpTester test("Pad", 18); + test.AddInput("data", input_shape, input_data); + test.AddInput("pads", {static_cast(pads.size())}, pads); + test.AddOutput("output", expected_shape, expected_data); + test.AddAttribute("mode", "edge"); + test.Run(OpTester::ExpectResult::kExpectFailure); +} + +TEST(PadOpTest, EdgeMode_ExtentOne_Valid) { + const std::vector input_shape{4}; + const std::vector input_data{1.f, 1.f, 1.f, 1.f}; + const std::vector pads{-3, 3}; + const std::vector expected_shape{4}; + const std::vector expected_output{1.f, 1.f, 1.f, 1.f}; + + OpTester test("Pad", 18); + test.AddInput("data", input_shape, input_data); + test.AddInput("pads", {static_cast(pads.size())}, pads); + test.AddOutput("output", expected_shape, expected_output); + test.AddAttribute("mode", "edge"); + test.Run(); +} + +// TEST(PadOpTest, EdgeMode_FlattenedInnermostAxis) { +// // This covers the else branch of inner_no_pad_size != 1 +// const std::vector input_shape{2, 3, 2, 4}; +// std::vector input_data(2 * 3 * 2 * 4); +// std::iota(input_data.begin(), input_data.end(), 1.f); +// +// const std::vector expected_shape{2, 3, 8}; + +TEST(PadOpTest, EdgeMode_FlattenedInnermostAxis) { + // Shape chosen to force FlattenInnerShape(): + // innermost dims {2,4} -> flattened to 8 + const std::vector input_shape = {2, 3, 2, 4}; + + std::vector input_data(2 * 3 * 2 * 4); + for (size_t i = 0; i < input_data.size(); ++i) { + input_data[i] = static_cast(i); + } + + // ONNX pad order: [b0,b1,b2,b3,e0,e1,e2,e3] + // The below shape will cause flattening the last two input dims to 8 + const std::vector pads = { + 0, 0, 0, 0, // begin + 0, 0, 0, 1 // end pad only on last original axis + }; + + // Expected shape: + // flattened axis grows from 8 -> 12 + const std::vector expected_shape = {2, 3, 2, 5}; + + std::vector expected_output = { + // [0][0][0] + 0.f, 1.f, 2.f, 3.f, 3.f, + // [0][0][1] + 4.f, 5.f, 6.f, 7.f, 7.f, + + // [0][1][0] + 8.f, 9.f, 10.f, 11.f, 11.f, + // [0][1][1] + 12.f, 13.f, 14.f, 15.f, 15.f, + + // [0][2][0] + 16.f, 17.f, 18.f, 19.f, 19.f, + // [0][2][1] + 20.f, 21.f, 22.f, 23.f, 23.f, + + // [1][0][0] + 24.f, 25.f, 26.f, 27.f, 27.f, + // [1][0][1] + 28.f, 29.f, 30.f, 31.f, 31.f, + + // [1][1][0] + 32.f, 33.f, 34.f, 35.f, 35.f, + // [1][1][1] + 36.f, 37.f, 38.f, 39.f, 39.f, + + // [1][2][0] + 40.f, 41.f, 42.f, 43.f, 43.f, + // [1][2][1] + 44.f, 45.f, 46.f, 47.f, 47.f}; + + OpTester test("Pad", 18); + test.AddInput("data", input_shape, input_data); + test.AddInput("pads", {static_cast(pads.size())}, pads); + test.AddOutput("output", expected_shape, expected_output); + test.AddAttribute("mode", "edge"); + test.Run(); +} + // Gh issue: https://github.com/microsoft/onnxruntime/issues/11828 // TEST(PadOpTest, Pad_Reflect_NegativeFront_PositiveBack) { // using T = float; From 635c6f8f2af5944a74da6302c39fd8abda3e67d8 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 22 Dec 2025 18:25:57 -0800 Subject: [PATCH 13/28] Fix Reflect and add tests for Wrap --- onnxruntime/core/providers/cpu/tensor/pad.cc | 12 +++++ .../test/providers/cpu/tensor/pad_test.cc | 50 ++++++++++++------- 2 files changed, 44 insertions(+), 18 deletions(-) diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc index d2466e644d70f..d4692054683bd 100644 --- a/onnxruntime/core/providers/cpu/tensor/pad.cc +++ b/onnxruntime/core/providers/cpu/tensor/pad.cc @@ -522,6 +522,18 @@ static Status PadImpl(OpKernelContext* ctx, reshaped_slice[i] + reshaped_slice[i + new_dims_count]; } + if (mode == Mode::Reflect) { + for (size_t i = 0; i < new_dims_count; ++i) { + const int64_t extent = input_extents[i]; // length after slicing + const bool reflect_on_axis = + (reshaped_pad[i] > 0) || (reshaped_pad[i + new_dims_count] > 0); + if (reflect_on_axis && extent < 2) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Pad reflect requires axis length >= 2 after slicing in reflect mode"); + } + } + } + // Compute true output dimensions for (size_t i = 0; i < data_rank; i++) { output_dims[i] += SafeInt(pads[i]) + pads[i + data_rank] + slices[i] + slices[i + data_rank]; diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc index 15e44482ec89f..6c81779dc34d5 100644 --- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc @@ -1348,14 +1348,6 @@ TEST(PadOpTest, EdgeMode_ExtentOne_Valid) { test.Run(); } -// TEST(PadOpTest, EdgeMode_FlattenedInnermostAxis) { -// // This covers the else branch of inner_no_pad_size != 1 -// const std::vector input_shape{2, 3, 2, 4}; -// std::vector input_data(2 * 3 * 2 * 4); -// std::iota(input_data.begin(), input_data.end(), 1.f); -// -// const std::vector expected_shape{2, 3, 8}; - TEST(PadOpTest, EdgeMode_FlattenedInnermostAxis) { // Shape chosen to force FlattenInnerShape(): // innermost dims {2,4} -> flattened to 8 @@ -1417,16 +1409,38 @@ TEST(PadOpTest, EdgeMode_FlattenedInnermostAxis) { } // Gh issue: https://github.com/microsoft/onnxruntime/issues/11828 -// TEST(PadOpTest, Pad_Reflect_NegativeFront_PositiveBack) { -// using T = float; -// RunAllOpsetAllDomainPadTests({4}, -// {T(1), T(2), T(3), T(4)}, -// {-3, 3}, -// T(0), -// {4}, -// {4, 0, 0, 0}, -// "reflect"); -//} +TEST(PadOpTest, Pad_Reflect_NegativeFront_PositiveBack) { + const std::vector input_shape = {4}; + const std::vector input_data = {1.0f, 2.0f, 3.0f, 4.0f}; + const std::vector pads = {-3, 3}; + const std::vector expected_shape{4}; + const std::vector expected_data = {2.f, 3.f, 4.f, 1.f}; + + OpTester test("Pad", 18); + test.AddInput("data", input_shape, input_data); + test.AddInput("pads", {static_cast(pads.size())}, pads); + test.AddOutput("output", expected_shape, expected_data); + test.AddAttribute("mode", "reflect"); + test.Run(OpTester::ExpectResult::kExpectFailure, + "Pad reflect requires axis length >= 2 after slicing in reflect mode"); +} + +TEST(PadOpTest, Pad_Wrap_NegativeFront_PositiveBack) { + const std::vector input_shape = {4}; + const std::vector input_data = {1.0f, 2.0f, 3.0f, 4.0f}; + const std::vector pads = {-3, 3}; + + const std::vector expected_shape{4}; + // Post-slice core: [4]; wrap 3 -> [4, 4, 4, 4] + const std::vector expected_data = {4, 4, 4, 4}; + + OpTester test("Pad", 18); + test.AddInput("data", input_shape, input_data); + test.AddInput("pads", {static_cast(pads.size())}, pads); + test.AddOutput("output", expected_shape, expected_data); + test.AddAttribute("mode", "wrap"); + test.Run(); +} } // namespace test } // namespace onnxruntime From 096d5393fe1b2baff7093d5b9a47a4dc8015aea3 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 22 Dec 2025 18:54:20 -0800 Subject: [PATCH 14/28] Test pass on CPU --- onnxruntime/core/providers/cpu/tensor/pad.cc | 29 +++++++++++-------- .../test/providers/cpu/tensor/pad_test.cc | 2 +- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc index d4692054683bd..2ebeb25aaeffb 100644 --- a/onnxruntime/core/providers/cpu/tensor/pad.cc +++ b/onnxruntime/core/providers/cpu/tensor/pad.cc @@ -522,18 +522,6 @@ static Status PadImpl(OpKernelContext* ctx, reshaped_slice[i] + reshaped_slice[i + new_dims_count]; } - if (mode == Mode::Reflect) { - for (size_t i = 0; i < new_dims_count; ++i) { - const int64_t extent = input_extents[i]; // length after slicing - const bool reflect_on_axis = - (reshaped_pad[i] > 0) || (reshaped_pad[i + new_dims_count] > 0); - if (reflect_on_axis && extent < 2) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Pad reflect requires axis length >= 2 after slicing in reflect mode"); - } - } - } - // Compute true output dimensions for (size_t i = 0; i < data_rank; i++) { output_dims[i] += SafeInt(pads[i]) + pads[i + data_rank] + slices[i] + slices[i + data_rank]; @@ -545,6 +533,23 @@ static Status PadImpl(OpKernelContext* ctx, return PadInputWithDimValueOfZero(ctx, mode, orig_input_shape, output_dims, value); } + // Special case for Reflect mode: ensure all extents >= 2 after slicing + // otherwise reflection is not possible. Matches numpy behavior as ONNX only + // implies that this would be wrong as the start and end positions should be distinct + // values and with 0 there is not one, and with 1 reflection degenerates into ambiguity. + if (mode == Mode::Reflect) { + for (size_t i = 0; i < new_dims_count; ++i) { + const int64_t extent = input_extents[i]; // length after slicing + const bool reflect_on_axis = + (reshaped_pad[i] > 0) || (reshaped_pad[i + new_dims_count] > 0); + if (reflect_on_axis && extent < 2) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Pad reflect requires axis length >= 2 after slicing. Input shape:", + orig_input_shape); + } + } + } + // output_shape needs to keep original. TensorShape output_shape(output_dims); auto& output_tensor = *ctx->Output(0, output_shape); diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc index 6c81779dc34d5..72ec0f7506401 100644 --- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc @@ -1422,7 +1422,7 @@ TEST(PadOpTest, Pad_Reflect_NegativeFront_PositiveBack) { test.AddOutput("output", expected_shape, expected_data); test.AddAttribute("mode", "reflect"); test.Run(OpTester::ExpectResult::kExpectFailure, - "Pad reflect requires axis length >= 2 after slicing in reflect mode"); + "Pad reflect requires axis length >= 2 after slicing"); } TEST(PadOpTest, Pad_Wrap_NegativeFront_PositiveBack) { From 0c521ae5d906aaf70d172cb450c6a14e3c38a849 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Tue, 23 Dec 2025 15:15:35 -0800 Subject: [PATCH 15/28] Add some cases handling for CUDA --- onnxruntime/core/providers/cuda/cuda_utils.cu | 1 + onnxruntime/core/providers/cuda/tensor/pad.cc | 62 ++++++++++++++++--- .../test/providers/cpu/tensor/pad_test.cc | 30 ++++----- 3 files changed, 71 insertions(+), 22 deletions(-) diff --git a/onnxruntime/core/providers/cuda/cuda_utils.cu b/onnxruntime/core/providers/cuda/cuda_utils.cu index 934425656e3c9..59f2deda1805e 100644 --- a/onnxruntime/core/providers/cuda/cuda_utils.cu +++ b/onnxruntime/core/providers/cuda/cuda_utils.cu @@ -81,6 +81,7 @@ template std::unique_ptr> CreateConstantOnes(cudaStream_t stream, T * output, T value, int64_t count); SPECIALIZED_FILL(int8_t) +SPECIALIZED_FILL(bool) SPECIALIZED_FILL(int16_t) SPECIALIZED_FILL(int32_t) SPECIALIZED_FILL(int64_t) diff --git a/onnxruntime/core/providers/cuda/tensor/pad.cc b/onnxruntime/core/providers/cuda/tensor/pad.cc index bdd6567d2ef34..e2e8842851c7c 100644 --- a/onnxruntime/core/providers/cuda/tensor/pad.cc +++ b/onnxruntime/core/providers/cuda/tensor/pad.cc @@ -94,7 +94,7 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { typedef typename ToCudaType::MappedType CudaT; const auto& input_tensor = *ctx->Input(0); auto const& input_shape = input_tensor.Shape(); - int32_t dimension_count = static_cast(input_shape.NumDimensions()); + const size_t dimension_count = input_shape.NumDimensions(); const PadsVector* p_pads = &pads_; const PadsVector* p_slices = &slices_; @@ -134,15 +134,41 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { TArray input_strides(input_pitches); auto output_dims(input_shape.AsShapeVector()); - ORT_ENFORCE(static_cast(dimension_count) * 2 == p_pads->size(), "'pads' attribute has wrong number of values"); + ORT_ENFORCE(dimension_count * 2 == p_pads->size(), "'pads' attribute has wrong number of values"); // Calculate output dimensions, and handle any negative padding TArray lower_pads(dimension_count); TArray upper_pads(dimension_count); - for (auto i = 0; i < dimension_count; i++) { - lower_pads[i] = (*p_pads)[i] + (*p_slices)[i]; - upper_pads[i] = (*p_pads)[static_cast(i) + dimension_count] + (*p_slices)[static_cast(i) + dimension_count]; - output_dims[i] += lower_pads[i] + upper_pads[i]; + for (size_t i = 0; i < dimension_count; i++) { + lower_pads[i] = SafeInt((*p_pads)[i]) + (*p_slices)[i]; + upper_pads[i] = SafeInt((*p_pads)[i + dimension_count]) + (*p_slices)[i + dimension_count]; + output_dims[i] += SafeInt(lower_pads[i]) + upper_pads[i]; + } + + TensorShapeVector input_extents; + input_extents.reserve(dimension_count); + for (size_t i = 0; i < dimension_count; i++) { + int64_t extent = std::max(SafeInt(input_dims[i]) + + (*p_slices)[i] + (*p_slices)[i + dimension_count], + 0U); + input_extents.push_back(extent); + } + + // Special case for Reflect mode: ensure all extents >= 2 after slicing + // otherwise reflection is not possible. Matches numpy behavior as ONNX only + // implies that this would be wrong as the start and end positions should be distinct + // values and with 0 there is not one, and with 1 reflection degenerates into ambiguity. + if (mode_ == Mode::Reflect) { + for (size_t i = 0; i < dimension_count; ++i) { + const int64_t extent = input_extents[i]; // length after slicing + const bool reflect_on_axis = + (*p_pads)[i] > 0 || (*p_pads)[i + dimension_count] > 0; + if (reflect_on_axis && extent < 2) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Pad reflect requires axis length >= 2 after slicing. Input shape:", + input_shape); + } + } } TensorShape output_shape(output_dims); @@ -154,6 +180,28 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { auto& output_tensor = *ctx->Output(0, output_shape); + // Early constant-fill: if any input extent is zero, no data to copy + // only padding if any + bool no_data_to_copy = false; + for (size_t i = 0; i < input_extents.size(); ++i) { + if (input_extents[i] == 0) { + no_data_to_copy = true; + break; + } + } + + if (no_data_to_copy) { + if (mode_ == Mode::Constant) { + cuda::Fill(Stream(ctx), reinterpret_cast(output_tensor.MutableDataRaw()), + value, + input_shape.Size()); + return Status::OK(); + } + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, + "Pad: invalid mode: ", static_cast(mode_), " with zero effective input extent"); + } + + // Case of all pads and slices being zero: just copy input to output if (std::all_of(p_pads->begin(), p_pads->end(), [](const int64_t v) { return v == 0; }) && std::all_of(p_slices->begin(), p_slices->end(), [](const int64_t v) { return v == 0; }) && output_shape.Size() > 0) { @@ -164,7 +212,7 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { return Status::OK(); } - if (IsNCHWInputWithPaddingAlongHAndW(static_cast(dimension_count), lower_pads, upper_pads)) { + if (IsNCHWInputWithPaddingAlongHAndW(dimension_count, lower_pads, upper_pads)) { // If we have entered here, it means the input can only be 4-D (NCHW), 3-D (CHW), or 2-D (HW) // NCHW input diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc index 72ec0f7506401..a7a54caea2fe8 100644 --- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc @@ -1145,8 +1145,8 @@ TEST(PadOpTest, ConstantMode_MixedSigns_Small_F32) { OpTester test("Pad", 13); test.AddInput("data", input_shape, input_data); - test.AddInput("pads", {static_cast(pads.size())}, pads); - test.AddInput("constant_value", {}, {cv}); + test.AddInput("pads", {static_cast(pads.size())}, pads, true); + test.AddInput("constant_value", {}, {cv}, true); test.AddOutput("output", expected_shape, expected_data); test.AddAttribute("mode", "constant"); test.Run(); @@ -1177,8 +1177,8 @@ TEST(PadOpTest, ConstantFill_F32_RemovesAllDataOnAxis) { 0.f, 0.f, 0.f, 0.f}; test.AddInput("data", input_shape, input_data); - test.AddInput("pads", {8}, {0, 0, -4, 0, 0, 0, 4, 0}); - test.AddInput("constant_value", {}, {0.0f}); + test.AddInput("pads", {8}, {0, 0, -4, 0, 0, 0, 4, 0}, true); + test.AddInput("constant_value", {}, {0.0f}, true); test.AddOutput("output", expected_shape, expected_data); test.Run(); } @@ -1228,8 +1228,8 @@ TEST(PadOpTest, ConstantPadLargeNegativePadNoOutput) { std::vector output_data; // empty now test.AddInput("data", input_shape, input_span); - test.AddInput("pads", pads_shape, pads); - test.AddInput("value", {}, {100.f}); + test.AddInput("pads", pads_shape, pads, true); + test.AddInput("value", {}, {100.f}, true); // Omit Axis input test.AddOutput("output", output_shape, output_data); @@ -1273,8 +1273,8 @@ TEST(PadOpTest, ConstantMode_MixedSigns_Small) { OpTester test("Pad", 18); test.AddInput("data", input_shape, input_data); - test.AddInput("pads", {static_cast(pads.size())}, pads); - test.AddInput("constant_value", {}, {cv}); + test.AddInput("pads", {static_cast(pads.size())}, pads, true); + test.AddInput("constant_value", {}, {cv}, true); test.AddOutput("output", expected_shape, expected_output); test.AddAttribute("mode", "constant"); test.Run(); @@ -1309,8 +1309,8 @@ TEST(PadOpTest, ConstantMode_InnermostCropThenPostPad) { OpTester test("Pad", 18); test.AddInput("data", input_shape, input_data); - test.AddInput("pads", {static_cast(pads.size())}, pads); - test.AddInput("constant_value", {}, {cv}); + test.AddInput("pads", {static_cast(pads.size())}, pads, true); + test.AddInput("constant_value", {}, {cv}, true); test.AddOutput("output", expected_shape, expected_output); test.AddAttribute("mode", "constant"); test.Run(); @@ -1327,7 +1327,7 @@ TEST(PadOpTest, EdgeMode_ZeroExtentFails) { OpTester test("Pad", 18); test.AddInput("data", input_shape, input_data); - test.AddInput("pads", {static_cast(pads.size())}, pads); + test.AddInput("pads", {static_cast(pads.size())}, pads, true); test.AddOutput("output", expected_shape, expected_data); test.AddAttribute("mode", "edge"); test.Run(OpTester::ExpectResult::kExpectFailure); @@ -1342,7 +1342,7 @@ TEST(PadOpTest, EdgeMode_ExtentOne_Valid) { OpTester test("Pad", 18); test.AddInput("data", input_shape, input_data); - test.AddInput("pads", {static_cast(pads.size())}, pads); + test.AddInput("pads", {static_cast(pads.size())}, pads, true); test.AddOutput("output", expected_shape, expected_output); test.AddAttribute("mode", "edge"); test.Run(); @@ -1402,7 +1402,7 @@ TEST(PadOpTest, EdgeMode_FlattenedInnermostAxis) { OpTester test("Pad", 18); test.AddInput("data", input_shape, input_data); - test.AddInput("pads", {static_cast(pads.size())}, pads); + test.AddInput("pads", {static_cast(pads.size())}, pads, true); test.AddOutput("output", expected_shape, expected_output); test.AddAttribute("mode", "edge"); test.Run(); @@ -1418,7 +1418,7 @@ TEST(PadOpTest, Pad_Reflect_NegativeFront_PositiveBack) { OpTester test("Pad", 18); test.AddInput("data", input_shape, input_data); - test.AddInput("pads", {static_cast(pads.size())}, pads); + test.AddInput("pads", {static_cast(pads.size())}, pads, true); test.AddOutput("output", expected_shape, expected_data); test.AddAttribute("mode", "reflect"); test.Run(OpTester::ExpectResult::kExpectFailure, @@ -1436,7 +1436,7 @@ TEST(PadOpTest, Pad_Wrap_NegativeFront_PositiveBack) { OpTester test("Pad", 18); test.AddInput("data", input_shape, input_data); - test.AddInput("pads", {static_cast(pads.size())}, pads); + test.AddInput("pads", {static_cast(pads.size())}, pads, true); test.AddOutput("output", expected_shape, expected_data); test.AddAttribute("mode", "wrap"); test.Run(); From b78cfe71a567763bf05d3c8200a23dafe207a802 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Tue, 23 Dec 2025 15:34:26 -0800 Subject: [PATCH 16/28] Clamp output dimensions, early exist on zero output --- onnxruntime/core/providers/cuda/tensor/pad.cc | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/onnxruntime/core/providers/cuda/tensor/pad.cc b/onnxruntime/core/providers/cuda/tensor/pad.cc index e2e8842851c7c..a841d93d1403e 100644 --- a/onnxruntime/core/providers/cuda/tensor/pad.cc +++ b/onnxruntime/core/providers/cuda/tensor/pad.cc @@ -142,7 +142,13 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { for (size_t i = 0; i < dimension_count; i++) { lower_pads[i] = SafeInt((*p_pads)[i]) + (*p_slices)[i]; upper_pads[i] = SafeInt((*p_pads)[i + dimension_count]) + (*p_slices)[i + dimension_count]; - output_dims[i] += SafeInt(lower_pads[i]) + upper_pads[i]; + output_dims[i] += std::max(0, SafeInt(lower_pads[i]) + upper_pads[i]); + } + + TensorShape output_shape(output_dims); + if (output_shape.Size() == 0) { + // No elements to output + return Status::OK(); } TensorShapeVector input_extents; @@ -171,8 +177,6 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { } } - TensorShape output_shape(output_dims); - // special case when there is a dim value of 0 in the shape. behavior depends on mode if (input_shape.Size() == 0) { ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode_, input_shape, output_shape)); From f3e68f6a6387878798634288fbb3f7722ff5e907 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Tue, 23 Dec 2025 16:02:44 -0800 Subject: [PATCH 17/28] Produce outut before early exist --- onnxruntime/core/providers/cuda/tensor/pad.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/onnxruntime/core/providers/cuda/tensor/pad.cc b/onnxruntime/core/providers/cuda/tensor/pad.cc index a841d93d1403e..f82d19e9732d6 100644 --- a/onnxruntime/core/providers/cuda/tensor/pad.cc +++ b/onnxruntime/core/providers/cuda/tensor/pad.cc @@ -142,13 +142,7 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { for (size_t i = 0; i < dimension_count; i++) { lower_pads[i] = SafeInt((*p_pads)[i]) + (*p_slices)[i]; upper_pads[i] = SafeInt((*p_pads)[i + dimension_count]) + (*p_slices)[i + dimension_count]; - output_dims[i] += std::max(0, SafeInt(lower_pads[i]) + upper_pads[i]); - } - - TensorShape output_shape(output_dims); - if (output_shape.Size() == 0) { - // No elements to output - return Status::OK(); + output_dims[i] += SafeInt(lower_pads[i]) + upper_pads[i]; } TensorShapeVector input_extents; @@ -177,12 +171,18 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { } } + TensorShape output_shape(output_dims); + // special case when there is a dim value of 0 in the shape. behavior depends on mode if (input_shape.Size() == 0) { ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode_, input_shape, output_shape)); } auto& output_tensor = *ctx->Output(0, output_shape); + if (output_shape.Size() == 0) { + // No elements to output + return Status::OK(); + } // Early constant-fill: if any input extent is zero, no data to copy // only padding if any From 5ef84e2d5d52876005b7c475a1072bd02d432553 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Tue, 23 Dec 2025 16:27:55 -0800 Subject: [PATCH 18/28] Wrap is not supported on CUDA. Wrap test must be ver 18 --- onnxruntime/test/providers/cpu/tensor/pad_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc index a7a54caea2fe8..c1f16317ef8e7 100644 --- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc @@ -1434,7 +1434,7 @@ TEST(PadOpTest, Pad_Wrap_NegativeFront_PositiveBack) { // Post-slice core: [4]; wrap 3 -> [4, 4, 4, 4] const std::vector expected_data = {4, 4, 4, 4}; - OpTester test("Pad", 18); + OpTester test("Pad", 19); // CUDA registers only up to 18 and does not impl wrap mode test.AddInput("data", input_shape, input_data); test.AddInput("pads", {static_cast(pads.size())}, pads, true); test.AddOutput("output", expected_shape, expected_data); From 8319d476d91ec85f60208a801827ce9c9dc79b17 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Thu, 25 Dec 2025 14:54:06 -0800 Subject: [PATCH 19/28] Still have CUDA failures, not clear why cudaMemset is not working as expected. [ FAILED ] PadOpTest/0.Pad_Constant_DimWithZeroInput, where TypeParam = float [ FAILED ] PadOpTest/1.Pad_Constant_DimWithZeroInput, where TypeParam = double --- onnxruntime/core/providers/cpu/tensor/pad.cc | 78 +++++++++---------- onnxruntime/core/providers/cuda/tensor/pad.cc | 78 ++++++++++--------- onnxruntime/core/providers/cuda/tensor/pad.h | 1 + .../test/providers/cpu/tensor/pad_test.cc | 7 +- 4 files changed, 83 insertions(+), 81 deletions(-) diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc index 2ebeb25aaeffb..b87e0d4b545f5 100644 --- a/onnxruntime/core/providers/cpu/tensor/pad.cc +++ b/onnxruntime/core/providers/cpu/tensor/pad.cc @@ -505,11 +505,11 @@ static Status PadImpl(OpKernelContext* ctx, TensorShapeVector reshaped_output_dims = reshaped_input_dims; TensorShapeVector input_starts; - TensorShapeVector input_extents; + TensorShapeVector effective_input_extents; // Calculate reshaped output dimensions, and handle any negative padding input_starts.reserve(new_dims_count); - input_extents.reserve(new_dims_count); + effective_input_extents.reserve(new_dims_count); for (size_t i = 0; i < new_dims_count; i++) { // Starts for every dimension. If slice is negative, we need to start further in, handled by the SliceIterator input_starts.push_back(-1 * reshaped_slice[i]); @@ -517,7 +517,7 @@ static Status PadImpl(OpKernelContext* ctx, int64_t extent = std::max(SafeInt(reshaped_input_dims[i]) + reshaped_slice[i] + reshaped_slice[i + new_dims_count], 0U); - input_extents.push_back(extent); + effective_input_extents.push_back(extent); reshaped_output_dims[i] += SafeInt(reshaped_pad[i]) + reshaped_pad[i + new_dims_count] + reshaped_slice[i] + reshaped_slice[i + new_dims_count]; } @@ -527,29 +527,13 @@ static Status PadImpl(OpKernelContext* ctx, output_dims[i] += SafeInt(pads[i]) + pads[i + data_rank] + slices[i] + slices[i + data_rank]; } - // special case an input with one or more dim values of 0. edge case that is easier to handle - // separately than to complicate all the code for normal usage. + // If the input is empty, but output shape may not be, need padding only + // this is expected for constant mode only, otherwise the output is empty + // no error if (orig_input_shape.Size() == 0) { return PadInputWithDimValueOfZero(ctx, mode, orig_input_shape, output_dims, value); } - // Special case for Reflect mode: ensure all extents >= 2 after slicing - // otherwise reflection is not possible. Matches numpy behavior as ONNX only - // implies that this would be wrong as the start and end positions should be distinct - // values and with 0 there is not one, and with 1 reflection degenerates into ambiguity. - if (mode == Mode::Reflect) { - for (size_t i = 0; i < new_dims_count; ++i) { - const int64_t extent = input_extents[i]; // length after slicing - const bool reflect_on_axis = - (reshaped_pad[i] > 0) || (reshaped_pad[i + new_dims_count] > 0); - if (reflect_on_axis && extent < 2) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Pad reflect requires axis length >= 2 after slicing. Input shape:", - orig_input_shape); - } - } - } - // output_shape needs to keep original. TensorShape output_shape(output_dims); auto& output_tensor = *ctx->Output(0, output_shape); @@ -559,17 +543,12 @@ static Status PadImpl(OpKernelContext* ctx, auto* output_end = output + static_cast(total_output_elems); OutputSink sink(output, output_end); - // Early constant-fill: if any input extent is zero, no data to copy - // only padding if any - bool no_data_to_copy = false; - for (size_t i = 0; i < input_extents.size(); ++i) { - if (input_extents[i] == 0) { - no_data_to_copy = true; - break; - } - } + // Early constant-fill: if any effective input extent is zero (input is not empty), no data to copy + // only padding if any for constant mode, for other modes it is an error + const bool no_effective_data_to_copy = std::any_of(effective_input_extents.begin(), effective_input_extents.end(), + [](int64_t v) { return v == 0; }); - if (no_data_to_copy) { + if (no_effective_data_to_copy) { if (mode == Mode::Constant) { PadAxisConstant(sink, output, value, total_output_elems); return Status::OK(); @@ -578,6 +557,23 @@ static Status PadImpl(OpKernelContext* ctx, "Pad: invalid mode: ", static_cast(mode), " with zero effective input extent"); } + // Special case for Reflect mode: ensure all extents >= 2 after slicing + // otherwise reflection is not possible. Matches numpy behavior as ONNX only + // implies that this would be wrong as the start and end positions should be distinct + // values and with 0 there is not one, and with 1 reflection degenerates into ambiguity. + if (mode == Mode::Reflect) { + for (size_t i = 0; i < new_dims_count; ++i) { + const int64_t extent = effective_input_extents[i]; // length after slicing + const bool reflect_on_axis = + (reshaped_pad[i] > 0) || (reshaped_pad[i + new_dims_count] > 0); + if (reflect_on_axis && extent < 2) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Pad reflect requires axis length >= 2 after slicing. Input shape:", + orig_input_shape); + } + } + } + TensorPitches output_pitches(reshaped_output_dims); // Initial skip, sum up the start padding on each axis SafeInt align_skip = 0; @@ -588,9 +584,9 @@ static Status PadImpl(OpKernelContext* ctx, // Validate coverage: pre + copy + post == total SafeInt copy_elems = 1; - for (size_t i = 0, lim = input_extents.size(); i < lim; ++i) { + for (size_t i = 0, lim = effective_input_extents.size(); i < lim; ++i) { // All extents are positive here due to the no_data_to_copy check above - copy_elems *= input_extents[i]; + copy_elems *= effective_input_extents[i]; } const size_t prepad_elems = align_skip; @@ -599,9 +595,9 @@ static Status PadImpl(OpKernelContext* ctx, total_output_elems, prepad_elems, copy_elems, postpad_elems)); TensorShape input_shape(reshaped_input_dims); - SliceIterator input(input_tensor, input_shape, input_starts, input_extents, {}); + SliceIterator input(input_tensor, input_shape, input_starts, effective_input_extents, {}); - ExtentAxisCounters input_counters(input_extents); + ExtentAxisCounters input_counters(effective_input_extents); switch (mode) { case Mode::Constant: @@ -613,7 +609,7 @@ static Status PadImpl(OpKernelContext* ctx, { T* axis_start = output; // Compute the actual number of data elements to copy on the innermost axis (after cropping). - const size_t inner_extent = onnxruntime::narrow(input_extents[inner_axis]); + const size_t inner_extent = onnxruntime::narrow(effective_input_extents[inner_axis]); // Copy innermost block. IMPORTANT: do not rely on the returned 'output' to be end-of-the extent. ORT_IGNORE_RETURN_VALUE(input.CopyInnermostAxisSolitaryInnerStep(output)); @@ -633,7 +629,7 @@ static Status PadImpl(OpKernelContext* ctx, // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done) while (input_counters.Increment()) { ptrdiff_t inner_pitch = onnxruntime::narrow(output_pitches[input_counters.Axis()]); - T* axis_start = output - inner_pitch * input_extents[input_counters.Axis()]; + T* axis_start = output - inner_pitch * effective_input_extents[input_counters.Axis()]; const SafeInt pre_pad = reshaped_pad[input_counters.Axis()]; const SafeInt post_pad = reshaped_pad[input_counters.Axis() + new_dims_count]; if (pre_pad > 0) { @@ -655,7 +651,7 @@ static Status PadImpl(OpKernelContext* ctx, while (input_counters) { output += align_skip; { - const SafeInt inner_extent = input_extents[inner_axis]; + const SafeInt inner_extent = effective_input_extents[inner_axis]; T* axis_start = output; T* axis_end = axis_start + onnxruntime::narrow(inner_extent); output = input.CopyInnermostAxisSolitaryInnerStep(output); @@ -687,7 +683,7 @@ static Status PadImpl(OpKernelContext* ctx, // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done) while (input_counters.Increment()) { ptrdiff_t inner_pitch = onnxruntime::narrow(output_pitches[input_counters.Axis()]); - T* axis_start = output - inner_pitch * input_extents[input_counters.Axis()]; + T* axis_start = output - inner_pitch * effective_input_extents[input_counters.Axis()]; const SafeInt pre_pad = reshaped_pad[input_counters.Axis()]; const SafeInt post_pad = reshaped_pad[input_counters.Axis() + new_dims_count]; if (pre_pad > 0) { @@ -774,7 +770,7 @@ static Status PadImpl(OpKernelContext* ctx, // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done) while (input_counters.Increment()) { ptrdiff_t inner_pitch = onnxruntime::narrow(output_pitches[input_counters.Axis()]); - T* axis_start = output - inner_pitch * input_extents[input_counters.Axis()]; + T* axis_start = output - inner_pitch * effective_input_extents[input_counters.Axis()]; const SafeInt pre_pad = reshaped_pad[input_counters.Axis()]; const SafeInt post_pad = reshaped_pad[input_counters.Axis() + new_dims_count]; if (mode == Mode::Reflect) { diff --git a/onnxruntime/core/providers/cuda/tensor/pad.cc b/onnxruntime/core/providers/cuda/tensor/pad.cc index f82d19e9732d6..91d01234c006b 100644 --- a/onnxruntime/core/providers/cuda/tensor/pad.cc +++ b/onnxruntime/core/providers/cuda/tensor/pad.cc @@ -145,13 +145,49 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { output_dims[i] += SafeInt(lower_pads[i]) + upper_pads[i]; } - TensorShapeVector input_extents; - input_extents.reserve(dimension_count); + TensorShapeVector effective_input_extents; + effective_input_extents.reserve(dimension_count); for (size_t i = 0; i < dimension_count; i++) { int64_t extent = std::max(SafeInt(input_dims[i]) + (*p_slices)[i] + (*p_slices)[i + dimension_count], 0U); - input_extents.push_back(extent); + effective_input_extents.push_back(extent); + } + + TensorShape output_shape(output_dims); + auto& output_tensor = *ctx->Output(0, output_shape); + + // If the input size is zero, but output shape is not, need padding only + // this is expected for constant mode only, otherwise the output is empty + // no error + if (input_shape.Size() == 0) { + ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode_, input_shape, output_shape)); + if (mode_ == Mode::Constant) { + CUDA_CALL_THROW(cudaMemsetAsync(output_tensor.MutableDataRaw(), value, output_tensor.SizeInBytes(), + Stream(ctx))); + cudaStreamSynchronize(Stream(ctx)); + } + // No error for other modes (preserve CPU historical behavior), + // but no output should be expected either + return Status::OK(); + } + + // Early constant-fill: input is not empty as above + // However, if any effective input extent is zero, no data to copy + // only padding if any. + const bool no_effective_data_to_copy = std::any_of(effective_input_extents.begin(), effective_input_extents.end(), + [](int64_t v) { return v == 0; }); + + if (no_effective_data_to_copy) { + if (mode_ == Mode::Constant) { + // Attempt to pad constant mode in case output is not empty + // all other modes are an error + CUDA_CALL_THROW(cudaMemsetAsync(output_tensor.MutableDataRaw(), value, output_tensor.SizeInBytes(), + Stream(ctx))); + return Status::OK(); + } + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, + "Pad: invalid mode: ", static_cast(mode_), " with zero effective input extent"); } // Special case for Reflect mode: ensure all extents >= 2 after slicing @@ -160,7 +196,7 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { // values and with 0 there is not one, and with 1 reflection degenerates into ambiguity. if (mode_ == Mode::Reflect) { for (size_t i = 0; i < dimension_count; ++i) { - const int64_t extent = input_extents[i]; // length after slicing + const int64_t extent = effective_input_extents[i]; // length after slicing const bool reflect_on_axis = (*p_pads)[i] > 0 || (*p_pads)[i + dimension_count] > 0; if (reflect_on_axis && extent < 2) { @@ -171,40 +207,6 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { } } - TensorShape output_shape(output_dims); - - // special case when there is a dim value of 0 in the shape. behavior depends on mode - if (input_shape.Size() == 0) { - ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode_, input_shape, output_shape)); - } - - auto& output_tensor = *ctx->Output(0, output_shape); - if (output_shape.Size() == 0) { - // No elements to output - return Status::OK(); - } - - // Early constant-fill: if any input extent is zero, no data to copy - // only padding if any - bool no_data_to_copy = false; - for (size_t i = 0; i < input_extents.size(); ++i) { - if (input_extents[i] == 0) { - no_data_to_copy = true; - break; - } - } - - if (no_data_to_copy) { - if (mode_ == Mode::Constant) { - cuda::Fill(Stream(ctx), reinterpret_cast(output_tensor.MutableDataRaw()), - value, - input_shape.Size()); - return Status::OK(); - } - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, - "Pad: invalid mode: ", static_cast(mode_), " with zero effective input extent"); - } - // Case of all pads and slices being zero: just copy input to output if (std::all_of(p_pads->begin(), p_pads->end(), [](const int64_t v) { return v == 0; }) && std::all_of(p_slices->begin(), p_slices->end(), [](const int64_t v) { return v == 0; }) && diff --git a/onnxruntime/core/providers/cuda/tensor/pad.h b/onnxruntime/core/providers/cuda/tensor/pad.h index b206a35995e4e..b7e4131795292 100644 --- a/onnxruntime/core/providers/cuda/tensor/pad.h +++ b/onnxruntime/core/providers/cuda/tensor/pad.h @@ -1,4 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. #include "core/providers/shared_library/provider_api.h" diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc index c1f16317ef8e7..cc72a17ff57e2 100644 --- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc @@ -67,7 +67,7 @@ static void RunAllOpsetAllDomainPadTests( bool value_is_initializer; }; const std::vector all_test_params{ - {false, false}, + {true, false}, #if (defined(USE_NNAPI) && defined(__ANDROID__)) || (defined(USE_COREML) && defined(__APPLE__)) // only enable when building NNAPI EP on Android or building CoreML EP for Apple environment // test runs out of memory in QEMU aarch64 environment, so don't enable otherwise @@ -1434,7 +1434,10 @@ TEST(PadOpTest, Pad_Wrap_NegativeFront_PositiveBack) { // Post-slice core: [4]; wrap 3 -> [4, 4, 4, 4] const std::vector expected_data = {4, 4, 4, 4}; - OpTester test("Pad", 19); // CUDA registers only up to 18 and does not impl wrap mode + // CUDA registers only up to 18 and does not impl wrap mode + // so we force version to 19 to automatically exclude EPs that do not + // implement wrap mode similar to the above tests. + OpTester test("Pad", 19); test.AddInput("data", input_shape, input_data); test.AddInput("pads", {static_cast(pads.size())}, pads, true); test.AddOutput("output", expected_shape, expected_data); From f6f15db2ab04f261ff8647113d53661514256792 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 5 Jan 2026 12:36:37 -0800 Subject: [PATCH 20/28] Address Copilot review comments and some bugs --- onnxruntime/core/providers/cpu/tensor/pad.cc | 4 +- .../core/providers/cpu/tensor/padbase.h | 6 +-- onnxruntime/core/providers/cuda/tensor/pad.cc | 17 ++++--- onnxruntime/core/providers/cuda/tensor/pad.h | 1 - .../test/providers/cpu/tensor/pad_test.cc | 50 ++----------------- 5 files changed, 20 insertions(+), 58 deletions(-) diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc index b87e0d4b545f5..09f0bd276d453 100644 --- a/onnxruntime/core/providers/cpu/tensor/pad.cc +++ b/onnxruntime/core/providers/cpu/tensor/pad.cc @@ -516,7 +516,7 @@ static Status PadImpl(OpKernelContext* ctx, // Do not allow negative extents int64_t extent = std::max(SafeInt(reshaped_input_dims[i]) + reshaped_slice[i] + reshaped_slice[i + new_dims_count], - 0U); + 0LL); effective_input_extents.push_back(extent); reshaped_output_dims[i] += SafeInt(reshaped_pad[i]) + reshaped_pad[i + new_dims_count] + reshaped_slice[i] + reshaped_slice[i + new_dims_count]; @@ -617,7 +617,7 @@ static Status PadImpl(OpKernelContext* ctx, const SafeInt pre_pad = reshaped_pad[inner_axis]; const SafeInt post_pad = reshaped_pad[inner_axis + new_dims_count]; if (pre_pad > 0) { - /// Pre - pad(innermost) retro - fill remains valid(write before row_start). + /// Pre-pad(innermost) retro-fill remains valid(write before row_start). PadAxisConstant(sink, axis_start - static_cast(pre_pad), value, pre_pad); } if (post_pad > 0) { diff --git a/onnxruntime/core/providers/cpu/tensor/padbase.h b/onnxruntime/core/providers/cpu/tensor/padbase.h index 62324f1b48df1..e2ab6ff6c8fb1 100644 --- a/onnxruntime/core/providers/cpu/tensor/padbase.h +++ b/onnxruntime/core/providers/cpu/tensor/padbase.h @@ -96,9 +96,9 @@ class PadBase { SafeInt(postpad_elems); if (checked_sum != total_output_elems) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, - +"Pad coverage invalid: pre=", prepad_elems, - +" copy=", copy_elems, " post=", postpad_elems, - +" total=", total_output_elems); + "Pad coverage invalid: pre=", prepad_elems, + " copy=", copy_elems, " post=", postpad_elems, + " total=", total_output_elems); } return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/tensor/pad.cc b/onnxruntime/core/providers/cuda/tensor/pad.cc index 91d01234c006b..656890e796a1c 100644 --- a/onnxruntime/core/providers/cuda/tensor/pad.cc +++ b/onnxruntime/core/providers/cuda/tensor/pad.cc @@ -150,7 +150,7 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { for (size_t i = 0; i < dimension_count; i++) { int64_t extent = std::max(SafeInt(input_dims[i]) + (*p_slices)[i] + (*p_slices)[i + dimension_count], - 0U); + 0LL); effective_input_extents.push_back(extent); } @@ -163,9 +163,11 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { if (input_shape.Size() == 0) { ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode_, input_shape, output_shape)); if (mode_ == Mode::Constant) { - CUDA_CALL_THROW(cudaMemsetAsync(output_tensor.MutableDataRaw(), value, output_tensor.SizeInBytes(), - Stream(ctx))); - cudaStreamSynchronize(Stream(ctx)); + const int64_t output_size = output_shape.Size(); + if (output_size > 0) { + Fill(Stream(ctx), reinterpret_cast(output_tensor.MutableData()), value, + output_size); + } } // No error for other modes (preserve CPU historical behavior), // but no output should be expected either @@ -182,8 +184,11 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { if (mode_ == Mode::Constant) { // Attempt to pad constant mode in case output is not empty // all other modes are an error - CUDA_CALL_THROW(cudaMemsetAsync(output_tensor.MutableDataRaw(), value, output_tensor.SizeInBytes(), - Stream(ctx))); + const int64_t output_size = output_shape.Size(); + if (output_size > 0) { + Fill(Stream(ctx), reinterpret_cast(output_tensor.MutableData()), value, + output_size); + } return Status::OK(); } return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, diff --git a/onnxruntime/core/providers/cuda/tensor/pad.h b/onnxruntime/core/providers/cuda/tensor/pad.h index b7e4131795292..b206a35995e4e 100644 --- a/onnxruntime/core/providers/cuda/tensor/pad.h +++ b/onnxruntime/core/providers/cuda/tensor/pad.h @@ -1,5 +1,4 @@ // Copyright (c) Microsoft Corporation. All rights reserved. -// Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. #include "core/providers/shared_library/provider_api.h" diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc index cc72a17ff57e2..324934346ba65 100644 --- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc @@ -911,6 +911,7 @@ class PadOpTest_Pad_Reflect_DimWithZeroInput_Test : public PadOpTest void PadOpTest_Pad_Reflect_DimWithZeroInput_Test::TestBody() { using T = TypeParam; + // DML: Unskip when fixed #41968513 RunAllOpsetAllDomainPadTests({2, 0}, // 2D {}, {1, 0, 1, 0}, // allowed if it doesn't pad the empty dim @@ -920,8 +921,9 @@ void PadOpTest_Pad_Reflect_DimWithZeroInput_Test::TestBody() { "reflect", OpTester::ExpectResult::kExpectSuccess, "", - {kDmlExecutionProvider}); // DML: Unskip when fixed #41968513 + {kDmlExecutionProvider}); + // DML: Unskip when fixed #41968513 RunAllOpsetAllDomainPadTests({0, 2, 1}, // 3D {}, {1, 1, 1, 1, 1, 1}, // not allowed if it pads the empty dim @@ -931,7 +933,7 @@ void PadOpTest_Pad_Reflect_DimWithZeroInput_Test::TestBody() { "reflect", OpTester::ExpectResult::kExpectFailure, "Cannot use 'reflect' mode to pad dimension with a value of 0. Input shape:{0,2,1}", - {kDmlExecutionProvider, kTensorrtExecutionProvider}); // DML: Unskip when fixed #41968513 + {kDmlExecutionProvider, kTensorrtExecutionProvider}); } TEST(PadOpTest, BoolType) { @@ -1108,50 +1110,6 @@ TEST(PadOpTest, ConstantPadNegativeAxes) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } -TEST(PadOpTest, ConstantMode_MixedSigns_Small_F32) { - const std::vector input_shape{2, 6, 4}; - std::vector input_data(2 * 6 * 4); - - for (size_t i = 0; i < input_data.size(); ++i) input_data[i] = static_cast((i % 5) + 1); - - const std::vector pads{1, 3, -2, -1, 0, 1}; - const float cv = 9.0f; - // starting from input shape {2,6,4} - // after padding: {2+1+-1,6+3-0,4-2_1} => {2,9,3} - const std::vector expected_shape{2, 9, 3}; - - const std::vector expected_data = { - // sample 0 - 9.0F, 9.0F, 9.0F, - 9.0F, 9.0F, 9.0F, - 9.0F, 9.0F, 9.0F, - 9.0F, 9.0F, 9.0F, - 9.0F, 9.0F, 9.0F, - 9.0F, 9.0F, 9.0F, - 9.0F, 9.0F, 9.0F, - 9.0F, 9.0F, 9.0F, - 9.0F, 9.0F, 9.0F, - - // sample 1 - 9.0F, 9.0F, 9.0F, - 9.0F, 9.0F, 9.0F, - 9.0F, 9.0F, 9.0F, - 3.0F, 4.0F, 9.0F, - 2.0F, 3.0F, 9.0F, - 1.0F, 2.0F, 9.0F, - 5.0F, 1.0F, 9.0F, - 4.0F, 5.0F, 9.0F, - 3.0F, 4.0F, 9.0F}; - - OpTester test("Pad", 13); - test.AddInput("data", input_shape, input_data); - test.AddInput("pads", {static_cast(pads.size())}, pads, true); - test.AddInput("constant_value", {}, {cv}, true); - test.AddOutput("output", expected_shape, expected_data); - test.AddAttribute("mode", "constant"); - test.Run(); -} - TEST(PadOpTest, ConstantFill_F32_RemovesAllDataOnAxis) { OpTester test("Pad", 18); test.AddAttribute("mode", "constant"); From 7e583dcb1f48d17e09008e2b7a5690e458111555 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 5 Jan 2026 19:02:07 -0800 Subject: [PATCH 21/28] Address CI failures --- onnxruntime/test/providers/cpu/tensor/pad_test.cc | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc index 324934346ba65..a27271465663a 100644 --- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc @@ -1288,7 +1288,9 @@ TEST(PadOpTest, EdgeMode_ZeroExtentFails) { test.AddInput("pads", {static_cast(pads.size())}, pads, true); test.AddOutput("output", expected_shape, expected_data); test.AddAttribute("mode", "edge"); - test.Run(OpTester::ExpectResult::kExpectFailure); + test.ConfigExcludeEps({kQnnExecutionProvider, kTensorrtExecutionProvider, kWebGpuExecutionProvider}); + test.Config(OpTester::ExpectResult::kExpectFailure, ""); + test.RunWithConfig(); } TEST(PadOpTest, EdgeMode_ExtentOne_Valid) { @@ -1379,8 +1381,10 @@ TEST(PadOpTest, Pad_Reflect_NegativeFront_PositiveBack) { test.AddInput("pads", {static_cast(pads.size())}, pads, true); test.AddOutput("output", expected_shape, expected_data); test.AddAttribute("mode", "reflect"); - test.Run(OpTester::ExpectResult::kExpectFailure, - "Pad reflect requires axis length >= 2 after slicing"); + test.ConfigExcludeEps({kQnnExecutionProvider, kTensorrtExecutionProvider, kWebGpuExecutionProvider}); + test.Config(OpTester::ExpectResult::kExpectFailure, + "Pad reflect requires axis length >= 2 after slicing"); + test.RunWithConfig(); } TEST(PadOpTest, Pad_Wrap_NegativeFront_PositiveBack) { @@ -1400,7 +1404,9 @@ TEST(PadOpTest, Pad_Wrap_NegativeFront_PositiveBack) { test.AddInput("pads", {static_cast(pads.size())}, pads, true); test.AddOutput("output", expected_shape, expected_data); test.AddAttribute("mode", "wrap"); - test.Run(); + test.ConfigExcludeEps({kQnnExecutionProvider, kTensorrtExecutionProvider, kWebGpuExecutionProvider}); + test.Config(OpTester::ExpectResult::kExpectFailure, ""); + test.RunWithConfig(); } } // namespace test From 0058aae7ec2f7ef54adec0ca3b0fa969a903ab68 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Tue, 6 Jan 2026 11:43:17 -0800 Subject: [PATCH 22/28] Fix accidentally broken test --- onnxruntime/test/providers/cpu/tensor/pad_test.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc index a27271465663a..f2fb4efefae4d 100644 --- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc @@ -1405,7 +1405,6 @@ TEST(PadOpTest, Pad_Wrap_NegativeFront_PositiveBack) { test.AddOutput("output", expected_shape, expected_data); test.AddAttribute("mode", "wrap"); test.ConfigExcludeEps({kQnnExecutionProvider, kTensorrtExecutionProvider, kWebGpuExecutionProvider}); - test.Config(OpTester::ExpectResult::kExpectFailure, ""); test.RunWithConfig(); } From 49e7359aa5f251373a1e8df9d341b8037b2074d4 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Tue, 6 Jan 2026 13:10:40 -0800 Subject: [PATCH 23/28] Address missing ep exclusion and re-instate original cast --- onnxruntime/core/providers/cpu/tensor/pad.cc | 6 +++--- onnxruntime/test/providers/cpu/tensor/pad_test.cc | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc index 09f0bd276d453..84e22d86dec0c 100644 --- a/onnxruntime/core/providers/cpu/tensor/pad.cc +++ b/onnxruntime/core/providers/cpu/tensor/pad.cc @@ -496,9 +496,9 @@ static Status PadImpl(OpKernelContext* ctx, // Reshape padding const size_t new_dims_count = reshaped_input_dims.size(); const size_t inner_axis = new_dims_count - 1; - const int64_t inner_no_pad_size = output_dims[inner_axis] > 0 - ? reshaped_input_dims[inner_axis] / output_dims[inner_axis] - : 0; + const size_t inner_no_pad_size = narrow(output_dims[inner_axis] > 0 + ? reshaped_input_dims[inner_axis] / output_dims[inner_axis] + : 0); PadsVector reshaped_pad(2 * new_dims_count), reshaped_slice(2 * new_dims_count); PadBase::ReshapePads(pads, data_rank, new_dims_count, inner_no_pad_size, reshaped_pad); PadBase::ReshapePads(slices, data_rank, new_dims_count, inner_no_pad_size, reshaped_slice); diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc index f2fb4efefae4d..d38e35771a75d 100644 --- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc @@ -1107,7 +1107,7 @@ TEST(PadOpTest, ConstantPadNegativeAxes) { 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f}); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kNnapiExecutionProvider}); } TEST(PadOpTest, ConstantFill_F32_RemovesAllDataOnAxis) { From e836ba8253e5b52d1b34856ef17df0b1654454e2 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Tue, 6 Jan 2026 13:51:51 -0800 Subject: [PATCH 24/28] Remove extra files --- onnxruntime/core/providers/cpu/tensor/pad.cc | 2 +- onnxruntime/test/testdata/test_pad_rce.onnx | Bin 179 -> 0 bytes onnxruntime/test/testdata/test_pad_rce.py | 27 ------------------- 3 files changed, 1 insertion(+), 28 deletions(-) delete mode 100644 onnxruntime/test/testdata/test_pad_rce.onnx delete mode 100644 onnxruntime/test/testdata/test_pad_rce.py diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc index 84e22d86dec0c..681494382c37a 100644 --- a/onnxruntime/core/providers/cpu/tensor/pad.cc +++ b/onnxruntime/core/providers/cpu/tensor/pad.cc @@ -347,7 +347,7 @@ void PadBase::FlattenInnerShape(gsl::span input_dims, gsl::span slices, TensorShapeVector& reshaped_dims) { const size_t dims_count = input_dims.size(); size_t inner_axis = dims_count - 1; - SafeInt inner_size = 1; + SafeInt inner_size = 1; // Find all inner most dimensions that can be flattened. do { diff --git a/onnxruntime/test/testdata/test_pad_rce.onnx b/onnxruntime/test/testdata/test_pad_rce.onnx deleted file mode 100644 index 10ddd88cc517c800299a461376e6573be2ced17a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 179 zcmdQ<^HomS0)|lvQF5NKDZZ z=3>dsPf1ncfGJwQ$SlMGl<@_M=0u4?jS}MF;^N>HV&h@}!6*TU(Lx+tY#i)DOh6F{ iOl!DUfI1kGB)Fg!3h{CA07W<;W^ge$v2Za6@Bsj<0x1*# diff --git a/onnxruntime/test/testdata/test_pad_rce.py b/onnxruntime/test/testdata/test_pad_rce.py deleted file mode 100644 index 0a9faac70aab3..0000000000000 --- a/onnxruntime/test/testdata/test_pad_rce.py +++ /dev/null @@ -1,27 +0,0 @@ -import onnx -from onnx import TensorProto, helper - - -def create_pad_model(): - input_data = helper.make_tensor_value_info("input", TensorProto.UINT64, [None, None, None]) - pads = helper.make_tensor_value_info("pads", TensorProto.INT64, [None]) - constant_value = helper.make_tensor_value_info("constant_value", TensorProto.UINT64, []) - - output = helper.make_tensor_value_info("output", TensorProto.UINT64, [None, None, None, None]) - - pad_node = helper.make_node( - op_type="Pad", - inputs=["input", "pads", "constant_value"], - outputs=["output"], - mode="constant", # or reflect/edge - ) - graph = helper.make_graph( - nodes=[pad_node], name="PadModel", inputs=[input_data, pads, constant_value], outputs=[output] - ) - - model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 14)]) - onnx.save(model, "test_pad_rce.onnx") - - -if __name__ == "__main__": - create_pad_model() From 9ce795825db0e4c86af9b91110506875d770ee30 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Tue, 6 Jan 2026 16:42:25 -0800 Subject: [PATCH 25/28] Fix TYPED_TEST macro expansion --- onnxruntime/test/providers/cpu/tensor/pad_test.cc | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc index d38e35771a75d..20e6a8523c0bd 100644 --- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc @@ -899,17 +899,7 @@ TYPED_TEST(PadOpTest, Pad_Edge_DimWithZeroInput) { {kDmlExecutionProvider}); } -static_assert(sizeof("Pad_Reflect_DimWithZeroInput") > 1, "test-name must not be empty"); -template -class PadOpTest_Pad_Reflect_DimWithZeroInput_Test : public PadOpTest { - private: - typedef PadOpTest TestFixture; - typedef gtest_TypeParam_ TypeParam; - void TestBody() override; -}; -[[maybe_unused]] static bool gtest_PadOpTest_Pad_Reflect_DimWithZeroInput_registered_ = ::testing::internal::TypeParameterizedTest, gtest_type_params_PadOpTest_>::Register("", ::testing::internal::CodeLocation("D:\\dev\\ort_main\\onnxruntime\\test\\providers\\cpu\\tensor\\pad_test.cc", 892), "PadOpTest", "Pad_Reflect_DimWithZeroInput", 0, ::testing::internal::GenerateNames()); -template -void PadOpTest_Pad_Reflect_DimWithZeroInput_Test::TestBody() { +TYPED_TEST(PadOpTest, Pad_Reflect_DimWithZeroInput) { using T = TypeParam; // DML: Unskip when fixed #41968513 RunAllOpsetAllDomainPadTests({2, 0}, // 2D From 2c62d9fbd920abec0c85d4ef8d154cf7d794e85d Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Wed, 7 Jan 2026 11:26:50 -0800 Subject: [PATCH 26/28] Address review and CI failures --- .../test/providers/cpu/tensor/pad_test.cc | 40 +++++++++++-------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc index 20e6a8523c0bd..11cdad7010c84 100644 --- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc @@ -67,7 +67,7 @@ static void RunAllOpsetAllDomainPadTests( bool value_is_initializer; }; const std::vector all_test_params{ - {true, false}, + {false, false}, #if (defined(USE_NNAPI) && defined(__ANDROID__)) || (defined(USE_COREML) && defined(__APPLE__)) // only enable when building NNAPI EP on Android or building CoreML EP for Apple environment // test runs out of memory in QEMU aarch64 environment, so don't enable otherwise @@ -772,7 +772,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) { T(1), {2}, {T(1), T(1)}, - "constant", OpTester::ExpectResult::kExpectSuccess, + "constant", OpTester::ExpectResult::kExpectSuccess, "", {kDmlExecutionProvider}); RunAllOpsetAllDomainPadTests({0}, // 1D empty pads @@ -780,7 +780,10 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) { {0, 0}, T(1), {0}, - {}); + {}, + "constant", + OpTester::ExpectResult::kExpectSuccess, "", + {kDmlExecutionProvider}); RunAllOpsetAllDomainPadTests({0}, // 1D offsetting pads {}, @@ -788,7 +791,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) { T(1), {0}, {}, - "constant", OpTester::ExpectResult::kExpectSuccess, + "constant", OpTester::ExpectResult::kExpectSuccess, "", {kDmlExecutionProvider}); RunAllOpsetAllDomainPadTests({2, 0}, // 2D @@ -797,7 +800,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) { T(1), {4, 2}, {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)}, - "constant", OpTester::ExpectResult::kExpectSuccess, + "constant", OpTester::ExpectResult::kExpectSuccess, "", {kDmlExecutionProvider}); RunAllOpsetAllDomainPadTests({0, 2}, @@ -806,7 +809,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) { T(1), {2, 4}, {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)}, - "constant", OpTester::ExpectResult::kExpectSuccess, + "constant", OpTester::ExpectResult::kExpectSuccess, "", {kDmlExecutionProvider}); RunAllOpsetAllDomainPadTests({0, 2}, @@ -815,7 +818,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) { T(1), {2, 2}, {T(1), T(1), T(1), T(1)}, - "constant", OpTester::ExpectResult::kExpectSuccess, + "constant", OpTester::ExpectResult::kExpectSuccess, "", {kDmlExecutionProvider}); RunAllOpsetAllDomainPadTests({2, 0, 2}, // 3D @@ -824,7 +827,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) { T(1), {2, 2, 2}, {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)}, - "constant", OpTester::ExpectResult::kExpectSuccess, + "constant", OpTester::ExpectResult::kExpectSuccess, "", {kDmlExecutionProvider}); } // Added output shape verification b/w the output shape generated by operator specific ONNX inference and @@ -895,7 +898,7 @@ TYPED_TEST(PadOpTest, Pad_Edge_DimWithZeroInput) { {2, 4, 0}, {}, "edge", - OpTester::ExpectResult::kExpectSuccess, + OpTester::ExpectResult::kExpectSuccess, "", {kDmlExecutionProvider}); } @@ -1164,10 +1167,11 @@ TEST(PadOpTest, ConstantPadLargeNegativePadNoOutput) { // Expected shape is as follows: // dim0: 2 + 1(pad) - 3(crop at the back) = (0) removed // Should produce empty output - // dim1: 18 + 0x100000(pad) - 0(crop at the front) = 0x10000 + // dim1: 18 + 0x100000(pad) - 0(crop at the front) = 65554 // dim2: 4 + -2(crop at the front) + 1(pad at the back) = 3 - // Resulting shape is {0, 0x10000, 3} with 0 at the front. + // Resulting shape is {0, 65554, 3} with 0 at the front. // How do we handle zero shapes? Currently ONNX spec allows it. + // We choose to produce a empty tensor constexpr int64_t dim0 = 2 + 1 - 3; constexpr int64_t dim1 = 18 + 0x100000 - 0; constexpr int64_t dim2 = 4 + -2 + 1; @@ -1225,7 +1229,8 @@ TEST(PadOpTest, ConstantMode_MixedSigns_Small) { test.AddInput("constant_value", {}, {cv}, true); test.AddOutput("output", expected_shape, expected_output); test.AddAttribute("mode", "constant"); - test.Run(); + test.ConfigExcludeEps({kDmlExecutionProvider}); + test.RunWithConfig(); } TEST(PadOpTest, ConstantMode_InnermostCropThenPostPad) { @@ -1261,7 +1266,8 @@ TEST(PadOpTest, ConstantMode_InnermostCropThenPostPad) { test.AddInput("constant_value", {}, {cv}, true); test.AddOutput("output", expected_shape, expected_output); test.AddAttribute("mode", "constant"); - test.Run(); + test.ConfigExcludeEps({kDmlExecutionProvider}); + test.RunWithConfig(); } TEST(PadOpTest, EdgeMode_ZeroExtentFails) { @@ -1278,7 +1284,7 @@ TEST(PadOpTest, EdgeMode_ZeroExtentFails) { test.AddInput("pads", {static_cast(pads.size())}, pads, true); test.AddOutput("output", expected_shape, expected_data); test.AddAttribute("mode", "edge"); - test.ConfigExcludeEps({kQnnExecutionProvider, kTensorrtExecutionProvider, kWebGpuExecutionProvider}); + test.ConfigExcludeEps({kDmlExecutionProvider, kQnnExecutionProvider, kTensorrtExecutionProvider, kWebGpuExecutionProvider}); test.Config(OpTester::ExpectResult::kExpectFailure, ""); test.RunWithConfig(); } @@ -1371,7 +1377,8 @@ TEST(PadOpTest, Pad_Reflect_NegativeFront_PositiveBack) { test.AddInput("pads", {static_cast(pads.size())}, pads, true); test.AddOutput("output", expected_shape, expected_data); test.AddAttribute("mode", "reflect"); - test.ConfigExcludeEps({kQnnExecutionProvider, kTensorrtExecutionProvider, kWebGpuExecutionProvider}); + test.ConfigExcludeEps({kDmlExecutionProvider, kQnnExecutionProvider, + kTensorrtExecutionProvider, kWebGpuExecutionProvider}); test.Config(OpTester::ExpectResult::kExpectFailure, "Pad reflect requires axis length >= 2 after slicing"); test.RunWithConfig(); @@ -1394,7 +1401,8 @@ TEST(PadOpTest, Pad_Wrap_NegativeFront_PositiveBack) { test.AddInput("pads", {static_cast(pads.size())}, pads, true); test.AddOutput("output", expected_shape, expected_data); test.AddAttribute("mode", "wrap"); - test.ConfigExcludeEps({kQnnExecutionProvider, kTensorrtExecutionProvider, kWebGpuExecutionProvider}); + test.ConfigExcludeEps({kDmlExecutionProvider, kQnnExecutionProvider, + kTensorrtExecutionProvider, kWebGpuExecutionProvider}); test.RunWithConfig(); } From 584876c974729793a3d4659da2da776ff7afd13d Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Wed, 7 Jan 2026 15:14:59 -0800 Subject: [PATCH 27/28] Address review comments --- onnxruntime/core/providers/cpu/tensor/pad.cc | 2 +- onnxruntime/test/providers/cpu/tensor/pad_test.cc | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc index 681494382c37a..9588f69697cfb 100644 --- a/onnxruntime/core/providers/cpu/tensor/pad.cc +++ b/onnxruntime/core/providers/cpu/tensor/pad.cc @@ -387,7 +387,7 @@ struct OutputSink { void operator()(T* output, T value) const { #ifdef _DEBUG if (output < beg || output >= end) { - throw std::out_of_range("Pad OutputSink: Output pointer is out of range"); + ORT_THROW("Pad OutputSink: Output pointer is out of range"); } #endif *output = value; diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc index 11cdad7010c84..6babebb1e561f 100644 --- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc @@ -1167,14 +1167,14 @@ TEST(PadOpTest, ConstantPadLargeNegativePadNoOutput) { // Expected shape is as follows: // dim0: 2 + 1(pad) - 3(crop at the back) = (0) removed // Should produce empty output - // dim1: 18 + 0x100000(pad) - 0(crop at the front) = 65554 + // dim1: 18 + 0x100000(pad) - 0(crop at the front) = 1,048,594 // dim2: 4 + -2(crop at the front) + 1(pad at the back) = 3 - // Resulting shape is {0, 65554, 3} with 0 at the front. + // Resulting shape is {0, 1048594, 3} with 0 at the front. // How do we handle zero shapes? Currently ONNX spec allows it. // We choose to produce a empty tensor - constexpr int64_t dim0 = 2 + 1 - 3; - constexpr int64_t dim1 = 18 + 0x100000 - 0; - constexpr int64_t dim2 = 4 + -2 + 1; + constexpr int64_t dim0 = 2LL + 1 - 3; + constexpr int64_t dim1 = 18LL + 0x100000 - 0; + constexpr int64_t dim2 = 4LL + -2 + 1; const std::initializer_list output_shape{dim0, dim1, dim2}; std::vector output_data; // empty now From 4d386d09c9801b3b8ccad3cb5578e544515e893c Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Wed, 7 Jan 2026 18:39:39 -0800 Subject: [PATCH 28/28] Skip for DML EP --- .../test/providers/cpu/tensor/pad_test.cc | 25 ++++++++----------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc index 6babebb1e561f..49c9d360f9046 100644 --- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc @@ -764,6 +764,9 @@ edge // test handling of input with a 0 for a dimension TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) { // TODO: Unskip Dml when fixed #41968513 + if (DefaultDmlExecutionProvider().get() != nullptr) { + GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 13, which exceeds threshold"; + } using T = TypeParam; RunAllOpsetAllDomainPadTests({0}, // 1D @@ -772,8 +775,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) { T(1), {2}, {T(1), T(1)}, - "constant", OpTester::ExpectResult::kExpectSuccess, "", - {kDmlExecutionProvider}); + "constant"); RunAllOpsetAllDomainPadTests({0}, // 1D empty pads {}, @@ -781,9 +783,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) { T(1), {0}, {}, - "constant", - OpTester::ExpectResult::kExpectSuccess, "", - {kDmlExecutionProvider}); + "constant"); RunAllOpsetAllDomainPadTests({0}, // 1D offsetting pads {}, @@ -791,8 +791,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) { T(1), {0}, {}, - "constant", OpTester::ExpectResult::kExpectSuccess, "", - {kDmlExecutionProvider}); + "constant"); RunAllOpsetAllDomainPadTests({2, 0}, // 2D {}, @@ -800,8 +799,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) { T(1), {4, 2}, {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)}, - "constant", OpTester::ExpectResult::kExpectSuccess, "", - {kDmlExecutionProvider}); + "constant"); RunAllOpsetAllDomainPadTests({0, 2}, {}, @@ -809,8 +807,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) { T(1), {2, 4}, {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)}, - "constant", OpTester::ExpectResult::kExpectSuccess, "", - {kDmlExecutionProvider}); + "constant"); RunAllOpsetAllDomainPadTests({0, 2}, {}, @@ -818,8 +815,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) { T(1), {2, 2}, {T(1), T(1), T(1), T(1)}, - "constant", OpTester::ExpectResult::kExpectSuccess, "", - {kDmlExecutionProvider}); + "constant"); RunAllOpsetAllDomainPadTests({2, 0, 2}, // 3D {}, @@ -827,8 +823,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) { T(1), {2, 2, 2}, {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)}, - "constant", OpTester::ExpectResult::kExpectSuccess, "", - {kDmlExecutionProvider}); + "constant"); } // Added output shape verification b/w the output shape generated by operator specific ONNX inference and // the output shape generated by operator specific ORT implementation. After adding this verification,