From d06ad1101606598d01142e9420fc6f9f929cc0f3 Mon Sep 17 00:00:00 2001 From: Susanta Bhattacharjee Date: Fri, 17 Oct 2025 16:16:37 +0530 Subject: [PATCH 1/3] Optimze copy tensor with padding Tensor layout related properties are calculated once and used those cached values during per element offset calculation. This brings ~200x improvement in wait time between two queries for PhiSlica model. That means a user has to wait only for 0.36 sec (instead of 74 sec !!!) between two queries. These numbers are from LNL. JIRA: https://jira.devtools.intel.com/browse/CVS-174810 --- .../include/intel_gpu/runtime/layout.hpp | 6 +++ .../intel_gpu/src/plugin/common_utils.cpp | 29 ++++++++--- src/plugins/intel_gpu/src/runtime/layout.cpp | 49 +++++++++++++++++++ 3 files changed, 76 insertions(+), 8 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp index aeb0f2182b5fde..e525d722411a67 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp @@ -320,6 +320,10 @@ struct layout { // element == { 0,0,0,0 } means first no-padding (i.e. data) element size_t get_linear_offset(tensor element = tensor(0)) const; + // Get variables needed for computing linear offset for a tensor with padding + void get_linear_offset_params(tensor& start_points, tensor& end_points, int64_t* padded_sizes, + int64_t* axes_map, size_t& map_size); + /// @brief Get aligned linear size calculated as multiplication of all elements. size_t get_linear_size() const; @@ -462,6 +466,8 @@ struct layout { } private: + static void get_axes_map(cldnn::format& fmt, int64_t* axes_map, size_t& map_size); + /// The size of the @ref memory (excluding padding) ov::PartialShape size; }; diff --git a/src/plugins/intel_gpu/src/plugin/common_utils.cpp b/src/plugins/intel_gpu/src/plugin/common_utils.cpp index b1969aabebf0de..90ceb96c998ada 100644 --- a/src/plugins/intel_gpu/src/plugin/common_utils.cpp +++ b/src/plugins/intel_gpu/src/plugin/common_utils.cpp @@ -25,16 +25,29 @@ void convert_and_copy_no_pad(const src_t* src, dst_t* dst, size_t size) { dst[i] = static_cast(src[i]); } +#define MAX_NUM_AXES 6 + template void convert_and_copy_padded_source(const src_t* src, dst_t* dst, cldnn::layout layout) { - cldnn::tensor size = layout.get_tensor(); - for (int64_t b = 0; b < size.batch[0]; b++) { - for (int64_t f = 0; f < size.feature[0]; f++) { - for (int64_t w = 0; w < size.spatial[3]; w++) { - for (int64_t z = 0; z < size.spatial[2]; z++) { - for (int64_t y = 0; y < size.spatial[1]; y++) { - for (int64_t x = 0; x < size.spatial[0]; x++) { - *dst++ = static_cast(src[layout.get_linear_offset(cldnn::tensor(b, f, x, y, z, w))]); + cldnn::tensor axes_start_point, axes_end_point; + int64_t padded_sizes[MAX_NUM_AXES], axes_map[MAX_NUM_AXES]; + size_t map_len = MAX_NUM_AXES; + + layout.get_linear_offset_params(axes_start_point, axes_end_point, padded_sizes, axes_map, map_len); + + for (int64_t b = axes_start_point.batch[0]; b < axes_end_point.batch[0]; b++) { + for (int64_t f = axes_start_point.feature[0]; f < axes_end_point.feature[0]; f++) { + for (int64_t w = axes_start_point.spatial[3]; w < axes_end_point.spatial[3]; w++) { + for (int64_t z = axes_start_point.spatial[2]; z < axes_end_point.spatial[2]; z++) { + for (int64_t y = axes_start_point.spatial[1]; y < axes_end_point.spatial[1]; y++) { + for (int64_t x = axes_start_point.spatial[0]; x < axes_end_point.spatial[0]; x++) { + int64_t element_sizes[MAX_NUM_AXES] = {b, f, x, y, z, w}; + size_t offset = element_sizes[axes_map[0]]; + + for (size_t i = 1; i < map_len; i++) + offset = offset * padded_sizes[i] + element_sizes[axes_map[i]]; + + *dst++ = static_cast(src[offset]); } } } diff --git a/src/plugins/intel_gpu/src/runtime/layout.cpp b/src/plugins/intel_gpu/src/runtime/layout.cpp index 05ca549151d9fb..1b17ee3dc32c45 100644 --- a/src/plugins/intel_gpu/src/runtime/layout.cpp +++ b/src/plugins/intel_gpu/src/runtime/layout.cpp @@ -335,6 +335,55 @@ std::vector layout::get_pitches() const { return pitches; } +void layout::get_axes_map(cldnn::format& fmt, int64_t* axes_map, size_t& map_size) { + const auto& output_order = fmt.order(); + const auto& internal_order = fmt.internal_order(); + std::vector sizes_map(output_order.size(), 0); + + //output_order has more elements than allocated in axes_map + if (output_order.size() > map_size) { + OPENVINO_THROW("Layout dimension higher than expected" + std::to_string(output_order.size())); + } + + map_size = output_order.size(); + + for (size_t i = 0; i < map_size; i++) { + auto c = output_order[i]; + auto pos = internal_order.find(c); + + if (pos == std::string::npos) + OPENVINO_THROW("Unknown coord type: " + std::to_string(c)); + + axes_map[i] = pos; + } +} + +void layout::get_linear_offset_params(tensor& start_points, tensor& end_points, int64_t* padded_sizes, + int64_t* axes_map, size_t& map_size) { + auto default_fmt = format::get_default_format(format.dimension(), format::is_weights_format(format), format::is_grouped(format)); + + std::vector lower_sizes, upper_sizes; + lower_sizes.assign(data_padding._lower_size.begin(), data_padding._lower_size.begin() + format.dimension()); + upper_sizes.assign(data_padding._upper_size.begin(), data_padding._upper_size.begin() + format.dimension()); + start_points = tensor(default_fmt, lower_sizes, 0); + const auto& u_padd = tensor(default_fmt, upper_sizes, 0); + + auto t = get_tensor(); + end_points = t + start_points; + + std::replace(t.raw.begin(), t.raw.end(), 0, 1); + + get_axes_map(format, axes_map, map_size); + const auto& p_sizes = (t + start_points + u_padd).sizes(format); + + if (p_sizes.size() < map_size) { + OPENVINO_THROW("Unsupported padded layout dimension" + std::to_string(p_sizes.size())); + } + + for (size_t i = 0; i < p_sizes.size(); i++) { + padded_sizes[i] = p_sizes[i]; + } +} size_t layout::get_linear_offset(tensor element) const { auto default_fmt = format::get_default_format(format.dimension(), format::is_weights_format(format), format::is_grouped(format)); From 2cc83e01d8503abf28576ea45b4af47b69cb446a Mon Sep 17 00:00:00 2001 From: Susanta Bhattacharjee Date: Tue, 21 Oct 2025 13:25:03 +0530 Subject: [PATCH 2/3] Moved the calculations to format.hpp and common_utils.cpp from layout.cpp. --- .../include/intel_gpu/runtime/format.hpp | 25 +++++++- .../include/intel_gpu/runtime/layout.hpp | 9 +-- .../intel_gpu/src/plugin/common_utils.cpp | 40 +++++++++++-- src/plugins/intel_gpu/src/runtime/layout.cpp | 57 +++---------------- 4 files changed, 70 insertions(+), 61 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp index ae23ab89bbe1f4..a529a1c533378a 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp @@ -14,7 +14,7 @@ #include #include #include - +#include "openvino/core/except.hpp" namespace cldnn { /// @addtogroup cpp_api C++ API @@ -302,6 +302,29 @@ struct format { fmt == bfvuwzyx); } + static void get_axes_map(const format& fmt, int64_t* axes_map, size_t& map_size) { + const auto& o_order = fmt.order(); + const auto& i_order = fmt.internal_order(); + std::vector sizes_map(o_order.size(), 0); + + // output_order has more elements than allocated in axes_map + if (o_order.size() > map_size) { + OPENVINO_THROW("Layout dimension higher than expected" + std::to_string(o_order.size())); + } + + map_size = o_order.size(); + + for (size_t i = 0; i < map_size; i++) { + auto c = o_order[i]; + auto pos = i_order.find(c); + + if (pos == std::string::npos) + OPENVINO_THROW("Unknown coord type: " + c); + + axes_map[i] = pos; + } + } + static format get_default_format(size_t rank, bool is_weights = false, bool is_grouped = false); static bool is_default_format(const format& fmt); diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp index e525d722411a67..c9126dca4bccae 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp @@ -320,10 +320,6 @@ struct layout { // element == { 0,0,0,0 } means first no-padding (i.e. data) element size_t get_linear_offset(tensor element = tensor(0)) const; - // Get variables needed for computing linear offset for a tensor with padding - void get_linear_offset_params(tensor& start_points, tensor& end_points, int64_t* padded_sizes, - int64_t* axes_map, size_t& map_size); - /// @brief Get aligned linear size calculated as multiplication of all elements. size_t get_linear_size() const; @@ -351,6 +347,9 @@ struct layout { } } + cldnn::format get_format() const; + padding get_padding() const; + size_t get_rank() const; size_t get_spatial_rank() const; @@ -466,8 +465,6 @@ struct layout { } private: - static void get_axes_map(cldnn::format& fmt, int64_t* axes_map, size_t& map_size); - /// The size of the @ref memory (excluding padding) ov::PartialShape size; }; diff --git a/src/plugins/intel_gpu/src/plugin/common_utils.cpp b/src/plugins/intel_gpu/src/plugin/common_utils.cpp index 90ceb96c998ada..e84316297c006c 100644 --- a/src/plugins/intel_gpu/src/plugin/common_utils.cpp +++ b/src/plugins/intel_gpu/src/plugin/common_utils.cpp @@ -18,6 +18,38 @@ namespace { +using namespace cldnn; + +#define MAX_NUM_AXES 6 +void get_linear_offset_params(layout& layout, tensor& start_points, tensor& end_points, int64_t* padded_sizes, int64_t* axes_map, size_t& map_size) { + + auto fmt = layout.get_format(); + auto data_padding = layout.get_padding(); + auto default_fmt = format::get_default_format(fmt.dimension(), format::is_weights_format(fmt), format::is_grouped(fmt)); + + std::vector lower_sizes, upper_sizes; + lower_sizes.assign(data_padding._lower_size.begin(), data_padding._lower_size.begin() + fmt.dimension()); + upper_sizes.assign(data_padding._upper_size.begin(), data_padding._upper_size.begin() + fmt.dimension()); + start_points = tensor(default_fmt, lower_sizes, 0); + const auto& u_padd = tensor(default_fmt, upper_sizes, 0); + + auto t = layout.get_tensor(); + end_points = t + start_points; + + std::replace(t.raw.begin(), t.raw.end(), 0, 1); + + format::get_axes_map(fmt, axes_map, map_size); + const auto& p_sizes = (t + start_points + u_padd).sizes(fmt); + + if (p_sizes.size() < map_size) { + OPENVINO_THROW("Unsupported padded layout dimension" + std::to_string(p_sizes.size())); + } + + for (int8_t i = 0; i < p_sizes.size(); i++) { + padded_sizes[i] = p_sizes[i]; + } +} + template void convert_and_copy_no_pad(const src_t* src, dst_t* dst, size_t size) { OPENVINO_ASSERT(src && dst, "[GPU] Src or Dst ptr is null"); @@ -25,15 +57,13 @@ void convert_and_copy_no_pad(const src_t* src, dst_t* dst, size_t size) { dst[i] = static_cast(src[i]); } -#define MAX_NUM_AXES 6 - template -void convert_and_copy_padded_source(const src_t* src, dst_t* dst, cldnn::layout layout) { - cldnn::tensor axes_start_point, axes_end_point; +void convert_and_copy_padded_source(const src_t* src, dst_t* dst, layout& layout) { + tensor axes_start_point, axes_end_point; int64_t padded_sizes[MAX_NUM_AXES], axes_map[MAX_NUM_AXES]; size_t map_len = MAX_NUM_AXES; - layout.get_linear_offset_params(axes_start_point, axes_end_point, padded_sizes, axes_map, map_len); + get_linear_offset_params(layout, axes_start_point, axes_end_point, padded_sizes, axes_map, map_len); for (int64_t b = axes_start_point.batch[0]; b < axes_end_point.batch[0]; b++) { for (int64_t f = axes_start_point.feature[0]; f < axes_end_point.feature[0]; f++) { diff --git a/src/plugins/intel_gpu/src/runtime/layout.cpp b/src/plugins/intel_gpu/src/runtime/layout.cpp index 1b17ee3dc32c45..00f211201cddd8 100644 --- a/src/plugins/intel_gpu/src/runtime/layout.cpp +++ b/src/plugins/intel_gpu/src/runtime/layout.cpp @@ -38,6 +38,14 @@ std::vector convert_dimensions(const std::vector& sizes, const } // namespace +format layout::get_format() const { + return format; +} + +padding layout::get_padding() const { + return data_padding; +} + size_t layout::get_rank() const { return format.dimension(); } @@ -335,55 +343,6 @@ std::vector layout::get_pitches() const { return pitches; } -void layout::get_axes_map(cldnn::format& fmt, int64_t* axes_map, size_t& map_size) { - const auto& output_order = fmt.order(); - const auto& internal_order = fmt.internal_order(); - std::vector sizes_map(output_order.size(), 0); - - //output_order has more elements than allocated in axes_map - if (output_order.size() > map_size) { - OPENVINO_THROW("Layout dimension higher than expected" + std::to_string(output_order.size())); - } - - map_size = output_order.size(); - - for (size_t i = 0; i < map_size; i++) { - auto c = output_order[i]; - auto pos = internal_order.find(c); - - if (pos == std::string::npos) - OPENVINO_THROW("Unknown coord type: " + std::to_string(c)); - - axes_map[i] = pos; - } -} - -void layout::get_linear_offset_params(tensor& start_points, tensor& end_points, int64_t* padded_sizes, - int64_t* axes_map, size_t& map_size) { - auto default_fmt = format::get_default_format(format.dimension(), format::is_weights_format(format), format::is_grouped(format)); - - std::vector lower_sizes, upper_sizes; - lower_sizes.assign(data_padding._lower_size.begin(), data_padding._lower_size.begin() + format.dimension()); - upper_sizes.assign(data_padding._upper_size.begin(), data_padding._upper_size.begin() + format.dimension()); - start_points = tensor(default_fmt, lower_sizes, 0); - const auto& u_padd = tensor(default_fmt, upper_sizes, 0); - - auto t = get_tensor(); - end_points = t + start_points; - - std::replace(t.raw.begin(), t.raw.end(), 0, 1); - - get_axes_map(format, axes_map, map_size); - const auto& p_sizes = (t + start_points + u_padd).sizes(format); - - if (p_sizes.size() < map_size) { - OPENVINO_THROW("Unsupported padded layout dimension" + std::to_string(p_sizes.size())); - } - - for (size_t i = 0; i < p_sizes.size(); i++) { - padded_sizes[i] = p_sizes[i]; - } -} size_t layout::get_linear_offset(tensor element) const { auto default_fmt = format::get_default_format(format.dimension(), format::is_weights_format(format), format::is_grouped(format)); From f056da3685a92d66a664576265d6816c66671859 Mon Sep 17 00:00:00 2001 From: Susanta Bhattacharjee Date: Tue, 21 Oct 2025 13:25:03 +0530 Subject: [PATCH 3/3] Moved the calculations to format.hpp and common_utils.cpp from layout.cpp. --- src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp | 3 +-- src/plugins/intel_gpu/src/plugin/common_utils.cpp | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp index a529a1c533378a..6fa808bf75b920 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp @@ -305,7 +305,6 @@ struct format { static void get_axes_map(const format& fmt, int64_t* axes_map, size_t& map_size) { const auto& o_order = fmt.order(); const auto& i_order = fmt.internal_order(); - std::vector sizes_map(o_order.size(), 0); // output_order has more elements than allocated in axes_map if (o_order.size() > map_size) { @@ -319,7 +318,7 @@ struct format { auto pos = i_order.find(c); if (pos == std::string::npos) - OPENVINO_THROW("Unknown coord type: " + c); + OPENVINO_THROW("Unknown coord type: " + std::to_string(c)); axes_map[i] = pos; } diff --git a/src/plugins/intel_gpu/src/plugin/common_utils.cpp b/src/plugins/intel_gpu/src/plugin/common_utils.cpp index e84316297c006c..17dda48816a9c9 100644 --- a/src/plugins/intel_gpu/src/plugin/common_utils.cpp +++ b/src/plugins/intel_gpu/src/plugin/common_utils.cpp @@ -22,7 +22,6 @@ using namespace cldnn; #define MAX_NUM_AXES 6 void get_linear_offset_params(layout& layout, tensor& start_points, tensor& end_points, int64_t* padded_sizes, int64_t* axes_map, size_t& map_size) { - auto fmt = layout.get_format(); auto data_padding = layout.get_padding(); auto default_fmt = format::get_default_format(fmt.dimension(), format::is_weights_format(fmt), format::is_grouped(fmt)); @@ -45,7 +44,7 @@ void get_linear_offset_params(layout& layout, tensor& start_points, tensor& end_ OPENVINO_THROW("Unsupported padded layout dimension" + std::to_string(p_sizes.size())); } - for (int8_t i = 0; i < p_sizes.size(); i++) { + for (size_t i = 0; i < p_sizes.size(); i++) { padded_sizes[i] = p_sizes[i]; } }