diff --git a/backends/intel_hpu/kernels/funcs.h b/backends/intel_hpu/kernels/funcs.h index a0d1a59e0c..4b08253268 100644 --- a/backends/intel_hpu/kernels/funcs.h +++ b/backends/intel_hpu/kernels/funcs.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include #include "paddle/extension.h" #include "paddle/phi/backends/device_ext.h" @@ -348,7 +349,19 @@ class ConvertTensors { for (auto d : dims) { num_elements *= d; } - int64_t addr_offset = num_elements * DataTypeSize(x.dtype()); + int64_t element_bytes = num_elements * DataTypeSize(x.dtype()); + auto addr_offset = element_bytes; + + // Align addr_offset to the nearest multiple of 0x80 if it's not already + // aligned + if (element_bytes % 0x80 != 0) { + int64_t N = 1; + // 求 element_bytes 和 0x80 的最大公约数 + int64_t gcd = std::gcd(element_bytes, 0x80); + N = 0x80 / gcd; + addr_offset = element_bytes * N; + num_list = num_list / N; + } if (is_input) { for (int64_t tensor_idx = 0; tensor_idx < num_list; tensor_idx++) {