Skip to content

Commit 1987f7c

Browse files
ZibinGuoguozibin
andauthored
[XPU] Update XHPC to 20250722. (#74277)
Co-authored-by: guozibin <[email protected]>
1 parent 5faaf3e commit 1987f7c

19 files changed

+133
-87
lines changed

cmake/external/xpu.cmake

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,15 @@ set(XPU_XPTI_LIB_NAME "libxpti.so")
2828
set(XPU_XBLAS_LIB_NAME "libxpu_blas.so")
2929
set(XPU_XFA_LIB_NAME "libxpu_flash_attention.so")
3030
set(XPU_XPUDNN_LIB_NAME "libxpu_dnn.so")
31+
set(XPU_XPUDNN_OMP_LIB_NAME "libomp.so")
3132
set(XPU_FFT_LIB_NAME "libcufft.so")
3233
# Avoid deprecated int32 apis:
3334
add_compile_definitions(XPUAPI_NOT_INCLUDE_DEPRECATED)
3435

3536
if(NOT DEFINED XPU_XHPC_BASE_DATE)
36-
set(XPU_XHPC_BASE_DATE "dev/20990602")
37+
set(XPU_XHPC_BASE_DATE "dev/20250722")
3738
endif()
38-
set(XPU_XCCL_BASE_VERSION "3.0.2.7") # For XRE5
39+
set(XPU_XCCL_BASE_VERSION "3.0.3.1") # For XRE5
3940
if(NOT DEFINED XPU_XFT_BASE_VERSION)
4041
set(XPU_XFT_BASE_VERSION "20250507/xpu3")
4142
endif()
@@ -183,6 +184,7 @@ set(XPU_CUDA_RT_LIB "${XPU_LIB_DIR}/${XPU_CUDA_RT_LIB_NAME}")
183184
set(XPU_ML_LIB "${XPU_LIB_DIR}/${XPU_ML_LIB_NAME}")
184185
set(XPU_XFA_LIB "${XPU_LIB_DIR}/${XPU_XFA_LIB_NAME}")
185186
set(XPU_XPUDNN_LIB "${XPU_LIB_DIR}/${XPU_XPUDNN_LIB_NAME}")
187+
set(XPU_XPUDNN_OMP_LIB "${XPU_LIB_DIR}/${XPU_XPUDNN_OMP_LIB_NAME}")
186188

187189
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
188190

@@ -251,8 +253,9 @@ if(WITH_XPU_XRE5)
251253
DOWNLOAD_COMMAND
252254
bash ${CMAKE_SOURCE_DIR}/tools/xpu/pack_paddle_dependence.sh
253255
${XPU_XRE_URL} ${XPU_XRE_DIR_NAME} ${XPU_XHPC_URL} ${XPU_XHPC_DIR_NAME}
254-
${XPU_XCCL_URL} ${XPU_XCCL_DIR_NAME} 1 && wget ${XPU_XFT_GET_DEPENCE_URL}
255-
&& bash ${XFT_COMMAND} ${XPU_XFT_URL} ${XPU_XFT_DIR_NAME} && bash
256+
${XPU_XCCL_URL} ${XPU_XCCL_DIR_NAME} 1 ${WITH_MKL}
257+
"${CMAKE_SOURCE_DIR}/build" && wget ${XPU_XFT_GET_DEPENCE_URL} && bash
258+
${XFT_COMMAND} ${XPU_XFT_URL} ${XPU_XFT_DIR_NAME} && bash
256259
${CMAKE_SOURCE_DIR}/tools/xpu/get_xpti_dependence.sh ${XPU_XPTI_URL}
257260
${XPU_XPTI_DIR_NAME} && bash
258261
${CMAKE_SOURCE_DIR}/tools/xpu/get_xpufft_dependence.sh ${XPU_FFT_URL}
@@ -263,6 +266,7 @@ if(WITH_XPU_XRE5)
263266
BUILD_BYPRODUCTS ${XPU_API_LIB}
264267
BUILD_BYPRODUCTS ${XPU_XBLAS_LIB}
265268
BUILD_BYPRODUCTS ${XPU_XPUDNN_LIB}
269+
BUILD_BYPRODUCTS ${XPU_XPUDNN_OMP_LIB}
266270
BUILD_BYPRODUCTS ${XPU_XFA_LIB}
267271
BUILD_BYPRODUCTS ${XPU_RT_LIB}
268272
BUILD_BYPRODUCTS ${XPU_CUDA_RT_LIB}
@@ -360,6 +364,10 @@ if(WITH_XPU_XRE5)
360364
${XPU_XFA_LIB}
361365
${XPU_XPUDNN_LIB}
362366
${XPU_ML_LIB})
367+
368+
if(NOT WITH_MKL)
369+
target_link_libraries(xpulib ${XPU_XPUDNN_OMP_LIB})
370+
endif()
363371
else()
364372
target_link_libraries(xpulib ${XPU_RT_LIB} ${XPU_API_LIB})
365373
endif()

paddle/phi/kernels/fusion/xpu/fast_where_xpu_kernel.cc

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -45,15 +45,15 @@ void FastWhereXPUKernel(const Context& dev_ctx,
4545
#ifndef PADDLE_WITH_XPU_PLUGIN
4646
LOG(INFO)
4747
<< "Add -DWITH_XPU_PLUGIN=ON to build xpu::plugin::fast_where(), or use "
48-
"xpu::select() instead, which leads low performance.";
49-
int r = xpu::select<XPUType>(dev_ctx.x_context(),
50-
condition_data,
51-
x_data,
52-
y_data,
53-
out_data,
54-
condition_dims,
55-
x_dims);
56-
PADDLE_ENFORCE_XDNN_SUCCESS(r, "select");
48+
"xpu::where() instead, which leads low performance.";
49+
int r = xpu::where<XPUType>(dev_ctx.x_context(),
50+
condition_data,
51+
x_data,
52+
y_data,
53+
out_data,
54+
condition_dims,
55+
x_dims);
56+
PADDLE_ENFORCE_XDNN_SUCCESS(r, "where");
5757
#else
5858
xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
5959
if (condition_dims != x_dims) {

paddle/phi/kernels/xpu/activation_grad_kernel.cc

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -368,21 +368,21 @@ struct XPUSiluGradFunctor : public funcs::BaseActivationFunctor<T> {
368368

369369
if (std::getenv("XPU_PADDLE_ACT_LUT") != nullptr) {
370370
if (!std::is_same<T, ::phi::dtype::bfloat16>::value) {
371-
// use fast_swish_grad if NOT bf16
372-
int r = xpu::fast_swish_grad(
371+
// use fast_silu_grad if NOT bf16
372+
int r = xpu::fast_silu_grad(
373373
dev_ctx.x_context(), x_data, y_grad, x_grad, dx->numel());
374-
PADDLE_ENFORCE_XDNN_SUCCESS(r, "fast_swish_grad");
374+
PADDLE_ENFORCE_XDNN_SUCCESS(r, "fast_silu_grad");
375375
} else {
376-
// use plain swish_grad
377-
int r = xpu::swish_grad(
376+
// use plain silu_grad
377+
int r = xpu::silu_grad(
378378
dev_ctx.x_context(), x_data, y_grad, x_grad, dx->numel());
379-
PADDLE_ENFORCE_XDNN_SUCCESS(r, "swish_grad");
379+
PADDLE_ENFORCE_XDNN_SUCCESS(r, "silu_grad");
380380
}
381381
} else {
382-
// use plain swish_grad
383-
int r = xpu::swish_grad(
382+
// use plain silu_grad
383+
int r = xpu::silu_grad(
384384
dev_ctx.x_context(), x_data, y_grad, x_grad, dx->numel());
385-
PADDLE_ENFORCE_XDNN_SUCCESS(r, "swish_grad");
385+
PADDLE_ENFORCE_XDNN_SUCCESS(r, "silu_grad");
386386
}
387387
}
388388
};

paddle/phi/kernels/xpu/activation_kernel.cc

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -205,12 +205,13 @@ struct XPURoundFunctor : public funcs::BaseActivationFunctor<T> {
205205
const DenseTensor& x,
206206
DenseTensor* out) const {
207207
using XPUType = typename XPUTypeTrait<T>::Type;
208-
int r = xpu::round<XPUType>(dev_ctx.x_context(),
209-
reinterpret_cast<const XPUType*>(x.data<T>()),
210-
reinterpret_cast<XPUType*>(out->data<T>()),
211-
x.numel(),
212-
decimals);
213-
PADDLE_ENFORCE_XDNN_SUCCESS(r, "round");
208+
int r = xpu::paddle_round<XPUType>(
209+
dev_ctx.x_context(),
210+
reinterpret_cast<const XPUType*>(x.data<T>()),
211+
reinterpret_cast<XPUType*>(out->data<T>()),
212+
x.numel(),
213+
decimals);
214+
PADDLE_ENFORCE_XDNN_SUCCESS(r, "paddle_round");
214215
}
215216
};
216217

@@ -344,20 +345,20 @@ struct XPUSiluFunctor : public funcs::BaseActivationFunctor<T> {
344345
if (std::getenv("XPU_PADDLE_ACT_LUT") != nullptr) {
345346
if (!std::is_same<T, ::phi::dtype::bfloat16>::value) {
346347
// use fast_swish if NOT bf16
347-
int r = xpu::fast_swish(
348+
int r = xpu::fast_silu(
348349
xpu_context, x_data, y_data, x.numel(), nullptr, nullptr);
349-
PADDLE_ENFORCE_XDNN_SUCCESS(r, "fast_swish");
350+
PADDLE_ENFORCE_XDNN_SUCCESS(r, "fast_silu");
350351
} else {
351352
// use plain swish
352-
int r = xpu::swish(
353-
xpu_context, x_data, y_data, x.numel(), nullptr, nullptr);
354-
PADDLE_ENFORCE_XDNN_SUCCESS(r, "swish");
353+
int r =
354+
xpu::silu(xpu_context, x_data, y_data, x.numel(), nullptr, nullptr);
355+
PADDLE_ENFORCE_XDNN_SUCCESS(r, "silu");
355356
}
356357
} else {
357358
// use plain swish
358359
int r =
359-
xpu::swish(xpu_context, x_data, y_data, x.numel(), nullptr, nullptr);
360-
PADDLE_ENFORCE_XDNN_SUCCESS(r, "swish");
360+
xpu::silu(xpu_context, x_data, y_data, x.numel(), nullptr, nullptr);
361+
PADDLE_ENFORCE_XDNN_SUCCESS(r, "silu");
361362
}
362363
}
363364
};

paddle/phi/kernels/xpu/add_n_kernel.cc

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,8 @@ void AddNKernel(const Context& dev_ctx,
7676
} else if (ptrs.size() < x.size()) {
7777
xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
7878
XPUType* out_t = RAII_GUARD.alloc_l3_or_gm<XPUType>(out->numel());
79-
int r = xpu::sum(dev_ctx.x_context(), ptrs, out_t, out->numel());
80-
PADDLE_ENFORCE_XDNN_SUCCESS(r, "sum");
79+
int r = xpu::add_n(dev_ctx.x_context(), ptrs, out_t, out->numel());
80+
PADDLE_ENFORCE_XDNN_SUCCESS(r, "add_n");
8181

8282
r = xpu::add(dev_ctx.x_context(),
8383
reinterpret_cast<const XPUType*>(out->data<T>()),
@@ -86,12 +86,12 @@ void AddNKernel(const Context& dev_ctx,
8686
out->numel());
8787
PADDLE_ENFORCE_XDNN_SUCCESS(r, "add");
8888
} else {
89-
int r = xpu::sum(dev_ctx.x_context(),
90-
ptrs,
91-
reinterpret_cast<XPUType*>(out->data<T>()),
92-
out->numel());
89+
int r = xpu::add_n(dev_ctx.x_context(),
90+
ptrs,
91+
reinterpret_cast<XPUType*>(out->data<T>()),
92+
out->numel());
9393

94-
PADDLE_ENFORCE_XDNN_SUCCESS(r, "sum");
94+
PADDLE_ENFORCE_XDNN_SUCCESS(r, "add_n");
9595
}
9696
}
9797

@@ -149,10 +149,10 @@ void AddNArrayKernel(const Context& dev_ctx,
149149

150150
// int sum(Context* xpu_ctx, const std::vector<const T*>& x_list, T*
151151
// y, int64_t len);
152-
int r = xpu::sum(dev_ctx.x_context(),
153-
ptrs,
154-
reinterpret_cast<XPUType*>(out->at(j).data<T>()),
155-
out->at(j).numel());
152+
int r = xpu::add_n(dev_ctx.x_context(),
153+
ptrs,
154+
reinterpret_cast<XPUType*>(out->at(j).data<T>()),
155+
out->at(j).numel());
156156
PADDLE_ENFORCE_XDNN_SUCCESS(r, "sum");
157157
}
158158
}

paddle/phi/kernels/xpu/c_softmax_with_cross_entropy_kernel.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,15 +125,15 @@ void FixLossAccordingToIgnoreIndex(const phi::XPUContext& dev_ctx,
125125
// int select(Context* xpu_ctx, const bool* condition, const T* x, const T* y,
126126
// T* z, const std::vector<int64_t>& condition_shape, const
127127
// std::vector<int64_t>& xshape);
128-
ret = xpu::select(
128+
ret = xpu::where(
129129
dev_ctx.x_context(),
130130
reinterpret_cast<const bool*>(bool_tensor_for_mask_label.data<bool>()),
131131
reinterpret_cast<const XPUType*>(zeros_constant.data<T>()),
132132
reinterpret_cast<const XPUType*>(loss->data<T>()),
133133
reinterpret_cast<XPUType*>(loss->data<T>()),
134134
common::vectorize(predicted_logits->dims()),
135135
common::vectorize(predicted_logits->dims()));
136-
PADDLE_ENFORCE_XDNN_SUCCESS(ret, "select");
136+
PADDLE_ENFORCE_XDNN_SUCCESS(ret, "where");
137137
}
138138
template <typename T>
139139
struct CSoftmaxWithCrossEntropyFunctor<phi::XPUContext, T> {

paddle/phi/kernels/xpu/flash_attn_kernel.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,8 @@ void FlashAttnKernelBase(
213213
: 0, // flash_mask_head_num
214214
nullptr, // flashmask_maxmin
215215
is_flashmask ? flashmask_stream : nullptr, // side_stream
216-
0 // fixlen_batch_num
216+
0, // fixlen_batch_num
217+
false // unpadded_lse
217218
);
218219
PADDLE_ENFORCE_XDNN_SUCCESS(r, "mha_varlen_fwd");
219220
if (is_flashmask && flashmask_stream != nullptr) {

paddle/phi/kernels/xpu/index_select_kernel.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ void IndexSelectKernel(const Context& dev_ctx,
7373
const int64_t* index_data =
7474
index_ptr ? reinterpret_cast<const int64_t*>(index_ptr)
7575
: index.template data<int64_t>();
76-
r = xpu::paddle_gather<XPUType, int64_t>(
76+
r = xpu::index_select<XPUType, int64_t>(
7777
dev_ctx.x_context(),
7878
reinterpret_cast<const XPUType*>(in_data),
7979
reinterpret_cast<const int64_t*>(index_data),
@@ -84,7 +84,7 @@ void IndexSelectKernel(const Context& dev_ctx,
8484
} else {
8585
const int* index_data = index_ptr ? reinterpret_cast<const int*>(index_ptr)
8686
: index.template data<int>();
87-
r = xpu::paddle_gather<XPUType, int>(
87+
r = xpu::index_select<XPUType, int>(
8888
dev_ctx.x_context(),
8989
reinterpret_cast<const XPUType*>(in_data),
9090
reinterpret_cast<const int*>(index_data),
@@ -93,7 +93,7 @@ void IndexSelectKernel(const Context& dev_ctx,
9393
index_len,
9494
dim);
9595
}
96-
PADDLE_ENFORCE_XDNN_SUCCESS(r, "paddle_gather");
96+
PADDLE_ENFORCE_XDNN_SUCCESS(r, "index_select");
9797
}
9898

9999
} // namespace phi

paddle/phi/kernels/xpu/reduce_max_grad_kernel.cc

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -109,14 +109,14 @@ void ReduceMaxGradKernel(const Context& dev_ctx,
109109
r = xpu::constant(
110110
dev_ctx.x_context(), broadcast1, x.numel(), static_cast<XPUDataType>(0));
111111
PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
112-
r = xpu::select(dev_ctx.x_context(),
113-
equal,
114-
broadcast2,
115-
broadcast1,
116-
x_grad_data,
117-
xdims,
118-
xdims);
119-
PADDLE_ENFORCE_XDNN_SUCCESS(r, "select");
112+
r = xpu::where(dev_ctx.x_context(),
113+
equal,
114+
broadcast2,
115+
broadcast1,
116+
x_grad_data,
117+
xdims,
118+
xdims);
119+
PADDLE_ENFORCE_XDNN_SUCCESS(r, "where");
120120
}
121121

122122
} // namespace phi

paddle/phi/kernels/xpu/reduce_min_grad_kernel.cc

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -105,14 +105,14 @@ void ReduceMinGradKernel(const Context& dev_ctx,
105105
// step 3. get x_grad
106106
r = xpu::constant<T>(dev_ctx.x_context(), broadcast1, x.numel(), 0);
107107
PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
108-
r = xpu::select<T>(dev_ctx.x_context(),
109-
equal,
110-
broadcast2,
111-
broadcast1,
112-
x_grad_data,
113-
xdims,
114-
xdims);
115-
PADDLE_ENFORCE_XDNN_SUCCESS(r, "select");
108+
r = xpu::where<T>(dev_ctx.x_context(),
109+
equal,
110+
broadcast2,
111+
broadcast1,
112+
x_grad_data,
113+
xdims,
114+
xdims);
115+
PADDLE_ENFORCE_XDNN_SUCCESS(r, "where");
116116
}
117117

118118
} // namespace phi

0 commit comments

Comments
 (0)