Skip to content

Commit 0436e45

Browse files
Split kernel into libphi.so and libphi_gpu_kernel.so (#64944) (#64958)
* [HACKATHON 6th] split GPU kernels * [HACKATHON 6th] split phi/kernels/gpu only * [HACKATHON 6th] fix setup.py * [HACKATHON 6th] fix setup.py typo :( * [HACKATHON 6th] fix inference error * [HACKATHON 6th] fix inference demo DSO missing error * [HACKATHON 6th] fix fluid eager test * [HACKATHON 6th] fix tests * [HACKATHON 6th] fix static library link dependency * [HACKATHON 6th] fix setup.py.in --------- Co-authored-by: Silver Ling <[email protected]>
1 parent f0d22fc commit 0436e45

File tree

9 files changed

+92
-11
lines changed

9 files changed

+92
-11
lines changed

cmake/inference_lib.cmake

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,14 @@ else()
285285
inference_lib_dist
286286
SRCS ${paddle_phi_lib}
287287
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
288+
if(WITH_GPU OR WITH_ROCM)
289+
set(paddle_phi_kernel_gpu_lib
290+
${PADDLE_BINARY_DIR}/paddle/phi/libphi_kernel_gpu.*)
291+
copy(
292+
inference_lib_dist
293+
SRCS ${paddle_phi_kernel_gpu_lib}
294+
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
295+
endif()
288296
endif()
289297
endif()
290298

paddle/fluid/eager/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ set(eager_deps
1313
grad_tensor_holder
1414
custom_operator_node)
1515

16+
if(WITH_GPU OR WITH_ROCM)
17+
set(eager_deps ${eager_deps} phi_kernel_gpu)
18+
endif()
19+
1620
if(NOT (NOT WITH_PYTHON AND ON_INFER))
1721
set(eager_deps ${eager_deps} accumulation_node prim_utils)
1822
endif()

paddle/fluid/inference/api/demo_ci/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,12 @@ if(NOT WIN32)
212212
set(DEPS
213213
${DEPS} ${PADDLE_LIB}/paddle/lib/libphi${CMAKE_SHARED_LIBRARY_SUFFIX}
214214
${PADDLE_LIB}/paddle/lib/libcommon${CMAKE_SHARED_LIBRARY_SUFFIX})
215+
if(WITH_GPU OR WITH_ROCM)
216+
set(DEPS
217+
${DEPS}
218+
${PADDLE_LIB}/paddle/lib/libphi_kernel_gpu${CMAKE_SHARED_LIBRARY_SUFFIX}
219+
)
220+
endif()
215221
endif()
216222
else()
217223
set(DEPS

paddle/phi/CMakeLists.txt

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,8 @@ set(PHI_SRCS
112112
${infermeta_srcs}
113113
${capi_srcs})
114114

115+
set(PHI_KERNEL_GPU_SRCS ${kernels_gpu_srcs})
116+
115117
if(WITH_SHARED_PHI)
116118
set(PHI_BUILD_TYPE
117119
SHARED
@@ -205,6 +207,40 @@ set(PHI_LIB
205207
"${CMAKE_CURRENT_BINARY_DIR}/${PHI_NAME}"
206208
CACHE FILEPATH "PHI Library" FORCE)
207209

210+
# NOTE(silverling): what we are doing here is to build a library `phi_kernel_gpu`
211+
# that contains all GPU kernels implementation. This can allow paddle be built
212+
# with more CUDA archs and reduce the binary size of `phi` library.
213+
if(WITH_GPU OR WITH_ROCM)
214+
if(WITH_GPU)
215+
nv_library(
216+
phi_kernel_gpu ${PHI_BUILD_TYPE}
217+
SRCS ${PHI_KERNEL_GPU_SRCS}
218+
DEPS ${PHI_DEPS})
219+
elseif(WITH_ROCM)
220+
hip_library(
221+
phi_kernel_gpu ${PHI_BUILD_TYPE}
222+
SRCS ${PHI_KERNEL_GPU_SRCS}
223+
DEPS ${PHI_DEPS})
224+
endif()
225+
226+
# NOTE(silverling): making library `phi` depend on `phi_kernel_gpu` (even `phi` does not use it)
227+
# will make targets that depend on `phi` also automatically depend on `phi_kernel_gpu`.
228+
# This will make users' life easier.
229+
target_link_libraries(phi phi_kernel_gpu)
230+
231+
# NOTE(silverling): `phi_kernel_gpu` needs symbols from `phi`.
232+
# When it's a shared library, it will work with no problem.
233+
# But when it's a static library, it must be linked to `phi` at link time explicitly.
234+
if(NOT WITH_SHARED_PHI)
235+
target_link_libraries(phi_kernel_gpu phi)
236+
endif()
237+
238+
string(REPLACE "phi" "phi_kernel_gpu" PHI_KERNEL_GPU_NAME ${PHI_NAME})
239+
set(PHI_KERNEL_GPU_LIB
240+
"${CMAKE_CURRENT_BINARY_DIR}/${PHI_KERNEL_GPU_NAME}"
241+
CACHE FILEPATH "PHI Kernel GPU Library" FORCE)
242+
endif()
243+
208244
if(MKL_FOUND AND WITH_ONEMKL)
209245
target_include_directories(phi PRIVATE ${MKL_INCLUDE})
210246
endif()

paddle/phi/kernels/CMakeLists.txt

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,6 @@ file(GLOB kernel_primitive_h "primitive/*.h")
2929
file(
3030
GLOB kernel_cu
3131
RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
32-
"gpu/*.cu"
33-
"gpu/*.cu.cc"
3432
"gpudnn/*.cu"
3533
"kps/*.cu"
3634
"legacy/kps/*.cu"
@@ -40,18 +38,24 @@ file(
4038
"strings/gpu/*.cu"
4139
"fusion/gpu/*.cu")
4240

41+
file(
42+
GLOB kernel_gpu
43+
RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
44+
"gpu/*.cu" "gpu/*.cu.cc")
45+
4346
if(APPLE OR WIN32)
4447
list(REMOVE_ITEM kernel_cu "fusion/gpu/fusion_group_kernel.cu")
4548
list(REMOVE_ITEM kernel_cu "sparse/gpu/conv_kernel_igemm.cu")
4649
endif()
4750

4851
if(NOT WITH_DGC)
49-
list(REMOVE_ITEM kernel_cu "gpu/dgc_kernel.cu")
52+
list(REMOVE_ITEM kernel_gpu "gpu/dgc_kernel.cu")
5053
endif()
5154

5255
if(DEFINED REDUCE_INFERENCE_LIB_SIZE)
53-
list(FILTER kernel_cu EXCLUDE REGEX ".*_grad_kernel\\.cc$")
5456
list(FILTER kernel_cu EXCLUDE REGEX ".*_grad_kernel\\.cu$")
57+
list(FILTER kernel_gpu EXCLUDE REGEX ".*_grad_kernel\\.cc$")
58+
list(FILTER kernel_gpu EXCLUDE REGEX ".*_grad_kernel\\.cu$")
5559
endif()
5660

5761
if(WITH_CUTLASS)
@@ -216,6 +220,15 @@ if(WITH_ROCM)
216220
list(
217221
REMOVE_ITEM
218222
kernel_cu
223+
"gpudnn/mha_cudnn_frontend.cu"
224+
"fusion/gpu/blha_get_max_len.cu"
225+
"fusion/gpu/block_multi_head_attention_kernel.cu"
226+
"fusion/gpu/fused_bn_add_activation_grad_kernel.cu"
227+
"fusion/gpu/fused_bn_add_activation_kernel.cu"
228+
"fusion/gpu/fusion_transpose_flatten_concat_kernel.cu")
229+
list(
230+
REMOVE_ITEM
231+
kernel_gpu
219232
"gpu/affine_grid_grad_kernel.cu"
220233
"gpu/apply_per_channel_scale_kernel.cu"
221234
"gpu/cholesky_solve_kernel.cu"
@@ -228,13 +241,7 @@ if(WITH_ROCM)
228241
"gpu/put_along_axis_grad_kernel.cu"
229242
"gpu/put_along_axis_kernel.cu"
230243
"gpu/qr_kernel.cu"
231-
"gpu/svd_kernel.cu"
232-
"gpudnn/mha_cudnn_frontend.cu"
233-
"fusion/gpu/blha_get_max_len.cu"
234-
"fusion/gpu/block_multi_head_attention_kernel.cu"
235-
"fusion/gpu/fused_bn_add_activation_grad_kernel.cu"
236-
"fusion/gpu/fused_bn_add_activation_kernel.cu"
237-
"fusion/gpu/fusion_transpose_flatten_concat_kernel.cu")
244+
"gpu/svd_kernel.cu")
238245
endif()
239246
240247
set(cc_search_pattern
@@ -291,6 +298,8 @@ file(
291298
if(WITH_GPU OR WITH_ROCM)
292299
collect_srcs(kernels_srcs SRCS ${kernel_cu})
293300
kernel_declare("${kernel_cu}")
301+
collect_srcs(kernels_gpu_srcs SRCS ${kernel_gpu})
302+
kernel_declare("${kernel_gpu}")
294303
endif()
295304
296305
if(WITH_XPU)

python/env_dict.py.in

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ env_dict={
1414
'FLUID_CORE_NAME':'@FLUID_CORE_NAME@',
1515
'PHI_LIB':'@PHI_LIB@',
1616
'PHI_NAME':'@PHI_NAME@',
17+
'PHI_KERNEL_GPU_LIB':'@PHI_KERNEL_GPU_LIB@',
18+
'PHI_KERNEL_GPU_NAME':'@PHI_KERNEL_GPU_NAME@',
1719
'WITH_SHARED_PHI':'@WITH_SHARED_PHI@',
1820
'IR_LIB':'@IR_LIB@',
1921
'IR_NAME':'@IR_NAME@',

python/setup.py.in

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -710,6 +710,12 @@ package_data['paddle.libs']= []
710710
if('${WITH_SHARED_PHI}' == 'ON'):
711711
package_data['paddle.libs'] += [('libphi' if os.name != 'nt' else 'phi') + ext_name]
712712
shutil.copy('${PHI_LIB}', libs_path)
713+
if('${PHI_KERNEL_GPU_LIB}'):
714+
package_data['paddle.libs'] += [
715+
('libphi_kernel_gpu' if os.name != 'nt' else 'phi_kernel_gpu')
716+
+ ext_name
717+
]
718+
shutil.copy('${PHI_KERNEL_GPU_LIB}', libs_path)
713719

714720
if('${WITH_SHARED_IR}' == 'ON'):
715721
package_data['paddle.libs'] += [('libpir' if os.name != 'nt' else 'pir') + ext_name]

setup.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1061,6 +1061,12 @@ def get_package_data_and_package_dir():
10611061
('libphi' if os.name != 'nt' else 'phi') + ext_suffix
10621062
]
10631063
shutil.copy(env_dict.get("PHI_LIB"), libs_path)
1064+
if env_dict.get("PHI_KERNEL_GPU_LIB"):
1065+
package_data['paddle.libs'] += [
1066+
('libphi_kernel_gpu' if os.name != 'nt' else 'phi_kernel_gpu')
1067+
+ ext_suffix
1068+
]
1069+
shutil.copy(env_dict.get("PHI_KERNEL_GPU_LIB"), libs_path)
10641070

10651071
if env_dict.get("WITH_SHARED_IR") == "ON":
10661072
package_data['paddle.libs'] += [

test/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,10 @@ if(${len} GREATER_EQUAL 1)
227227
target_link_libraries(${test_name} $<TARGET_LINKER_FILE:${paddle_lib}>)
228228
if(WITH_SHARED_PHI)
229229
target_link_libraries(${test_name} $<TARGET_LINKER_FILE:phi>)
230+
if(WITH_GPU OR WITH_ROCM)
231+
target_link_libraries(${test_name}
232+
$<TARGET_LINKER_FILE:phi_kernel_gpu>)
233+
endif()
230234
endif()
231235
if(WITH_SHARED_IR)
232236
target_link_libraries(${test_name} $<TARGET_LINKER_FILE:pir>)

0 commit comments

Comments
 (0)