@@ -37,6 +37,8 @@ include(cblas)
37
37
include (flashattn)
38
38
include (cutlass)
39
39
include (dgc)
40
+ include (warpctc)
41
+ include (warprnnt)
40
42
41
43
set (PLUGIN_VERSION ${PADDLE_VERSION} )
42
44
@@ -308,8 +310,6 @@ file(
308
310
${PADDLE_SOURCE_DIR} /paddle/phi/kernels/gpu/hinge_loss_grad_kernel.cu
309
311
${PADDLE_SOURCE_DIR} /paddle/phi/kernels/gpu/hinge_loss_kernel.cu
310
312
${PADDLE_SOURCE_DIR} /paddle/phi/kernels/gpu/gru_grad_kernel.cu
311
- ${PADDLE_SOURCE_DIR} /paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu
312
- ${PADDLE_SOURCE_DIR} /paddle/phi/kernels/gpu/grid_sample_kernel.cu
313
313
${PADDLE_SOURCE_DIR} /paddle/phi/kernels/gpu/generate_proposals_kernel.cu
314
314
${PADDLE_SOURCE_DIR} /paddle/phi/kernels/gpu/gaussian_inplace_grad_kernel.cu
315
315
${PADDLE_SOURCE_DIR} /paddle/phi/kernels/gpu/gammaln_kernel.cu
@@ -612,12 +612,9 @@ file(
612
612
${PADDLE_SOURCE_DIR} /paddle/phi/kernels/funcs/math_function.cc
613
613
${PADDLE_SOURCE_DIR} /paddle/phi/kernels/gpu/log_softmax_kernel.cu
614
614
${PADDLE_SOURCE_DIR} /paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
615
- # ${PADDLE_SOURCE_DIR}/paddle/phi/backends/context_pool.cc
616
615
${PADDLE_SOURCE_DIR} /paddle/phi/kernels/funcs/repeat_tensor2index_tensor.cu
617
616
${PADDLE_SOURCE_DIR} /paddle/phi/kernels/gpu/binomial_kernel.cu
618
617
${PADDLE_SOURCE_DIR} /paddle/phi/kernels/gpu/bernoulli_kernel.cu
619
- # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bmm_grad_kernel_impl.h
620
- # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bmm_kernel.cu
621
618
${PADDLE_SOURCE_DIR} /paddle/phi/backends/dynload/cufft.cc
622
619
${PADDLE_SOURCE_DIR} /paddle/phi/kernels/gpu/box_coder_kernel.cu
623
620
${PADDLE_SOURCE_DIR} /paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
@@ -640,31 +637,12 @@ file(
640
637
${PADDLE_SOURCE_DIR} /paddle/phi/kernels/gpu/gather_tree_kernel.cu
641
638
${PADDLE_SOURCE_DIR} /paddle/phi/kernels/gpu/graph_reindex_kernel.cu
642
639
${PADDLE_SOURCE_DIR} /paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
643
- # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/group_norm_kernel.cu
644
640
${PADDLE_SOURCE_DIR} /paddle/phi/kernels/gpu/group_norm_grad_kernel.cu
645
641
${PADDLE_SOURCE_DIR} /paddle/phi/kernels/gpu/gumbel_softmax_grad_kernel.cu
646
642
${PADDLE_SOURCE_DIR} /paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
647
- # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_act_dequant_kernel.cu
648
- # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu
649
- # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_weighted_swiglu_act_quant_kernel.cu
650
- # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_elemwise_activation_kernel.cu
651
- # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_grad_kernel.cu
652
- # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/fp8_gemm/fp8_gemm_with_cublasLt/fp8_fp8_half_gemm.cu
653
- # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_grad_kernel.cu
654
- # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/fused_conv2d_add_act_kernel.cu
655
- # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/variable_length_memory_efficient_attention_kernel.cu
656
- # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_kernel.cu
657
- # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/gemm_epilogue_kernel.cu
658
- # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/blha_get_max_len.cu
659
- # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_elemwise_activation_grad_kernel.cu
660
- # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/stride/as_real_kernel.cc
661
- # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/stride/as_complex_kernel.cc
662
- # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/stride/complex_grad_kernel.cc
663
- # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/stride/complex_kernel.cc
664
- # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/shape_kernel.cc
665
- # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/sparse/gpu/conv_kernel_igemm.cu
643
+ ${PADDLE_SOURCE_DIR} /paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
644
+ ${PADDLE_SOURCE_DIR} /paddle/phi/kernels/gpu/rms_norm_kernel.cu
666
645
# ############################################################################
667
- # kernels/fusion kernels/selected_rows
668
646
${PADDLE_SOURCE_DIR} /paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
669
647
# kernels/kps
670
648
${PADDLE_SOURCE_DIR} /paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -696,7 +674,6 @@ file(
696
674
${PADDLE_SOURCE_DIR} /paddle/phi/kernels/legacy/gpu/cal_aux_loss_grad_kernel.cu
697
675
${PADDLE_SOURCE_DIR} /paddle/phi/kernels/legacy/gpu/expand_modality_expert_id_kernel.cu
698
676
${PADDLE_SOURCE_DIR} /paddle/phi/kernels/legacy/gpu/int_bincount_kernel.cu
699
- ${PADDLE_SOURCE_DIR} /paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
700
677
${PADDLE_SOURCE_DIR} /paddle/phi/kernels/fusion/gpu/fused_bias_act_kernel.cu)
701
678
702
679
file (
@@ -706,15 +683,14 @@ file(
706
683
passes/*.cc
707
684
kernels/*.cc
708
685
kernels/*.cu
686
+ kernels/fusion/*.cc
687
+ kernels/fusion/*.cu
709
688
kernels/gpudnn/*.cc
710
689
kernels/gpudnn/*.cu
711
690
kernels/cuda_kernels/*.cc
712
691
kernels/cuda_kernels/*.cu
713
692
kernels/funcs/blas/*.cc
714
- kernels/ernie_core/*.cu
715
- kernels/ernie_core/rms_norm_kernel_register.cu
716
- kernels/ernie_core/top_p_sampling_kernel_register.cu
717
- kernels/ernie_core/fused_bias_act_kernel_register.cu)
693
+ kernels/ernie_core/*.cu)
718
694
719
695
set (CUSTOM_DEVICE_SRCS ${CUDA_SRCS} ${CC_SRCS} ${ERNIE_CORE_SRCS} )
720
696
@@ -723,18 +699,17 @@ set_source_files_properties(${CUSTOM_DEVICE_SRCS} PROPERTIES LANGUAGE CUDA)
723
699
set (CMAKE_CUCC_COMPILER "cucc" )
724
700
set (CMAKE_CUCC_FLAGS "-I /opt/maca/tools/cu-bridge/include/" )
725
701
726
- set_source_files_properties (
727
- ${PADDLE_SOURCE_DIR} /paddle/phi/kernels/gpu/rms_norm_kernel.cu
728
- PROPERTIES LANGUAGE CUDA)
729
- add_library (
730
- ${TARGET_NAME} SHARED
731
- ${CUSTOM_DEVICE_SRCS}
732
- ${PADDLE_SOURCE_DIR} /paddle/phi/kernels/gpu/rms_norm_kernel.cu)
702
+ add_library (${TARGET_NAME} SHARED ${CUSTOM_DEVICE_SRCS} )
733
703
734
704
target_include_directories (
735
705
${TARGET_NAME}
736
- PRIVATE ${PADDLE_SOURCE_DIR} ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR} /kernels
737
- ${CUDA_INCLUDE_DIRS} ${PADDLE_SOURCE_DIR} /third_party/pybind/include
706
+ PRIVATE ${PADDLE_SOURCE_DIR}
707
+ ${CMAKE_SOURCE_DIR}
708
+ ${CMAKE_SOURCE_DIR} /kernels
709
+ ${CUDA_INCLUDE_DIRS}
710
+ ${WARPCTC_INCLUDE_DIR}
711
+ ${WARPRNNT_INCLUDE_DIR}
712
+ ${PADDLE_SOURCE_DIR} /third_party/pybind/include
738
713
${PADDLE_SOURCE_DIR} /paddle/phi/api/include /compat)
739
714
740
715
target_link_libraries (
@@ -747,16 +722,16 @@ target_link_libraries(
747
722
protobuf
748
723
external_error_proto
749
724
dgc
725
+ ${WARPCTC_LIBRARIES}
726
+ ${WARPRNNT_LIBRARIES}
750
727
${PADDLE_CORE_LIB} )
751
- target_link_libraries (${TARGET_NAME} /opt/maca/lib/libmccl.so)
752
- target_link_libraries (${TARGET_NAME} /opt/maca/lib/libmcFlashAttn.so)
753
- target_link_libraries (${TARGET_NAME} /opt/maca/lib/libmcpti.so)
754
728
include_directories (BEFORE ${PADDLE_SOURCE_DIR} )
755
729
756
730
target_compile_definitions (
757
731
${TARGET_NAME}
758
732
PUBLIC PADDLE_WITH_CUDA=1
759
733
PADDLE_WITH_CUSTOM_DEVICE=1
734
+ mcblasContext=cublasContext
760
735
GPUContext=CustomContext
761
736
KPSContext=CustomContext
762
737
STREAM_TYPE=cudaStream_t
0 commit comments