diff --git a/backends/iluvatar_gpu/CMakeLists.txt b/backends/iluvatar_gpu/CMakeLists.txt index 532965c8fda..a8d770f3b7b 100644 --- a/backends/iluvatar_gpu/CMakeLists.txt +++ b/backends/iluvatar_gpu/CMakeLists.txt @@ -33,6 +33,10 @@ include(external/eigen) include(external/xxhash) include(external/zlib) include(external/protobuf) +if(WITH_FLAGCX) + add_definitions("-DPADDLE_WITH_FLAGCX") + include(external/flagcx) +endif() set(PLUGIN_VERSION ${PADDLE_VERSION}) set(PROTO_FILE "${PADDLE_SOURCE_DIR}/paddle/phi/core/external_error.proto") @@ -66,8 +70,10 @@ target_include_directories(external_error_proto target_link_libraries(external_error_proto PUBLIC protobuf) set_target_properties(external_error_proto PROPERTIES POSITION_INDEPENDENT_CODE ON) - add_custom_target(external_deps DEPENDS eigen3 zlib protobuf) +if(WITH_FLAGCX) + add_custom_target(external_deps DEPENDS flagcx) +endif() if(WITH_COREX) add_definitions(-DPADDLE_WITH_COREX) @@ -258,7 +264,9 @@ target_link_libraries( protobuf external_error_proto cuinfer - nccl) + nccl + # change nccl to ${FLAGCX_LIB} if compiling with FlagCX ${FLAGCX_LIB} +) include_directories(BEFORE ${PADDLE_SOURCE_DIR}) diff --git a/backends/iluvatar_gpu/build_paddle.sh b/backends/iluvatar_gpu/build_paddle.sh index 57565784d8e..880ac9094b3 100644 --- a/backends/iluvatar_gpu/build_paddle.sh +++ b/backends/iluvatar_gpu/build_paddle.sh @@ -1,13 +1,13 @@ #!/bin/bash # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -26,6 +26,15 @@ export CMAKE_CUDA_ARCHITECTURES=${COREX_ARCH} CURRENT_DIR=$(pwd) PADDLE_SOURCE_DIR="${CURRENT_DIR}/../../Paddle" PATCH_FILE="${CURRENT_DIR}/patches/paddle-corex.patch" +# set BUILD_WITH_FLAGCX to 1 if we want to use flagcx as communication backend +BUILD_WITH_FLAGCX=0 +FLAGCX_ROOT="/workspace/FlagCX" + +if [ "$BUILD_WITH_FLAGCX" == "1" ]; then + WITH_FLAGCX="ON" +else + WITH_FLAGCX="OFF" +fi bash clean_paddle.sh @@ -51,9 +60,10 @@ if [[ ! -d "build" ]]; then fi pushd build -cmake -DPY_VERSION=${PYTHON_VERSION} -DWITH_COREX=ON \ --DWITH_DISTRIBUTE=ON -DWITH_NCCL=ON -DWITH_RCCL=OFF -DCMAKE_BUILD_TYPE=Release \ +cmake -DPY_VERSION=${PYTHON_VERSION} -DWITH_COREX=ON -DPADDLE_SOURCE_DIR=${PADDLE_SOURCE_DIR} \ +-DWITH_DISTRIBUTE=ON -DWITH_NCCL=ON -DWITH_FLAGCX=${WITH_FLAGCX} -DWITH_RCCL=OFF -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DON_INFER=ON -DCOREX_VERSION=${COREX_VERSION} -DCOREX_ARCH=${COREX_ARCH} \ +-DFLAGCX_ROOT=${FLAGCX_ROOT} \ -DCMAKE_CXX_FLAGS='-Wno-error=pessimizing-move -Wno-error=deprecated-copy -Wno-error=init-list-lifetime' \ -DCMAKE_CUDA_FLAGS='-Xclang -fcuda-allow-variadic-functions -mllvm --skip-double' \ -DWITH_ARM=OFF -DWITH_DGC=OFF .. 2>&1 | tee compile.log diff --git a/backends/iluvatar_gpu/cmake/external/flagcx.cmake b/backends/iluvatar_gpu/cmake/external/flagcx.cmake new file mode 100644 index 00000000000..020d1a8afc9 --- /dev/null +++ b/backends/iluvatar_gpu/cmake/external/flagcx.cmake @@ -0,0 +1,41 @@ +set(CMAKE_FIND_DEBUG_MODE ON) +# flagcx.cmake +if(NOT WITH_FLAGCX) + return() +endif() + +set(FLAGCX_SOURCE_DIR "${FLAGCX_ROOT}") +set(FLAGCX_LIB_DIR "${FLAGCX_SOURCE_DIR}/build/lib") +set(FLAGCX_BINARY_DIR "${PADDLE_SOURCE_DIR}/build/third_party/flagcx") +set(THIRD_PARTY_DIR "${PADDLE_SOURCE_DIR}/build/third_party") + +file(REMOVE_RECURSE ${FLAGCX_BINARY_DIR}) +message(STATUS "removed old flagcx dir") +message(STATUS "Copying third-party source to build directory") +execute_process(COMMAND cp -r ${FLAGCX_SOURCE_DIR} ${THIRD_PARTY_DIR} + RESULT_VARIABLE COPY_RESULT) + +if(NOT COPY_RESULT EQUAL 0) + message(FATAL_ERROR "Failed to copy third-party source to build directory") +endif() + +# Create a custom target to build the third-party library +message(STATUS "Building third-party library with its Makefile") + +find_path( + FLAGCX_INCLUDE_DIR flagcx.h + PATHS ${FLAGCX_SOURCE_DIR}/flagcx/include + NO_DEFAULT_PATH) + +message(STATUS "FLAGCX_INCLUDE_DIR is ${FLAGCX_INCLUDE_DIR}") +include_directories(SYSTEM ${FLAGCX_INCLUDE_DIR}) + +add_library(flagcx INTERFACE) +find_library( + FLAGCX_LIB + NAMES flagcx libflagcx + PATHS ${FLAGCX_LIB_DIR} + DOC "My custom library") + +add_dependencies(flagcx FLAGCX_LIB) +message(STATUS "FLAGCX_LIB is ${FLAGCX_LIB}") diff --git a/backends/iluvatar_gpu/runtime/runtime.cc b/backends/iluvatar_gpu/runtime/runtime.cc index 904c978158a..d4eb14aa4ef 100644 --- a/backends/iluvatar_gpu/runtime/runtime.cc +++ b/backends/iluvatar_gpu/runtime/runtime.cc @@ -15,6 +15,9 @@ #include #include #include +#if defined(PADDLE_WITH_FLAGCX) +#include +#endif #include #include #include @@ -46,6 +49,10 @@ static int global_current_device = 0; const char *const DeviceType = "iluvatar_gpu"; const char *const SubDeviceType = "v0.1"; +#if defined(PADDLE_WITH_FLAGCX) +C_CCLComm globalComm = nullptr; +flagcxHandlerGroup_t flagcx_handler; +#endif namespace phi { namespace internal { @@ -75,6 +82,33 @@ inline ncclDataType_t PDDataTypeToNcclDataType(C_DataType type) { return ncclFloat32; } +#if defined(PADDLE_WITH_FLAGCX) +inline flagcxDataType_t PDDataTypeToFlagcxDataType(C_DataType type) { + if (type == C_DataType::FLOAT32) { + return flagcxFloat; + } else if (type == C_DataType::BFLOAT16) { + return flagcxBfloat16; + } else if (type == C_DataType::UINT8) { + return flagcxUint8; + } else if (type == C_DataType::UINT32) { + return flagcxUint32; + } else if (type == C_DataType::UINT64) { + return flagcxUint64; + } else if (type == C_DataType::INT8) { + return flagcxInt8; + } else if (type == C_DataType::INT32) { + return flagcxInt32; + } else if (type == C_DataType::INT64) { + return flagcxInt64; + } else if (type == C_DataType::FLOAT16) { + return flagcxHalf; + } else { + LOG(ERROR) << "Datatype " << type << " in flagcx is not supported."; + } + return flagcxFloat; +} +#endif + #define NCCL_CHECK(cmd) \ do { \ ncclResult_t r = cmd; \ @@ -86,6 +120,20 @@ inline ncclDataType_t PDDataTypeToNcclDataType(C_DataType type) { } \ } while (0) +#if defined(PADDLE_WITH_FLAGCX) +#define FLAGCX_CHECK(cmd) \ + do { \ + flagcxResult_t r = cmd; \ + if (r != flagcxSuccess) { \ + PADDLE_THROW( \ + common::errors::External("Failed, FLAGCX error %s:%d '%s'\n", \ + __FILE__, \ + __LINE__, \ + flagcxGetErrorString(r))); \ + } \ + } while (0) +#endif + class EigenGpuStreamDevice : public Eigen::StreamInterface { public: EigenGpuStreamDevice() @@ -791,18 +839,51 @@ ncclRedOp_t PDReduceOpToNcclReduceOp(C_CCLReduceOp op) { } } +#if defined(PADDLE_WITH_FLAGCX) +flagcxRedOp_t PDReduceOpToFlagcxReduceOp(C_CCLReduceOp op) { + if (op == C_CCLReduceOp::MIN) { + return flagcxMin; + } else if (op == C_CCLReduceOp::MAX) { + return flagcxMax; + } else if (op == C_CCLReduceOp::SUM) { + return flagcxSum; + } else if (op == C_CCLReduceOp::PRODUCT) { + return flagcxProd; + } else if (op == C_CCLReduceOp::AVG) { + return flagcxAvg; + } else { + LOG(ERROR) << "Reduceop " << op << " in flagcx is not supported."; + } +} +#endif + C_Status XcclGetUniqueIdSize(size_t *size) { +#if defined(PADDLE_WITH_FLAGCX) + *size = sizeof(flagcxUniqueId); +#else *size = sizeof(ncclUniqueId); +#endif return C_SUCCESS; } C_Status XcclGetUniqueId(C_CCLRootId *unique_id) { +#if defined(PADDLE_WITH_FLAGCX) + if (unique_id->sz != sizeof(flagcxUniqueId)) { + LOG(ERROR) << "unique_id->sz must be equal sizeof(ncclUniqueId)"; + return C_FAILED; + } + flagcxUniqueId_t flagcxId = + reinterpret_cast(unique_id->data); + FLAGCX_CHECK(flagcxGetUniqueId(&flagcxId)); + unique_id->data = flagcxId; +#else if (unique_id->sz != sizeof(ncclUniqueId)) { LOG(ERROR) << "unique_id->sz must be equal sizeof(ncclUniqueId)"; return C_FAILED; } NCCL_CHECK( ncclGetUniqueId(reinterpret_cast(unique_id->data))); +#endif return C_SUCCESS; } @@ -811,17 +892,32 @@ C_Status XcclCommInitRank(size_t nranks, C_CCLRootId *unique_id, size_t rank, C_CCLComm *comm) { +#if defined(PADDLE_WITH_FLAGCX) + FLAGCX_CHECK( + flagcxCommInitRank(reinterpret_cast(comm), + nranks, + reinterpret_cast(unique_id->data), + rank)); + globalComm = *comm; + VLOG(4) << "[FLAGCX] comm inited: " << reinterpret_cast(*comm); +#else NCCL_CHECK( ncclCommInitRank(reinterpret_cast(comm), nranks, *(reinterpret_cast(unique_id->data)), rank)); VLOG(4) << "[NCCL] comm inited: " << reinterpret_cast(*comm); +#endif return C_SUCCESS; } C_Status XcclDestroyComm(C_CCLComm comm) { +#if defined(PADDLE_WITH_FLAGCX) + FLAGCX_CHECK(flagcxCommDestroy(reinterpret_cast(comm))); + globalComm = nullptr; +#else NCCL_CHECK(ncclCommDestroy(reinterpret_cast(comm))); +#endif return C_SUCCESS; } @@ -832,6 +928,18 @@ C_Status XcclAllReduce(void *send_buf, C_CCLReduceOp op, C_CCLComm comm, C_Stream stream) { +#if defined(PADDLE_WITH_FLAGCX) + cudaStream_t cudaStream = reinterpret_cast(stream); + + FLAGCX_CHECK( + flagcxAllReduce(send_buf, + recv_buf, + count, + phi::internal::PDDataTypeToFlagcxDataType(data_type), + PDReduceOpToFlagcxReduceOp(op), + reinterpret_cast(comm), + reinterpret_cast(&cudaStream))); +#else NCCL_CHECK(ncclAllReduce(send_buf, recv_buf, count, @@ -839,6 +947,7 @@ C_Status XcclAllReduce(void *send_buf, PDReduceOpToNcclReduceOp(op), reinterpret_cast(comm), reinterpret_cast(stream))); +#endif return C_SUCCESS; } @@ -848,6 +957,17 @@ C_Status XcclBroadcast(void *buf, size_t root, C_CCLComm comm, C_Stream stream) { +#if defined(PADDLE_WITH_FLAGCX) + cudaStream_t cudaStream = reinterpret_cast(stream); + FLAGCX_CHECK( + flagcxBroadcast(static_cast(buf), + buf, + count, + phi::internal::PDDataTypeToFlagcxDataType(data_type), + root, + reinterpret_cast(comm), + reinterpret_cast(&cudaStream))); +#else NCCL_CHECK(ncclBroadcast(static_cast(buf), buf, count, @@ -855,6 +975,7 @@ C_Status XcclBroadcast(void *buf, root, reinterpret_cast(comm), reinterpret_cast(stream))); +#endif return C_SUCCESS; } @@ -866,6 +987,18 @@ C_Status XcclReduce(void *send_buf, size_t root, C_CCLComm comm, C_Stream stream) { +#if defined(PADDLE_WITH_FLAGCX) + cudaStream_t cudaStream = reinterpret_cast(stream); + FLAGCX_CHECK( + flagcxReduce(send_buf, + recv_buf, + count, + phi::internal::PDDataTypeToFlagcxDataType(data_type), + PDReduceOpToFlagcxReduceOp(op), + root, + reinterpret_cast(comm), + reinterpret_cast(&cudaStream))); +#else NCCL_CHECK(ncclReduce(send_buf, recv_buf, count, @@ -874,6 +1007,7 @@ C_Status XcclReduce(void *send_buf, root, reinterpret_cast(comm), reinterpret_cast(stream))); +#endif return C_SUCCESS; } @@ -883,12 +1017,23 @@ C_Status XcclAllGather(void *send_buf, C_DataType data_type, C_CCLComm comm, C_Stream stream) { +#if defined(PADDLE_WITH_FLAGCX) + cudaStream_t cudaStream = reinterpret_cast(stream); + FLAGCX_CHECK( + flagcxAllGather(send_buf, + recv_buf, + count, + phi::internal::PDDataTypeToFlagcxDataType(data_type), + reinterpret_cast(comm), + reinterpret_cast(&cudaStream))); +#else NCCL_CHECK(ncclAllGather(send_buf, recv_buf, count, phi::internal::PDDataTypeToNcclDataType(data_type), reinterpret_cast(comm), reinterpret_cast(stream))); +#endif return C_SUCCESS; } @@ -899,6 +1044,17 @@ C_Status XcclReduceScatter(void *send_buf, C_CCLReduceOp op, C_CCLComm comm, C_Stream stream) { +#if defined(PADDLE_WITH_FLAGCX) + cudaStream_t cudaStream = reinterpret_cast(stream); + FLAGCX_CHECK( + flagcxReduceScatter(send_buf, + recv_buf, + count, + phi::internal::PDDataTypeToFlagcxDataType(data_type), + PDReduceOpToFlagcxReduceOp(op), + reinterpret_cast(comm), + reinterpret_cast(&cudaStream))); +#else NCCL_CHECK( ncclReduceScatter(send_buf, recv_buf, @@ -907,16 +1063,25 @@ C_Status XcclReduceScatter(void *send_buf, PDReduceOpToNcclReduceOp(op), reinterpret_cast(comm), reinterpret_cast(stream))); +#endif return C_SUCCESS; } C_Status XcclGroupStart() { +#if defined(PADDLE_WITH_FLAGCX) + FLAGCX_CHECK(flagcxGroupStart(reinterpret_cast(globalComm))); +#else NCCL_CHECK(ncclGroupStart()); +#endif return C_SUCCESS; } C_Status XcclGroupEnd() { +#if defined(PADDLE_WITH_FLAGCX) + FLAGCX_CHECK(flagcxGroupEnd(reinterpret_cast(globalComm))); +#else NCCL_CHECK(ncclGroupEnd()); +#endif return C_SUCCESS; } @@ -926,12 +1091,24 @@ C_Status XcclSend(void *send_buf, size_t dest_rank, C_CCLComm comm, C_Stream stream) { +#if defined(PADDLE_WITH_FLAGCX) + cudaStream_t cudaStream = reinterpret_cast(stream); + FLAGCX_CHECK(flagcxSend(send_buf, + count, + phi::internal::PDDataTypeToFlagcxDataType(data_type), + dest_rank, + reinterpret_cast(comm), + reinterpret_cast(&cudaStream))); + flagcx_handler->devHandle->streamSynchronize( + reinterpret_cast(&cudaStream)); +#else NCCL_CHECK(ncclSend(send_buf, count, phi::internal::PDDataTypeToNcclDataType(data_type), dest_rank, reinterpret_cast(comm), reinterpret_cast(stream))); +#endif return C_SUCCESS; } @@ -941,14 +1118,67 @@ C_Status XcclRecv(void *recv_buf, size_t src_rank, C_CCLComm comm, C_Stream stream) { +#if defined(PADDLE_WITH_FLAGCX) + cudaStream_t cudaStream = reinterpret_cast(stream); + FLAGCX_CHECK(flagcxRecv(recv_buf, + count, + phi::internal::PDDataTypeToFlagcxDataType(data_type), + src_rank, + reinterpret_cast(comm), + reinterpret_cast(&cudaStream))); + flagcx_handler->devHandle->streamSynchronize( + reinterpret_cast(&cudaStream)); +#else NCCL_CHECK(ncclRecv(recv_buf, count, phi::internal::PDDataTypeToNcclDataType(data_type), src_rank, reinterpret_cast(comm), reinterpret_cast(stream))); +#endif + return C_SUCCESS; +} + +#if defined(PADDLE_WITH_FLAGCX) +C_Status XcclAllToAll(const void **send_buf, + const size_t *send_count, + const C_DataType *send_dtype, + void **recv_buf, + const size_t *recv_count, + const C_DataType *recv_dtype, + size_t rank, + size_t nranks, + C_CCLComm comm, + C_Stream stream) { + flagcxComm_t flagcxComm = reinterpret_cast(comm); + cudaStream_t cudaStream = reinterpret_cast(stream); + FLAGCX_CHECK(flagcxGroupStart(flagcxComm)); + for (size_t i = 0; i < nranks; i++) { + if (send_count[i] > 0) { + FLAGCX_CHECK( + flagcxSend(const_cast(send_buf[i]), + send_count[i], + phi::internal::PDDataTypeToFlagcxDataType(send_dtype[i]), + i, + flagcxComm, + reinterpret_cast(&cudaStream))); + } + if (recv_count[i] > 0) { + FLAGCX_CHECK( + flagcxRecv(const_cast(recv_buf[i]), + recv_count[i], + phi::internal::PDDataTypeToFlagcxDataType(recv_dtype[i]), + i, + flagcxComm, + reinterpret_cast(&cudaStream))); + } + } + FLAGCX_CHECK(flagcxGroupEnd(flagcxComm)); + flagcx_handler->devHandle->streamSynchronize( + reinterpret_cast(&cudaStream)); return C_SUCCESS; } +#endif void InitPlugin(CustomRuntimeParams *params) { PADDLE_CUSTOM_RUNTIME_CHECK_VERSION(params); @@ -959,6 +1189,9 @@ void InitPlugin(CustomRuntimeParams *params) { 0, sizeof(C_DeviceInterface)); +#if defined(PADDLE_WITH_FLAGCX) + flagcxHandleInit(&flagcx_handler); +#endif params->interface->get_compute_capability = GetComputeCapability; params->interface->get_runtime_version = GetRuntimeVersion; params->interface->get_driver_version = GetDriverVersion; @@ -1022,6 +1255,9 @@ void InitPlugin(CustomRuntimeParams *params) { params->interface->xccl_reduce = XcclReduce; params->interface->xccl_reduce_scatter = XcclReduceScatter; params->interface->xccl_send = XcclSend; +#if defined(PADDLE_WITH_FLAGCX) + params->interface->xccl_all_to_all = XcclAllToAll; +#endif params->interface->profiler_collect_trace_data = nullptr; params->interface->profiler_initialize = nullptr;