diff --git a/docs/build.md b/docs/build.md index 3c5c17a89..17f645239 100644 --- a/docs/build.md +++ b/docs/build.md @@ -6,11 +6,6 @@ - Torch 2.4.1 - Clang 19 -#### if for AMD GPU: -- ROCM 6.3.0 -- Torch 2.4.1 with ROCM support - - Dependencies with other versions may also work well, but this is not guaranteed. If you find any problem in installing, please tell us in Issues. @@ -26,10 +21,7 @@ Dependencies with other versions may also work well, but this is not guaranteed. pip3 install black "clang-format==19.1.2" pre-commit ruff yapf==0.43 pip3 install ninja cmake wheel pybind11 cuda-python==12.4 numpy chardet pytest ``` - for AMD GPU, use torch with rocm support and hip-python - ```sh - python3 -m pip install -i https://test.pypi.org/simple hip-python>=6.3.0 - ``` + 4. Apply NVSHMEM fix (Disclaimer: This step is because of NVSHMEM license requirements, it is illegal to release any modified codes or patch.) @@ -84,8 +76,6 @@ Dependencies with other versions may also work well, but this is not guaranteed. export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/llvm-project/build/lib ``` - For ROCMSHMEM on AMD GPU, no explicit build required as the building process is integrated with Triton-distributed. - 6. Build Triton-distributed Then you can build Triton-distributed. ```sh @@ -114,20 +104,13 @@ This example runs on a single node with 8 H800 GPUs. ```sh bash ./third_party/distributed/launch.sh ./third_party/distributed/distributed/test/nvidia/test_ag_gemm_intra_node.py --case correctness_tma ``` -For AMD CDNA3 GPUs: -```sh -bash ./third_party/distributed/launch_amd.sh ./third_party/distributed/distributed/test/amd/test_ag_gemm_intra_node.py 8192 53248 16384 -``` #### GEMM ReduceScatter example on single node This example runs on a single node with 8 H800 GPUs. ```sh bash ./third_party/distributed/launch.sh ./third_party/distributed/distributed/test/nvidia/test_gemm_rs_multi_node.py 8192 8192 29568 ``` -For AMD CDNA3 GPUs: -```sh -bash ./third_party/distributed/launch_amd.sh ./third_party/distributed/distributed/test/amd/test_gemm_rs_intra_node.py 8192 3584 14336 -``` + #### NVSHMEM example in Triton-distributed ```sh bash ./third_party/distributed/launch.sh ./third_party/distributed/distributed/test/nvidia/test_nvshmem_api.py @@ -173,40 +156,4 @@ bash ./third_party/distributed/launch.sh ./third_party/distributed/distributed/t # moe rs bash ./third_party/distributed/launch.sh ./third_party/distributed/distributed/test/test_moe_reduce_rs_intra_node.py 8192 2048 1536 32 2 bash ./third_party/distributed/launch.sh ./third_party/distributed/distributed/test/test_moe_reduce_rs_intra_node.py 8192 2048 1536 32 2 --check -``` - -## To use Triton-distributed with the AMD backend: -- Starting from the rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.4 Docker container -#### Steps: -1. Clone the repo -```sh -git clone https://github.com/ByteDance-Seed/Triton-distributed.git -``` -2. Update submodules -```sh -cd Triton-distributed/ -git submodule update --init --recursive -``` -3. Install dependencies -```sh -sudo apt-get update -y -sudo apt install -y libopenmpi-dev -pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3 --no-deps -./third_party/rocshmem_bind/build.sh -python3 -m pip install -i https://test.pypi.org/simple hip-python~=6.3.2 (or whatever Rocm version you have) -pip3 install pybind11 -``` -4. Build Triton-distributed -```sh -pip3 install -e python --verbose --no-build-isolation -``` -### Test your installation -#### GEMM ReduceScatter example on single node -```sh -bash ./third_party/distributed/launch_amd.sh ./third_party/distributed/distributed/test/amd/test_ag_gemm_intra_node.py 8192 8192 29568 - ``` -and see the following (reduced) output -```sh -torchrun --node_rank=0 --nproc_per_node=8 --nnodes=1 ./third_party/distributed/distributed/test/amd/test_ag_gemm_intra_node.py 8192 8192 29568 -✅ Triton and Torch match -``` +``` \ No newline at end of file diff --git a/docs/build_amd.md b/docs/build_amd.md new file mode 100644 index 000000000..a11d21249 --- /dev/null +++ b/docs/build_amd.md @@ -0,0 +1,54 @@ +# Build Triton-distributed (RocSHMEM) + +## The best practice to use Trition-distributed in AMD GPU + +- ROCm 6.3.3 +- torch-2.6/2.8 (torch-2.6 has major improvement, compatible with SGLang, vLLM) +- python3.12.8 +- MI300X/MI325X + +Dependencies with other versions may also work well, but this is not guaranteed. If you find any problem in installing, please tell us in Issues. + +## Setup without docker + +1. make sure torch-rocm is installed for ROCm SDK 6.3.3 + ```sh + pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.3 + ``` +2. install ompi for ROCm SDK + ``` + sudo apt-get update -y && \ + sudo apt install -y libopenmpi-dev + ``` +3. install other dependencies + ``` + python3 -m pip install -i https://test.pypi.org/simple hip-python~=6.3.3 # or whatever Rocm version you have + pip3 install pybind11 + ``` + +#### Warnning of install inside existing dockder + +Make sure following repositories granted permission to clone submodules + +``` +export TRITON_DIST_HOME=$(readlink -f `pwd`) + +git config --global --add safe.directory $TRITON_DIST_HOME/Triton-distributed/3rdparty/rocshmem +git config --global --add safe.directory $TRITON_DIST_HOME/Triton-distributed/3rdparty/triton +git config --global --add safe.directory $TRITON_DIST_HOME/Triton-distributed + +git submodule update --init --recursive +``` + +## Build + +> python3 python/setup.py build_ext + +## Test + +Currently only single node build supported, multi-node build will be supported soon. + +- Single node test + ``` + bash ./scripts/launch_amd.sh python/triton_dist/test/amd/test_ag_gemm_intra_node.py 8192 8192 29568 + ``` \ No newline at end of file diff --git a/python/build_helpers.py b/python/build_helpers.py index 8b50d735c..160b757a3 100644 --- a/python/build_helpers.py +++ b/python/build_helpers.py @@ -46,5 +46,8 @@ def copy_apply_patches(): for file in files: source_file = os.path.join(root, file) target_file = os.path.join(target_dir, file) - shutil.copy2(source_file, target_file) + try: + shutil.copy2(source_file, target_file) + except Exception: + shutil.copyfile(source_file, target_file) print(f"Copied {source_file} to {target_file}") diff --git a/shmem/rocshmem_bind/scripts/build_rshm_ipc_single.sh b/shmem/rocshmem_bind/scripts/build_rshm_ipc_single.sh index 515efae72..963013b0e 100755 --- a/shmem/rocshmem_bind/scripts/build_rshm_ipc_single.sh +++ b/shmem/rocshmem_bind/scripts/build_rshm_ipc_single.sh @@ -9,6 +9,8 @@ else install_path=$1 fi +hip_cmake_path="/opt/rocm/lib/cmake/hip;/opt/rocm/lib/cmake/rocprim;/opt/rocm/lib/cmake/rocthrust" + src_path=$(dirname "$(realpath $0)")/../../../3rdparty/rocshmem/ cmake \ @@ -29,6 +31,7 @@ cmake \ -DUSE_SINGLE_NODE=ON \ -DUSE_HOST_SIDE_HDP_FLUSH=OFF \ -DBUILD_LOCAL_GPU_TARGET_ONLY=ON \ + -DCMAKE_PREFIX_PATH="$hip_cmake_path" \ $src_path cmake --build . --parallel cmake --install .