diff --git a/.github/workflows/publish_devel_image.yml b/.github/workflows/publish_devel_image.yml index 0a3fad3d..12c3e5f2 100644 --- a/.github/workflows/publish_devel_image.yml +++ b/.github/workflows/publish_devel_image.yml @@ -1,6 +1,17 @@ name: Publish devel image on: workflow_dispatch: + inputs: + push: + description: 'Push to Docker Hub' + required: false + default: 'true' + + # Schedule the workflow to run at 10:00 (UTC) every month. + schedule: + # Minute[0,59] Hour[0,23] Day of month[1,31] Month[1,12] Day of week[0,6] (Sunday=0) + - cron: '0 10 1 * *' + env: # Tells where to store caches. CI_CACHE_DIR: ${{ github.workspace }}/../../ci_cache @@ -27,12 +38,28 @@ jobs: - name: Create cache directory run: mkdir -p $CI_CACHE_DIR/.buildx-cache + - name: Build devel image for cuda 12.6 (experimental) + uses: docker/build-push-action@v5 + with: + context: ./docker + file: ./docker/Dockerfile.devel + push: ${{ inputs.push || 'true' }} + cache-from: type=local,src=$CI_CACHE_DIR/.buildx-cache + cache-to: type=local,dest=$CI_CACHE_DIR/.buildx-cache + build-args: | + UBUNTU_VERSION=22.04 + CUDA_VERSION=12.6 + GCC_VERSION=12 + tags: | + vectorchai/scalellm_devel:cuda12.6-ubuntu22.04 + vectorchai/scalellm_devel:cuda12.6 + - name: Build devel image for cuda 12.4 uses: docker/build-push-action@v5 with: context: ./docker file: ./docker/Dockerfile.devel - push: true + push: ${{ inputs.push || 'true' }} cache-from: type=local,src=$CI_CACHE_DIR/.buildx-cache cache-to: type=local,dest=$CI_CACHE_DIR/.buildx-cache build-args: | @@ -49,7 +76,7 @@ jobs: with: context: ./docker file: ./docker/Dockerfile.devel - push: true + push: ${{ inputs.push || 'true' }} cache-from: type=local,src=$CI_CACHE_DIR/.buildx-cache cache-to: type=local,dest=$CI_CACHE_DIR/.buildx-cache build-args: | @@ -65,7 +92,7 @@ jobs: with: context: ./docker file: ./docker/Dockerfile.devel - push: true + push: ${{ inputs.push || 'true' }} cache-from: type=local,src=$CI_CACHE_DIR/.buildx-cache cache-to: type=local,dest=$CI_CACHE_DIR/.buildx-cache build-args: | diff --git a/.github/workflows/publish_docs.yml b/.github/workflows/publish_docs.yml index 682a2eba..3ec0898b 100644 --- a/.github/workflows/publish_docs.yml +++ b/.github/workflows/publish_docs.yml @@ -21,6 +21,11 @@ on: workflow_call: + # Schedule the workflow to run at 9:00 (UTC) every month. + schedule: + # Minute[0,59] Hour[0,23] Day of month[1,31] Month[1,12] Day of week[0,6] (Sunday=0) + - cron: '0 9 1 * *' + # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages permissions: contents: read @@ -58,7 +63,8 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'workflow_call' || - github.event_name == 'workflow_dispatch' + github.event_name == 'workflow_dispatch' || + github.event_name == 'schedule' environment: name: github-pages url: ${{ steps.deployment.outputs.page_url }} diff --git a/.github/workflows/publish_manylinux_image.yml b/.github/workflows/publish_manylinux_image.yml index afa6545c..b6ad0f98 100644 --- a/.github/workflows/publish_manylinux_image.yml +++ b/.github/workflows/publish_manylinux_image.yml @@ -1,6 +1,17 @@ name: Publish manylinux image on: workflow_dispatch: + inputs: + push: + description: 'Push to Docker Hub' + required: false + default: 'true' + + # Schedule the workflow to run at 9:00 (UTC) every month. + schedule: + # Minute[0,59] Hour[0,23] Day of month[1,31] Month[1,12] Day of week[0,6] (Sunday=0) + - cron: '0 9 1 * *' + env: # Tells where to store caches. CI_CACHE_DIR: ${{ github.workspace }}/../../ci_cache @@ -10,7 +21,7 @@ jobs: strategy: fail-fast: false matrix: - cuda: ["11.8", "12.1", "12.4"] + cuda: ["11.8", "12.1", "12.4", "12.6"] runs-on: [self-hosted, linux, build] steps: - name: Checkout repository @@ -36,7 +47,7 @@ jobs: with: context: ./docker file: ./docker/Dockerfile.manylinux - push: true + push: ${{ inputs.push || 'true' }} cache-from: type=local,src=$CI_CACHE_DIR/.buildx-cache cache-to: type=local,dest=$CI_CACHE_DIR/.buildx-cache build-args: | diff --git a/docker/common/install_cuda.sh b/docker/common/install_cuda.sh index 413c08f9..bea67665 100755 --- a/docker/common/install_cuda.sh +++ b/docker/common/install_cuda.sh @@ -139,6 +139,39 @@ function install_124 { ldconfig } +function install_126 { + echo "Installing CUDA 12.6.2 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2" + rm -rf /usr/local/cuda-12.6 /usr/local/cuda + # install CUDA 12.6.2 in the same container + wget -q https://developer.download.nvidia.com/compute/cuda/12.6.2/local_installers/cuda_12.6.2_560.35.03_linux.run + chmod +x cuda_12.6.2_560.35.03_linux.run + ./cuda_12.6.2_560.35.03_linux.run --toolkit --silent + rm -f cuda_12.6.2_560.35.03_linux.run + rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.6 /usr/local/cuda + + # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement + mkdir tmp_cudnn && cd tmp_cudnn + wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz + tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz + cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/ + cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/ + cd .. + rm -rf tmp_cudnn + + # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses + # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build + git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git + cd nccl && make -j src.build + cp -a build/include/* /usr/local/cuda/include/ + cp -a build/lib/* /usr/local/cuda/lib64/ + cd .. + rm -rf nccl + + install_cusparselt_062 + + ldconfig +} + function prune_118 { echo "Pruning CUDA 11.8 and cuDNN" ##################################################################################### @@ -229,12 +262,46 @@ function prune_124 { $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a ##################################################################################### - # CUDA 12.1 prune visual tools + # CUDA 12.4 prune visual tools ##################################################################################### export CUDA_BASE="/usr/local/cuda-12.4/" rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/ } +function prune_126 { + echo "Pruning CUDA 12.6" + ##################################################################################### + # CUDA 12.6 prune static libs + ##################################################################################### + export NVPRUNE="/usr/local/cuda-12.6/bin/nvprune" + export CUDA_LIB_DIR="/usr/local/cuda-12.6/lib64" + + export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90" + export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90" + + if [[ -n "$OVERRIDE_GENCODE" ]]; then + export GENCODE=$OVERRIDE_GENCODE + fi + if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then + export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN + fi + + # all CUDA libs except CuDNN and CuBLAS + ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis" \ + | xargs -I {} bash -c \ + "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}" + + # prune CuDNN and CuBLAS + $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a + $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a + + ##################################################################################### + # CUDA 12.6 prune visual tools + ##################################################################################### + export CUDA_BASE="/usr/local/cuda-12.6/" + rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/ +} + # idiomatic parameter and option handling in sh while test $# -gt 0 do @@ -245,6 +312,8 @@ do ;; 12.4) install_124; prune_124 ;; + 12.6) install_126; prune_126 + ;; *) echo "bad argument $1"; exit 1 ;; esac diff --git a/scripts/build_wheel.sh b/scripts/build_wheel.sh index c8136682..b2f778bd 100755 --- a/scripts/build_wheel.sh +++ b/scripts/build_wheel.sh @@ -23,6 +23,9 @@ export PATH="$HOME/.local/bin:$PATH" PYVER="${PYTHON_VERSION//./}" export PATH="/opt/python/cp${PYVER}-cp${PYVER}/bin:$PATH" +# update pip +python -m pip install --upgrade pip + # install PyTorch pip install torch==$TORCH_VERSION -i "https://download.pytorch.org/whl/cu${CUDA_VERSION//./}" diff --git a/scripts/run_pytest.sh b/scripts/run_pytest.sh index 4a47af8f..1f8be1a0 100755 --- a/scripts/run_pytest.sh +++ b/scripts/run_pytest.sh @@ -24,6 +24,9 @@ export PATH="$HOME/.local/bin:$PATH" PYVER="${PYTHON_VERSION//./}" export PATH="/opt/python/cp${PYVER}-cp${PYVER}/bin:$PATH" +# update pip +python -m pip install --upgrade pip + # install PyTorch pip install torch==$TORCH_VERSION -i "https://download.pytorch.org/whl/cu${CUDA_VERSION//./}" @@ -33,6 +36,6 @@ pip install -r requirements-test.txt # install scalellm wheel pip install dist/*.whl -# run pytest +# run pytest within the tests directory cd tests pytest \ No newline at end of file