Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 30 additions & 3 deletions .github/workflows/publish_devel_image.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,17 @@
name: Publish devel image
on:
workflow_dispatch:
inputs:
push:
description: 'Push to Docker Hub'
required: false
default: 'true'

# Schedule the workflow to run at 10:00 (UTC) every month.
schedule:
# Minute[0,59] Hour[0,23] Day of month[1,31] Month[1,12] Day of week[0,6] (Sunday=0)
- cron: '0 10 1 * *'

env:
# Tells where to store caches.
CI_CACHE_DIR: ${{ github.workspace }}/../../ci_cache
Expand All @@ -27,12 +38,28 @@ jobs:
- name: Create cache directory
run: mkdir -p $CI_CACHE_DIR/.buildx-cache

- name: Build devel image for cuda 12.6 (experimental)
uses: docker/build-push-action@v5
with:
context: ./docker
file: ./docker/Dockerfile.devel
push: ${{ inputs.push || 'true' }}
cache-from: type=local,src=$CI_CACHE_DIR/.buildx-cache
cache-to: type=local,dest=$CI_CACHE_DIR/.buildx-cache
build-args: |
UBUNTU_VERSION=22.04
CUDA_VERSION=12.6
GCC_VERSION=12
tags: |
vectorchai/scalellm_devel:cuda12.6-ubuntu22.04
vectorchai/scalellm_devel:cuda12.6

- name: Build devel image for cuda 12.4
uses: docker/build-push-action@v5
with:
context: ./docker
file: ./docker/Dockerfile.devel
push: true
push: ${{ inputs.push || 'true' }}
cache-from: type=local,src=$CI_CACHE_DIR/.buildx-cache
cache-to: type=local,dest=$CI_CACHE_DIR/.buildx-cache
build-args: |
Expand All @@ -49,7 +76,7 @@ jobs:
with:
context: ./docker
file: ./docker/Dockerfile.devel
push: true
push: ${{ inputs.push || 'true' }}
cache-from: type=local,src=$CI_CACHE_DIR/.buildx-cache
cache-to: type=local,dest=$CI_CACHE_DIR/.buildx-cache
build-args: |
Expand All @@ -65,7 +92,7 @@ jobs:
with:
context: ./docker
file: ./docker/Dockerfile.devel
push: true
push: ${{ inputs.push || 'true' }}
cache-from: type=local,src=$CI_CACHE_DIR/.buildx-cache
cache-to: type=local,dest=$CI_CACHE_DIR/.buildx-cache
build-args: |
Expand Down
8 changes: 7 additions & 1 deletion .github/workflows/publish_docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ on:

workflow_call:

# Schedule the workflow to run at 9:00 (UTC) every month.
schedule:
# Minute[0,59] Hour[0,23] Day of month[1,31] Month[1,12] Day of week[0,6] (Sunday=0)
- cron: '0 9 1 * *'

# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
permissions:
contents: read
Expand Down Expand Up @@ -58,7 +63,8 @@ jobs:
if: |
github.event_name == 'push' ||
github.event_name == 'workflow_call' ||
github.event_name == 'workflow_dispatch'
github.event_name == 'workflow_dispatch' ||
github.event_name == 'schedule'
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
Expand Down
15 changes: 13 additions & 2 deletions .github/workflows/publish_manylinux_image.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,17 @@
name: Publish manylinux image
on:
workflow_dispatch:
inputs:
push:
description: 'Push to Docker Hub'
required: false
default: 'true'

# Schedule the workflow to run at 9:00 (UTC) every month.
schedule:
# Minute[0,59] Hour[0,23] Day of month[1,31] Month[1,12] Day of week[0,6] (Sunday=0)
- cron: '0 9 1 * *'

env:
# Tells where to store caches.
CI_CACHE_DIR: ${{ github.workspace }}/../../ci_cache
Expand All @@ -10,7 +21,7 @@ jobs:
strategy:
fail-fast: false
matrix:
cuda: ["11.8", "12.1", "12.4"]
cuda: ["11.8", "12.1", "12.4", "12.6"]
runs-on: [self-hosted, linux, build]
steps:
- name: Checkout repository
Expand All @@ -36,7 +47,7 @@ jobs:
with:
context: ./docker
file: ./docker/Dockerfile.manylinux
push: true
push: ${{ inputs.push || 'true' }}
cache-from: type=local,src=$CI_CACHE_DIR/.buildx-cache
cache-to: type=local,dest=$CI_CACHE_DIR/.buildx-cache
build-args: |
Expand Down
71 changes: 70 additions & 1 deletion docker/common/install_cuda.sh
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,39 @@ function install_124 {
ldconfig
}

function install_126 {
echo "Installing CUDA 12.6.2 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
rm -rf /usr/local/cuda-12.6 /usr/local/cuda
# install CUDA 12.6.2 in the same container
wget -q https://developer.download.nvidia.com/compute/cuda/12.6.2/local_installers/cuda_12.6.2_560.35.03_linux.run
chmod +x cuda_12.6.2_560.35.03_linux.run
./cuda_12.6.2_560.35.03_linux.run --toolkit --silent
rm -f cuda_12.6.2_560.35.03_linux.run
rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.6 /usr/local/cuda

# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
mkdir tmp_cudnn && cd tmp_cudnn
wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
cd ..
rm -rf tmp_cudnn

# NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
# Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
cd nccl && make -j src.build
cp -a build/include/* /usr/local/cuda/include/
cp -a build/lib/* /usr/local/cuda/lib64/
cd ..
rm -rf nccl

install_cusparselt_062

ldconfig
}

function prune_118 {
echo "Pruning CUDA 11.8 and cuDNN"
#####################################################################################
Expand Down Expand Up @@ -229,12 +262,46 @@ function prune_124 {
$NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a

#####################################################################################
# CUDA 12.1 prune visual tools
# CUDA 12.4 prune visual tools
#####################################################################################
export CUDA_BASE="/usr/local/cuda-12.4/"
rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
}

function prune_126 {
echo "Pruning CUDA 12.6"
#####################################################################################
# CUDA 12.6 prune static libs
#####################################################################################
export NVPRUNE="/usr/local/cuda-12.6/bin/nvprune"
export CUDA_LIB_DIR="/usr/local/cuda-12.6/lib64"

export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"

if [[ -n "$OVERRIDE_GENCODE" ]]; then
export GENCODE=$OVERRIDE_GENCODE
fi
if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
fi

# all CUDA libs except CuDNN and CuBLAS
ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis" \
| xargs -I {} bash -c \
"echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"

# prune CuDNN and CuBLAS
$NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
$NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a

#####################################################################################
# CUDA 12.6 prune visual tools
#####################################################################################
export CUDA_BASE="/usr/local/cuda-12.6/"
rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/
}

# idiomatic parameter and option handling in sh
while test $# -gt 0
do
Expand All @@ -245,6 +312,8 @@ do
;;
12.4) install_124; prune_124
;;
12.6) install_126; prune_126
;;
*) echo "bad argument $1"; exit 1
;;
esac
Expand Down
3 changes: 3 additions & 0 deletions scripts/build_wheel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ export PATH="$HOME/.local/bin:$PATH"
PYVER="${PYTHON_VERSION//./}"
export PATH="/opt/python/cp${PYVER}-cp${PYVER}/bin:$PATH"

# update pip
python -m pip install --upgrade pip

# install PyTorch
pip install torch==$TORCH_VERSION -i "https://download.pytorch.org/whl/cu${CUDA_VERSION//./}"

Expand Down
5 changes: 4 additions & 1 deletion scripts/run_pytest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ export PATH="$HOME/.local/bin:$PATH"
PYVER="${PYTHON_VERSION//./}"
export PATH="/opt/python/cp${PYVER}-cp${PYVER}/bin:$PATH"

# update pip
python -m pip install --upgrade pip

# install PyTorch
pip install torch==$TORCH_VERSION -i "https://download.pytorch.org/whl/cu${CUDA_VERSION//./}"

Expand All @@ -33,6 +36,6 @@ pip install -r requirements-test.txt
# install scalellm wheel
pip install dist/*.whl

# run pytest
# run pytest within the tests directory
cd tests
pytest