Skip to content

Commit bfb66c6

Browse files
authored
Add CI workflow to run tests on AWS (#7753)
This PR migrates CI workflows for unit tests to AWS. v1 tests use 4xL40S and accelerate tests use 1xL40S. @sfc-gh-truwase This looks working now. We could disable modal tests after this PR is merged, or keep both for a while just in case. --------- Signed-off-by: Masahiro Tanaka <mtanaka@anyscale.com>
1 parent c0e9b2c commit bfb66c6

File tree

2 files changed

+192
-0
lines changed

2 files changed

+192
-0
lines changed
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
################################################################################
2+
# DeepSpeed CI - AWS L40S GPU Tests (HuggingFace Accelerate Integration)
3+
#
4+
# Runs the same tests as modal-accelerate.yml but on AWS self-hosted runners.
5+
# Tests DeepSpeed integration with HuggingFace Accelerate library.
6+
# Uses 4x NVIDIA L40S GPUs on g6e.12xlarge instances.
7+
################################################################################
8+
9+
name: aws-accelerate
10+
11+
on:
12+
workflow_dispatch:
13+
14+
push:
15+
branches:
16+
- master
17+
18+
pull_request:
19+
paths-ignore:
20+
- 'docs/**'
21+
- 'blogs/**'
22+
- 'deepspeed/inference/v2/**'
23+
- 'tests/unit/inference/v2/**'
24+
branches:
25+
- master
26+
27+
concurrency:
28+
group: ${{ github.workflow }}-${{ github.ref }}
29+
cancel-in-progress: true
30+
31+
jobs:
32+
accelerate-tests:
33+
name: Accelerate Integration Tests
34+
runs-on: [self-hosted, gpu-ci, gpu-l40s, l40s-1gpu, aws]
35+
36+
container:
37+
image: nvidia/cuda:12.6.3-devel-ubuntu22.04
38+
options: --gpus all --shm-size "32G"
39+
40+
env:
41+
TORCH_VER: "2.7"
42+
CUDA_VER: "12.6"
43+
44+
steps:
45+
- name: Install system dependencies
46+
run: |
47+
apt-get update && apt-get install -y git git-lfs libaio-dev python3 python3-pip
48+
git lfs install
49+
ln -sf /usr/bin/python3 /usr/bin/python
50+
51+
- name: Checkout repository
52+
uses: actions/checkout@v4
53+
with:
54+
lfs: true
55+
56+
- name: Install PyTorch
57+
run: |
58+
pip install torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu126
59+
60+
- name: Install Python dependencies
61+
run: |
62+
pip install --upgrade pip
63+
pip install -r requirements/requirements.txt
64+
pip install -r requirements/requirements-dev.txt
65+
pip install datasets
66+
67+
- name: Check environment
68+
run: |
69+
echo "=== GPU Information ==="
70+
nvidia-smi
71+
echo ""
72+
echo "=== CUDA Version ==="
73+
nvcc --version
74+
echo ""
75+
echo "=== Python/PyTorch Info ==="
76+
python --version
77+
python -c "import torch; print(f'PyTorch: {torch.__version__}')"
78+
python -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
79+
python -c "import torch; print(f'CUDA devices: {torch.cuda.device_count()}')"
80+
python -c "import torch; print(f'BF16 support: {torch.cuda.is_bf16_supported()}')"
81+
82+
- name: Install DeepSpeed
83+
run: |
84+
# Initialize CUDA before install so setup.py can detect NCCL version
85+
python -c "import torch; torch.cuda.init(); print(f'NCCL version: {torch.cuda.nccl.version()}')"
86+
# Use --no-build-isolation so setup.py can access pre-installed PyTorch
87+
pip install --no-build-isolation .
88+
ds_report
89+
# Debug: Check captured torch_info values
90+
python -c "from deepspeed.git_version_info import torch_info; print(f'torch_info: {torch_info}')"
91+
92+
- name: Clone and install Accelerate
93+
run: |
94+
git clone https://github.com/huggingface/accelerate
95+
pip install "./accelerate[testing]"
96+
97+
- name: Run Accelerate DeepSpeed tests
98+
run: |
99+
pytest --verbose ./accelerate/tests/deepspeed
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
################################################################################
2+
# DeepSpeed CI - AWS L40S GPU Tests (PyTorch Latest)
3+
#
4+
# Runs the same tests as modal-torch-latest.yml but on AWS self-hosted runners.
5+
# Uses 4x NVIDIA L40S GPUs on g6e.12xlarge instances.
6+
################################################################################
7+
8+
name: aws-torch-latest
9+
10+
on:
11+
workflow_dispatch:
12+
13+
push:
14+
branches:
15+
- master
16+
17+
pull_request:
18+
paths-ignore:
19+
- 'docs/**'
20+
- 'blogs/**'
21+
- 'deepspeed/inference/v2/**'
22+
- 'tests/unit/inference/v2/**'
23+
branches:
24+
- master
25+
26+
concurrency:
27+
group: ${{ github.workflow }}-${{ github.ref }}
28+
cancel-in-progress: true
29+
30+
jobs:
31+
unit-tests:
32+
name: Unit Tests (V1)
33+
runs-on: [self-hosted, gpu-ci, gpu-l40s, l40s-4gpu, aws]
34+
35+
container:
36+
image: nvidia/cuda:12.6.3-devel-ubuntu22.04
37+
options: --gpus all --shm-size "32G"
38+
39+
env:
40+
TORCH_VER: "2.7"
41+
CUDA_VER: "12.6"
42+
43+
steps:
44+
- name: Install system dependencies
45+
run: |
46+
apt-get update && apt-get install -y git git-lfs libaio-dev python3 python3-pip
47+
git lfs install
48+
ln -sf /usr/bin/python3 /usr/bin/python
49+
50+
- name: Checkout repository
51+
uses: actions/checkout@v4
52+
with:
53+
lfs: true
54+
55+
- name: Install PyTorch
56+
run: |
57+
pip install torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu126
58+
59+
- name: Install Python dependencies
60+
run: |
61+
pip install --upgrade pip
62+
pip install -r requirements/requirements.txt
63+
pip install -r requirements/requirements-dev.txt
64+
pip install -r requirements/requirements-deepcompile.txt
65+
66+
- name: Check environment
67+
run: |
68+
echo "=== GPU Information ==="
69+
nvidia-smi
70+
echo ""
71+
echo "=== CUDA Version ==="
72+
nvcc --version
73+
echo ""
74+
echo "=== Python/PyTorch Info ==="
75+
python --version
76+
python -c "import torch; print(f'PyTorch: {torch.__version__}')"
77+
python -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
78+
python -c "import torch; print(f'CUDA devices: {torch.cuda.device_count()}')"
79+
python -c "import torch; print(f'BF16 support: {torch.cuda.is_bf16_supported()}')"
80+
81+
- name: Install DeepSpeed
82+
run: |
83+
# Initialize CUDA before install so setup.py can detect NCCL version
84+
python -c "import torch; torch.cuda.init(); print(f'NCCL version: {torch.cuda.nccl.version()}')"
85+
# Use --no-build-isolation so setup.py can access pre-installed PyTorch
86+
pip install --no-build-isolation .
87+
ds_report
88+
# Debug: Check captured torch_info values
89+
python -c "from deepspeed.git_version_info import torch_info; print(f'torch_info: {torch_info}')"
90+
91+
- name: Run unit tests
92+
run: |
93+
pytest -n 4 --forked --verbose tests/unit/v1/ --torch_ver=${{ env.TORCH_VER }} --cuda_ver=${{ env.CUDA_VER }}

0 commit comments

Comments
 (0)