Skip to content

Commit 961c724

Browse files
authored
Merge branch 'hpcaitech:main' into fix-dp-rank
2 parents 7bdd7d9 + edd65a8 commit 961c724

File tree

107 files changed

+2995
-1014
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

107 files changed

+2995
-1014
lines changed

.compatibility

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
2.2.2-12.1.0
21
2.3.0-12.1.0
32
2.4.0-12.4.1
3+
2.5.1-12.4.1

.cuda_ext.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
{
22
"build": [
33
{
4-
"torch_command": "pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu121",
5-
"cuda_image": "hpcaitech/cuda-conda:12.1"
4+
"torch_command": "pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu121",
5+
"cuda_image": "image-cloud.luchentech.com/hpcaitech/cuda-conda:12.1"
66
},
77
{
8-
"torch_command": "pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu124",
9-
"cuda_image": "hpcaitech/cuda-conda:12.4"
8+
"torch_command": "pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124",
9+
"cuda_image": "image-cloud.luchentech.com/hpcaitech/cuda-conda:12.4"
1010
}
1111
]
1212
}

.github/workflows/build_on_pr.yml

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ jobs:
3434
anyExtensionFileChanged: ${{ steps.find-extension-change.outputs.any_changed }}
3535
changedLibraryFiles: ${{ steps.find-lib-change.outputs.all_changed_files }}
3636
anyLibraryFileChanged: ${{ steps.find-lib-change.outputs.any_changed }}
37-
runs-on: ubuntu-latest
37+
runs-on: [self-hosted, ubuntu-latest]
3838
concurrency:
3939
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-detect-change
4040
cancel-in-progress: true
@@ -87,10 +87,10 @@ jobs:
8787
name: Build and Test Colossal-AI
8888
needs: detect
8989
if: needs.detect.outputs.anyLibraryFileChanged == 'true'
90-
runs-on: [self-hosted, gpu]
90+
runs-on: [self-hosted, ubuntu-latest]
9191
container:
92-
image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
93-
options: --gpus all --rm -v /dev/shm -v /data/scratch:/data/scratch
92+
image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0
93+
options: --gpus all --shm-size=2g --rm -v /dev/shm -v /data/scratch:/data/scratch
9494
timeout-minutes: 90
9595
defaults:
9696
run:
@@ -138,6 +138,10 @@ jobs:
138138
cp -p -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
139139
fi
140140
141+
- name: Install flash-attention
142+
run: |
143+
pip install flash-attn==2.7.4.post1 --no-build-isolation
144+
141145
- name: Install Colossal-AI
142146
run: |
143147
BUILD_EXT=1 pip install -v -e .

.github/workflows/build_on_schedule.yml

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,16 @@ name: Build on Schedule
33
on:
44
schedule:
55
# run at 00:00 of every Sunday
6-
- cron: "0 0 * * *"
6+
- cron: "0 0 * * 0"
77
workflow_dispatch:
88

99
jobs:
1010
build:
1111
name: Build and Test Colossal-AI
1212
if: github.repository == 'hpcaitech/ColossalAI'
13-
runs-on: [self-hosted, gpu]
13+
runs-on: [self-hosted, ubuntu-latest]
1414
container:
15-
image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
15+
image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0
1616
options: --gpus all --rm -v /dev/shm -v /data/scratch/:/data/scratch/
1717
timeout-minutes: 90
1818
steps:
@@ -51,6 +51,10 @@ jobs:
5151
with:
5252
ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
5353

54+
- name: Install flash-attention
55+
run: |
56+
pip install flash-attn==2.7.4.post1 --no-build-isolation
57+
5458
- name: Install Colossal-AI
5559
if: steps.check-avai.outputs.avai == 'true'
5660
run: |

.github/workflows/close_inactive.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ on:
77
jobs:
88
close-issues:
99
if: github.event.pull_request.draft == false && github.base_ref == 'main' && github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
10-
runs-on: ubuntu-latest
10+
runs-on: [self-hosted, ubuntu-latest]
1111
permissions:
1212
issues: write
1313
pull-requests: write

.github/workflows/compatiblity_test_on_dispatch.yml

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ on:
1515
jobs:
1616
matrix_preparation:
1717
name: Prepare Container List
18-
runs-on: ubuntu-latest
18+
runs-on: [self-hosted, ubuntu-latest]
1919
outputs:
2020
matrix: ${{ steps.set-matrix.outputs.matrix }}
2121
steps:
@@ -31,7 +31,7 @@ jobs:
3131
do
3232
for cv in $CUDA_VERSIONS
3333
do
34-
DOCKER_IMAGE+=("\"hpcaitech/pytorch-cuda:${tv}-${cv}\"")
34+
DOCKER_IMAGE+=("\"image-cloud.luchentech.com/hpcaitech/pytorch-cuda:${tv}-${cv}\"")
3535
done
3636
done
3737
@@ -44,7 +44,7 @@ jobs:
4444
name: Test for PyTorch Compatibility
4545
needs: matrix_preparation
4646
if: github.repository == 'hpcaitech/ColossalAI'
47-
runs-on: [self-hosted, 8-gpu]
47+
runs-on: [self-hosted, ubuntu-latest]
4848
strategy:
4949
fail-fast: false
5050
matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
@@ -73,7 +73,18 @@ jobs:
7373
7474
- name: Unit Testing
7575
run: |
76-
PYTHONPATH=$PWD pytest --durations=0 tests
76+
PYTHONPATH=$PWD pytest
77+
-m "not largedist" \
78+
--durations=0 \
79+
--ignore tests/test_analyzer \
80+
--ignore tests/test_auto_parallel \
81+
--ignore tests/test_fx \
82+
--ignore tests/test_autochunk \
83+
--ignore tests/test_gptq \
84+
--ignore tests/test_infer_ops \
85+
--ignore tests/test_legacy \
86+
--ignore tests/test_smoothquant \
87+
tests/
7788
env:
7889
DATA: /data/scratch/cifar-10
7990
LD_LIBRARY_PATH: /github/home/.tensornvme/lib

.github/workflows/compatiblity_test_on_pr.yml

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ on:
99
jobs:
1010
matrix_preparation:
1111
name: Prepare Container List
12-
runs-on: ubuntu-latest
12+
runs-on: [self-hosted, ubuntu-latest]
1313
outputs:
1414
matrix: ${{ steps.set-matrix.outputs.matrix }}
1515
concurrency:
@@ -23,7 +23,7 @@ jobs:
2323
DOCKER_IMAGE=()
2424
2525
while read tag; do
26-
DOCKER_IMAGE+=("\"hpcaitech/pytorch-cuda:${tag}\"")
26+
DOCKER_IMAGE+=("\"image-cloud.luchentech.com/hpcaitech/pytorch-cuda:${tag}\"")
2727
done <.compatibility
2828
2929
container=$( IFS=',' ; echo "${DOCKER_IMAGE[*]}" )
@@ -35,7 +35,7 @@ jobs:
3535
name: Test for PyTorch Compatibility
3636
needs: matrix_preparation
3737
if: github.repository == 'hpcaitech/ColossalAI'
38-
runs-on: [self-hosted, 8-gpu]
38+
runs-on: [self-hosted, ubuntu-latest]
3939
strategy:
4040
fail-fast: false
4141
matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
@@ -67,7 +67,18 @@ jobs:
6767
6868
- name: Unit Testing
6969
run: |
70-
PYTHONPATH=$PWD pytest --durations=0 tests
70+
PYTHONPATH=$PWD pytest \
71+
-m "not largedist" \
72+
--durations=0 \
73+
--ignore tests/test_analyzer \
74+
--ignore tests/test_auto_parallel \
75+
--ignore tests/test_fx \
76+
--ignore tests/test_autochunk \
77+
--ignore tests/test_gptq \
78+
--ignore tests/test_infer_ops \
79+
--ignore tests/test_legacy \
80+
--ignore tests/test_smoothquant \
81+
tests/
7182
env:
7283
DATA: /data/scratch/cifar-10
7384
LD_LIBRARY_PATH: /github/home/.tensornvme/lib

.github/workflows/compatiblity_test_on_schedule.yml

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ on:
99
jobs:
1010
matrix_preparation:
1111
name: Prepare Container List
12-
runs-on: ubuntu-latest
12+
runs-on: [self-hosted, ubuntu-latest]
1313
outputs:
1414
matrix: ${{ steps.set-matrix.outputs.matrix }}
1515
steps:
@@ -20,7 +20,7 @@ jobs:
2020
DOCKER_IMAGE=()
2121
2222
while read tag; do
23-
DOCKER_IMAGE+=("\"hpcaitech/pytorch-cuda:${tag}\"")
23+
DOCKER_IMAGE+=("\"image-cloud.luchentech.com/hpcaitech/pytorch-cuda:${tag}\"")
2424
done <.compatibility
2525
2626
container=$( IFS=',' ; echo "${DOCKER_IMAGE[*]}" )
@@ -32,7 +32,7 @@ jobs:
3232
name: Test for PyTorch Compatibility
3333
needs: matrix_preparation
3434
if: github.repository == 'hpcaitech/ColossalAI'
35-
runs-on: [self-hosted, 8-gpu]
35+
runs-on: [self-hosted, ubuntu-latest]
3636
strategy:
3737
fail-fast: false
3838
matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
@@ -61,7 +61,18 @@ jobs:
6161
6262
- name: Unit Testing
6363
run: |
64-
PYTHONPATH=$PWD pytest --durations=0 tests
64+
PYTHONPATH=$PWD pytest \
65+
-m "not largedist" \
66+
--durations=0 \
67+
--ignore tests/test_analyzer \
68+
--ignore tests/test_auto_parallel \
69+
--ignore tests/test_fx \
70+
--ignore tests/test_autochunk \
71+
--ignore tests/test_gptq \
72+
--ignore tests/test_infer_ops \
73+
--ignore tests/test_legacy \
74+
--ignore tests/test_smoothquant \
75+
tests/
6576
env:
6677
DATA: /data/scratch/cifar-10
6778
LD_LIBRARY_PATH: /github/home/.tensornvme/lib

.github/workflows/cuda_ext_check_before_merge.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ jobs:
1010
matrix_preparation:
1111
name: Prepare Container List
1212
if: github.repository == 'hpcaitech/ColossalAI'
13-
runs-on: ubuntu-latest
13+
runs-on: [self-hosted, ubuntu-latest]
1414
outputs:
1515
matrix: ${{ steps.set-matrix.outputs.matrix }}
1616
steps:
@@ -24,7 +24,7 @@ jobs:
2424
build:
2525
name: Release bdist wheels
2626
needs: matrix_preparation
27-
runs-on: [self-hosted, gpu]
27+
runs-on: [self-hosted, ubuntu-latest]
2828
strategy:
2929
fail-fast: false
3030
matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}

.github/workflows/doc_build_on_schedule_after_release.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
build-doc:
1212
name: Trigger Documentation Build Workflow
1313
if: github.repository == 'hpcaitech/ColossalAI'
14-
runs-on: ubuntu-latest
14+
runs-on: [self-hosted, ubuntu-latest]
1515
steps:
1616
- name: trigger workflow in ColossalAI-Documentation
1717
run: |

0 commit comments

Comments
 (0)