Skip to content

Commit b755104

Browse files
authored
Merge branch 'master' into loadams/reenable-py311-312
2 parents 6b7dd7e + b4e74a9 commit b755104

File tree

244 files changed

+12256
-1670
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

244 files changed

+12256
-1670
lines changed

.github/workflows/modal-accelerate.yml

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ name: modal-accelerate
33
# This CI is running on modal.com's GPUs.
44
#
55
# It's set up here on github actions and then the cloned repo is sent to modal and everything
6-
# happens on their hw - see deepspeed/modal_ci/accelerate.py for where the actual vm is loaded, updated and the tests are
6+
# happens on their hw - see ci/accelerate.py for where the actual vm is loaded, updated and the tests are
77
# run.
88
#
99
# Both files are annotated to what's important and how one might change or update things if needed.
@@ -20,18 +20,23 @@ on:
2020
branches:
2121
- master
2222

23-
pull_request:
23+
# you have to switch to `pull_request` if you need to change the CI job's python script,
24+
# otherwise GH will use a master version of the CI files, ignoring the modifications in the PR -
25+
# the other way is to use modal cli to test this job from one's host - it'd require setting up
26+
# modal secrets
27+
# pull_request:
28+
pull_request_target:
2429
paths-ignore:
2530
- 'docs/**'
2631
- 'blogs/**'
2732
- 'deepspeed/inference/v2/**'
2833
- 'tests/unit/inference/v2/**'
29-
types: [draft, opened, ready_for_review, synchronize]
34+
types: [review_requested, ready_for_review, synchronize]
3035
branches:
3136
- master
3237

3338
concurrency:
34-
group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
39+
group: ${{ github.workflow }}-${{ github.ref }}
3540
cancel-in-progress: true
3641

3742
jobs:

.github/workflows/modal-torch-latest.yml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ name: modal-torch-latest
33
# This CI is running on modal.com's GPUs.
44
#
55
# It's set up here on github actions and then the cloned repo is sent to modal and everything
6-
# happens on their hw - see deepspeed/modal_ci/torch_latest.py for where the actual vm is loaded, updated and the tests are
6+
# happens on their hw - see ci/torch_latest.py for where the actual vm is loaded, updated and the tests are
77
# run.
88
#
99
# Both files are annotated to what's important and how one might change or update things if needed.
@@ -16,22 +16,23 @@ name: modal-torch-latest
1616

1717
on:
1818
workflow_dispatch:
19+
1920
push:
2021
branches:
2122
- master
2223

23-
pull_request:
24+
pull_request_target:
2425
paths-ignore:
2526
- 'docs/**'
2627
- 'blogs/**'
2728
- 'deepspeed/inference/v2/**'
2829
- 'tests/unit/inference/v2/**'
29-
types: [draft, opened, ready_for_review, synchronize]
30+
types: [review_requested, ready_for_review, synchronize]
3031
branches:
3132
- master
3233

3334
concurrency:
34-
group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
35+
group: ${{ github.workflow }}-${{ github.ref }}
3536
cancel-in-progress: true
3637

3738
jobs:
Lines changed: 53 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,53 @@
1-
name: nv-lightning-v100
2-
3-
on:
4-
workflow_dispatch:
5-
pull_request:
6-
paths-ignore:
7-
- 'docs/**'
8-
- 'blogs/**'
9-
- 'deepspeed/inference/v2/**'
10-
- 'tests/unit/inference/v2/**'
11-
merge_group:
12-
branches: [ master ]
13-
schedule:
14-
- cron: "0 0 * * *"
15-
16-
concurrency:
17-
group: ${{ github.workflow }}-${{ github.ref }}
18-
cancel-in-progress: true
19-
20-
jobs:
21-
unit-tests:
22-
runs-on: [self-hosted, nvidia, cu124, v100]
23-
24-
steps:
25-
- uses: actions/checkout@v4
26-
27-
- id: setup-venv
28-
uses: ./.github/workflows/setup-venv
29-
30-
- name: Install pytorch
31-
run: |
32-
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
33-
python -c "import torch; print('torch:', torch.__version__, torch)"
34-
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
35-
36-
- name: Install deepspeed
37-
run: |
38-
pip install .[dev,autotuning]
39-
ds_report
40-
41-
- name: Python environment
42-
run: |
43-
pip list
44-
45-
- name: PyTorch Lightning Tests
46-
run: |
47-
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
48-
pip install pytorch-lightning
49-
pip install "protobuf<4.21.0"
50-
cd tests
51-
pytest $PYTEST_OPTS lightning/
1+
# name: nv-lightning-v100
2+
3+
# disabled as the v100s are no more - need to port to modal while removing v100
4+
5+
# on:
6+
# workflow_dispatch:
7+
# pull_request:
8+
# paths-ignore:
9+
# - 'docs/**'
10+
# - 'blogs/**'
11+
# - 'deepspeed/inference/v2/**'
12+
# - 'tests/unit/inference/v2/**'
13+
# merge_group:
14+
# branches: [ master ]
15+
# schedule:
16+
# - cron: "0 0 * * *"
17+
18+
# concurrency:
19+
# group: ${{ github.workflow }}-${{ github.ref }}
20+
# cancel-in-progress: true
21+
22+
# jobs:
23+
# unit-tests:
24+
# runs-on: [self-hosted, nvidia, cu124, v100]
25+
26+
# steps:
27+
# - uses: actions/checkout@v4
28+
29+
# - id: setup-venv
30+
# uses: ./.github/workflows/setup-venv
31+
32+
# - name: Install pytorch
33+
# run: |
34+
# pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
35+
# python -c "import torch; print('torch:', torch.__version__, torch)"
36+
# python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
37+
38+
# - name: Install deepspeed
39+
# run: |
40+
# pip install .[dev,autotuning]
41+
# ds_report
42+
43+
# - name: Python environment
44+
# run: |
45+
# pip list
46+
47+
# - name: PyTorch Lightning Tests
48+
# run: |
49+
# unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
50+
# pip install pytorch-lightning
51+
# pip install "protobuf<4.21.0"
52+
# cd tests
53+
# pytest $PYTEST_OPTS lightning/

.github/workflows/xpu-max1100.yml

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ jobs:
5151
pip install --upgrade pip
5252
pip install py-cpuinfo
5353
pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/xpu
54-
pip install intel-extension-for-pytorch==2.7.10+xpu oneccl_bind_pt==2.7.0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us
54+
pip install intel-extension-for-pytorch==2.7.10+xpu oneccl_bind_pt==2.7.0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us --trusted-host pytorch-extension.intel.com
5555
pip install .[dev,autotuning]
5656
5757
- name: Check container state
@@ -72,12 +72,7 @@ jobs:
7272
export I_MPI_SHM=off
7373
pytest --verbose accelerator/*
7474
pytest --verbose autotuning/*
75-
pytest --verbose checkpoint/test_reshape_checkpoint.py
76-
pytest --verbose checkpoint/test_moe_checkpoint.py
77-
pytest --verbose checkpoint/test_shared_weights.py
78-
pytest --verbose launcher/test_ds_arguments.py launcher/test_run.py
7975
pytest --verbose model_parallelism/*
80-
pytest --verbose moe/test_moe_tp.py
8176
pytest --verbose monitor/*
8277
pytest --verbose utils/*
8378
pytest --verbose runtime/test_ds_config_model.py
@@ -87,7 +82,4 @@ jobs:
8782
pytest --verbose runtime/zero/test_zeropp.py
8883
pytest --verbose runtime/test_autocast.py
8984
pytest --verbose runtime/test_data.py
90-
pytest --verbose runtime/test_runtime_utils.py
91-
pytest --verbose runtime/activation_checkpointing/*
92-
pytest --verbose runtime/utils/*
9385
pytest --verbose runtime/zero/test_zero_dynamic_class.py

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,3 +60,6 @@ tests/unit/saved_checkpoint/
6060
*.hip
6161
*.cuh
6262
*hip_layers.h
63+
64+
# virtual env directory for format
65+
venv

COMMITTERS.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@
22

33
| Name | GitHub ID | Affiliation
44
|--- | ---- | --- |
5-
| Olatunji Ruwase | [tjruwase](https://github.com/tjruwase) | Microsoft |
5+
| Olatunji Ruwase | [tjruwase](https://github.com/tjruwase) | SnowFlake |
66
| Logan Adams | [loadams](https://github.com/loadams) | Microsoft |
7-
| Masahiro Tanaka | [tohtana](https://github.com/tohtana) | Microsoft |
7+
| Masahiro Tanaka | [tohtana](https://github.com/tohtana) | Anyscale |
88
| Jeff Rasley | [jeffra](https://github.com/jeffra) | SnowFlake |
99
| Minjia Zhang | [minjiazhang](https://github.com/minjiazhang) | UIUC |
1010
| Ashwin Aji | [ashwinma](https://github.com/ashwinma) | AMD |
1111
| Sam Foreman | [saforem2](https://github.com/saforem2) | Argonne National Laboratory |
1212
| Zhipeng Wang | [PKUWZP](https://github.com/PKUWZP) | LinkedIn |
13+
| Guokai Ma | [delock](https://github.com/delock) | Intel |

0 commit comments

Comments
 (0)