deepspeedai
diff --git a/‎.github/workflows/modal-accelerate.yml‎
Lines changed: 9 additions & 4 deletions b/‎.github/workflows/modal-accelerate.yml‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎.github/workflows/modal-torch-latest.yml‎
Lines changed: 5 additions & 4 deletions b/‎.github/workflows/modal-torch-latest.yml‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎.github/workflows/nv-lightning-v100.yml‎
Lines changed: 53 additions & 51 deletions b/‎.github/workflows/nv-lightning-v100.yml‎
Lines changed: 53 additions & 51 deletions
diff --git a/‎.github/workflows/xpu-max1100.yml‎
Lines changed: 1 addition & 9 deletions b/‎.github/workflows/xpu-max1100.yml‎
Lines changed: 1 addition & 9 deletions
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎COMMITTERS.md‎
Lines changed: 3 additions & 2 deletions b/‎COMMITTERS.md‎
Lines changed: 3 additions & 2 deletions
@@ -3,7 +3,7 @@ name: modal-accelerate
 # This CI is running on modal.com's GPUs.
 #
 # It's set up here on github actions and then the cloned repo is sent to modal and everything
-# happens on their hw - see deepspeed/modal_ci/accelerate.py for where the actual vm is loaded, updated and the tests are
+# happens on their hw - see ci/accelerate.py for where the actual vm is loaded, updated and the tests are
 # run.
 #
 # Both files are annotated to what's important and how one might change or update things if needed.
@@ -20,18 +20,23 @@ on:
     branches:
       - master
 
-  pull_request:
+  # you have to switch to `pull_request` if you need to change the CI job's python script,
+  # otherwise GH will use a master version of the CI files, ignoring the modifications in the PR -
+  # the other way is to use modal cli to test this job from one's host - it'd require setting up
+  # modal secrets
+  # pull_request:
+  pull_request_target:
     paths-ignore:
       - 'docs/**'
       - 'blogs/**'
       - 'deepspeed/inference/v2/**'
       - 'tests/unit/inference/v2/**'
-    types: [draft, opened, ready_for_review, synchronize]
+    types: [review_requested, ready_for_review, synchronize]
     branches:
       - master
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
+  group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
 jobs:
 
@@ -3,7 +3,7 @@ name: modal-torch-latest
 # This CI is running on modal.com's GPUs.
 #
 # It's set up here on github actions and then the cloned repo is sent to modal and everything
-# happens on their hw - see deepspeed/modal_ci/torch_latest.py  for where the actual vm is loaded, updated and the tests are
+# happens on their hw - see ci/torch_latest.py  for where the actual vm is loaded, updated and the tests are
 # run.
 #
 # Both files are annotated to what's important and how one might change or update things if needed.
@@ -16,22 +16,23 @@ name: modal-torch-latest
 
 on:
   workflow_dispatch:
+
   push:
     branches:
       - master
 
-  pull_request:
+  pull_request_target:
     paths-ignore:
       - 'docs/**'
       - 'blogs/**'
       - 'deepspeed/inference/v2/**'
       - 'tests/unit/inference/v2/**'
-    types: [draft, opened, ready_for_review, synchronize]
+    types: [review_requested, ready_for_review, synchronize]
     branches:
       - master
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
+  group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
 jobs:
 
@@ -1,51 +1,53 @@
-name: nv-lightning-v100
-
-on:
-  workflow_dispatch:
-  pull_request:
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
-      - 'deepspeed/inference/v2/**'
-      - 'tests/unit/inference/v2/**'
-  merge_group:
-    branches: [ master ]
-  schedule:
-    - cron: "0 0 * * *"
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  unit-tests:
-    runs-on: [self-hosted, nvidia, cu124, v100]
-
-    steps:
-      - uses: actions/checkout@v4
-
-      - id: setup-venv
-        uses: ./.github/workflows/setup-venv
-
-      - name: Install pytorch
-        run: |
-          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
-          python -c "import torch; print('torch:', torch.__version__, torch)"
-          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
-
-      - name: Install deepspeed
-        run: |
-          pip install .[dev,autotuning]
-          ds_report
-
-      - name: Python environment
-        run: |
-          pip list
-
-      - name: PyTorch Lightning Tests
-        run: |
-          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
-          pip install pytorch-lightning
-          pip install "protobuf<4.21.0"
-          cd tests
-          pytest $PYTEST_OPTS lightning/
+# name: nv-lightning-v100
+
+# disabled as the v100s are no more - need to port to modal while removing v100
+
+# on:
+#   workflow_dispatch:
+#   pull_request:
+#     paths-ignore:
+#       - 'docs/**'
+#       - 'blogs/**'
+#       - 'deepspeed/inference/v2/**'
+#       - 'tests/unit/inference/v2/**'
+#   merge_group:
+#     branches: [ master ]
+#   schedule:
+#     - cron: "0 0 * * *"
+
+# concurrency:
+#   group: ${{ github.workflow }}-${{ github.ref }}
+#   cancel-in-progress: true
+
+# jobs:
+#   unit-tests:
+#     runs-on: [self-hosted, nvidia, cu124, v100]
+
+#     steps:
+#       - uses: actions/checkout@v4
+
+#       - id: setup-venv
+#         uses: ./.github/workflows/setup-venv
+
+#       - name: Install pytorch
+#         run: |
+#           pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
+#           python -c "import torch; print('torch:', torch.__version__, torch)"
+#           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+#       - name: Install deepspeed
+#         run: |
+#           pip install .[dev,autotuning]
+#           ds_report
+
+#       - name: Python environment
+#         run: |
+#           pip list
+
+#       - name: PyTorch Lightning Tests
+#         run: |
+#           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+#           pip install pytorch-lightning
+#           pip install "protobuf<4.21.0"
+#           cd tests
+#           pytest $PYTEST_OPTS lightning/
@@ -51,7 +51,7 @@ jobs:
         pip install --upgrade pip
         pip install py-cpuinfo
         pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/xpu
-        pip install intel-extension-for-pytorch==2.7.10+xpu oneccl_bind_pt==2.7.0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us
+        pip install intel-extension-for-pytorch==2.7.10+xpu oneccl_bind_pt==2.7.0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us --trusted-host pytorch-extension.intel.com
         pip install .[dev,autotuning]
 
     - name: Check container state
@@ -72,12 +72,7 @@ jobs:
         export I_MPI_SHM=off
         pytest --verbose accelerator/*
         pytest --verbose autotuning/*
-        pytest --verbose checkpoint/test_reshape_checkpoint.py
-        pytest --verbose checkpoint/test_moe_checkpoint.py
-        pytest --verbose checkpoint/test_shared_weights.py
-        pytest --verbose launcher/test_ds_arguments.py launcher/test_run.py
         pytest --verbose model_parallelism/*
-        pytest --verbose moe/test_moe_tp.py
         pytest --verbose monitor/*
         pytest --verbose utils/*
         pytest --verbose runtime/test_ds_config_model.py
@@ -87,7 +82,4 @@ jobs:
         pytest --verbose runtime/zero/test_zeropp.py
         pytest --verbose runtime/test_autocast.py
         pytest --verbose runtime/test_data.py
-        pytest --verbose runtime/test_runtime_utils.py
-        pytest --verbose runtime/activation_checkpointing/*
-        pytest --verbose runtime/utils/*
         pytest --verbose runtime/zero/test_zero_dynamic_class.py
@@ -60,3 +60,6 @@ tests/unit/saved_checkpoint/
 *.hip
 *.cuh
 *hip_layers.h
+
+# virtual env directory for format
+venv
@@ -2,11 +2,12 @@
 
 | Name | GitHub ID | Affiliation
 |--- | ---- | --- |
-| Olatunji Ruwase | [tjruwase](https://github.com/tjruwase)     | Microsoft |
+| Olatunji Ruwase | [tjruwase](https://github.com/tjruwase)     | SnowFlake |
 | Logan Adams     | [loadams](https://github.com/loadams)      | Microsoft |
-| Masahiro Tanaka | [tohtana](https://github.com/tohtana)      | Microsoft |
+| Masahiro Tanaka | [tohtana](https://github.com/tohtana)      | Anyscale |
 | Jeff Rasley     | [jeffra](https://github.com/jeffra)       | SnowFlake  |
 | Minjia Zhang    | [minjiazhang](https://github.com/minjiazhang)  | UIUC  |
 | Ashwin Aji      | [ashwinma](https://github.com/ashwinma)        | AMD   |
 | Sam Foreman     | [saforem2](https://github.com/saforem2)        | Argonne National Laboratory |
 | Zhipeng Wang    | [PKUWZP](https://github.com/PKUWZP)       | LinkedIn  |
+| Guokai Ma       | [delock](https://github.com/delock)       | Intel  |