Skip to content

add lb default

add lb default #1276

Workflow file for this run

################################################################################
# This file is auto-generated from the .j2 file via generate_github_workflows.py. Do not edit manually.
################################################################################
name: PR Test
on:
# Do not run CI on push to reduce CI time
# push:
# branches: [main]
pull_request:
branches: [main]
types: [synchronize, labeled]
workflow_dispatch:
inputs:
infinite_run:
description: 'Run training infinitely'
required: false
type: boolean
default: false
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
e2e-test-short:
if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-short'))
runs-on: self-hosted
container:
image: slimerl/slime:latest
options: >
--gpus all
--ipc=host
--shm-size=16g
--ulimit memlock=-1
--ulimit stack=67108864
--memory=0
--memory-swap=0
-e http_proxy=$http_proxy
-e https_proxy=$https_proxy
-e HTTP_PROXY=$HTTP_PROXY
-e HTTPS_PROXY=$HTTPS_PROXY
-v /mnt/nvme0n1/slime_ci:/data/slime_ci
-v /mnt/nvme0n1/slime_ci/models:/root/models
-v /mnt/nvme0n1/slime_ci/datasets:/root/datasets
strategy:
fail-fast: false
matrix:
info: [{"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_async_short.py"}, {"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_short.py"}]
defaults:
run:
working-directory: ${{ github.workspace }}
env:
GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }}
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
SLIME_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }}
SLIME_TEST_USE_DEEPEP: ${{ matrix.info.use_deepep || '0' }}
SLIME_TEST_USE_FP8_ROLLOUT: ${{ matrix.info.use_fp8_rollout || '0' }}
SLIME_TEST_ENABLE_EVAL: ${{ matrix.info.enable_eval || '1' }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install
shell: bash
run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages
- name: Execute
shell: bash
run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }}
e2e-test-fsdp:
if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-fsdp'))
runs-on: self-hosted
container:
image: slimerl/slime:latest
options: >
--gpus all
--ipc=host
--shm-size=16g
--ulimit memlock=-1
--ulimit stack=67108864
--memory=0
--memory-swap=0
-e http_proxy=$http_proxy
-e https_proxy=$https_proxy
-e HTTP_PROXY=$HTTP_PROXY
-e HTTPS_PROXY=$HTTPS_PROXY
-v /mnt/nvme0n1/slime_ci:/data/slime_ci
-v /mnt/nvme0n1/slime_ci/models:/root/models
-v /mnt/nvme0n1/slime_ci/datasets:/root/datasets
strategy:
fail-fast: false
matrix:
info: [{"num_gpus": 4, "test_file": "test_qwen3_4B_fsdp_true_on_policy.py --colocated"}, {"num_gpus": 8, "test_file": "test_qwen3_vl_4B_fsdp.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_distributed.py"}, {"num_gpus": 4, "test_file": "test_qwen3_0.6B_megatron_fsdp_align.py"}]
defaults:
run:
working-directory: ${{ github.workspace }}
env:
GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }}
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
SLIME_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }}
SLIME_TEST_USE_DEEPEP: ${{ matrix.info.use_deepep || '0' }}
SLIME_TEST_USE_FP8_ROLLOUT: ${{ matrix.info.use_fp8_rollout || '0' }}
SLIME_TEST_ENABLE_EVAL: ${{ matrix.info.enable_eval || '1' }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install
shell: bash
run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages
- name: Execute
shell: bash
run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }}
e2e-test-megatron:
if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-megatron'))
runs-on: self-hosted
container:
image: slimerl/slime:latest
options: >
--gpus all
--ipc=host
--shm-size=16g
--ulimit memlock=-1
--ulimit stack=67108864
--memory=0
--memory-swap=0
-e http_proxy=$http_proxy
-e https_proxy=$https_proxy
-e HTTP_PROXY=$HTTP_PROXY
-e HTTPS_PROXY=$HTTPS_PROXY
-v /mnt/nvme0n1/slime_ci:/data/slime_ci
-v /mnt/nvme0n1/slime_ci/models:/root/models
-v /mnt/nvme0n1/slime_ci/datasets:/root/datasets
strategy:
fail-fast: false
matrix:
info: [{"num_gpus": 8, "test_file": "test_quick_start_glm4_9B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_30B_A3B.py", "use_deepep": "1", "use_fp8_rollout": "1"}, {"enable_eval": "0", "num_gpus": 8, "test_file": "test_qwen3_30B_A3B_r3.py", "use_deepep": "1", "use_fp8_rollout": "1"}, {"enable_eval": "0", "num_gpus": 8, "test_file": "test_qwen3_30B_A3B_r3.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ppo.py"}, {"num_gpus": 8, "test_file": "test_moonlight_16B_A3B.py"}, {"enable_eval": "0", "num_gpus": 8, "test_file": "test_moonlight_16B_A3B_r3.py"}, {"num_gpus": 8, "test_file": "test_mimo_7B_mtp_only_grad.py"}]
defaults:
run:
working-directory: ${{ github.workspace }}
env:
GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }}
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
SLIME_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }}
SLIME_TEST_USE_DEEPEP: ${{ matrix.info.use_deepep || '0' }}
SLIME_TEST_USE_FP8_ROLLOUT: ${{ matrix.info.use_fp8_rollout || '0' }}
SLIME_TEST_ENABLE_EVAL: ${{ matrix.info.enable_eval || '1' }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install
shell: bash
run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages
- name: Execute
shell: bash
run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }}
e2e-test-precision:
if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-precision'))
runs-on: self-hosted
container:
image: slimerl/slime:latest
options: >
--gpus all
--ipc=host
--shm-size=16g
--ulimit memlock=-1
--ulimit stack=67108864
--memory=0
--memory-swap=0
-e http_proxy=$http_proxy
-e https_proxy=$https_proxy
-e HTTP_PROXY=$HTTP_PROXY
-e HTTPS_PROXY=$HTTPS_PROXY
-v /mnt/nvme0n1/slime_ci:/data/slime_ci
-v /mnt/nvme0n1/slime_ci/models:/root/models
-v /mnt/nvme0n1/slime_ci/datasets:/root/datasets
strategy:
fail-fast: false
matrix:
info: [{"num_gpus": 8, "test_file": "test_qwen3_0.6B_parallel_check.py"}, {"num_gpus": 4, "test_file": "test_qwen3_0.6B_megatron_fsdp_align.py"}]
defaults:
run:
working-directory: ${{ github.workspace }}
env:
GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }}
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
SLIME_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }}
SLIME_TEST_USE_DEEPEP: ${{ matrix.info.use_deepep || '0' }}
SLIME_TEST_USE_FP8_ROLLOUT: ${{ matrix.info.use_fp8_rollout || '0' }}
SLIME_TEST_ENABLE_EVAL: ${{ matrix.info.enable_eval || '1' }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install
shell: bash
run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages
- name: Execute
shell: bash
run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }}
e2e-test-ckpt:
if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-ckpt'))
runs-on: self-hosted
container:
image: slimerl/slime:latest
options: >
--gpus all
--ipc=host
--shm-size=16g
--ulimit memlock=-1
--ulimit stack=67108864
--memory=0
--memory-swap=0
-e http_proxy=$http_proxy
-e https_proxy=$https_proxy
-e HTTP_PROXY=$HTTP_PROXY
-e HTTPS_PROXY=$HTTPS_PROXY
-v /mnt/nvme0n1/slime_ci:/data/slime_ci
-v /mnt/nvme0n1/slime_ci/models:/root/models
-v /mnt/nvme0n1/slime_ci/datasets:/root/datasets
strategy:
fail-fast: false
matrix:
info: [{"num_gpus": 8, "test_file": "test_qwen3_4B_ckpt.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ckpt.py --async-save"}]
defaults:
run:
working-directory: ${{ github.workspace }}
env:
GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }}
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
SLIME_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }}
SLIME_TEST_USE_DEEPEP: ${{ matrix.info.use_deepep || '0' }}
SLIME_TEST_USE_FP8_ROLLOUT: ${{ matrix.info.use_fp8_rollout || '0' }}
SLIME_TEST_ENABLE_EVAL: ${{ matrix.info.enable_eval || '1' }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install
shell: bash
run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages
- name: Execute
shell: bash
run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }}
e2e-test-long:
if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-long'))
runs-on: self-hosted
container:
image: slimerl/slime:latest
options: >
--gpus all
--ipc=host
--shm-size=16g
--ulimit memlock=-1
--ulimit stack=67108864
--memory=0
--memory-swap=0
-e http_proxy=$http_proxy
-e https_proxy=$https_proxy
-e HTTP_PROXY=$HTTP_PROXY
-e HTTPS_PROXY=$HTTPS_PROXY
-v /mnt/nvme0n1/slime_ci:/data/slime_ci
-v /mnt/nvme0n1/slime_ci/models:/root/models
-v /mnt/nvme0n1/slime_ci/datasets:/root/datasets
strategy:
fail-fast: false
matrix:
info: [{"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k.py"}, {"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k_async.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_colocated_2xGPU.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_distributed.py"}]
defaults:
run:
working-directory: ${{ github.workspace }}
env:
GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }}
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
SLIME_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }}
SLIME_TEST_USE_DEEPEP: ${{ matrix.info.use_deepep || '0' }}
SLIME_TEST_USE_FP8_ROLLOUT: ${{ matrix.info.use_fp8_rollout || '0' }}
SLIME_TEST_ENABLE_EVAL: ${{ matrix.info.enable_eval || '1' }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install
shell: bash
run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages
- name: Execute
shell: bash
run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }}
e2e-test-image:
if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-image'))
runs-on: self-hosted
container:
image: slimerl/slime-test:latest
options: >
--gpus all
--ipc=host
--shm-size=16g
--ulimit memlock=-1
--ulimit stack=67108864
--memory=0
--memory-swap=0
-e http_proxy=$http_proxy
-e https_proxy=$https_proxy
-e HTTP_PROXY=$HTTP_PROXY
-e HTTPS_PROXY=$HTTPS_PROXY
-v /mnt/nvme0n1/slime_ci:/data/slime_ci
-v /mnt/nvme0n1/slime_ci/models:/root/models
-v /mnt/nvme0n1/slime_ci/datasets:/root/datasets
strategy:
fail-fast: false
matrix:
info: [{"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_async_short.py"}, {"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_short.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_colocated_2xGPU.py"}, {"num_gpus": 2, "test_file": "test_qwen3_4B_fsdp_true_on_policy.py"}, {"num_gpus": 8, "test_file": "test_qwen3_vl_4B_fsdp.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_distributed.py"}, {"num_gpus": 8, "test_file": "test_quick_start_glm4_9B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_30B_A3B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ppo.py"}, {"num_gpus": 8, "test_file": "test_moonlight_16B_A3B.py"}, {"num_gpus": 8, "test_file": "test_mimo_7B_mtp_only_grad.py"}, {"num_gpus": 8, "test_file": "test_qwen3_0.6B_parallel_check.py"}, {"num_gpus": 4, "test_file": "test_qwen3_0.6B_megatron_fsdp_align.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ckpt.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ckpt.py --async-save"}, {"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k.py"}, {"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k_async.py"}]
defaults:
run:
working-directory: ${{ github.workspace }}
env:
GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }}
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
SLIME_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }}
SLIME_TEST_USE_DEEPEP: ${{ matrix.info.use_deepep || '0' }}
SLIME_TEST_USE_FP8_ROLLOUT: ${{ matrix.info.use_fp8_rollout || '0' }}
SLIME_TEST_ENABLE_EVAL: ${{ matrix.info.enable_eval || '1' }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install
shell: bash
run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages
- name: Execute
shell: bash
run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }}