Skip to content

Commit 9d6c7e7

Browse files
committed
init commit
0 parents  commit 9d6c7e7

File tree

503 files changed

+64866
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

503 files changed

+64866
-0
lines changed

.github/dependabot.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
## Enabled the dependabot to check the dependencies of the project
2+
## Dependabot will open pull requests to update dependencies automatically
3+
4+
version: 2
5+
updates:
6+
- package-ecosystem: pip
7+
directory: "/"
8+
schedule:
9+
interval: weekly

.github/workflows/checkpoints.yml

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
name: checkpoints
2+
3+
on:
4+
# Trigger the workflow on push or pull request,
5+
# but only for the main branch
6+
push:
7+
branches:
8+
- main
9+
- v0.2.x
10+
paths:
11+
- "**/*.py"
12+
- .github/workflows/checkpoints.yml
13+
pull_request:
14+
branches:
15+
- main
16+
- v0.2.x
17+
paths:
18+
- "**/*.py"
19+
- "verl/trainer/config/*.yaml"
20+
- .github/workflows/checkpoints.yml
21+
- "tests/e2e/*.sh"
22+
23+
# Cancel jobs on the same ref if a new one is triggered
24+
concurrency:
25+
group: ${{ github.workflow }}-${{ github.ref }}
26+
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
27+
28+
# Declare permissions just read content.
29+
permissions:
30+
contents: read
31+
32+
jobs:
33+
e2e_gsm8k_megatron:
34+
runs-on: [self-hosted, l20-0]
35+
timeout-minutes: 40 # Increase this timeout value as needed
36+
env:
37+
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
38+
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
39+
NO_PROXY: "localhost,127.0.0.1"
40+
HF_HUB_ENABLE_HF_TRANSFER: 1
41+
container:
42+
image: whatcanyousee/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te2.0-megatron0.11.0-v0.0.6
43+
options: --gpus all --shm-size=10g
44+
steps:
45+
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
46+
with:
47+
fetch-depth: 0
48+
- name: Install the current repository
49+
run: |
50+
pip3 install hf_transfer
51+
pip3 install -e .[test]
52+
- name: Prepare gsm8k dataset
53+
run: |
54+
python3 examples/data_preprocess/gsm8k.py
55+
- name: Running Checkpoint Integration Test (Qwen Megatron)
56+
run: |
57+
ray stop --force
58+
export PYTHONPATH=$PYTHONPATH:/opt/nvidia/Megatron-LM
59+
bash tests/checkpoint/run_qwen_megatron_ckpt.sh
60+
- name: Running Checkpoint Integration Test (Deepseek Megatron)
61+
run: |
62+
ray stop --force
63+
export PYTHONPATH=$PYTHONPATH:/opt/nvidia/Megatron-LM
64+
bash tests/checkpoint/run_deepseek_megatron_ckpt.sh

.github/workflows/dataset.yml

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
name: dataset
2+
3+
on:
4+
# Trigger the workflow on push or pull request,
5+
# but only for the main branch
6+
push:
7+
branches:
8+
- main
9+
- v0.2.x
10+
paths:
11+
- "**/*.py"
12+
- .github/workflows/dataset.yml
13+
pull_request:
14+
branches:
15+
- main
16+
- v0.2.x
17+
paths:
18+
- "**/*.py"
19+
- .github/workflows/dataset.yml
20+
21+
# Cancel jobs on the same ref if a new one is triggered
22+
concurrency:
23+
group: ${{ github.workflow }}-${{ github.ref }}
24+
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
25+
26+
# Declare permissions just read content.
27+
permissions:
28+
contents: read
29+
30+
jobs:
31+
ray:
32+
runs-on: [self-hosted, l20-1]
33+
timeout-minutes: 10 # Increase this timeout value as needed
34+
env:
35+
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
36+
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
37+
NO_PROXY: "localhost,127.0.0.1"
38+
HF_HUB_ENABLE_HF_TRANSFER: 1
39+
container:
40+
image: verlai/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3
41+
options: --gpus all --shm-size=10g
42+
steps:
43+
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
44+
with:
45+
fetch-depth: 0
46+
- name: Install the current repository
47+
run: |
48+
pip install hf_transfer
49+
pip install -e .[test]
50+
pip install --upgrade "ray>=2.40.0"
51+
pip install cupy-cuda12x
52+
- name: Running dataset tests
53+
run: |
54+
[ ! -d "$HOME/verl-data" ] && git clone --depth 1 https://github.com/eric-haibin-lin/verl-data ~/verl-data
55+
pytest -s -x tests/verl/utils/dataset/test_rl_dataset.py
56+
pytest -s -x tests/verl/utils/dataset/test_sft_dataset.py
57+
# pytest -s -x tests/verl/utils/dataset/test_rm_dataset.py
58+
- name: Running ray test using cupy (move it to L20 when dockerfile ready)
59+
run: |
60+
cd tests/ray
61+
pytest -s -x test_rvdz.py

.github/workflows/e2e_ascend.yml

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
name: e2e_ascend
2+
3+
on:
4+
# Trigger the workflow on push or pull request,
5+
# but only for the main branch
6+
push:
7+
branches:
8+
- main
9+
- v0.2.x
10+
paths:
11+
- "**/*.py"
12+
- .github/workflows/e2e_ascend.yml
13+
pull_request:
14+
branches:
15+
- main
16+
- v0.2.x
17+
paths:
18+
- "**/*.py"
19+
- .github/workflows/e2e_ascend.yml
20+
21+
permissions:
22+
contents: read
23+
24+
jobs:
25+
test:
26+
name: verl Ascend test (self-host)
27+
runs-on: [self-hosted, npu-0]
28+
timeout-minutes: 5 # Increase this timeout value as needed
29+
env:
30+
HF_HUB_ENABLE_HF_TRANSFER: 1
31+
container:
32+
image: quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10
33+
volumes:
34+
- /usr/local/dcmi:/usr/local/dcmi
35+
- /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
36+
- /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/
37+
# Use self-host cache speed up pip and model download
38+
# - /home/action/actions-runner/_work/cache:/github/home/.cache/
39+
options: >-
40+
--device /dev/davinci0
41+
--device /dev/davinci_manager
42+
--device /dev/devmm_svm
43+
--device /dev/hisi_hdc
44+
--privileged
45+
--network "host"
46+
steps:
47+
- name: Check npu and CANN info
48+
run: |
49+
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
50+
npu-smi info
51+
- name: Checkout volcengine/verl repo
52+
uses: actions/checkout@v4
53+
- name: Run test
54+
run: |
55+
lscpu
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
name: e2e_digit_completion
2+
3+
on:
4+
# Trigger the workflow on push or pull request,
5+
# but only for the main branch
6+
push:
7+
branches:
8+
- main
9+
- v0.2.x
10+
paths:
11+
- "**/*.py"
12+
- .github/workflows/e2e_digit_completion.yml
13+
pull_request:
14+
branches:
15+
- main
16+
- v0.2.x
17+
paths:
18+
- "**/*.py"
19+
- "verl/trainer/config/*.yaml"
20+
- .github/workflows/e2e_digit_completion.yml
21+
- "tests/e2e/*.sh"
22+
23+
# Cancel jobs on the same ref if a new one is triggered
24+
concurrency:
25+
group: ${{ github.workflow }}-${{ github.ref }}
26+
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
27+
28+
# Declare permissions just read content.
29+
permissions:
30+
contents: read
31+
32+
jobs:
33+
e2e_digit_completion:
34+
runs-on: [self-hosted, l20-0]
35+
timeout-minutes: 20 # Increase this timeout value as needed
36+
env:
37+
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
38+
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
39+
NO_PROXY: "localhost,127.0.0.1"
40+
HF_HUB_ENABLE_HF_TRANSFER: 1
41+
container:
42+
image: verlai/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3
43+
options: --gpus all --shm-size=10g
44+
steps:
45+
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
46+
with:
47+
fetch-depth: 0
48+
- name: Install the current repository
49+
run: |
50+
pip3 install hf_transfer
51+
pip3 install -e .[test]
52+
- name: Running digit completon e2e training tests on 8 L20 GPUs
53+
run: |
54+
ray stop --force
55+
bash tests/e2e/run_ray_trainer.sh
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
name: e2e_digit_completion_fire
2+
3+
on:
4+
# Trigger the workflow on push or pull request,
5+
# but only for the main branch
6+
push:
7+
branches:
8+
- main
9+
paths:
10+
- "**/*.py"
11+
- .github/workflows/e2e_digit_completion_fire.yml
12+
pull_request:
13+
branches:
14+
- main
15+
paths:
16+
- "**/*.py"
17+
- .github/workflows/e2e_digit_completion_fire.yml
18+
- "tests/e2e/*.sh"
19+
20+
# Declare permissions just read content.
21+
permissions:
22+
contents: read
23+
24+
jobs:
25+
e2e_digit_completion:
26+
runs-on: [self-hosted, l20-0]
27+
timeout-minutes: 20 # Increase this timeout value as needed
28+
env:
29+
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
30+
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
31+
NO_PROXY: "localhost,127.0.0.1"
32+
HF_HUB_ENABLE_HF_TRANSFER: 1
33+
container:
34+
image: verlai/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3
35+
options: --gpus all --shm-size=10g
36+
steps:
37+
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
38+
with:
39+
fetch-depth: 0
40+
- name: Install the current repository
41+
run: |
42+
pip3 install hf_transfer
43+
pip3 install -e .[test]
44+
- name: Running digit completon e2e training tests on 8 L20 GPUs
45+
run: |
46+
ray stop --force
47+
bash tests/e2e/run_ray_trainer_fire_sampling.sh

.github/workflows/e2e_grpo.yml

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
name: e2e_grpo
2+
3+
on:
4+
# Trigger the workflow on push or pull request,
5+
# but only for the main branch
6+
push:
7+
branches:
8+
- main
9+
- v0.2.x
10+
paths:
11+
- "**/*.py"
12+
- .github/workflows/e2e_grpo.yml
13+
pull_request:
14+
branches:
15+
- main
16+
- v0.2.x
17+
paths:
18+
- "**/*.py"
19+
- "verl/trainer/config/*.yaml"
20+
- .github/workflows/e2e_grpo.yml
21+
- "tests/e2e/*.sh"
22+
23+
# Cancel jobs on the same ref if a new one is triggered
24+
concurrency:
25+
group: ${{ github.workflow }}-${{ github.ref }}
26+
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
27+
28+
# Declare permissions just read content.
29+
permissions:
30+
contents: read
31+
32+
jobs:
33+
e2e_gsm8k_megatron:
34+
runs-on: [self-hosted, l20-0]
35+
timeout-minutes: 60 # Increase this timeout value as needed
36+
env:
37+
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
38+
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
39+
NO_PROXY: "localhost,127.0.0.1"
40+
HF_HUB_ENABLE_HF_TRANSFER: 1
41+
container:
42+
image: whatcanyousee/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te2.0-megatron0.11.0-v0.0.6
43+
options: --gpus all --shm-size=10g
44+
steps:
45+
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
46+
with:
47+
fetch-depth: 0
48+
- name: Install the current repository
49+
run: |
50+
pip3 install hf_transfer
51+
pip3 install -e .[test]
52+
- name: Prepare gsm8k dataset
53+
run: |
54+
python3 examples/data_preprocess/gsm8k.py
55+
- name: Running GRPO gsm8k e2e training tests with FSDP on 8 L20 GPUs (Deepseek)
56+
run: |
57+
ray stop --force
58+
bash tests/e2e/run_deepseek_grpo.sh
59+
- name: Running GRPO gsm8k e2e training tests with 3D parallelism on 8 L20 GPUs with Megatron (Deepseek)
60+
run: |
61+
ray stop --force
62+
bash tests/e2e/run_deepseek_grpo_megatron.sh
63+
- name: Running GRPO gsm8k e2e training tests with FSDP on 8 L20 GPUs (Qwen)
64+
run: |
65+
ray stop --force
66+
bash tests/e2e/run_qwen_grpo.sh
67+
- name: Running GRPO gsm8k e2e training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen)
68+
run: |
69+
ray stop --force
70+
bash tests/e2e/run_qwen_grpo_megatron.sh

0 commit comments

Comments
 (0)