Skip to content

Commit a60ade4

Browse files
authored
Merge branch 'main' into main_tq_submodule
2 parents 9013472 + 4da0d3d commit a60ade4

File tree

124 files changed

+11204
-324
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

124 files changed

+11204
-324
lines changed
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
# # Tests layout
2+
3+
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
4+
# - `tests/trainer` for testing functionality related to `verl/trainer`
5+
# - `tests/models` for testing functionality related to `verl/models`
6+
# - ...
7+
8+
# There are a few folders with `special_` prefix, created for special purposes:
9+
# - `special_distributed`: unit tests that must run with multiple GPUs
10+
# - `special_e2e`: end-to-end tests with training/generation scripts
11+
# - `special_npu`: tests for NPUs
12+
# - `special_sanity`: a suite of quick sanity tests
13+
# - `special_standalone`: a set of test that are designed to run in dedicated environments
14+
15+
# Accelerators for tests
16+
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
17+
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
18+
19+
# # Workflow layout
20+
21+
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
22+
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
23+
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
24+
# 3. End-to-end tests: `e2e_*.yml`
25+
# 4. Unit tests
26+
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
27+
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
28+
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
29+
# - new workflow yaml is added to `.github/workflows`
30+
# - new tests are added to workflow mentioned in 2.
31+
32+
33+
name: e2e_fully_async_policy
34+
35+
on:
36+
# Trigger the workflow on push or pull request,
37+
# but only for the main branch
38+
# For push, for now only anti-patterns are specified so it is more conservative
39+
# and achieves higher coverage.
40+
push:
41+
branches:
42+
- main
43+
- v0.*
44+
paths:
45+
- "**/*.py"
46+
- "!**/*.md"
47+
- "!**/*.sh"
48+
# Other entrypoints
49+
- "!examples/*trainer*"
50+
- "!tests/**"
51+
- "!verl/trainer/main_*.py"
52+
- "!verl/trainer/fsdp_sft_trainer.py"
53+
- "!recipe/**"
54+
- "recipe/fully_async_policy"
55+
pull_request:
56+
branches:
57+
- main
58+
- v0.*
59+
paths:
60+
- "**/*.py"
61+
- "!**/*.md"
62+
- "!**/*.sh"
63+
# Other entrypoints
64+
- "!examples/**"
65+
- "!tests/**"
66+
- "!verl/trainer/main_*.py"
67+
- "!verl/trainer/fsdp_sft_trainer.py"
68+
# Other recipes
69+
- "!recipe/**"
70+
# Home
71+
- "recipe/fully_async_policy"
72+
# Entrypoints
73+
- ".github/workflows/e2e_fully_async_policy.yml"
74+
- "examples/data_preprocess/gsm8k.py"
75+
- "tests/special_e2e/run_fully_async_policy.sh"
76+
77+
# Cancel jobs on the same ref if a new one is triggered
78+
concurrency:
79+
group: ${{ github.workflow }}-${{ github.ref }}
80+
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
81+
82+
# Declare permissions just read content.
83+
permissions:
84+
contents: read
85+
86+
env:
87+
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2"
88+
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
89+
TRANSFORMERS_VERSION: "4.56.2"
90+
91+
jobs:
92+
setup:
93+
if: github.repository_owner == 'volcengine'
94+
runs-on: ubuntu-latest
95+
outputs:
96+
runner-label: ${{ steps.create-runner.outputs.runner-label }}
97+
mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
98+
steps:
99+
- uses: actions/checkout@v4
100+
- id: create-runner
101+
uses: volcengine/vemlp-github-runner@v1
102+
with:
103+
mode: "create"
104+
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
105+
mlp-image: "${{ env.IMAGE }}"
106+
107+
# Test FSDP2 strategy
108+
e2e_fully_async_policy_fsdp2:
109+
needs: setup
110+
runs-on: [ "${{ needs.setup.outputs.runner-label || 'L20x8' }}" ]
111+
timeout-minutes: 10 # Increase timeout for async training
112+
env:
113+
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
114+
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
115+
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
116+
HF_ENDPOINT: "https://hf-mirror.com"
117+
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
118+
ACTOR_STRATEGY: "fsdp2"
119+
steps:
120+
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
121+
with:
122+
fetch-depth: 0
123+
- name: Install the current repository
124+
run: |
125+
pip3 install --no-deps -e .[test,gpu]
126+
pip3 install transformers==$TRANSFORMERS_VERSION
127+
- name: Prepare GSM8K dataset
128+
run: |
129+
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
130+
- name: Running the E2E test with fully_async_policy algorithm (FSDP2)
131+
run: |
132+
ray stop --force
133+
bash tests/special_e2e/run_fully_async_policy.sh
134+
135+
cleanup:
136+
runs-on: ubuntu-latest
137+
needs:
138+
[
139+
setup,
140+
e2e_fully_async_policy_fsdp2
141+
]
142+
if: always()
143+
steps:
144+
- id: destroy-runner
145+
uses: volcengine/vemlp-github-runner@v1
146+
with:
147+
mode: "destroy"
148+
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
149+
mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"

.github/workflows/e2e_sft.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ jobs:
9191
e2e_sft:
9292
needs: setup
9393
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
94-
timeout-minutes: 25 # Increase this timeout value as needed
94+
timeout-minutes: 30 # Increase this timeout value as needed
9595
env:
9696
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
9797
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}

.github/workflows/model.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,7 @@ jobs:
208208
209209
- name: Running mcore engine tests on 8 L20 GPUs
210210
run: |
211+
ray stop --force
211212
pytest -s -x tests/models/test_engine.py
212213
213214
cleanup:

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,9 @@ verl is inspired by the design of Nemo-Aligner, Deepspeed-chat and OpenRLHF. The
238238
- [Vision-SR1](https://github.com/zli12321/Vision-SR1): Self-Rewarding Vision-Language Model via Reasoning Decomposition ![GitHub Repo stars](https://img.shields.io/github/stars/zli12321/Vision-SR1)
239239
- [SimpleVLA-RL](https://github.com/PRIME-RL/SimpleVLA-RL): SimpleVLA-RL: A Simple yet Effective Vision-Language Action Model for Reinforcement Learning ![GitHub Repo stars](https://img.shields.io/github/stars/PRIME-RL/SimpleVLA-RL)
240240
- [Table-R1](https://github.com/Table-R1/Table-R1): Table-R1: Inference-Time Scaling for Table Reasoning ![GitHub Repo stars](https://img.shields.io/github/stars/Table-R1/Table-R1)
241+
- [Revisual-R1](https://github.com/CSfufu/Revisual-R1): Revisual-R1: Advancing Multimodal Reasoning From Optimized Cold Start to Staged Reinforcement Learning ![GitHub Repo stars](https://img.shields.io/github/stars/CSfufu/Revisual-R1)
242+
- [ARES](https://github.com/shawn0728/ARES): ARES: Multimodal Adaptive Reasoning via Difficulty-Aware Token-Level Entropy Shaping ![GitHub Repo stars](https://img.shields.io/github/stars/shawn0728/ARES)
243+
- [Meta-Bandit-LLM](https://github.com/sanxing-chen/meta-bandit-llm): Meta-Bandit-LLM: Long-horizon multiturn interactive training for meta-bandit agents ![GitHub Repo stars](https://img.shields.io/github/stars/sanxing-chen/meta-bandit-llm)
241244

242245
and many more awesome work listed in [recipe](recipe/README.md).
243246

docker/Dockerfile.rocm7

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# default base image
22
ARG REMOTE_VLLM="1"
33
ARG COMMON_WORKDIR=/app
4-
ARG BASE_IMAGE=rocm/vllm-dev:base_rocm7_0930_rc1_20250916_tuned_20250917
4+
ARG BASE_IMAGE=rocm/vllm-dev:base
55

66
FROM ${BASE_IMAGE} AS base
77

@@ -104,7 +104,7 @@ ARG COMMON_WORKDIR
104104
COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
105105
COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
106106

107-
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
107+
ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1
108108
ENV TOKENIZERS_PARALLELISM=false
109109

110110
# ENV that can improve safe tensor loading, and end-to-end time
@@ -115,6 +115,8 @@ ENV HIP_FORCE_DEV_KERNARG=1
115115

116116
# -----------------------
117117
# Install verl
118+
ARG VERL_REPO=https://github.com/volcengine/verl.git
119+
ARG VERL_BRANCH=main
118120
RUN pip install "tensordict==0.6.2" --no-deps && \
119121
pip install accelerate \
120122
codetiming \
@@ -133,8 +135,9 @@ RUN pip install "tensordict==0.6.2" --no-deps && \
133135
pybind11
134136

135137
WORKDIR /workspace/
136-
RUN git clone https://github.com/volcengine/verl.git && \
138+
RUN git clone ${VERL_REPO} && \
137139
cd verl && \
140+
git checkout ${VERL_BRANCH} && \
138141
pip install -e .
139142

140143
CMD ["/bin/bash"]

docker/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ For vLLM with FSDP, please refer to [hiyouga/verl](https://hub.docker.com/r/hiyo
3636

3737
For SGLang with FSDP, please refer to [ocss884/verl-sglang](https://hub.docker.com/r/ocss884/verl-sglang) repository and the latest version is ``ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post5`` which is provided by SGLang RL Group.
3838

39+
For latest vLLM with Megatron, please refer to [iseekyan/verl](https://hub.docker.com/r/iseekyan/verl) repository and the latest version is ``iseekyan/verl:nemo.gptoss_vllm0.11.0``.
40+
3941
See files under ``docker/`` for NGC-based image or if you want to build your own.
4042

4143
Note that For aws instances with EFA net interface (Sagemaker AI Pod), you need to install EFA driver as shown in ``docker/Dockerfile.extenstion.awsefa``
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
FROM nvcr.io/nvidia/nemo:25.07.gpt_oss
2+
3+
RUN git clone -b v0.11.0 --depth 1 https://github.com/vllm-project/vllm.git /opt/vllm
4+
5+
RUN pip install setuptools_scm
6+
7+
RUN cd /opt/vllm && pip install --no-deps --no-build-isolation --no-cache-dir -e .
8+
9+
RUN pip install cbor2 setproctitle blake3 openai_harmony pybase64 msgspec partial_json_parser py-cpuinfo diskcache gguf
10+
11+
RUN pip install --upgrade transformers tokenizers
12+
13+
RUN pip install codetiming tensordict mathruler pylatexenc
14+
15+
RUN pip3 install --no-cache-dir mbridge

0 commit comments

Comments
 (0)