Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/docker/docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
services:
trinity-node-1:
image: trinity-rft-unittest:20250924
image: trinity-rft-unittest:20251030
pull_policy: never
command: sh -c "pip install -e .[dev] && ray start --head --dashboard-host 0.0.0.0 --include-dashboard true --block"
environment:
Expand Down Expand Up @@ -29,7 +29,7 @@ services:
capabilities: [gpu]

trinity-node-2:
image: trinity-rft-unittest:20250924
image: trinity-rft-unittest:20251030
pull_policy: never
command: sh -c "pip install -e .[dev] && ray start --address=trinity-node-1:6379 --block"
environment:
Expand Down
9 changes: 9 additions & 0 deletions .github/workflows/unittest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,15 @@ jobs:
fi
fi

- name: Convert report.json time to ms
working-directory: trinity-${{ github.run_id }}
if: env.tests_run == 'true' || failure()
run: |
REPORT=report.json
if [ -f "$REPORT" ]; then
jq '(.results.tests[] | .duration, .start, .stop) |= (. * 1000) | (.results.summary.start, .results.summary.stop) |= (. * 1000)' "$REPORT" > "$REPORT.tmp" && mv "$REPORT.tmp" "$REPORT"
fi

- name: Clean checkpoint dir
working-directory: trinity-${{ github.run_id }}/.github/workflows/docker
if: always()
Expand Down
1 change: 0 additions & 1 deletion benchmark/config/countdown-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ explorer:
rollout_model:
engine_num: 2
tensor_parallel_size: 1
use_v1: true
enforce_eager: true
enable_prefix_caching: false
enable_chunked_prefill: false
Expand Down
1 change: 0 additions & 1 deletion benchmark/config/gsm8k-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ explorer:
rollout_model:
engine_num: 2
tensor_parallel_size: 1
use_v1: true
enforce_eager: false
enable_prefix_caching: false
enable_chunked_prefill: false
Expand Down
17 changes: 15 additions & 2 deletions docs/sphinx_doc/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

# -- Project information -----------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
import subprocess

from trinity import __version__ as version

project = "Trinity-RFT"
Expand Down Expand Up @@ -58,11 +60,22 @@
apidoc_excluded_paths = ["tests", "build"]
apidoc_separate_modules = True


# Multiversion configs
smv_tag_whitelist = r"^v\d+\.\d+\.\d+$" # match v1.0.0 pattern
def get_recent_tags(n: int) -> list:
"""Retrieve the most recent n git tags."""
try:
tags = subprocess.check_output(
["git", "tag", "--sort=-creatordate"], universal_newlines=True
).splitlines()
return tags[:n]
except subprocess.CalledProcessError:
return []


smv_tag_whitelist = r"^(" + "|".join(get_recent_tags(4)) + r")$"
smv_branch_whitelist = r"^(main)$" # included branches
smv_remote_whitelist = None
smv_released_pattern = r"^tags/.*$"

smv_prefer_remote_refs = False

Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ requires-python = ">=3.10,<3.13"
dependencies = [
"verl==0.5.0",
"ray[default]>=2.48.0",
"vllm>=0.9.1,<=0.10.2",
"vllm>=0.9.1,<=0.11.0",
"tensordict",
"wandb",
"omegaconf",
Expand Down Expand Up @@ -73,7 +73,7 @@ dev = [
]
megatron = [
"megatron-core[mlm]==0.13.1",
"transformer_engine[pytorch]==2.6.0.post1",
"transformer_engine[pytorch]==2.8.0",
"mbridge>=0.13.0",
]

Expand Down
8 changes: 4 additions & 4 deletions scripts/docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# docker run -it --gpus all --shm-size="64g" --rm -v $PWD:/workspace -v <root_path_of_data_and_checkpoints>:/data trinity-rft:latest


FROM nvcr.io/nvidia/cuda:12.6.3-cudnn-devel-ubuntu22.04
FROM nvcr.io/nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04

WORKDIR /workspace

Expand All @@ -20,13 +20,13 @@ RUN apt update && apt install -y \


# For Aliyun users: update pip mirror to aliyun to speed up pip install
RUN pip config set global.index-url http://mirrors.cloud.aliyuncs.com/pypi/simple/ \
&& pip config set install.trusted-host mirrors.cloud.aliyuncs.com
# RUN pip config set global.index-url http://mirrors.cloud.aliyuncs.com/pypi/simple/ \
# && pip config set install.trusted-host mirrors.cloud.aliyuncs.com

# copy the Trinity-RFT dir into the workspace
COPY . .

RUN pip install --upgrade pip && pip install -e .[dev] && pip install flash-attn
RUN pip install --upgrade pip && pip install -e .[dev] && pip install flash_attn==2.8.1 --no-build-isolation

# Set Env variables

Expand Down
17 changes: 9 additions & 8 deletions scripts/docker_for_megatron/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,10 @@
# docker run -it --gpus all --shm-size="64g" --rm -v $PWD:/workspace -v <root_path_of_data_and_checkpoints>:/data trinity-rft-megatron:latest


FROM nvcr.io/nvidia/cuda:12.6.3-cudnn-devel-ubuntu22.04
FROM nvcr.io/nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04

WORKDIR /workspace

# copy the Trinity-RFT dir into the workspace
COPY . .

RUN apt update && apt install -y \
build-essential \
curl git wget vim tmux net-tools \
Expand All @@ -22,17 +19,21 @@ RUN apt update && apt install -y \
&& ln -sf /usr/bin/pip3 /usr/bin/pip

# For Aliyun users: update pip mirror to aliyun to speed up pip install
RUN pip config set global.index-url http://mirrors.cloud.aliyuncs.com/pypi/simple/ \
&& pip config set install.trusted-host mirrors.cloud.aliyuncs.com
# RUN pip config set global.index-url http://mirrors.cloud.aliyuncs.com/pypi/simple/ \
# && pip config set install.trusted-host mirrors.cloud.aliyuncs.com

# copy the Trinity-RFT dir into the workspace
COPY . .

# Install Trinity-RFT with Megatron
RUN pip install --upgrade pip \
&& pip install -e .[dev] \
&& pip install flash_attn==2.8.1 --no-build-isolation \
&& pip install -e .[megatron] \
&& pip install flash-attn==2.8.1 \
&& pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
--config-settings "--build-option=--cpp_ext" \
--config-settings "--build-option=--cuda_ext" \
--resume-retries 999 git+https://github.com/NVIDIA/apex.git
--resume-retries 20 git+https://github.com/NVIDIA/apex.git

# Set Env variables

Expand Down
3 changes: 0 additions & 3 deletions tests/common/vllm_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,6 @@ def setUp(self):
self.config.explorer.rollout_model.engine_type = "vllm"
self.config.explorer.rollout_model.engine_num = 1
self.config.explorer.rollout_model.tensor_parallel_size = 1
self.config.explorer.rollout_model.use_v1 = True
self.config.explorer.rollout_model.chat_template = CHAT_TEMPLATE
self.config.explorer.rollout_model.enable_openai_api = True

Expand Down Expand Up @@ -368,7 +367,6 @@ def setUp(self):
self.config.explorer.rollout_model.engine_type = "vllm"
self.config.explorer.rollout_model.engine_num = 1
self.config.explorer.rollout_model.tensor_parallel_size = 1
self.config.explorer.rollout_model.use_v1 = True
self.config.explorer.rollout_model.chat_template = CHAT_TEMPLATE
self.config.explorer.rollout_model.enable_openai_api = True

Expand Down Expand Up @@ -578,7 +576,6 @@ def setUp(self):
self.config.explorer.rollout_model.engine_type = "vllm"
self.config.explorer.rollout_model.engine_num = 1
self.config.explorer.rollout_model.tensor_parallel_size = 1
self.config.explorer.rollout_model.use_v1 = True
self.config.explorer.rollout_model.chat_template = CHAT_TEMPLATE
self.config.explorer.rollout_model.enable_openai_api = True
# added for toolcalls
Expand Down
1 change: 0 additions & 1 deletion tests/template/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ explorer:
enforce_eager: true
dtype: bfloat16
seed: 42
use_v1: true
trainer:
trainer_type: verl
save_interval: 100
Expand Down
7 changes: 4 additions & 3 deletions trinity/common/models/api/vllm_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,10 +347,10 @@ async def run_api_server_in_ray_actor(
reasoning_parser: Optional[str] = None,
):
vllm_version = get_vllm_version()
if vllm_version < parse_version("0.8.5") or vllm_version > parse_version("0.10.2"):
if vllm_version < parse_version("0.8.5") or vllm_version > parse_version("0.11.0"):
raise ValueError(
f"Unsupported vllm version: {vllm.__version__}. "
"This patch requires vllm version >= 0.8.5, <= 0.10.2."
"This patch requires vllm version >= 0.8.5, <= 0.11.0."
)

parser = FlexibleArgumentParser(description="Run the OpenAI API server.")
Expand All @@ -371,5 +371,6 @@ async def run_api_server_in_ray_actor(
if reasoning_parser:
cli_args.extend(["--reasoning-parser", reasoning_parser])
args = parser.parse_args(cli_args)
print(args)
if vllm_version >= parse_version("0.11.0"):
args.structured_outputs_config.reasoning_parser = reasoning_parser
await run_server_in_ray(args, async_llm)
9 changes: 3 additions & 6 deletions trinity/common/models/vllm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ def __init__(
engine_args.enable_log_requests = False
else:
engine_args.disable_log_requests = True
if get_vllm_version() >= parse_version("0.11.0"):
engine_args.reasoning_parser = config.reasoning_parser
self.async_llm = vllm.AsyncLLMEngine.from_engine_args(engine_args)
self.processor = None
self.tokenizer = None
Expand All @@ -107,12 +109,7 @@ def __init__(

async def _initialize_tokenizer(self):
if self.tokenizer is None:
if self.enable_lora:
self.tokenizer = await self.async_llm.get_tokenizer(
lora_request=self.get_lora_request()
)
else:
self.tokenizer = await self.async_llm.get_tokenizer()
self.tokenizer = await self.async_llm.get_tokenizer()
self.tokenizer.truncation_side = "left"

def _initialize_processor(self):
Expand Down
2 changes: 1 addition & 1 deletion trinity/explorer/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def _create_runner(self):
return (
ray.remote(WorkflowRunner)
.options(
num_cpus=1,
num_cpus=0,
namespace=self.namespace,
scheduling_strategy="SPREAD",
runtime_env={
Expand Down
6 changes: 3 additions & 3 deletions trinity/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,15 +153,15 @@ def need_save(self) -> bool:

async def sync_weight(self) -> Dict:
"""Sync the model weight."""
self.logger.info(f"Trainer synchronizing weights at step {self.train_step_num} starting..")
self.logger.info(f"Trainer sync_weights at step {self.train_step_num} started.")
metrics = {}
with Timer(metrics, "time/sync_weight"):
if self.config.synchronizer.sync_method == SyncMethod.NCCL:
result = await self.synchronizer.ready_to_nccl_sync.remote(
"trainer", self.train_step_num
)
if result is None:
self.logger.error("Trainer synchronizing weights failed.")
self.logger.error("Trainer sync_weights failed.")
else:
self.engine.sync_weight()
self.last_trainer_sync_step = self.train_step_num
Expand All @@ -171,7 +171,7 @@ async def sync_weight(self) -> Dict:
self.engine.upload_state_dict()
self.last_sync_step = self.train_step_num
await self.synchronizer.set_trainer_status.remote(RunningStatus.RUNNING)
self.logger.info(f"Trainer synchronizing weights at step {self.train_step_num} end.")
self.logger.info(f"Trainer sync_weights at step {self.train_step_num} finished.")
return metrics

def _log_experiences(self, samples: List[Dict]) -> None:
Expand Down