feat: Add recipe to reproduce Tulu-3 DPO model (#804)

mrm-196 · root · root · web-flow · commit bde1a68af874 · 2025-08-20T01:25:25.000Z
Signed-off-by: root &lt;root@gcp5-sdc-29.slurm.svc.cluster.local&gt;
Signed-off-by: root &lt;root@gcp5-sdc-31.slurm.svc.cluster.local&gt;
Signed-off-by: root &lt;root@gcp5-sdc-26.slurm.svc.cluster.local&gt;
Signed-off-by: root &lt;root@gcp5-sdc-30.slurm.svc.cluster.local&gt;
Co-authored-by: root &lt;root@gcp5-sdc-29.slurm.svc.cluster.local&gt;
Co-authored-by: root &lt;root@gcp5-sdc-31.slurm.svc.cluster.local&gt;
Co-authored-by: root &lt;root@gcp5-sdc-26.slurm.svc.cluster.local&gt;
Co-authored-by: root &lt;root@gcp5-sdc-30.slurm.svc.cluster.local&gt;
diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-tulu3-1n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-tulu3-1n8g-fsdp2tp1.yaml
@@ -0,0 +1,51 @@
+defaults: "../../dpo.yaml"
+
+cluster:
+  num_nodes: 1
+  gpus_per_node: 8
+
+policy:
+  model_name: "allenai/Llama-3.1-Tulu-3-8B-SFT"
+  tokenizer:
+    name: "allenai/Llama-3.1-Tulu-3-8B-SFT"
+  train_micro_batch_size: 1
+  train_global_batch_size: 128
+  max_total_sequence_length: 2048
+  optimizer:
+    name: "torch.optim.AdamW"
+    kwargs:
+      lr: 5.0e-7
+      weight_decay: 0.0
+  scheduler:
+    - name: "torch.optim.lr_scheduler.LinearLR"
+      kwargs:
+        start_factor: 1.0e-6
+        end_factor: 1.0
+        total_iters: 211
+    - name: "torch.optim.lr_scheduler.LinearLR"
+      kwargs:
+        start_factor: 1.0
+        end_factor: 0.0
+        total_iters: 1899
+    - milestones: [211]
+
+data:
+  dataset_name: "Tulu3Preference"
+
+dpo:
+  max_num_steps: 2110
+  val_period: -1
+  val_at_start: false
+  preference_average_log_probs: True
+  reference_policy_kl_penalty: 5
+  val_micro_batch_size: ${policy.train_micro_batch_size}
+  val_global_batch_size: ${policy.train_global_batch_size}
+
+checkpointing:
+  metric_name: null
+  save_period: 250
+
+logger:
+  wandb_enabled: True
+  wandb:
+    name: "dpo-tulu3-8b"
diff --git a/examples/run_dpo.py b/examples/run_dpo.py
@@ -176,13 +176,19 @@ def setup_data(data_config: DataConfig, policy_config: PolicyConfig):
 
     if data_config["dataset_name"] == "HelpSteer3":
         data = hf_datasets.HelpSteer3Dataset()
+        train_dataset = data.formatted_ds["train"]
+        val_dataset = data.formatted_ds["validation"]
+    elif data_config["dataset_name"] == "Tulu3Preference":
+        data = hf_datasets.Tulu3PreferenceDataset()
+        train_dataset = data.formatted_ds["train"]
+        val_dataset = None
     else:
         data = hf_datasets.DPODataset(
             train_data_path=data_config["train_data_path"],
             val_data_path=data_config["val_data_path"],
         )
-    train_dataset = data.formatted_ds["train"]
-    val_dataset = data.formatted_ds["validation"]
+        train_dataset = data.formatted_ds["train"]
+        val_dataset = data.formatted_ds["validation"]
 
     dpo_task_spec = data.task_spec
 
@@ -195,13 +201,14 @@ def setup_data(data_config: DataConfig, policy_config: PolicyConfig):
         max_seq_length=data_config["max_input_seq_length"],
     )
 
-    val_dataset = AllTaskProcessedDataset(
-        val_dataset,
-        tokenizer,
-        dpo_task_spec,
-        dpo_preprocessor,
-        max_seq_length=data_config["max_input_seq_length"],
-    )
+    if val_dataset:
+        val_dataset = AllTaskProcessedDataset(
+            val_dataset,
+            tokenizer,
+            dpo_task_spec,
+            dpo_preprocessor,
+            max_seq_length=data_config["max_input_seq_length"],
+        )
 
     return train_dataset, val_dataset, tokenizer, dpo_task_spec
 
diff --git a/nemo_rl/data/hf_datasets/__init__.py b/nemo_rl/data/hf_datasets/__init__.py
@@ -23,6 +23,7 @@
     PromptResponseDataset,
 )
 from nemo_rl.data.hf_datasets.squad import SquadDataset
+from nemo_rl.data.hf_datasets.tulu3 import Tulu3PreferenceDataset
 
 __all__ = [
     "DPODataset",
@@ -32,6 +33,7 @@
     "OpenMathInstruct2Dataset",
     "PromptResponseDataset",
     "SquadDataset",
+    "Tulu3PreferenceDataset",
     "COMMON_CHAT_TEMPLATES",
     "CLEVRCoGenTDataset",
 ]
diff --git a/nemo_rl/data/hf_datasets/tulu3.py b/nemo_rl/data/hf_datasets/tulu3.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from typing import Any
+
+from datasets import load_dataset
+
+from nemo_rl.data.interfaces import TaskDataSpec
+
+
+def format_tulu3_preference(data: dict[str, Any]) -> dict[str, str | dict[str, str]]:
+    chosen_conversation = data["chosen"]
+    rejected_conversation = data["rejected"]
+
+    context = chosen_conversation[:-1]
+
+    # We assume that except last assistant response, all messages in
+    # chosen and rejected conversations are similar. Validating this...
+    assert json.dumps(context, ensure_ascii=False) == json.dumps(
+        rejected_conversation[:-1], ensure_ascii=False
+    ), (
+        f"Context mismatch.\n\nchosen: {chosen_conversation}\n\n rejected: {rejected_conversation}"
+    )
+
+    # We assume that last response is always from the assistant. Validating this...
+    assert chosen_conversation[-1]["role"] == "assistant", (
+        f"The last chosen response ({chosen_conversation[-1]}) is not from assistant!"
+    )
+    assert rejected_conversation[-1]["role"] == "assistant", (
+        f"The last rejected response ({rejected_conversation[-1]}) is not from assistant!"
+    )
+
+    chosen_response = chosen_conversation[-1]["content"]
+    rejected_response = rejected_conversation[-1]["content"]
+
+    return {
+        "prompt": context,
+        "chosen_response": chosen_response,
+        "rejected_response": rejected_response,
+    }
+
+
+class Tulu3PreferenceDataset:
+    """Tulu3 preference dataset for DPO training."""
+
+    def __init__(self) -> None:
+        ds = load_dataset(
+            path="allenai/llama-3.1-tulu-3-8b-preference-mixture",
+            trust_remote_code=True,
+        )
+        self.formatted_ds = ds.map(format_tulu3_preference)
+
+        self.task_spec = TaskDataSpec(
+            task_name="Tulu3Preference",
+        )
diff --git a/pyrefly.toml b/pyrefly.toml
@@ -62,6 +62,7 @@ project-includes = [
     "nemo_rl/data/hf_datasets/openmathinstruct2.py",
     "nemo_rl/data/hf_datasets/prompt_response_dataset.py",
     "nemo_rl/data/hf_datasets/squad.py",
+    "nemo_rl/data/hf_datasets/tulu3.py",
     "nemo_rl/data/multimodal_utils.py",
     "nemo_rl/data/interfaces.py",
     "nemo_rl/data/packing/__init__.py",
diff --git a/tests/test_suites/llm/dpo-llama3.1-8b-tulu3-1n8g-fsdp2tp1.sh b/tests/test_suites/llm/dpo-llama3.1-8b-tulu3-1n8g-fsdp2tp1.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=1
+STEPS_PER_RUN=150
+MAX_STEPS=150
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=45
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_dpo.py \
+    --config $CONFIG_PATH \
+    dpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'data["train/sft_loss"]["1"] < 0.00001' \
+        'data["train/sft_loss"]["150"] < 0.00001' \
+        'data["train/preference_loss"]["1"] > 0.6930' \
+        'data["train/preference_loss"]["1"] < 0.6932' \
+        'data["train/preference_loss"]["150"] < 0.68'
+fi
+
diff --git a/tests/test_suites/release.txt b/tests/test_suites/release.txt
@@ -25,3 +25,4 @@ tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.sh
 # Long 8b convergence
 tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1.v2.sh
 tests/test_suites/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.sh
+tests/test_suites/llm/dpo-llama3.1-8b-tulu3-1n8g-fsdp2tp1.sh

Original file line number	Diff line number	Diff line change
`@@ -23,6 +23,7 @@`
`23`	`23`	`PromptResponseDataset,`
`24`	`24`	`)`
`25`	`25`	`from nemo_rl.data.hf_datasets.squad import SquadDataset`
	`26`	`+from nemo_rl.data.hf_datasets.tulu3 import Tulu3PreferenceDataset`
`26`	`27`
`27`	`28`	`__all__ = [`
`28`	`29`	`"DPODataset",`
`@@ -32,6 +33,7 @@`
`32`	`33`	`"OpenMathInstruct2Dataset",`
`33`	`34`	`"PromptResponseDataset",`
`34`	`35`	`"SquadDataset",`
	`36`	`+ "Tulu3PreferenceDataset",`
`35`	`37`	`"COMMON_CHAT_TEMPLATES",`
`36`	`38`	`"CLEVRCoGenTDataset",`
`37`	`39`	`]`