xxman-google
diff --git a/‎.github/workflows/cicd-main.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/cicd-main.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 47 additions & 1 deletion b/‎README.md‎
Lines changed: 47 additions & 1 deletion
diff --git a/‎docs/guides/rm.md‎
Lines changed: 24 additions & 0 deletions b/‎docs/guides/rm.md‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎docs/index.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/index.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/configs/rm.yaml‎
Lines changed: 146 additions & 0 deletions b/‎examples/configs/rm.yaml‎
Lines changed: 146 additions & 0 deletions
@@ -205,6 +205,7 @@ jobs:
           time uv run --no-sync bash ./tests/functional/grpo_multiturn.sh
           time uv run --no-sync bash ./tests/functional/grpo_non_colocated.sh
           time uv run --no-sync bash ./tests/functional/dpo.sh
+          time uv run --no-sync bash ./tests/functional/rm.sh
           time uv run --no-sync bash ./tests/functional/eval.sh
           time uv run --no-sync bash ./tests/functional/eval_async.sh
           time uv run --no-sync bash ./tests/functional/test_mcore_extra_installed_correctly.sh
 
@@ -27,6 +27,9 @@
   - [DPO](#dpo)
     - [DPO Single Node](#dpo-single-node)
     - [DPO Multi-node](#dpo-multi-node)
+  - [RM](#rm)
+    - [RM Single Node](#rm-single-node)
+    - [RM Multi-node](#rm-multi-node)
   - [Evaluation](#evaluation)
     - [Convert Model Format (Optional)](#convert-model-format-optional)
     - [Run Evaluation](#run-evaluation)
@@ -338,7 +341,50 @@ For distributed DPO training across multiple nodes, modify the following script
 NUM_ACTOR_NODES=2
 
 COMMAND="uv run ./examples/run_dpo.py --config examples/configs/dpo.yaml cluster.num_nodes=2 cluster.gpus_per_node=8 dpo.val_global_batch_size=32 checkpointing.checkpoint_dir='results/dpo_llama81_2nodes' logger.wandb_enabled=True logger.wandb.name='dpo-llama1b'" \
-RAY_DEDUP_LOGS=0 \
+CONTAINER=YOUR_CONTAINER \
+MOUNTS="$PWD:$PWD" \
+sbatch \
+    --nodes=${NUM_ACTOR_NODES} \
+    --account=YOUR_ACCOUNT \
+    --job-name=YOUR_JOBNAME \
+    --partition=YOUR_PARTITION \
+    --time=4:0:0 \
+    --gres=gpu:8 \
+    ray.sub
+```
+
+## RM
+
+We provide a sample RM experiment that uses the [HelpSteer3 dataset](https://huggingface.co/datasets/nvidia/HelpSteer3) for preference-based training.
+
+### RM Single Node
+
+The default RM experiment is configured to run on a single GPU. To launch the experiment:
+
+```sh
+uv run python examples/run_rm.py
+```
+
+This trains a RM based on `meta-llama/Llama-3.2-1B-Instruct` on one GPU.
+
+If you have access to more GPUs, you can update the experiment accordingly. To run on 8 GPUs, we update the cluster configuration:
+
+```sh
+uv run python examples/run_rm.py cluster.gpus_per_node=8
+```
+
+Refer to the [RM documentation](docs/guides/rm.md) for more information.
+
+### RM Multi-node
+
+For distributed RM training across multiple nodes, modify the following script for your use case:
+
+```sh
+# Run from the root of NeMo RL repo
+## number of nodes to use for your job
+NUM_ACTOR_NODES=2
+
+COMMAND="uv run ./examples/run_rm.py --config examples/configs/rm.yaml cluster.num_nodes=2 cluster.gpus_per_node=8 checkpointing.checkpoint_dir='results/rm_llama1b_2nodes' logger.wandb_enabled=True logger.wandb.name='rm-llama1b-2nodes'" \
 CONTAINER=YOUR_CONTAINER \
 MOUNTS="$PWD:$PWD" \
 sbatch \
 
@@ -0,0 +1,24 @@
+# Reward Model Training in NeMo RL
+
+This document explains how to train reward models (RM) within NeMo RL. Currently, only Bradley-Terry reward models are supported on the DTensor backend. Megatron backend support is tracked [here](https://github.com/NVIDIA-NeMo/RL/issues/720).
+
+## Launch a Training Job
+
+The script, [examples/run_rm.py](../../examples/run_rm.py), is used to train a Bradley-Terry reward model. This script can be launched either locally or via Slurm. For details on how to set up Ray and launch a job using Slurm, refer to the [cluster documentation](../cluster.md).
+
+Be sure to launch the job using `uv`. The command to launch a training job is as follows:
+
+```bash
+uv run examples/run_rm.py
+
+# Can also add overrides on CLI, like changing the config or changing the model
+uv run examples/run_rm.py --config examples/configs/rm.yaml policy.model_name=Qwen/Qwen2.5-1.5B
+```
+
+The default YAML config shares the same base template as the SFT config but includes a new `reward_model_cfg` section with `enabled: true` to load the model as a Reward Model. You can find an example RM config file at [examples/configs/rm.yaml](../../examples/configs/rm.yaml).
+
+**Reminder**: Set your `HF_HOME`, `WANDB_API_KEY`, and `HF_DATASETS_CACHE` (if needed). Make sure to log in using `huggingface-cli` if you're working with Llama models.
+
+## Datasets
+
+By default, NeMo RL supports the `HelpSteer3` dataset. This dataset is downloaded from Hugging Face and preprocessed on-the-fly, so there's no need to provide a path to any datasets on disk.
@@ -28,6 +28,7 @@ guides/sft.md
 guides/dpo.md
 guides/grpo.md
 guides/grpo-deepscaler.md
+guides/rm.md
 guides/eval.md
 guides/deepseek.md
 model-quirks.md
 
@@ -0,0 +1,146 @@
+# Bradley-Terry (BT) Reward Model Training Configuration
+rm:
+  ## total number of steps to train will equal
+  ## min((max_num_epochs * len(train_dataloader)), max_num_steps)
+  max_num_epochs: 1
+  max_num_steps: -1  # by default, train for 1 epoch
+
+  val_period: 16
+  val_batches: -1
+  val_global_batch_size: 32
+  val_micro_batch_size: 1
+  val_at_start: false
+  seed: 42
+
+checkpointing:
+  enabled: true
+  checkpoint_dir: "results/rm"
+  metric_name: "val_loss"
+  higher_is_better: false
+  keep_top_k: 3
+  save_period: ${rm.val_period}
+
+policy:
+  model_name: "meta-llama/Llama-3.2-1B-Instruct"
+  tokenizer:
+    name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
+    # We don't use the "default" chat template because the Llama tokenizer inserts the current
+    # date in the system prompt, which could make the reward model's output date-dependent.
+    chat_template: "{{- bos_token }}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = '' %}\n{%- endif %}\n\n{#- System message #}\n{{- '<|start_header_id|>system<|end_header_id|>\n\n' }}\n{{- system_message }}\n{{- '<|eot_id|>' }}\n\n{%- for message in messages %}\n    {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' }}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>assistant<|end_header_id>\n\n' }}\n{%- endif %}"
+  train_global_batch_size: 128
+  train_micro_batch_size: 1
+  max_total_sequence_length: 8192
+  precision: "bfloat16"
+  fsdp_offload_enabled: false
+  activation_checkpointing_enabled: false
+
+  reward_model_cfg:
+    enabled: true  # loads model as a Reward Model (do not change)
+    reward_model_type: "bradley_terry"  # only "bradley_terry" is currently supported
+
+  dtensor_cfg:
+    enabled: true
+    cpu_offload: false
+    sequence_parallel: false
+    activation_checkpointing: false
+    tensor_parallel_size: 1
+    context_parallel_size: 1
+    custom_parallel_plan: null
+
+  dynamic_batching:
+    enabled: false
+
+  sequence_packing:
+    enabled: false
+
+  # makes the training sequence length divisible by the tensor parallel size
+  # this is useful for sequence parallel training
+  make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
+  max_grad_norm: 1.0
+
+  optimizer:
+    name: "torch.optim.AdamW"
+    kwargs:
+      lr: 2.0e-6
+      weight_decay: 0.1
+      betas: [0.9, 0.98]
+      eps: 1e-5
+      # when using Dtensor, we need to set `foreach` and `fused` to false
+      foreach: false
+      fused: false
+    
+  ## ignored since enabled=false, but needed for testing purposes
+  megatron_cfg:
+    enabled: false
+    empty_unused_memory_level: 1
+    activation_checkpointing: false
+    tensor_model_parallel_size: 2
+    pipeline_model_parallel_size: 2
+    context_parallel_size: 1
+    pipeline_dtype: ${policy.precision}
+    num_layers_in_first_pipeline_stage: null
+    num_layers_in_last_pipeline_stage: null
+    sequence_parallel: false
+
+    optimizer:
+      optimizer: "adam"
+      lr: 2.0e-6
+      min_lr: 1.9999e-6
+      weight_decay: 0.1
+      bf16: false
+      fp16: false
+      params_dtype: "float32"
+
+      #adam
+      adam_beta1: 0.9
+      adam_beta2: 0.98
+      adam_eps: 1e-5
+
+      #sgd
+      sgd_momentum: 0.9
+
+      #distributed optimizer
+      use_distributed_optimizer: true
+      use_precision_aware_optimizer: true
+
+      clip_grad: ${policy.max_grad_norm}
+
+    scheduler:
+      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      weight_decay_incr_style: "constant"
+      lr_decay_style: "constant"
+      lr_decay_iters: null
+      lr_warmup_iters: 50
+      lr_warmup_init: 1.9999e-6
+
+    distributed_data_parallel_config:
+      grad_reduce_in_fp32: false
+      overlap_grad_reduce: true
+      overlap_param_gather: false
+      average_in_collective: true
+      data_parallel_sharding_strategy: "optim_grads_params"
+
+    
+data:
+  max_input_seq_length: ${policy.max_total_sequence_length}
+  dataset_name: "HelpSteer3"
+
+logger:
+  log_dir: "logs"  # Base directory for all logs
+  wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running
+  tensorboard_enabled: true
+  mlflow_enabled: false
+  monitor_gpus: true  # If true, will monitor GPU usage and log to wandb and/or tensorboard
+  wandb:
+    project: "rm-dev"
+    name: "rm-dev-${data.dataset_name}"
+  tensorboard:
+    log_dir: "tb_logs-rm-dev-${data.dataset_name}"
+  gpu_monitoring:
+    collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
+    flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
+
+cluster:
+  gpus_per_node: 1
+  num_nodes: 1