RL/examples/configs/rm.yaml at 79662b04efdccf412ee480ea684c6d5c720f2c5d · NVIDIA-NeMo/RL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# Bradley-Terry (BT) Reward Model Training Configuration
rm:
  ## total number of steps to train will equal
  ## min((max_num_epochs * len(train_dataloader)), max_num_steps)
  max_num_epochs: 1
  max_num_steps: -1  # by default, train for 1 epoch

  val_period: 16
  val_batches: -1
  val_global_batch_size: 32
  val_micro_batch_size: 1
  val_at_start: false
  seed: 42

checkpointing:
  enabled: true
  checkpoint_dir: "results/rm"
  metric_name: "val_loss"
  higher_is_better: false
  keep_top_k: 3
  save_period: ${rm.val_period}
  checkpoint_must_save_by: null

policy:
  model_name: "meta-llama/Llama-3.2-1B-Instruct"
  tokenizer:
    name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
    # We don't use the "default" chat template because the Llama tokenizer inserts the current
    # date in the system prompt, which could make the reward model's output date-dependent.
    chat_template: "{{- bos_token }}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = '' %}\n{%- endif %}\n\n{#- System message #}\n{{- '<|start_header_id|>system<|end_header_id|>\n\n' }}\n{{- system_message }}\n{{- '<|eot_id|>' }}\n\n{%- for message in messages %}\n    {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' }}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>assistant<|end_header_id>\n\n' }}\n{%- endif %}"
  train_global_batch_size: 128
  train_micro_batch_size: 1
  max_total_sequence_length: 8192
  precision: "bfloat16"
  activation_checkpointing_enabled: false

  reward_model_cfg:
    enabled: true  # loads model as a Reward Model (do not change)
    reward_model_type: "bradley_terry"  # only "bradley_terry" is currently supported

  dtensor_cfg:
    enabled: true
    cpu_offload: false
    sequence_parallel: false
    activation_checkpointing: false
    tensor_parallel_size: 1
    context_parallel_size: 1
    custom_parallel_plan: null

  dynamic_batching:
    enabled: false

  sequence_packing:
    enabled: false

  # makes the training sequence length divisible by the tensor parallel size
  # this is useful for sequence parallel training
  make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
  max_grad_norm: 1.0

  optimizer:
    name: "torch.optim.AdamW"
    kwargs:
      lr: 2.0e-6
      weight_decay: 0.1
      betas: [0.9, 0.98]
      eps: 1e-5
      # when using Dtensor, we need to set `foreach` and `fused` to false
      foreach: false
      fused: false

  ## ignored since enabled=false, but needed for testing purposes
  megatron_cfg:
    enabled: false
    empty_unused_memory_level: 1
    activation_checkpointing: false
    tensor_model_parallel_size: 2
    pipeline_model_parallel_size: 2
    context_parallel_size: 1
    pipeline_dtype: ${policy.precision}
    num_layers_in_first_pipeline_stage: null
    num_layers_in_last_pipeline_stage: null
    sequence_parallel: false

    optimizer:
      optimizer: "adam"
      lr: 2.0e-6
      min_lr: 1.9999e-6
      weight_decay: 0.1
      bf16: false
      fp16: false
      params_dtype: "float32"

      #adam
      adam_beta1: 0.9
      adam_beta2: 0.98
      adam_eps: 1e-5

      #sgd
      sgd_momentum: 0.9

      #distributed optimizer
      use_distributed_optimizer: true
      use_precision_aware_optimizer: true

      clip_grad: ${policy.max_grad_norm}

    scheduler:
      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
      weight_decay_incr_style: "constant"
      lr_decay_style: "constant"
      lr_decay_iters: 1000
      lr_warmup_iters: 50
      lr_warmup_init: 1.9999e-6

    distributed_data_parallel_config:
      grad_reduce_in_fp32: false
      overlap_grad_reduce: true
      overlap_param_gather: false
      average_in_collective: true
      data_parallel_sharding_strategy: "optim_grads_params"


data:
  max_input_seq_length: ${policy.max_total_sequence_length}
  shuffle: true

  dataset_name: HelpSteer3
  # You can use custom preference datasets for training and validation. For example:
  #   1. PreferenceDataset
  #   data:
  #     dataset_name: PreferenceDataset
  #     train_data_path: <PathToTrainingDataset>  # e.g., /path/to/local/dataset.jsonl or hf_org/hf_dataset_name (HuggingFace)
  #     val_data_paths:
  #       <NameOfValidationDataset1>: <PathToValidationDataset1>
  #       ...
  #     train_split: <TrainSplit>, default is None  # used for HuggingFace datasets
  #     val_split: <ValSplit>, default is None  # used for HuggingFace datasets
  #   2. BinaryPreferenceDataset
  #   data:
  #     dataset_name: BinaryPreferenceDataset
  #     train_data_path: <PathToTrainingDataset>  # e.g., /path/to/local/dataset.jsonl or hf_org/hf_dataset_name (HuggingFace)
  #     val_data_path: <PathToValidationDataset>
  #     prompt_key: <PromptKey>, default is "prompt"
  #     chosen_key: <ChosenKey>, default is "chosen"
  #     rejected_key: <RejectedKey>, default is "rejected"
  #     train_split: <TrainSplit>, default is None  # used for HuggingFace datasets
  #     val_split: <ValSplit>, default is None  # used for HuggingFace datasets
  # See https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/rm.md#datasets for more details.

  # If you are doing checkpointing, `metric_name` should reflect the metric and validation set to be tracked. For example:
  #   checkpointing:
  #     metric_name: "validation-<NameOfValidationDataset1>_loss"
  #   ...

logger:
  log_dir: "logs"  # Base directory for all logs
  wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running
  tensorboard_enabled: true
  mlflow_enabled: false
  swanlab_enabled: false # Disable SwanLab logging
  monitor_gpus: true  # If true, will monitor GPU usage and log to wandb and/or tensorboard
  wandb:
    project: "rm-dev"
    name: "rm-dev-${data.dataset_name}"
  tensorboard:
    log_dir: "tb_logs-rm-dev-${data.dataset_name}"
  gpu_monitoring:
    collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
    flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)

cluster:
  gpus_per_node: 1
  num_nodes: 1