NVIDIA-NeMo
diff --git a/‎3rdparty/Megatron-Bridge-workspace/Megatron-Bridge‎ b/‎3rdparty/Megatron-Bridge-workspace/Megatron-Bridge‎
diff --git a/‎3rdparty/Megatron-Bridge-workspace/setup.py‎
Lines changed: 1 addition & 1 deletion b/‎3rdparty/Megatron-Bridge-workspace/setup.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml‎
Lines changed: 0 additions & 9 deletions b/‎examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml‎
Lines changed: 0 additions & 9 deletions
diff --git a/‎examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n2g-megatrontp2.v1.yaml‎
Lines changed: 25 additions & 0 deletions b/‎examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n2g-megatrontp2.v1.yaml‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎…-instruct-clevr-1n2g-dtensor2tp1.v1.yaml‎ ‎…-clevr-1n2g-dtensor2tp1.v1.yaml.disabled‎examples/configs/recipes/vlm/vlm_grpo-smolvlm2-2.2b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml renamed to examples/configs/recipes/vlm/vlm_grpo-smolvlm2-2.2b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml.disabled b/‎…-instruct-clevr-1n2g-dtensor2tp1.v1.yaml‎ ‎…-clevr-1n2g-dtensor2tp1.v1.yaml.disabled‎examples/configs/recipes/vlm/vlm_grpo-smolvlm2-2.2b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml renamed to examples/configs/recipes/vlm/vlm_grpo-smolvlm2-2.2b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml.disabled
diff --git a/‎examples/configs/vlm_grpo_3B.yaml‎
Lines changed: 68 additions & 0 deletions b/‎examples/configs/vlm_grpo_3B.yaml‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎examples/configs/vlm_grpo_3B_megatron.yaml‎
Lines changed: 200 additions & 0 deletions b/‎examples/configs/vlm_grpo_3B_megatron.yaml‎
Lines changed: 200 additions & 0 deletions
diff --git a/‎examples/run_vlm_grpo.py‎
Lines changed: 18 additions & 8 deletions b/‎examples/run_vlm_grpo.py‎
Lines changed: 18 additions & 8 deletions
@@ -33,7 +33,7 @@
     "packaging",
     "tensorboard>=2.19.0",
     "torch",
-    "transformers>=4.51.3",
+    "transformers>=4.55.0",
     "typing-extensions",
     "rich",
     "wandb>=0.19.10",
 
@@ -3,12 +3,3 @@ checkpointing:
   checkpoint_dir: results/clevr_grpo
 policy:
   max_total_sequence_length: 3072
-env:
-  refcoco:
-    reward_functions:
-    - name: format
-      weight: 0.1
-    - name: bbox_giou
-      weight: 0.9
-      kwargs:
-        giou_penalty_thres: 1.0
@@ -0,0 +1,25 @@
+defaults: ../../vlm_grpo_3B.yaml
+checkpointing:
+  checkpoint_dir: results/clevr_grpo
+policy:
+  max_total_sequence_length: 3072
+  dtensor_cfg:
+    enabled: false
+  dynamic_batching:
+    enabled: false
+  make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
+  optimizer: null
+  megatron_cfg:
+    enabled: true
+    empty_unused_memory_level: 1
+    optimizer:
+      lr: 5.0e-07
+      min_lr: 5.0e-08
+    scheduler:
+      lr_warmup_iters: 50
+      lr_warmup_init: 5.0e-08
+    distributed_data_parallel_config:
+      overlap_grad_reduce: false
+logger:
+  wandb:
+    name: vlm-grpo-3b-megatron
@@ -58,6 +58,70 @@ policy:
     context_parallel_size: 1
     custom_parallel_plan: null
 
+  megatron_cfg:
+    enabled: false
+    empty_unused_memory_level: 0
+    activation_checkpointing: false
+    converter_type: "Qwen2ForCausalLM"
+    tensor_model_parallel_size: 1
+    expert_tensor_parallel_size: 1
+    expert_model_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    num_layers_in_first_pipeline_stage: null
+    num_layers_in_last_pipeline_stage: null
+    context_parallel_size: 1
+    pipeline_dtype: ${policy.precision}
+    sequence_parallel: false
+    freeze_moe_router: true
+    moe_router_dtype: "fp64"
+    moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
+    moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
+    moe_permute_fusion: false
+    #gives ~20% training perf speedup with sequence packing
+    apply_rope_fusion: True
+    defer_fp32_logits: null
+
+    optimizer:
+      optimizer: "adam"
+      lr: 5.0e-6
+      min_lr: 5.0e-7
+      weight_decay: 0.01
+      bf16: true
+      fp16: false
+      params_dtype: "float32"
+
+      #adam
+      adam_beta1: 0.9
+      adam_beta2: 0.999
+      adam_eps: 1e-8
+
+      #sgd
+      sgd_momentum: 0.9
+
+      #distributed optimizer
+      use_distributed_optimizer: true
+      use_precision_aware_optimizer: true
+
+      clip_grad: ${policy.max_grad_norm}
+
+    scheduler:
+      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      weight_decay_incr_style: "constant"
+      lr_decay_style: "constant"
+      lr_decay_iters: 1000
+      lr_warmup_iters: 13
+      lr_warmup_init: 5.0e-7
+
+    distributed_data_parallel_config:
+      grad_reduce_in_fp32: false
+      overlap_grad_reduce: true
+      overlap_param_gather: true
+      average_in_collective: true
+      use_custom_fsdp: false
+      data_parallel_sharding_strategy: "optim_grads_params"
+
+
   # dynamic_batching improves performance by ensuring logprob and training microbatches
   # have a sufficent number of tokens to maximize GPU utilization. Specifically, variable length
   # responses are sorted by sequence length and bucketed into microbatches with a total
@@ -76,6 +140,10 @@ policy:
 
   sequence_packing:
     enabled: False
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
+    algorithm: "modified_first_fit_decreasing"
+    sequence_length_round: 64
 
   optimizer:
     name: "torch.optim.AdamW"
 
@@ -0,0 +1,200 @@
+grpo:
+  num_prompts_per_step: 8
+  num_generations_per_prompt: 16
+  max_rollout_turns: 1
+  max_num_epochs: 1
+  max_num_steps: 1000000
+  normalize_rewards: true
+  use_leave_one_out_baseline: true
+  val_period: 10
+  val_at_start: false
+  overlong_filtering: false
+  max_val_samples: 256
+  val_batch_size: 256
+  seed: 42
+  async_grpo:
+    enabled: false
+    max_trajectory_age_steps: 1
+loss_fn:
+  reference_policy_kl_penalty: 0.01
+  ratio_clip_min: 0.2
+  ratio_clip_max: 0.2
+  ratio_clip_c: null
+  use_on_policy_kl_approximation: false
+  use_importance_sampling_correction: false
+  token_level_loss: true
+checkpointing:
+  enabled: true
+  checkpoint_dir: results/clevr_grpo_${policy.model_name}
+  metric_name: val_reward
+  higher_is_better: true
+  keep_top_k: 3
+  save_period: 10
+  checkpoint_must_save_by: null
+policy:
+  model_name: Qwen/Qwen2.5-VL-3B-Instruct
+  tokenizer:
+    name: ${policy.model_name}
+  train_global_batch_size: 128
+  train_micro_batch_size: 1
+  generation_batch_size: 32
+  logprob_batch_size: 4
+  max_total_sequence_length: 2048
+  precision: bfloat16
+  dtensor_cfg:
+    _v2: true
+    enabled: false
+    cpu_offload: false
+    sequence_parallel: false
+    activation_checkpointing: false
+    tensor_parallel_size: 1
+    context_parallel_size: 1
+    custom_parallel_plan: null
+  dynamic_batching:
+    enabled: false
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
+    sequence_length_round: 64
+  make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
+  max_grad_norm: 1.0
+  sequence_packing:
+    enabled: false
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
+    algorithm: modified_first_fit_decreasing
+    sequence_length_round: 64
+  optimizer: null
+  scheduler:
+  - name: torch.optim.lr_scheduler.LinearLR
+    kwargs:
+      start_factor: 0.1
+      end_factor: 1.0
+      total_iters: 50
+  - name: torch.optim.lr_scheduler.ConstantLR
+    kwargs:
+      factor: 1.0
+      total_iters: 10000000000
+  - milestones:
+    - 50
+  generation:
+    backend: vllm
+    max_new_tokens: 1024
+    temperature: 1.0
+    top_p: 1.0
+    top_k: null
+    stop_token_ids: null
+    stop_strings: null
+    vllm_cfg:
+      async_engine: false
+      precision: ${policy.precision}
+      tensor_parallel_size: 1
+      pipeline_parallel_size: 1
+      expert_parallel_size: 1
+      gpu_memory_utilization: 0.6
+      max_model_len: ${policy.max_total_sequence_length}
+      enforce_eager: false
+      enable_expert_parallel: false
+    colocated:
+      enabled: true
+      resources:
+        gpus_per_node: null
+        num_nodes: null
+  megatron_cfg:
+    enabled: true
+    empty_unused_memory_level: 0
+    activation_checkpointing: false
+    converter_type: Qwen2ForCausalLM
+    tensor_model_parallel_size: 1
+    expert_tensor_parallel_size: 1
+    expert_model_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    num_layers_in_first_pipeline_stage: null
+    num_layers_in_last_pipeline_stage: null
+    context_parallel_size: 1
+    pipeline_dtype: ${policy.precision}
+    sequence_parallel: false
+    freeze_moe_router: true
+    moe_router_dtype: fp64
+    moe_router_load_balancing_type: none
+    moe_router_bias_update_rate: 0.0
+    moe_permute_fusion: false
+    apply_rope_fusion: true
+    optimizer:
+      optimizer: adam
+      lr: 2.0e-07
+      min_lr: 2.0e-07
+      weight_decay: 0.01
+      bf16: true
+      fp16: false
+      params_dtype: float32
+      adam_beta1: 0.9
+      adam_beta2: 0.999
+      adam_eps: 1.0e-08
+      sgd_momentum: 0.9
+      use_distributed_optimizer: true
+      use_precision_aware_optimizer: true
+      clip_grad: ${policy.max_grad_norm}
+    scheduler:
+      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      weight_decay_incr_style: constant
+      lr_decay_style: constant
+      lr_decay_iters: 1000
+      lr_warmup_iters: 50
+      lr_warmup_init: 2.0e-08
+    distributed_data_parallel_config:
+      grad_reduce_in_fp32: false
+      overlap_grad_reduce: false
+      overlap_param_gather: true
+      average_in_collective: true
+      use_custom_fsdp: false
+      data_parallel_sharding_strategy: optim_grads_params
+data:
+  max_input_seq_length: ${policy.max_total_sequence_length}
+  prompt_file: examples/prompts/clevr_cogent_cot.txt
+  system_prompt_file: null
+  dataset_name: clevr-cogent
+  split: trainA
+  shuffle: true
+env:
+  clevr-cogent:
+    num_workers: 8
+    reward_functions:
+    - name: format
+      weight: 0.2
+    - name: exact_alnum
+      weight: 0.8
+  geometry3k:
+    num_workers: 8
+    reward_functions:
+    - name: format
+      weight: 0.1
+    - name: math_expr
+      weight: 0.9
+  refcoco:
+    num_workers: 8
+    reward_functions:
+    - name: format
+      weight: 0.1
+    - name: bbox_giou
+      weight: 0.9
+      kwargs:
+        giou_penalty_thres: 0.5
+logger:
+  log_dir: logs
+  num_val_samples_to_print: 0
+  wandb_enabled: false
+  tensorboard_enabled: true
+  swanlab_enabled: false
+  mlflow_enabled: false
+  monitor_gpus: false
+  wandb:
+    project: grpo-dev
+    name: vlm-grpo-3b-megatron
+  tensorboard: {}
+  gpu_monitoring:
+    collection_interval: 10
+    flush_interval: 10
+cluster:
+  gpus_per_node: 2
+  num_nodes: 1
@@ -194,16 +194,29 @@ def hf_data_processor(
 
     length = sum(len(m["token_ids"]) for m in message_log)
     loss_multiplier = 1.0
-    if length > max_seq_length:
+    if length >= max_seq_length:
+        # Treat truncated messages as text only
+        vllm_kwargs = {
+            "vllm_content": None,
+            "vllm_images": [],
+        }
+
         # make smaller and mask out
         for chat_message in message_log:
             chat_message["token_ids"] = chat_message["token_ids"][
                 : min(4, max_seq_length // len(message_log))
             ]
+            for key, value in chat_message.items():
+                if isinstance(value, PackedTensor):
+                    chat_message[key] = PackedTensor.empty_like(value)
         loss_multiplier = 0.0
-        raise NotImplementedError(
-            "Sequence length is too long, please use a shorter sequence length"
-        )
+    else:
+        # get the prompt content! (use this for vllm-backend that needs formatted dialog and list of images) for the entire conversation
+        # add images for vllm serving
+        vllm_kwargs = {
+            "vllm_content": string_formatted_dialog,
+            "vllm_images": images,
+        }
 
     output: DatumSpec = {
         "message_log": message_log,
@@ -212,10 +225,7 @@ def hf_data_processor(
         "loss_multiplier": loss_multiplier,
         "idx": idx,
         "task_name": task_data_spec.task_name,
-        # get the prompt content! (use this for vllm-backend that needs formatted dialog and list of images) for the entire conversation
-        # add images for vllm serving
-        "vllm_content": string_formatted_dialog,
-        "vllm_images": images,
+        **vllm_kwargs,
     }
     return output