NVIDIA-NeMo
diff --git a/‎README.md‎
Lines changed: 47 additions & 1 deletion b/‎README.md‎
Lines changed: 47 additions & 1 deletion
diff --git a/‎examples/configs/distillation_math.yaml‎
Lines changed: 170 additions & 0 deletions b/‎examples/configs/distillation_math.yaml‎
Lines changed: 170 additions & 0 deletions
diff --git a/‎examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-fsdp2tp1.v1.yaml‎
Lines changed: 58 additions & 0 deletions b/‎examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-fsdp2tp1.v1.yaml‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎examples/configs/recipes/llm/distillation-qwen3-32b-to-4b-base-1n8g-fsdp2tp2-dynamicbatch.v1.yaml‎
Lines changed: 55 additions & 0 deletions b/‎examples/configs/recipes/llm/distillation-qwen3-32b-to-4b-base-1n8g-fsdp2tp2-dynamicbatch.v1.yaml‎
Lines changed: 55 additions & 0 deletions
@@ -3,6 +3,8 @@
 [![CICD NeMo RL](https://github.com/NVIDIA-NeMo/RL/actions/workflows/cicd-main.yml/badge.svg?branch=main&event=schedule)](https://github.com/NVIDIA-NeMo/RL/actions/workflows/cicd-main.yml)
 
 ## 📣 News
+* [9/25/2025] On-policy Distillation (Qwen3-style)
+    * Student generates on-policy sequences and aligns logits to a larger teacher via KL, achieving near-larger-model quality at lower cost than RL. See [On-policy Distillation](#on-policy-distillation).
 * [7/25/2025] [Release v0.3.0!](https://github.com/NVIDIA-NeMo/RL/releases/tag/v0.3.0)
     * 📝 [v0.3.0 Blog Post](https://nvidia-nemo.github.io/blog/2025/07/21/nemo-rl-v0.3/)
     * 📊 View the release run metrics on [Google Colab](https://colab.research.google.com/drive/15kpesCV1m_C5UQFStssTEjaN2RsBMeZ0?usp=sharing) to get a head start on your experimentation.
@@ -59,7 +61,7 @@ For detailed information on backend selection, configuration, and examples, see
 - 🔜 **Megatron Bridge Integration** - Integrate Megatron Bridge to enable training features from Megatron Core.
 - 🔜 **NeMo Automodel Integration** - Integrate NeMo Automodel to power our DTensor path.
 - 🔜 **New Models** - gpt-oss.
-- 🔜 **Expand Algorithms** - DAPO, GSPO.
+- 🔜 **Expand Algorithms** - DAPO, GSPO, On-policy Distillation.
 - 🔜 **GB200** - Add container support for GB200.
 - ✅ **Distributed Training** - Ray-based infrastructure.
 - ✅ **Environment Support and Isolation** - Support for multi-environment training and dependency isolation between components.
@@ -83,6 +85,7 @@ For detailed information on backend selection, configuration, and examples, see
     |Algorithms|Single Node|Multi-node|
     |-|-|-|
     |[GRPO](#grpo)|[GRPO Single Node](#grpo-single-node)|[GRPO Multi-node](#grpo-multi-node): [GRPO Qwen2.5-32B](#grpo-qwen25-32b), [GRPO Multi-Turn](#grpo-multi-turn)|
+    |[On-policy Distillation](#on-policy-distillation)|[Distillation Single Node](#on-policy-distillation-single-node)|[Distillation Multi-node](#on-policy-distillation-multi-node)|
     |[Supervised Fine-Tuning (SFT)](#supervised-fine-tuning-sft)|[SFT Single Node](#sft-single-node)|[SFT Multi-node](#sft-multi-node)|
     |[DPO](#dpo)|[DPO Single Node](#dpo-single-node)|[DPO Multi-node](#dpo-multi-node)|
     |[RM](#rm)|[RM Single Node](#rm-single-node)|[RM Multi-node](#rm-multi-node)|
@@ -312,6 +315,49 @@ Reference example for training to play a Sliding Puzzle Game:
 uv run python examples/run_grpo_sliding_puzzle.py
 ```
 
+## On-policy Distillation
+
+We provide an example on-policy distillation experiment using the [DeepScaler dataset](https://huggingface.co/agentica-org/DeepScaleR-1.5B-Preview).
+
+> [!NOTE]
+> Distillation currently supports the DTensor and vLLM generation backend. Megatron generation/training paths are not supported yet.
+
+### On-policy Distillation Single Node
+
+To run on-policy distillation on a single GPU using `Qwen/Qwen3-1.7B-Base` as the student and `Qwen/Qwen3-4B` as the teacher:
+
+```sh
+uv run python examples/run_distillation_math.py
+```
+
+Customize parameters with command-line overrides. For example:
+
+```sh
+uv run python examples/run_distillation_math.py \
+  policy.model_name="Qwen/Qwen3-1.7B-Base" \
+  teacher.model_name="Qwen/Qwen3-4B" \
+  cluster.gpus_per_node=8
+```
+
+### On-policy Distillation Multi-node
+
+```sh
+# Run from the root of NeMo RL repo
+NUM_ACTOR_NODES=2
+
+COMMAND="uv run ./examples/run_distillation_math.py --config examples/configs/distillation_math.yaml cluster.num_nodes=2 cluster.gpus_per_node=8 checkpointing.checkpoint_dir='results/distill_2nodes' logger.wandb_enabled=True logger.wandb.name='distill-2nodes'" \
+CONTAINER=YOUR_CONTAINER \
+MOUNTS="$PWD:$PWD" \
+sbatch \
+    --nodes=${NUM_ACTOR_NODES} \
+    --account=YOUR_ACCOUNT \
+    --job-name=YOUR_JOBNAME \
+    --partition=YOUR_PARTITION \
+    --time=4:0:0 \
+    --gres=gpu:8 \
+    ray.sub
+```
+
 ## Supervised Fine-Tuning (SFT)
 
 We provide example SFT experiments using various datasets including [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/), OpenAI format datasets (with tool calling support), and custom JSONL datasets. For detailed documentation on supported datasets and configurations, see the [SFT documentation](docs/guides/sft.md).
 
@@ -0,0 +1,170 @@
+# Distillation Algorithm Configuration
+distillation:
+    num_prompts_per_step: 128
+    num_generations_per_prompt: 1
+    max_rollout_turns: 1 # for multi-turn rollouts. Math Environments just have 1 turn (answering the question)
+    max_num_steps: 1000
+    val_batch_size: 64
+    val_period: 20
+    val_at_start: false
+    max_val_samples: 512
+    topk_logits_k: 64
+    seed: 42
+
+loss_fn:
+    kl_type: "mixed" # forward, reverse, mixed
+    mixed_kl_weight: 0.5 # when kl_type is "mixed", this is the weight of the forward KL
+    zero_outside_topk: false # zero out the teacher logits outside the top k when calculate forward KL loss
+
+checkpointing:
+    enabled: true
+    checkpoint_dir: "checkpoints/distillation-${policy.model_name}"
+    metric_name: "val_reward"
+    higher_is_better: true
+    keep_top_k: 3
+    save_period: 10
+    checkpoint_must_save_by: null
+    model_save_format: "safetensors"
+    save_consolidated: false
+
+policy: &POLICY_BASE
+    model_name: "Qwen/Qwen3-1.7B-Base"
+    tokenizer:
+        name: ${..model_name} ## specify if you'd like to use a tokenizer different from the model's default
+    train_global_batch_size: 64
+    train_micro_batch_size: 1
+    generation_batch_size: 64
+    logprob_batch_size: 1
+    max_total_sequence_length: 8192
+    precision: "bfloat16"
+    logprob_chunk_size: null
+
+    dtensor_cfg: &DTENSOR_BASE
+        enabled: true
+        _v2: true
+        cpu_offload: False
+        sequence_parallel: false
+        activation_checkpointing: true
+        tensor_parallel_size: 2
+        context_parallel_size: 2
+        custom_parallel_plan: null
+
+    dynamic_batching:
+        enabled: true
+        train_mb_tokens: ${mul:${..max_total_sequence_length}, ${..train_micro_batch_size}}
+        logprob_mb_tokens: ${mul:${..max_total_sequence_length}, ${..logprob_batch_size}}
+        sequence_length_round: 64
+
+    sequence_packing:
+        enabled: false
+        train_mb_tokens: ${mul:${..max_total_sequence_length}, ${..train_micro_batch_size}}
+        logprob_mb_tokens: ${mul:${..max_total_sequence_length}, ${..logprob_batch_size}}
+        algorithm: "modified_first_fit_decreasing"
+        sequence_length_round: 64
+
+    max_grad_norm: 1.0
+    # makes the training sequence length divisible by the tensor parallel size
+    # this is useful for sequence parallel training
+    # must be divisible by 2*cp
+    make_sequence_length_divisible_by: ${mul:${mul:${.dtensor_cfg.tensor_parallel_size}, ${.dtensor_cfg.context_parallel_size}}, 2}
+    optimizer:
+        name: "torch.optim.AdamW"
+        kwargs:
+            lr: 2.0e-5
+            weight_decay: 0.01
+            betas: [0.9, 0.999]
+            eps: 1e-8
+            # when using Dtensor, we need to set foreach
+            # and fused to False
+            foreach: False
+            fused: False
+
+    megatron_cfg: # [TODO]
+        enabled: false
+
+    scheduler:
+        - name: "torch.optim.lr_scheduler.LinearLR"
+          kwargs:
+              start_factor: 0.1
+              end_factor: 1.0
+              total_iters: 10
+        - name: "torch.optim.lr_scheduler.ConstantLR"
+          kwargs:
+              factor: 1.0
+              total_iters: 10000000000
+        - milestones: [10]
+
+    generation:
+        backend: "vllm"
+        max_new_tokens: ${..max_total_sequence_length} # refer to local policy/teacher config
+        temperature: 1.0
+        top_p: 1.0
+        top_k: null
+        stop_token_ids: null
+        stop_strings: null
+        vllm_cfg:
+            async_engine: false
+            precision: ${...precision}
+            tensor_parallel_size: 1
+            pipeline_parallel_size: 1
+            expert_parallel_size: 1  # When EP > 1, EP must be a multiple of TP since vLLM's EP = DP * TP
+            gpu_memory_utilization: 0.6
+            max_model_len: ${...max_total_sequence_length} # refer to local policy/teacher config
+            enforce_eager: False
+            use_deep_gemm: False
+            num_last_layers_in_bf16: 0
+            num_first_layers_in_bf16: 0
+            distributed_executor_backend: null
+
+        colocated:
+            # true: generation shares training GPUs
+            # false: uses dedicated generation resources
+            enabled: true
+            # only relevant when enabled is false
+            resources:
+                gpus_per_node: null # Decides num gpus to be dedicated to generation when there is one node in the cluster i.e cluster.num_nodes == 1
+                num_nodes: null # Decides number of nodes to be dedicated to generation
+
+
+teacher:
+    <<: *POLICY_BASE
+    model_name: "Qwen/Qwen3-4B"
+    dtensor_cfg:
+        <<: *DTENSOR_BASE
+        context_parallel_size: 2 
+        tensor_parallel_size: 4
+
+data:
+    max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len
+    prompt_file: "examples/prompts/cot.txt"
+    system_prompt_file: null
+    dataset_name: "DeepScaler"
+    shuffle: true
+
+env:
+    math:
+        num_workers: 8
+
+logger:
+    log_dir: "logs/distillation"
+    num_val_samples_to_print: 5
+    wandb_enabled: true
+    tensorboard_enabled: true
+    mlflow_enabled: false
+    swanlab_enabled: false
+    monitor_gpus: true
+    wandb:
+        project: "nemo-distillation"
+        name: "distillation-${data.dataset_name}-${teacher.model_name}-${policy.model_name}-${loss_fn.kl_type}-${distillation.topk_logits_k}"
+    tensorboard:
+        log_dir: "tb_logs-distillation-${data.dataset_name}"
+    mlflow:
+        experiment_name: "distillation-dev"
+        run_name: "distillation-math-cl-logger"
+    gpu_monitoring:
+        collection_interval: 10
+        flush_interval: 10
+
+cluster:
+    gpus_per_node: 8
+    num_nodes: 1
@@ -0,0 +1,58 @@
+defaults: ../../distillation_math.yaml
+distillation:
+  num_prompts_per_step: 64
+  max_num_steps: 20
+  val_batch_size: 32
+  val_period: 10
+  max_val_samples: 256
+loss_fn:
+  kl_type: reverse
+checkpointing:
+  checkpoint_dir: checkpoints/distillation-qwen3-32b-to-1.7b-base
+policy:
+  train_global_batch_size: 32
+  generation_batch_size: 32
+  dtensor_cfg:
+    tensor_parallel_size: 1
+    context_parallel_size: 1
+  dynamic_batching:
+    enabled: false
+  make_sequence_length_divisible_by: 1
+  scheduler:
+  - name: torch.optim.lr_scheduler.LinearLR
+    kwargs:
+      start_factor: 0.1
+      end_factor: 1.0
+      total_iters: 20
+  - name: torch.optim.lr_scheduler.ConstantLR
+    kwargs:
+      factor: 1.0
+      total_iters: 10000000000
+  - milestones:
+    - 20
+teacher:
+  model_name: Qwen/Qwen3-32B
+  train_global_batch_size: 32
+  generation_batch_size: 32
+  dtensor_cfg:
+    context_parallel_size: 1
+  dynamic_batching:
+    enabled: false
+  make_sequence_length_divisible_by: 1
+  scheduler:
+  - name: torch.optim.lr_scheduler.LinearLR
+    kwargs:
+      start_factor: 0.1
+      end_factor: 1.0
+      total_iters: 20
+  - name: torch.optim.lr_scheduler.ConstantLR
+    kwargs:
+      factor: 1.0
+      total_iters: 10000000000
+  - milestones:
+    - 20
+logger:
+  log_dir: logs/distillation-qwen3-32b-to-1.7b-base
+  wandb:
+    project: nemo-rl
+    name: distillation-qwen3-32b-to-1.7b-base
@@ -0,0 +1,55 @@
+defaults: ../../distillation_math.yaml
+distillation:
+  num_prompts_per_step: 64
+  max_num_steps: 20
+  val_batch_size: 32
+  val_period: 10
+  max_val_samples: 256
+loss_fn:
+  kl_type: reverse
+checkpointing:
+  checkpoint_dir: checkpoints/distillation-qwen3-32b-to-4b-base-dynamicbatch
+policy:
+  model_name: Qwen/Qwen3-4B-Base
+  train_global_batch_size: 32
+  generation_batch_size: 32
+  dtensor_cfg:
+    context_parallel_size: 1
+  make_sequence_length_divisible_by: 2
+  scheduler:
+  - name: torch.optim.lr_scheduler.LinearLR
+    kwargs:
+      start_factor: 0.1
+      end_factor: 1.0
+      total_iters: 20
+  - name: torch.optim.lr_scheduler.ConstantLR
+    kwargs:
+      factor: 1.0
+      total_iters: 10000000000
+  - milestones:
+    - 20
+teacher:
+  model_name: Qwen/Qwen3-32B
+  train_global_batch_size: 32
+  generation_batch_size: 32
+  dtensor_cfg:
+    tensor_parallel_size: 8
+    context_parallel_size: 1
+  make_sequence_length_divisible_by: 2
+  scheduler:
+  - name: torch.optim.lr_scheduler.LinearLR
+    kwargs:
+      start_factor: 0.1
+      end_factor: 1.0
+      total_iters: 20
+  - name: torch.optim.lr_scheduler.ConstantLR
+    kwargs:
+      factor: 1.0
+      total_iters: 10000000000
+  - milestones:
+    - 20
+logger:
+  log_dir: logs/distillation-qwen3-32b-to-4b-base-dynamicbatch
+  wandb:
+    project: nemo-rl
+    name: distillation-qwen3-32b-to-4b-base-dynamicbatch