NVIDIA-NeMo
diff --git a/‎nemo/collections/llm/gpt/model/qwen2.py‎
Lines changed: 28 additions & 3 deletions b/‎nemo/collections/llm/gpt/model/qwen2.py‎
Lines changed: 28 additions & 3 deletions
diff --git a/‎tutorials/llm/distill_deepseek_r1/REAMDE.rst‎
Lines changed: 12 additions & 0 deletions b/‎tutorials/llm/distill_deepseek_r1/REAMDE.rst‎
Lines changed: 12 additions & 0 deletions
@@ -36,6 +36,10 @@
 
 @dataclass
 class Qwen2Config(GPTConfig):
+    """
+    Base config for Qwen 2 Models
+    """
+
     normalization: str = "RMSNorm"
     activation_func: Callable = F.silu
     gated_linear_unit: bool = True
@@ -54,6 +58,10 @@ class Qwen2Config(GPTConfig):
 
 @dataclass
 class Qwen2Config500M(Qwen2Config):
+    """
+    Config for Qwen 2 0.5B: https://huggingface.co/Qwen/Qwen2-0.5B
+    """
+
     num_layers: int = 24
     hidden_size: int = 896
     num_attention_heads: int = 14
@@ -63,6 +71,10 @@ class Qwen2Config500M(Qwen2Config):
 
 @dataclass
 class Qwen2Config1P5B(Qwen2Config):
+    """
+    Config for Qwen 2 1.5B: https://huggingface.co/Qwen/Qwen2-1.5B
+    """
+
     num_layers: int = 28
     hidden_size: int = 1536
     num_attention_heads: int = 12
@@ -72,6 +84,10 @@ class Qwen2Config1P5B(Qwen2Config):
 
 @dataclass
 class Qwen2Config7B(Qwen2Config):
+    """
+    Config for Qwen 2 7B: https://huggingface.co/Qwen/Qwen2-7B
+    """
+
     num_layers: int = 28
     hidden_size: int = 3584
     num_attention_heads: int = 28
@@ -82,17 +98,24 @@ class Qwen2Config7B(Qwen2Config):
 
 @dataclass
 class Qwen2Config72B(Qwen2Config):
+    """
+    Config for Qwen 2 72B: https://huggingface.co/Qwen/Qwen2-72B
+    """
+
     num_layers: int = 80
     hidden_size: int = 8192
     num_attention_heads: int = 64
     num_query_groups: int = 8
     ffn_hidden_size: int = 29568
     vocab_size: int = 152064
     layernorm_epsilon: float = 1e-5
-    vocab_size: int = 152064
 
 
 class Qwen2Model(GPTModel):
+    """
+    Base model for Qwen 2
+    """
+
     def __init__(
         self,
         config: Annotated[Optional[Qwen2Config], Config[Qwen2Config]] = None,
@@ -105,6 +128,7 @@ def __init__(
 
 @io.model_importer(Qwen2Model, "hf")
 class HFQwen2Importer(io.ModelConnector["AutoModelForCausalLM", Qwen2Model]):
+    # pylint: disable=C0115,C0116
     def init(self) -> Qwen2Model:
         return Qwen2Model(self.config, tokenizer=self.tokenizer)
 
@@ -163,6 +187,8 @@ def config(self) -> Qwen2Config:
             make_vocab_size_divisible_by=128,
             rotary_base=source.rope_theta,
             share_embeddings_and_output_weights=False,
+            vocab_size=source.vocab_size,
+            seq_length=source.max_position_embeddings,
             fp16=(dtype_from_hf(source) == torch.float16),
             bf16=(dtype_from_hf(source) == torch.bfloat16),
             params_dtype=dtype_from_hf(source),
@@ -173,6 +199,7 @@ def config(self) -> Qwen2Config:
 
 @io.model_exporter(Qwen2Model, "hf")
 class HFQwen2Exporter(io.ModelConnector[Qwen2Model, "AutoModelForCausalLM"]):
+    # pylint: disable=C0115,C0116
     def init(self, dtype=torch.bfloat16) -> "AutoModelForCausalLM":
         from transformers import AutoModelForCausalLM
         from transformers.modeling_utils import no_init_weights
@@ -288,7 +315,6 @@ def _import_qkv_bias(ctx: io.TransformCTX, q, k, v):
     head_num = megatron_config.num_attention_heads
     num_query_groups = megatron_config.num_query_groups
     heads_per_group = head_num // num_query_groups
-    hidden_size = megatron_config.hidden_size
     head_size = megatron_config.kv_channels
 
     new_q_tensor_shape = (head_num, head_size)
@@ -360,7 +386,6 @@ def _export_qkv_bias(ctx: io.TransformCTX, qkv_bias):
     head_num = megatron_config.num_attention_heads
     num_query_groups = megatron_config.num_query_groups
     heads_per_group = head_num // num_query_groups
-    hidden_size = megatron_config.hidden_size
     head_size = megatron_config.kv_channels
     qkv_total_dim = head_num + 2 * num_query_groups
 
 
@@ -0,0 +1,12 @@
+Distilling the Reasoning Ability of DeepSeek R1 into Qwen with the NeMo 2.0 Framework
+==================================================================================
+
+DeepSeek R1 is an open-source large language model dedicated to solving logical reasoning tasks. It employs a Mixture of Experts (MoE) architecture and boasts 671B parameters. Through reinforcement learning, it has been trained to perform deep thinking (generating long-chain-of-thought), excelling in reasoning tasks and various specialized fields such as mathematics, programming, and scientific analysis.
+
+Moreover, as per the `DeepSeek-R1 <https://arxiv.org/abs/2501.12948>`_ paper, the reasoning patterns of larger models can be distilled into smaller ones. Specifically, we can distill long-chain-of-thought (long-CoT) data that encapsulates reasoning processes from DeepSeek-R1 and directly fine-tune open-source models like Qwen and Llama. This simple distillation approach greatly enhances the reasoning capabilities of smaller models.
+
+
+To illustrate the complete distillation process, we have prepared two notebooks demonstrating how to extract reasoning data from DeepSeek-R1 using the NIM API, and how to train models with the distilled data.
+
+* `generate_reasoning_data.ipynb <./generate_reasoning_data.ipynb>`_ demonstrates the process of distilling reasoning data from DeepSeek-R1 using the NIM API.
+* `qwen2_distill_nemo.ipynb <./qwen2_distill_nemo.ipynb>`_ shows how to train open-source models with the distilled data.