test

wwwjn · wwwjn · commit c21868612de1 · 2025-08-27T14:21:50.000-07:00
diff --git a/torchtitan/models/deepseek_v3/hf_implementation.py b/torchtitan/models/deepseek_v3/hf_implementation.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Hugging Face implementation for DeepSeek-V3 model inference.
+"""
+
+import argparse
+import gc
+import os
+import time
+
+import torch
+
+
+def print_gpu_memory_usage(message=""):
+    """Print current GPU memory usage."""
+    if torch.cuda.is_available():
+        allocated = torch.cuda.memory_allocated() / (1024**3)
+        reserved = torch.cuda.memory_reserved() / (1024**3)
+        print(
+            f"GPU Memory ({message}): Allocated: {allocated:.2f} GB, Reserved: {reserved:.2f} GB"
+        )
+
+
+def run_huggingface_implementation(args, _):
+    """Run the DeepSeek-V3 model using Hugging Face Transformers."""
+    # Disable Hugging Face cache
+    from transformers import AutoConfig, AutoModelForCausalLM
+
+    # We're not using the tokenizer anymore, using fake inputs instead
+    # Use local path for model weights if specified, otherwise use model_name
+    model_path = args.model_path
+    print(f"Loading model from local path: {model_path}")
+    start_time = time.time()
+
+    quantization_config = {
+        "activation_scheme": "dynamic",
+        "fmt": "e4m3",
+        "quant_method": "fp8",  # Updated from fp8 to fbgemm_fp8
+        "weight_block_size": [128, 128],
+    }
+    print(f"Using quantization config: {quantization_config}")
+
+    # ============= Change config to only use a few layers  =============
+    config = None
+    if args.num_layers > 0:
+        # Try to load config from local path first, fall back to model_name if needed
+        try:
+            config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+        except Exception as e:
+            print(f"Could not load config from local path: {e}")
+            print(f"Falling back to loading config from {args.model_name}")
+            config = AutoConfig.from_pretrained(args.model_name, trust_remote_code=True)
+
+        config.n_group = 1  # make n_groups = a huge group
+        config.topk_group = 1  # make topk_group = a huge group
+        # tailer the first several layers
+        config.num_hidden_layers = args.num_layers
+        # Explicitly set rope_interleaved to True to use the interleaved rope implementation
+        config.rope_interleaved = True
+        print(f"Modified config to use only {args.num_layers} layers")
+        print(f"Config of Deepseek: {config}")
+
+    # Load the model from local path
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        torch_dtype=torch.bfloat16,
+        device_map="cuda",  # Try with specific device first
+        config=config,
+        trust_remote_code=True,
+        # Disable features that can cause issues with device mapping
+        attn_implementation="eager",  # Use standard attention instead of flash attention
+        quantization_config=quantization_config,
+        local_files_only=True,  # Only use local files, don't fetch from cache
+        use_auth_token=False,  # Don't try to authenticate with HF
+    )
+
+    print(f"Model loaded in {time.time() - start_time:.2f} seconds")
+    print_gpu_memory_usage("After loading model")
+
+    # Get the device where the model is loaded
+    device = next(model.parameters()).device
+    print(f"Model is on device: {device}")
+
+    # Create fake input directly on the correct device
+    print("\nCreating fake input with the same shape as tokenized input")
+
+    # Define sequence length for fake input
+    seq_length = 2048  # You can adjust this based on your needs
+    vocab_size = 50000
+
+    with torch.no_grad():
+        # Create fake input_ids directly on the device - using random integers between 0 and 50000 (typical vocab size)
+        torch.manual_seed(42)
+        tokens = torch.randint(
+            0, vocab_size, (1, seq_length), dtype=torch.long, device="cuda"
+        )
+
+        # Create fake attention_mask directly on the device - all 1s for full attention
+        attention_mask = torch.ones((1, seq_length), dtype=torch.long, device=device)
+
+        # Create inputs dictionary similar to what tokenizer would produce
+        inputs = {"input_ids": tokens, "attention_mask": attention_mask}
+
+        # Print input information
+        print(f"Fake input token IDs: {inputs['input_ids'][0][:10].cpu().numpy()}...")
+        print(f"Fake input shape: {inputs['input_ids'].shape}")
+        print(f"Input tensors device: {inputs['input_ids'].device}")
+
+    # Run a single forward pass
+    print("\nRunning single forward pass...")
+    start_time = time.time()
+
+    with torch.no_grad():
+        # Forward pass through the model with output_hidden_states=True and output_attentions=True
+        outputs = model(
+            **inputs, output_hidden_states=True, output_attentions=True, use_cache=False
+        )
+
+    forward_time = time.time() - start_time
+
+    # Get the logits from the output
+    logits = outputs.logits if hasattr(outputs, "logits") else outputs
+
+    # Get the predictions for the next token (highest probability)
+    next_token_logits = logits[:, -1, :]
+    print(f"\nNext token logits : {next_token_logits}")
+    next_token_probs = torch.softmax(next_token_logits, dim=-1)
+    print(f"\nNext token probabilities: {next_token_probs}")
+    top_k_values, top_k_indices = torch.topk(next_token_probs, 5, dim=-1)
+
+    print("\nForward Pass Results:")
+    print(f"- Output logits shape: {logits.shape}")
+    print(f"- Sequence length: {logits.shape[1]}")
+    print(f"- Vocabulary size: {logits.shape[2]}")
+
+    print(
+        "\nTop 5 predicted next tokens (showing IDs only since we're not using tokenizer):"
+    )
+    for i, (value, index) in enumerate(zip(top_k_values[0], top_k_indices[0])):
+        print(f"  {i+1}. Token ID: {index} - Probability: {value.item():.4f}")
+
+    print(f"\nForward pass stats:")
+    print(f"- Time: {forward_time:.4f} seconds")
+    print(f"- Input tokens: {inputs['input_ids'].shape[1]}")
+    print(f"- Tokens per second: {inputs['input_ids'].shape[1] / forward_time:.2f}")
+    print_gpu_memory_usage("After forward pass")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Load and test DeepSeek-V3 model")
+    parser.add_argument(
+        "--num_layers",
+        type=int,
+        default=5,  # tailered to 5 layers for 671B model
+        help="Number of layers to use (0 for all layers)",
+    )
+
+    # Hugging Face specific arguments
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default="/data/users/jianiw/model/DeepSeek-V3.1-Base",
+        help="Hugging Face model name or path",
+    )
+
+    args = parser.parse_args()
+    run_huggingface_implementation(args, None)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/torchtitan/models/deepseek_v3/model/model.py b/torchtitan/models/deepseek_v3/model/model.py
@@ -11,7 +11,7 @@
 from torch import nn
 
 from torchtitan.models.attention import build_attention
-from torchtitan.models.moe import FeedForward, MoE
+from torchtitan.models.moe import FeedForward, MoE, print_tensor_stats
 from torchtitan.protocols.train_spec import ModelProtocol
 
 from .args import DeepSeekV3ModelArgs
@@ -295,9 +295,12 @@ def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor):
         Returns:
             torch.Tensor: Output tensor with the same shape as the input.
         """
+        print_tensor_stats(f"input of TransformerBlock {self.layer_id}: ", x)
         x = x + self.attention(self.attention_norm(x), freqs_cis)
         if self.moe_enabled:
-            x = x + self.moe(self.ffn_norm(x))
+            x = self.ffn_norm(x)
+            print_tensor_stats(f"After ffn_norm : ", x)
+            x = x + self.moe(x)
         else:
             x = x + self.feed_forward(self.ffn_norm(x))
         return x
@@ -385,8 +388,11 @@ def forward(
 
         h = self.tok_embeddings(tokens) if self.tok_embeddings is not None else tokens
 
+
+        token_inputs = h
         for layer in self.layers.values():
-            h = layer(h, self.freqs_cis)
+            # reset before each layer
+            h = layer(token_inputs, self.freqs_cis)
         h = self.norm(h) if self.norm is not None else h
         output = self.output(h) if self.output is not None else h
         return output
diff --git a/torchtitan/models/deepseek_v3/model/state_dict_adapter.py b/torchtitan/models/deepseek_v3/model/state_dict_adapter.py
@@ -453,14 +453,18 @@ def from_hf(self, hf_state_dict: dict[str, Any]) -> dict[str, Any]:
                     expert_weights_by_layer, titan_abstract_key, value.device_mesh
                 )
 
-                if stacked_value is not None:
-                    local_tensor = stacked_value._local_tensor
 
-                    tensor_list = local_tensor.tolist()
-                    # Save to JSON file
-                    import json
-                    with open(f'my_implementation_tensor_{new_key}.json', 'w') as f:
-                        json.dump(tensor_list, f)
+                if stacked_value is not None:
+                    if torch.distributed.get_rank() == 0:
+                        print("saving tensor to json file")
+                        local_tensor = stacked_value._local_tensor
+                        print("stacked_value: ", stacked_value.shape, stacked_value.device_mesh, stacked_value.placements, "local_tensor: ", local_tensor.shape)
+                        
+                        tensor_list = local_tensor.tolist()
+                        # Save to JSON file
+                        import json
+                        with open(f'my_imp_tensor_222_{new_key}.json', 'w') as f:
+                            json.dump(tensor_list, f)
                     state_dict[new_key] = stacked_value
 
             elif "layers" in key:
diff --git a/torchtitan/models/deepseek_v3/train_configs/deepseek_v3_671b.toml b/torchtitan/models/deepseek_v3/train_configs/deepseek_v3_671b.toml
@@ -35,8 +35,8 @@ decay_type = "cosine"
 min_lr_factor = 0.1
 
 [training]
-local_batch_size = 4
-seq_len = 4096
+local_batch_size = 2
+seq_len = 2048
 max_norm = 1.0  # grad norm clipping
 steps = 10
 compile = false
diff --git a/torchtitan/models/moe.py b/torchtitan/models/moe.py
@@ -14,6 +14,14 @@
 from torchtitan.distributed.expert_parallel import expert_parallel
 
 
+def print_tensor_stats(name, tensor):
+    mean = tensor.mean().item()
+    std = tensor.std().item()
+    min_val = tensor.min().item()
+    max_val = tensor.max().item()
+    print(
+        f"{name} - Shape: {tensor.shape} Mean: {mean:.6f}, Min: {min_val:.6f}, Max: {max_val:.6f}, Std: {std:.6f}, First 10 values: {tensor.flatten()[:10].tolist()}"
+    )
 @dataclass
 class MoEArgs:
     num_experts: int = 8
@@ -367,9 +375,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         Returns:
             out (torch.Tensor): Output tensor with shape ``(bs, slen, dim)``.
         """
+
+        print_tensor_stats("input of MoE module: ", x)
+
         bs, slen, dim = x.shape
         x = x.view(-1, dim)
-
+        
         # top_scores and selected_experts_indices shape (bs*slen*top_k,)
         # num_tokens_per_expert shape (num_experts,)
         (
@@ -378,6 +389,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             num_tokens_per_expert,
         ) = self.router(x, self.expert_bias)
 
+        print_tensor_stats("top_scores of router: ", top_scores)
+
         # tokens_per_expert will be used to update the expert bias for load balancing.
         # and also to count the expert usage
         # TODO: Activation Checkpointing has the side effect of double counting tokens_per_expert --
@@ -400,6 +413,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             num_tokens_per_expert,
         ) = self.reorderer(top_scores, selected_experts_indices)
 
+        # print_tensor_stats("selected_experts_indices of reorderer: ", selected_experts_indices)
+        # Print first 10 elements of selected_experts_indices
+        print(f"First 10 elements of selected_experts_indices: {selected_experts_indices.flatten()[:10].tolist()}")
+        
+
         # shape (bs*slen*top_k, dim)
         token_indices_experts_sorted = token_indices_experts_sorted.reshape(
             -1, 1
@@ -414,9 +432,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 * top_scores_experts_sorted.reshape(-1, 1)
             ).to(x.dtype)
 
+        print_tensor_stats("routed_input of GroupedExperts module: ", routed_input)
+
         # shape (bs*slen*top_k, dim)
         routed_output = self.experts(routed_input, num_tokens_per_expert)
 
+        print_tensor_stats("routed_output of GroupedExperts module: ", routed_output)
+
         if not self.score_before_experts:
             routed_output = (
                 routed_output.to(torch.float32)
@@ -426,13 +448,17 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         # shared expert
         if self.shared_experts is not None:
             out = self.shared_experts(x)
+            print_tensor_stats("out of Shard Experts module: ", out)
         else:
             out = torch.zeros_like(x)
 
         out = out.scatter_add(
             dim=0, index=token_indices_experts_sorted, src=routed_output
         )
+
+        
         out = out.reshape(bs, slen, dim)
+        print_tensor_stats("out of MoE module: ", out)
         return out
 
     def init_weights(
diff --git a/torchtitan/train.py b/torchtitan/train.py