Skip to content

Commit c218686

Browse files
committed
test
1 parent c56bac1 commit c218686

File tree

6 files changed

+282
-16
lines changed

6 files changed

+282
-16
lines changed
Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
#!/usr/bin/env python3
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
"""
9+
Hugging Face implementation for DeepSeek-V3 model inference.
10+
"""
11+
12+
import argparse
13+
import gc
14+
import os
15+
import time
16+
17+
import torch
18+
19+
20+
def print_gpu_memory_usage(message=""):
21+
"""Print current GPU memory usage."""
22+
if torch.cuda.is_available():
23+
allocated = torch.cuda.memory_allocated() / (1024**3)
24+
reserved = torch.cuda.memory_reserved() / (1024**3)
25+
print(
26+
f"GPU Memory ({message}): Allocated: {allocated:.2f} GB, Reserved: {reserved:.2f} GB"
27+
)
28+
29+
30+
def run_huggingface_implementation(args, _):
31+
"""Run the DeepSeek-V3 model using Hugging Face Transformers."""
32+
# Disable Hugging Face cache
33+
from transformers import AutoConfig, AutoModelForCausalLM
34+
35+
# We're not using the tokenizer anymore, using fake inputs instead
36+
# Use local path for model weights if specified, otherwise use model_name
37+
model_path = args.model_path
38+
print(f"Loading model from local path: {model_path}")
39+
start_time = time.time()
40+
41+
quantization_config = {
42+
"activation_scheme": "dynamic",
43+
"fmt": "e4m3",
44+
"quant_method": "fp8", # Updated from fp8 to fbgemm_fp8
45+
"weight_block_size": [128, 128],
46+
}
47+
print(f"Using quantization config: {quantization_config}")
48+
49+
# ============= Change config to only use a few layers =============
50+
config = None
51+
if args.num_layers > 0:
52+
# Try to load config from local path first, fall back to model_name if needed
53+
try:
54+
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
55+
except Exception as e:
56+
print(f"Could not load config from local path: {e}")
57+
print(f"Falling back to loading config from {args.model_name}")
58+
config = AutoConfig.from_pretrained(args.model_name, trust_remote_code=True)
59+
60+
config.n_group = 1 # make n_groups = a huge group
61+
config.topk_group = 1 # make topk_group = a huge group
62+
# tailer the first several layers
63+
config.num_hidden_layers = args.num_layers
64+
# Explicitly set rope_interleaved to True to use the interleaved rope implementation
65+
config.rope_interleaved = True
66+
print(f"Modified config to use only {args.num_layers} layers")
67+
print(f"Config of Deepseek: {config}")
68+
69+
# Load the model from local path
70+
model = AutoModelForCausalLM.from_pretrained(
71+
model_path,
72+
torch_dtype=torch.bfloat16,
73+
device_map="cuda", # Try with specific device first
74+
config=config,
75+
trust_remote_code=True,
76+
# Disable features that can cause issues with device mapping
77+
attn_implementation="eager", # Use standard attention instead of flash attention
78+
quantization_config=quantization_config,
79+
local_files_only=True, # Only use local files, don't fetch from cache
80+
use_auth_token=False, # Don't try to authenticate with HF
81+
)
82+
83+
print(f"Model loaded in {time.time() - start_time:.2f} seconds")
84+
print_gpu_memory_usage("After loading model")
85+
86+
# Get the device where the model is loaded
87+
device = next(model.parameters()).device
88+
print(f"Model is on device: {device}")
89+
90+
# Create fake input directly on the correct device
91+
print("\nCreating fake input with the same shape as tokenized input")
92+
93+
# Define sequence length for fake input
94+
seq_length = 2048 # You can adjust this based on your needs
95+
vocab_size = 50000
96+
97+
with torch.no_grad():
98+
# Create fake input_ids directly on the device - using random integers between 0 and 50000 (typical vocab size)
99+
torch.manual_seed(42)
100+
tokens = torch.randint(
101+
0, vocab_size, (1, seq_length), dtype=torch.long, device="cuda"
102+
)
103+
104+
# Create fake attention_mask directly on the device - all 1s for full attention
105+
attention_mask = torch.ones((1, seq_length), dtype=torch.long, device=device)
106+
107+
# Create inputs dictionary similar to what tokenizer would produce
108+
inputs = {"input_ids": tokens, "attention_mask": attention_mask}
109+
110+
# Print input information
111+
print(f"Fake input token IDs: {inputs['input_ids'][0][:10].cpu().numpy()}...")
112+
print(f"Fake input shape: {inputs['input_ids'].shape}")
113+
print(f"Input tensors device: {inputs['input_ids'].device}")
114+
115+
# Run a single forward pass
116+
print("\nRunning single forward pass...")
117+
start_time = time.time()
118+
119+
with torch.no_grad():
120+
# Forward pass through the model with output_hidden_states=True and output_attentions=True
121+
outputs = model(
122+
**inputs, output_hidden_states=True, output_attentions=True, use_cache=False
123+
)
124+
125+
forward_time = time.time() - start_time
126+
127+
# Get the logits from the output
128+
logits = outputs.logits if hasattr(outputs, "logits") else outputs
129+
130+
# Get the predictions for the next token (highest probability)
131+
next_token_logits = logits[:, -1, :]
132+
print(f"\nNext token logits : {next_token_logits}")
133+
next_token_probs = torch.softmax(next_token_logits, dim=-1)
134+
print(f"\nNext token probabilities: {next_token_probs}")
135+
top_k_values, top_k_indices = torch.topk(next_token_probs, 5, dim=-1)
136+
137+
print("\nForward Pass Results:")
138+
print(f"- Output logits shape: {logits.shape}")
139+
print(f"- Sequence length: {logits.shape[1]}")
140+
print(f"- Vocabulary size: {logits.shape[2]}")
141+
142+
print(
143+
"\nTop 5 predicted next tokens (showing IDs only since we're not using tokenizer):"
144+
)
145+
for i, (value, index) in enumerate(zip(top_k_values[0], top_k_indices[0])):
146+
print(f" {i+1}. Token ID: {index} - Probability: {value.item():.4f}")
147+
148+
print(f"\nForward pass stats:")
149+
print(f"- Time: {forward_time:.4f} seconds")
150+
print(f"- Input tokens: {inputs['input_ids'].shape[1]}")
151+
print(f"- Tokens per second: {inputs['input_ids'].shape[1] / forward_time:.2f}")
152+
print_gpu_memory_usage("After forward pass")
153+
154+
155+
def main():
156+
parser = argparse.ArgumentParser(description="Load and test DeepSeek-V3 model")
157+
parser.add_argument(
158+
"--num_layers",
159+
type=int,
160+
default=5, # tailered to 5 layers for 671B model
161+
help="Number of layers to use (0 for all layers)",
162+
)
163+
164+
# Hugging Face specific arguments
165+
parser.add_argument(
166+
"--model_path",
167+
type=str,
168+
default="/data/users/jianiw/model/DeepSeek-V3.1-Base",
169+
help="Hugging Face model name or path",
170+
)
171+
172+
args = parser.parse_args()
173+
run_huggingface_implementation(args, None)
174+
175+
176+
if __name__ == "__main__":
177+
main()

torchtitan/models/deepseek_v3/model/model.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from torch import nn
1212

1313
from torchtitan.models.attention import build_attention
14-
from torchtitan.models.moe import FeedForward, MoE
14+
from torchtitan.models.moe import FeedForward, MoE, print_tensor_stats
1515
from torchtitan.protocols.train_spec import ModelProtocol
1616

1717
from .args import DeepSeekV3ModelArgs
@@ -295,9 +295,12 @@ def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor):
295295
Returns:
296296
torch.Tensor: Output tensor with the same shape as the input.
297297
"""
298+
print_tensor_stats(f"input of TransformerBlock {self.layer_id}: ", x)
298299
x = x + self.attention(self.attention_norm(x), freqs_cis)
299300
if self.moe_enabled:
300-
x = x + self.moe(self.ffn_norm(x))
301+
x = self.ffn_norm(x)
302+
print_tensor_stats(f"After ffn_norm : ", x)
303+
x = x + self.moe(x)
301304
else:
302305
x = x + self.feed_forward(self.ffn_norm(x))
303306
return x
@@ -385,8 +388,11 @@ def forward(
385388

386389
h = self.tok_embeddings(tokens) if self.tok_embeddings is not None else tokens
387390

391+
392+
token_inputs = h
388393
for layer in self.layers.values():
389-
h = layer(h, self.freqs_cis)
394+
# reset before each layer
395+
h = layer(token_inputs, self.freqs_cis)
390396
h = self.norm(h) if self.norm is not None else h
391397
output = self.output(h) if self.output is not None else h
392398
return output

torchtitan/models/deepseek_v3/model/state_dict_adapter.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -453,14 +453,18 @@ def from_hf(self, hf_state_dict: dict[str, Any]) -> dict[str, Any]:
453453
expert_weights_by_layer, titan_abstract_key, value.device_mesh
454454
)
455455

456-
if stacked_value is not None:
457-
local_tensor = stacked_value._local_tensor
458456

459-
tensor_list = local_tensor.tolist()
460-
# Save to JSON file
461-
import json
462-
with open(f'my_implementation_tensor_{new_key}.json', 'w') as f:
463-
json.dump(tensor_list, f)
457+
if stacked_value is not None:
458+
if torch.distributed.get_rank() == 0:
459+
print("saving tensor to json file")
460+
local_tensor = stacked_value._local_tensor
461+
print("stacked_value: ", stacked_value.shape, stacked_value.device_mesh, stacked_value.placements, "local_tensor: ", local_tensor.shape)
462+
463+
tensor_list = local_tensor.tolist()
464+
# Save to JSON file
465+
import json
466+
with open(f'my_imp_tensor_222_{new_key}.json', 'w') as f:
467+
json.dump(tensor_list, f)
464468
state_dict[new_key] = stacked_value
465469

466470
elif "layers" in key:

torchtitan/models/deepseek_v3/train_configs/deepseek_v3_671b.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ decay_type = "cosine"
3535
min_lr_factor = 0.1
3636

3737
[training]
38-
local_batch_size = 4
39-
seq_len = 4096
38+
local_batch_size = 2
39+
seq_len = 2048
4040
max_norm = 1.0 # grad norm clipping
4141
steps = 10
4242
compile = false

torchtitan/models/moe.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,14 @@
1414
from torchtitan.distributed.expert_parallel import expert_parallel
1515

1616

17+
def print_tensor_stats(name, tensor):
18+
mean = tensor.mean().item()
19+
std = tensor.std().item()
20+
min_val = tensor.min().item()
21+
max_val = tensor.max().item()
22+
print(
23+
f"{name} - Shape: {tensor.shape} Mean: {mean:.6f}, Min: {min_val:.6f}, Max: {max_val:.6f}, Std: {std:.6f}, First 10 values: {tensor.flatten()[:10].tolist()}"
24+
)
1725
@dataclass
1826
class MoEArgs:
1927
num_experts: int = 8
@@ -367,9 +375,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
367375
Returns:
368376
out (torch.Tensor): Output tensor with shape ``(bs, slen, dim)``.
369377
"""
378+
379+
print_tensor_stats("input of MoE module: ", x)
380+
370381
bs, slen, dim = x.shape
371382
x = x.view(-1, dim)
372-
383+
373384
# top_scores and selected_experts_indices shape (bs*slen*top_k,)
374385
# num_tokens_per_expert shape (num_experts,)
375386
(
@@ -378,6 +389,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
378389
num_tokens_per_expert,
379390
) = self.router(x, self.expert_bias)
380391

392+
print_tensor_stats("top_scores of router: ", top_scores)
393+
381394
# tokens_per_expert will be used to update the expert bias for load balancing.
382395
# and also to count the expert usage
383396
# TODO: Activation Checkpointing has the side effect of double counting tokens_per_expert --
@@ -400,6 +413,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
400413
num_tokens_per_expert,
401414
) = self.reorderer(top_scores, selected_experts_indices)
402415

416+
# print_tensor_stats("selected_experts_indices of reorderer: ", selected_experts_indices)
417+
# Print first 10 elements of selected_experts_indices
418+
print(f"First 10 elements of selected_experts_indices: {selected_experts_indices.flatten()[:10].tolist()}")
419+
420+
403421
# shape (bs*slen*top_k, dim)
404422
token_indices_experts_sorted = token_indices_experts_sorted.reshape(
405423
-1, 1
@@ -414,9 +432,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
414432
* top_scores_experts_sorted.reshape(-1, 1)
415433
).to(x.dtype)
416434

435+
print_tensor_stats("routed_input of GroupedExperts module: ", routed_input)
436+
417437
# shape (bs*slen*top_k, dim)
418438
routed_output = self.experts(routed_input, num_tokens_per_expert)
419439

440+
print_tensor_stats("routed_output of GroupedExperts module: ", routed_output)
441+
420442
if not self.score_before_experts:
421443
routed_output = (
422444
routed_output.to(torch.float32)
@@ -426,13 +448,17 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
426448
# shared expert
427449
if self.shared_experts is not None:
428450
out = self.shared_experts(x)
451+
print_tensor_stats("out of Shard Experts module: ", out)
429452
else:
430453
out = torch.zeros_like(x)
431454

432455
out = out.scatter_add(
433456
dim=0, index=token_indices_experts_sorted, src=routed_output
434457
)
458+
459+
435460
out = out.reshape(bs, slen, dim)
461+
print_tensor_stats("out of MoE module: ", out)
436462
return out
437463

438464
def init_weights(

0 commit comments

Comments
 (0)