fix llama's tp acc problem without breaking meta device loading (#4699)

Zhenhuan Chen · web-flow · commit 4f975b74b4ec · 2024-08-22T22:17:04.000+08:00
diff --git a/examples/gpu/llm/inference/run_generation_with_deepspeed.py b/examples/gpu/llm/inference/run_generation_with_deepspeed.py
@@ -210,7 +210,7 @@ def print_mem_usage(msg):
 if args.benchmark:
     print_mem_usage("pre-from-pretrained")
 
-is_meta_support = model_type not in ["auto"] and not args.disable_optimize_transformers
+is_meta_support = model_type not in ["auto"]
 
 # Construct model with fake meta tensors, later will be replaced during ds-inference ckpt load
 with deepspeed.OnDevice(dtype=load_dtype, device="meta", enabled=is_meta_support):
@@ -287,6 +287,12 @@ def write_checkpoints_json():
 if isinstance(model, deepspeed.InferenceEngine):
     model = model.module
 
+# reinitialize some buffers that is empty caused by meta device loading
+if args.disable_optimize_transformers:
+    if model_type == "llama" and isinstance(model, LlamaForCausalLM):
+        if hasattr(model.model, "causal_mask"):
+            model.model.causal_mask = torch.triu(torch.ones_like(model.model.causal_mask), diagonal=1)
+
 if args.num_beams is None:
     args.num_beams = 1 if args.greedy else 4