Merge remote-tracking branch 'origin/prefill_overlap'

hiworldwzj · hiworldwzj · commit f0f0361f904c · 2025-03-31T18:44:43.000+08:00
diff --git a/lightllm/common/basemodel/basemodel.py b/lightllm/common/basemodel/basemodel.py
@@ -437,7 +437,6 @@ def create_inferstate(cur_batch: DecodeMicroBatch, batch_index):
 
     @torch.no_grad()
     def microbatch_overlap_prefill(self, batch: PrefillMicroBatch, batch1: PrefillMicroBatch):
-        assert batch.batch_size == batch1.batch_size
         assert batch.mem_indexes.is_cuda
         assert batch1.mem_indexes.is_cuda
         input_ids, input_ids1 = batch.input_ids, batch1.input_ids
diff --git a/test/model/model_infer.py b/test/model/model_infer.py
@@ -8,6 +8,7 @@
 from lightllm.utils.envs_utils import get_env_start_args
 from lightllm.models.deepseek2.model import Deepseek2TpPartModel
 from lightllm.common.basemodel.microbatch_overlap_objs import DecodeMicroBatch
+from torch.profiler import profile, record_function, ProfilerActivity
 
 
 def test_model_inference(args, model_class):
@@ -116,6 +117,16 @@ def decode(
     return logits
 
 
+def torch_profile(fn, log_dir=None):
+    with profile(
+        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+        record_shapes=False,
+        on_trace_ready=torch.profiler.tensorboard_trace_handler(log_dir)
+    ) as prof:
+        fn()
+    print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+
+
 def tppart_model_infer(args, model_class, model_kvargs, batch_size, input_len, output_len, ans_queue):
     args = get_env_start_args()
     import triton.profiler as proton
@@ -244,6 +255,28 @@ def tppart_model_infer(args, model_class, model_kvargs, batch_size, input_len, o
         if args.profile:
             proton.start(name="forward_prefill", context="python")
 
+    if args.torch_profile:
+        print("Profile Prefill")
+        try:
+            torch_profile(
+                lambda: model_part.forward(
+                    batch_size,
+                    total_token_num,
+                    input_len,
+                    test_data,
+                    mem_indexes,
+                    b_req_idx,
+                    b_start_loc,
+                    b_seq_len,
+                    b_ready_cache_len=b_ready_cache_len,
+                    is_prefill=True,
+                ),
+                log_dir=f"./logs_decode_overlap/forward_prefill_{model_kvargs['rank_id']}",
+            )
+        except Exception as e:
+            print(str(e))
+            raise
+
     logics = model_part.forward(
         batch_size,
         total_token_num,
@@ -291,6 +324,21 @@ def tppart_model_infer(args, model_class, model_kvargs, batch_size, input_len, o
                 b_seq_len,
                 total_token_num,
             )
+            if i == 0 and args.torch_profile:
+                torch_profile(
+                    lambda: overlap_decode(
+                        model_part,
+                        batch_size,
+                        max_len_in_batch,
+                        torch.from_numpy(predict_ids).cuda().reshape(-1),
+                        mem_indexes,
+                        b_req_idx,
+                        b_start_loc,
+                        b_seq_len,
+                        total_token_num,
+                    ),
+                    log_dir=f"./logs_decode_overlap/forward_decode_{model_kvargs['rank_id']}",
+                )
         else:
             logits = decode(
                 model_part,
@@ -303,6 +351,21 @@ def tppart_model_infer(args, model_class, model_kvargs, batch_size, input_len, o
                 b_seq_len,
                 total_token_num,
             )
+            if i ==0 and args.torch_profile:
+                torch_profile(
+                    lambda: decode(
+                        model_part,
+                        batch_size,
+                        max_len_in_batch,
+                        torch.from_numpy(predict_ids).cuda().reshape(-1),
+                        mem_indexes,
+                        b_req_idx,
+                        b_start_loc,
+                        b_seq_len,
+                        total_token_num,
+                    ),
+                    log_dir=f"./logs_decode_overlap/forward_decode_{model_kvargs['rank_id']}",
+                )
 
         prob_out = torch.softmax(logits, dim=-1)
         predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
diff --git a/test/model/test_model.py b/test/model/test_model.py
@@ -24,7 +24,7 @@
 from lightllm.models.cohere.model import CohereTpPartModel
 from lightllm.models.mixtral.model import MixtralTpPartModel
 from lightllm.models.qwen2.model import Qwen2TpPartModel
-from lightllm.utils.config_utils import get_config_json
+from lightllm.utils.config_utils import get_config_json, get_dtype
 
 
 def get_model(weight_dir):
@@ -71,6 +71,8 @@ def get_model(weight_dir):
 class TestModelInfer(unittest.TestCase):
     def test_model_infer(self):
         args = get_env_start_args()
+        if args.data_type is None:
+            args.data_type = get_dtype(args.model_dir)
         model_dir = args.model_dir
         model_class = get_model(model_dir)
         test_model_inference(args, model_class)
@@ -89,6 +91,11 @@ def test_model_infer(self):
         action="store_true",
         help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
     )
+    parser.add_argument(
+        "--torch_profile",
+        action="store_true",
+        help="Enable torch profiler to profile the model",
+    )
     args = parser.parse_args()
     set_env_start_args(args)
     torch.multiprocessing.set_start_method("spawn")