Merge branch 'deepep' of https://github.com/ModelTC/lightllm into deepep

shihaobai · shihaobai · commit e123d380056b · 2025-03-27T19:53:41.000+08:00
diff --git a/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl_for_first_token_constraint_mode.py b/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl_for_first_token_constraint_mode.py
@@ -28,6 +28,7 @@ def init_custom(self):
         logger.info(f"first_allowed_tokens : {self.first_allowed_tokens}")
         # check token_id < vocab_size
         assert all(e < self.model.vocab_size for e in self.first_allowed_tokens)
+        self.fill_value = torch.tensor(-1000000.0)
         return
 
     def decode(self):
@@ -92,5 +93,8 @@ def _mask_first_gen_token_logits(self, run_reqs: List[InferReq], logits: torch.T
                         mask[i, :] = True
                         mask[i, self.first_allowed_tokens] = False
             torch.cuda.current_stream().wait_stream(g_infer_context.get_overlap_stream())
-            logits[mask] = -1000000.0
+            # 不能使用 logits[mask] = -1000000.0
+            # 会存在诡异的多流异步问题, 可能是torch的bug
+            new_logits = torch.where(mask, self.fill_value, logits)
+            logits.copy_(new_logits)
         return