Skip unnecessary sampling and fix the random offset (#4068)

grimoire · web-flow · commit c5b34f16608e · 2025-10-24T15:05:58.000+08:00
* optimize multinomial sampling kernel

* remove

* add comments

* optimize

* remove sync

* recovery

* remove print

* fix

* optimize output pipeline

* skip unnecessary sampling

* add rand offsets

* add more comment
diff --git a/lmdeploy/pytorch/engine/logits_process.py b/lmdeploy/pytorch/engine/logits_process.py
@@ -126,6 +126,17 @@ def _apply_custom_logits_processors(batched_logits_processors, all_ids, logits):
     return logits
 
 
+def _torch_topk(x: torch.Tensor, k: int, dim: int = -1, largest: bool = True, sorted: bool = True):
+    if k == 1:
+        # torch.topk would not fallback to torch.max/torch.min automatically
+        if largest:
+            return torch.max(x, dim=dim, keepdim=True)
+        else:
+            return torch.min(x, dim=dim, keepdim=True)
+    else:
+        return torch.topk(x, k, dim=dim, largest=largest, sorted=sorted)
+
+
 class FusedLogitsProcessor:
     """Custom logits processor."""
 
@@ -266,7 +277,7 @@ def __random_sampling(scores: torch.Tensor, indices: torch.LongTensor):
             if max_topk <= 0:
                 scores, indices = logits.sort(1, descending=True)
             else:
-                scores, indices = logits.topk(max_topk, dim=1)
+                scores, indices = _torch_topk(logits, max_topk, dim=1)
             result = __random_sampling(scores, indices)
 
         if self.guided_decoding_manager and self.guided_processors:
@@ -285,7 +296,7 @@ def compute_logprobs(self, raw_logprobs: torch.Tensor, token_ids: torch.LongTens
         logprobs = raw_logprobs.gather(-1, indices)
         num_logprobs = self.sampling_inputs.max_num_logprobs
         if num_logprobs > 0:
-            topk_logprobs, topk_indices = raw_logprobs.topk(num_logprobs, dim=-1)
+            topk_logprobs, topk_indices = _torch_topk(raw_logprobs, num_logprobs, dim=-1)
             logprobs = torch.cat([logprobs, topk_logprobs], dim=-1)
             indices = torch.cat([indices, topk_indices], dim=-1)
 
diff --git a/lmdeploy/pytorch/strategies/ar/model_agent.py b/lmdeploy/pytorch/strategies/ar/model_agent.py
@@ -70,6 +70,10 @@ def slice_extra_inputs(self, extra_inputs: ARExtraInputs, seq_length: torch.Long
     def _step_sampling_inputs(self, sampling_inputs: SamplingInputs, next_token_ids: torch.Tensor):
         """step."""
         sampling_inputs.num_ignore_eos = sampling_inputs.num_ignore_eos - 1
+        if sampling_inputs.random_offsets is not None:
+            # random offset is used to generate random numbers for multinomial sampling
+            # so we need to increase it by 1 at each step
+            sampling_inputs.random_offsets += 1
 
         all_ids = sampling_inputs.all_ids
         if all_ids is not None:
diff --git a/lmdeploy/pytorch/strategies/ar/sampling.py b/lmdeploy/pytorch/strategies/ar/sampling.py
@@ -65,7 +65,7 @@ def __gather_params():
                 param = seq.sampling_param
                 temperature[idx] = param.temperature
                 repetition_penalty[idx] = param.repetition_penalty
-                top_k[idx] = param.top_k
+                top_k[idx] = max(0, param.top_k)
                 top_p[idx] = param.top_p
                 min_p[idx] = param.min_p
                 random_offsets[idx] = seq.num_valid_ids
@@ -129,6 +129,9 @@ def __get_bad_words(bad_words):
             repetition_penalty = torch.tensor(repetition_penalty)
 
         temperature = torch.tensor(temperature)
+        if (temperature == 1.0).all():
+            # skip temperature processing if all temperature are 1.0
+            temperature = None
 
         bad_words, bad_mask = __get_bad_words(bad_words)
         stop_words, stop_mask = __get_bad_words(stop_words)
@@ -144,6 +147,10 @@ def __get_bad_words(bad_words):
             random_offsets = None
         else:
             top_k = torch.tensor(top_k)
+            if (top_k == max_top_k).all():
+                # we would perform max_top_k before top_k
+                # if all top_k are same, we do not need to filter topk again
+                top_k = None
             top_p, min_top_p = __get_topp(top_p)
             min_p = __get_minp(min_p)
             random_seeds = torch.tensor(random_seeds)
diff --git a/lmdeploy/pytorch/strategies/dllm/model_agent.py b/lmdeploy/pytorch/strategies/dllm/model_agent.py
@@ -169,6 +169,10 @@ def _step_sampling_inputs(self, sampling_inputs: SamplingInputs, next_token_ids:
         num_ignore_eos = sampling_inputs.num_ignore_eos.view(-1, dllm_block_size)
         num_ignore_eos = torch.where(is_unmasked, num_ignore_eos - dllm_block_size, num_ignore_eos)
         sampling_inputs.num_ignore_eos = num_ignore_eos.flatten()
+        if sampling_inputs.random_offsets is not None:
+            # random offset is used to generate random numbers for multinomial sampling
+            # so we need to increase it by 1 at each step
+            sampling_inputs.random_offsets += 1
         return sampling_inputs
 
     def make_stopping_criteria(self, seqs: SeqList) -> DLLMStoppingCriteria: