NVIDIA
diff --git a/‎tensorrt_llm/_torch/pyexecutor/llm_request.py‎
Lines changed: 13 additions & 3 deletions b/‎tensorrt_llm/_torch/pyexecutor/llm_request.py‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/py_executor.py‎
Lines changed: 2 additions & 0 deletions b/‎tensorrt_llm/_torch/pyexecutor/py_executor.py‎
Lines changed: 2 additions & 0 deletions
@@ -8,6 +8,7 @@
 from tensorrt_llm._torch.shared_tensor import SharedTensorContainer
 from tensorrt_llm.bindings import executor as tllm_executor
 from tensorrt_llm.executor.result import TokenLogprobs
+from tensorrt_llm.sampling_params import LogprobMode
 
 SamplingConfig = tensorrt_llm.bindings.SamplingConfig
 '''
@@ -460,7 +461,7 @@ def __init__(
             is_first_draft: bool = False,
             use_chunked_generation_logits: bool = True,
             logits_chunk_size: int = 8,
-            logprobs_mode: str = "raw",
+            logprobs_mode: LogprobMode | None = None,
             **kwargs):
 
         self.py_logits_post_processors = kwargs.pop("py_logits_post_processors",
@@ -539,7 +540,7 @@ def __init__(
         # currently, keep py_stop_words_list as python list, rather than tensor.
         self.py_stop_words_list = stop_words_list
 
-        self.py_logprobs_mode = logprobs_mode
+        self.py_logprobs_mode = LogprobMode.RAW if logprobs_mode is None else logprobs_mode
 
         self.py_result = PyResult(
             prompt_len=self.py_prompt_len,
@@ -568,6 +569,15 @@ def set_exclude_last_generation_logits(
         self.py_result.set_exclude_last_generation_logits(
             exclude_last_generation_logits)
 
+    def validate_logprobs_mode(self):
+        if self.py_logprobs_mode not in [
+                LogprobMode.RAW, LogprobMode.PROCESSED
+        ]:
+            raise ValueError(
+                f"Invalid logprobs_mode: {self.py_logprobs_mode} "
+                f"Expected one of {LogprobMode.RAW.value}, {LogprobMode.PROCESSED.value}"
+            )
+
     @property
     def cached_tokens(self) -> int:
         return self._cached_tokens
@@ -801,7 +811,7 @@ def executor_request_to_llm_request(
         py_multimodal_data=getattr(executor_request, "py_multimodal_data",
                                    None),
         kv_cache_retention_config=executor_request.kv_cache_retention_config,
-        logprobs_mode=getattr(executor_request, "py_logprobs_mode", "raw"),
+        logprobs_mode=getattr(executor_request, "py_logprobs_mode", None),
     )
     if child_req_ids:
         for child_id in child_req_ids:
 
@@ -1718,6 +1718,8 @@ def _validate_request(self, request: LlmRequest):
                     f"Request beam width {sampling_config.beam_width} "
                     f"is not equal to max_beam_width {self.max_beam_width}. This is not supported!"
                 )
+        # Validate logprobs mode
+        request.validate_logprobs_mode()
 
         # Check token ID ranges
         if isinstance(self.model_engine.model, DecoderModelForCausalLM):
Original file line number	Diff line number	Diff line change
`@@ -1718,6 +1718,8 @@ def _validate_request(self, request: LlmRequest):`
`1718`	`1718`	`f"Request beam width {sampling_config.beam_width} "`
`1719`	`1719`	`f"is not equal to max_beam_width {self.max_beam_width}. This is not supported!"`
`1720`	`1720`	`)`
	`1721`	`+ # Validate logprobs mode`
	`1722`	`+ request.validate_logprobs_mode()`
`1721`	`1723`
`1722`	`1724`	`# Check token ID ranges`
`1723`	`1725`	`if isinstance(self.model_engine.model, DecoderModelForCausalLM):`