niukuo
diff --git a/‎tensorrt_llm/_torch/pyexecutor/_util.py‎
Lines changed: 1 addition & 1 deletion b/‎tensorrt_llm/_torch/pyexecutor/_util.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorrt_llm/bench/benchmark/throughput.py‎
Lines changed: 0 additions & 4 deletions b/‎tensorrt_llm/bench/benchmark/throughput.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎tensorrt_llm/builder.py‎
Lines changed: 0 additions & 3 deletions b/‎tensorrt_llm/builder.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎tensorrt_llm/commands/serve.py‎
Lines changed: 3 additions & 5 deletions b/‎tensorrt_llm/commands/serve.py‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎tensorrt_llm/executor/serialization.py‎
Lines changed: 1 addition & 1 deletion b/‎tensorrt_llm/executor/serialization.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorrt_llm/llmapi/llm.py‎
Lines changed: 18 additions & 28 deletions b/‎tensorrt_llm/llmapi/llm.py‎
Lines changed: 18 additions & 28 deletions
@@ -391,7 +391,7 @@ def create_py_executor_instance(
                 "Guided decoding is not supported with overlap scheduler.")
 
     logger.info(
-        f"max_seq_len={executor_config.max_seq_len}, max_num_requests={executor_config.max_batch_size}, max_num_tokens={executor_config.max_num_tokens}, max_batch_size={executor_config.max_batch_size}"
+        f"max_seq_len={executor_config.max_seq_len}, max_num_requests={executor_config.max_batch_size}, max_num_tokens={executor_config.max_num_tokens}"
     )
 
     for key, value in pytorch_backend_config.extra_resource_managers.items():
 
@@ -360,10 +360,6 @@ def throughput_command(
             kwargs["enable_iter_perf_stats"] = True
 
         if runtime_config.backend == 'pytorch':
-            if kwargs.pop("extended_runtime_perf_knob_config", None):
-                logger.warning(
-                    "Ignore extended_runtime_perf_knob_config for pytorch backend."
-                )
             llm = PyTorchLLM(**kwargs)
         else:
             llm = LLM(**kwargs)
 
@@ -49,9 +49,6 @@ def default(self, obj):
         if isinstance(obj, KVCacheType):
             # For KVCacheType, convert it to string by split of 'KVCacheType.PAGED'.
             return obj.__str__().split('.')[-1]
-        elif hasattr(obj, 'model_dump'):
-            # Handle Pydantic models (including DecodingBaseConfig and subclasses)
-            return obj.model_dump(mode='json')
         else:
             return super().default(obj)
 
 
@@ -121,9 +121,9 @@ def get_llm_args(model: str,
         "max_seq_len": max_seq_len,
         "kv_cache_config": kv_cache_config,
         "backend": backend if backend == "pytorch" else None,
-        "num_postprocess_workers": num_postprocess_workers,
-        "postprocess_tokenizer_dir": tokenizer or model,
-        "reasoning_parser": reasoning_parser,
+        "_num_postprocess_workers": num_postprocess_workers,
+        "_postprocess_tokenizer_dir": tokenizer or model,
+        "_reasoning_parser": reasoning_parser,
     }
 
     return llm_args, llm_args_extra_dict
@@ -418,8 +418,6 @@ def disaggregated_mpi_worker(config_file: Optional[str], log_level: str):
         llm_args = update_llm_args_with_extra_dict(llm_args,
                                                    llm_args_extra_dict)
 
-        # Ignore the non-LLM args
-        llm_args.pop("router", None)
         _launch_disaggregated_server(config_file, llm_args)
         return
 
 
@@ -10,7 +10,7 @@
 BASE_ZMQ_CLASSES = {
     "builtins": [
         "Exception", "ValueError", "NotImplementedError", "AttributeError",
-        "AssertionError", "RuntimeError"
+        "AssertionError"
     ],  # each Exception Error class needs to be added explicitly
     "collections": ["OrderedDict"],
     "datetime": ["timedelta"],
 
@@ -124,16 +124,6 @@ def __init__(self,
             else:
                 llm_args_cls = TrtLlmArgs
 
-            # check the kwargs and raise ValueError directly
-            valid_keys = set(
-                list(llm_args_cls.model_fields.keys()) +
-                ['_mpi_session', 'backend'])
-            for key in kwargs:
-                if key not in valid_keys:
-                    raise ValueError(
-                        f"{self.__class__.__name__} got invalid argument: {key}"
-                    )
-
             self.args = llm_args_cls.from_kwargs(
                 model=model,
                 tokenizer=tokenizer,
@@ -596,7 +586,7 @@ def _build_model(self):
         max_num_tokens = max_num_tokens or build_config.max_num_tokens
         max_seq_len = max_seq_len or build_config.max_seq_len
 
-        self._executor_config = tllm.ExecutorConfig(
+        executor_config = tllm.ExecutorConfig(
             max_beam_width=self.args.max_beam_width,
             scheduler_config=PybindMirror.maybe_to_pybind(
                 self.args.scheduler_config),
@@ -608,20 +598,20 @@ def _build_model(self):
         if self.args.backend is None:
             # also set executor_config.max_seq_len in TRT workflow, to deduce default max_tokens
             if max_seq_len is not None:
-                self._executor_config.max_seq_len = max_seq_len
+                executor_config.max_seq_len = max_seq_len
             else:
                 engine_config = EngineConfig.from_json_file(self._engine_dir /
                                                             "config.json")
-                self._executor_config.max_seq_len = engine_config.build_config.max_seq_len
+                executor_config.max_seq_len = engine_config.build_config.max_seq_len
         if self.args.kv_cache_config is not None:
-            self._executor_config.kv_cache_config = PybindMirror.maybe_to_pybind(
+            executor_config.kv_cache_config = PybindMirror.maybe_to_pybind(
                 self.args.kv_cache_config)
         if os.getenv("FORCE_DETERMINISTIC", "0") == "1":
             # Disable KV cache reuse for deterministic mode
-            self._executor_config.kv_cache_config.enable_block_reuse = False
-            self._executor_config.kv_cache_config.enable_partial_reuse = False
+            executor_config.kv_cache_config.enable_block_reuse = False
+            executor_config.kv_cache_config.enable_partial_reuse = False
         if self.args.peft_cache_config is not None:
-            self._executor_config.peft_cache_config = PybindMirror.maybe_to_pybind(
+            executor_config.peft_cache_config = PybindMirror.maybe_to_pybind(
                 self.args.peft_cache_config)
         elif self._on_trt_backend and self.args.build_config.plugin_config.lora_plugin:
             engine_config = EngineConfig.from_json_file(self._engine_dir /
@@ -630,16 +620,16 @@ def _build_model(self):
             max_lora_rank = lora_config.max_lora_rank
             num_lora_modules = engine_config.pretrained_config.num_hidden_layers * \
                 len(lora_config.lora_target_modules + lora_config.missing_qkv_modules)
-            self._executor_config.peft_cache_config = tllm.PeftCacheConfig(
+            executor_config.peft_cache_config = tllm.PeftCacheConfig(
                 num_device_module_layer=max_lora_rank * num_lora_modules *
                 self.args.max_loras,
                 num_host_module_layer=max_lora_rank * num_lora_modules *
                 self.args.max_cpu_loras,
             )
         if self.args.decoding_config is not None:
-            self._executor_config.decoding_config = self.args.decoding_config
+            executor_config.decoding_config = self.args.decoding_config
         if self.args.guided_decoding_backend == 'xgrammar':
-            self._executor_config.guided_decoding_config = tllm.GuidedDecodingConfig(
+            executor_config.guided_decoding_config = tllm.GuidedDecodingConfig(
                 backend=tllm.GuidedDecodingConfig.GuidedDecodingBackend.
                 XGRAMMAR,
                 **_xgrammar_tokenizer_info(self.tokenizer))
@@ -648,18 +638,18 @@ def _build_model(self):
                 f"Unrecognized guided decoding backend {self.args.guided_decoding_backend}"
             )
 
-        self._executor_config.normalize_log_probs = self.args.normalize_log_probs
-        self._executor_config.enable_chunked_context = self.args.enable_chunked_prefill
-        self._executor_config.max_beam_width = self.args.max_beam_width or self.args.build_config.max_beam_width
+        executor_config.normalize_log_probs = self.args.normalize_log_probs
+        executor_config.enable_chunked_context = self.args.enable_chunked_prefill
+        executor_config.max_beam_width = self.args.max_beam_width or self.args.build_config.max_beam_width
         if self._on_trt_backend and self.args.extended_runtime_perf_knob_config is not None:
-            self._executor_config.extended_runtime_perf_knob_config = PybindMirror.maybe_to_pybind(
+            executor_config.extended_runtime_perf_knob_config = PybindMirror.maybe_to_pybind(
                 self.args.extended_runtime_perf_knob_config)
         if self.args.cache_transceiver_config is not None:
-            self._executor_config.cache_transceiver_config = PybindMirror.maybe_to_pybind(
+            executor_config.cache_transceiver_config = PybindMirror.maybe_to_pybind(
                 self.args.cache_transceiver_config)
         from tensorrt_llm._torch.pyexecutor.config import update_executor_config
         update_executor_config(
-            self._executor_config,
+            executor_config,
             backend=self.args.backend,
             pytorch_backend_config=self.args.get_pytorch_backend_config()
             if self.args.backend in ["pytorch", "_autodeploy"] else None,
@@ -671,14 +661,14 @@ def _build_model(self):
             trt_engine_dir=self._engine_dir,
             max_input_len=self.args.max_input_len,
             max_seq_len=max_seq_len)
-        self._executor_config.llm_parallel_config = self.args.parallel_config
+        executor_config.llm_parallel_config = self.args.parallel_config
         return_logits = self.args.gather_generation_logits or (
             self._on_trt_backend and self.args.build_config
             and self.args.build_config.gather_context_logits)
 
         self._executor = self._executor_cls.create(
             self._engine_dir,
-            executor_config=self._executor_config,
+            executor_config=executor_config,
             batched_logits_processor=self.args.batched_logits_processor,
             model_world_size=self.args.parallel_config.world_size,
             mpi_session=self.mpi_session,
Original file line number	Diff line number	Diff line change
`@@ -391,7 +391,7 @@ def create_py_executor_instance(`
`391`	`391`	`"Guided decoding is not supported with overlap scheduler.")`
`392`	`392`
`393`	`393`	`logger.info(`
`394`		`- f"max_seq_len={executor_config.max_seq_len}, max_num_requests={executor_config.max_batch_size}, max_num_tokens={executor_config.max_num_tokens}, max_batch_size={executor_config.max_batch_size}"`
	`394`	`+ f"max_seq_len={executor_config.max_seq_len}, max_num_requests={executor_config.max_batch_size}, max_num_tokens={executor_config.max_num_tokens}"`
`395`	`395`	`)`
`396`	`396`
`397`	`397`	`for key, value in pytorch_backend_config.extra_resource_managers.items():`