[None][chore] Defer exposing context parallel configs (#9552)

brb-nv · web-flow · commit f61067cbb5d1 · 2025-12-01T09:50:02.000-08:00
Signed-off-by: Balaram Buddharaju &lt;169953907+brb-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/bench/benchmark/low_latency.py b/tensorrt_llm/bench/benchmark/low_latency.py
@@ -121,12 +121,6 @@
     default=1,
     help="pipeline parallelism size",
 )
-@optgroup.option(
-    "--cp",
-    type=int,
-    default=1,
-    help="context parallelism size",
-)
 @optgroup.option(
     "--ep",
     type=int,
diff --git a/tensorrt_llm/bench/benchmark/throughput.py b/tensorrt_llm/bench/benchmark/throughput.py
@@ -202,12 +202,6 @@
     default=1,
     help="pipeline parallelism size",
 )
-@optgroup.option(
-    "--cp",
-    type=int,
-    default=1,
-    help="context parallelism size",
-)
 @optgroup.option(
     "--ep",
     type=int,
diff --git a/tensorrt_llm/commands/eval.py b/tensorrt_llm/commands/eval.py
@@ -25,7 +25,6 @@
 from ..llmapi import BuildConfig, KvCacheConfig
 from ..llmapi.llm_utils import update_llm_args_with_extra_options
 from ..logger import logger, severity_map
-from ..mapping import CpType
 
 
 @click.group()
@@ -75,10 +74,6 @@
               type=int,
               default=1,
               help='Pipeline parallelism size.')
-@click.option("--cp_size",
-              type=int,
-              default=1,
-              help='Context parallelism size.')
 @click.option("--ep_size",
               type=int,
               default=None,
@@ -110,18 +105,14 @@
               is_flag=True,
               default=False,
               help="Flag for disabling KV cache reuse.")
-@click.option("--cp_config",
-              type=dict,
-              default=None,
-              help="Context parallelism configuration as JSON.")
 @click.pass_context
 def main(ctx, model: str, tokenizer: Optional[str], log_level: str,
          backend: str, max_beam_width: int, max_batch_size: int,
          max_num_tokens: int, max_seq_len: int, tp_size: int, pp_size: int,
          ep_size: Optional[int], gpus_per_node: Optional[int],
          kv_cache_free_gpu_memory_fraction: float, trust_remote_code: bool,
          revision: Optional[str], extra_llm_api_options: Optional[str],
-         disable_kv_cache_reuse: bool, cp_size: int, cp_config: Optional[dict]):
+         disable_kv_cache_reuse: bool):
     logger.set_level(log_level)
     build_config = BuildConfig(max_batch_size=max_batch_size,
                                max_num_tokens=max_num_tokens,
@@ -132,20 +123,11 @@ def main(ctx, model: str, tokenizer: Optional[str], log_level: str,
         free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction,
         enable_block_reuse=not disable_kv_cache_reuse)
 
-    if cp_config is not None and "cp_type" in cp_config:
-        cp_config = cp_config.copy()
-        try:
-            cp_config["cp_type"] = CpType[cp_config["cp_type"].upper()]
-        except KeyError:
-            raise ValueError(f"Invalid cp_type: {cp_config['cp_type']}. " \
-                             f"Must be one of: {', '.join([t.name for t in CpType])}")
     llm_args = {
         "model": model,
         "tokenizer": tokenizer,
         "tensor_parallel_size": tp_size,
         "pipeline_parallel_size": pp_size,
-        "context_parallel_size": cp_size,
-        "cp_config": cp_config if cp_config is not None else {},
         "moe_expert_parallel_size": ep_size,
         "gpus_per_node": gpus_per_node,
         "trust_remote_code": trust_remote_code,