From 28650cc45a7b06c99823f9bcfd4a70d488859270 Mon Sep 17 00:00:00 2001
From: fcogidi <41602287+fcogidi@users.noreply.github.com>
Date: Tue, 8 Apr 2025 14:16:54 -0400
Subject: [PATCH] Refactor CLI options to use enable/disable flags and update
 default values for pipeline parallelism

---
 vec_inf/cli/_cli.py    | 26 +++++++++++++-------------
 vec_inf/cli/_config.py |  2 +-
 vec_inf/cli/_helper.py |  4 ++--
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py
index 9bfead26..987a5580 100644
--- a/vec_inf/cli/_cli.py
+++ b/vec_inf/cli/_cli.py
@@ -40,14 +40,14 @@ def cli() -> None:
     help="GPU memory utilization, default to 0.9",
 )
 @click.option(
-    "--enable-prefix-caching",
-    is_flag=True,
-    help="Enables automatic prefix caching",
+    "--enable-prefix-caching/--disable-prefix-caching",
+    default=None,
+    help="Enable or disable automatic prefix caching",
 )
 @click.option(
-    "--enable-chunked-prefill",
-    is_flag=True,
-    help="Enable chunked prefill, enabled by default if max number of sequences > 32k",
+    "--enable-chunked-prefill/--disable-chunked-prefill",
+    default=None,
+    help="Enable or disable chunked prefill, enabled by default if max number of sequences > 32k",
 )
 @click.option(
     "--max-num-batched-tokens",
@@ -101,9 +101,9 @@ def cli() -> None:
     help="Path to parent directory containing model weights",
 )
 @click.option(
-    "--pipeline-parallelism",
-    is_flag=True,
-    help="Enable pipeline parallelism, enabled by default for supported models",
+    "--pipeline-parallelism/--no-pipeline-parallelism",
+    default=None,
+    help="Enable or disable pipeline parallelism, enabled by default for supported models",
 )
 @click.option(
     "--compilation-config",
@@ -111,14 +111,14 @@ def cli() -> None:
     help="torch.compile optimization level, accepts '0' or '3', default to '0', which means no optimization is applied",
 )
 @click.option(
-    "--enforce-eager",
-    is_flag=True,
-    help="Always use eager-mode PyTorch",
+    "--enforce-eager/--no-enforce-eager",
+    default=None,
+    help="If enabled, always use eager-mode PyTorch",
 )
 @click.option(
     "--json-mode",
     is_flag=True,
-    help="Output in JSON string",
+    help="If enabled, output will be in JSON string format instead of a table",
 )
 def launch(
     model_name: str,
diff --git a/vec_inf/cli/_config.py b/vec_inf/cli/_config.py
index 515a6629..18cd9c80 100644
--- a/vec_inf/cli/_config.py
+++ b/vec_inf/cli/_config.py
@@ -57,7 +57,7 @@ class ModelConfig(BaseModel):
         default=0.9, gt=0.0, le=1.0, description="GPU memory utilization"
     )
     pipeline_parallelism: bool = Field(
-        default=True, description="Enable pipeline parallelism"
+        default=False, description="Enable pipeline parallelism"
     )
     enforce_eager: bool = Field(default=False, description="Force eager mode execution")
     qos: Union[QOS, str] = Field(default="m2", description="Quality of Service tier")
diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py
index bd520ac1..da912f25 100644
--- a/vec_inf/cli/_helper.py
+++ b/vec_inf/cli/_helper.py
@@ -98,8 +98,8 @@ def _get_launch_params(self) -> dict[str, Any]:
 
         # Process boolean fields
         for bool_field in BOOLEAN_FIELDS:
-            if self.cli_kwargs[bool_field]:
-                params[bool_field] = True
+            if self.cli_kwargs.get(bool_field) is not None:
+                params[bool_field] = self.cli_kwargs[bool_field]
 
         # Merge other overrides
         for key, value in self.cli_kwargs.items():