VectorInstitute · fcogidi · Apr 8, 2025
diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py
@@ -40,14 +40,14 @@ def cli() -> None:
     help="GPU memory utilization, default to 0.9",
 )
 @click.option(
-    "--enable-prefix-caching",
-    is_flag=True,
-    help="Enables automatic prefix caching",
+    "--enable-prefix-caching/--disable-prefix-caching",
+    default=None,
+    help="Enable or disable automatic prefix caching",
 )
 @click.option(
-    "--enable-chunked-prefill",
-    is_flag=True,
-    help="Enable chunked prefill, enabled by default if max number of sequences > 32k",
+    "--enable-chunked-prefill/--disable-chunked-prefill",
+    default=None,
+    help="Enable or disable chunked prefill, enabled by default if max number of sequences > 32k",
 )
 @click.option(
     "--max-num-batched-tokens",
@@ -101,24 +101,24 @@ def cli() -> None:
     help="Path to parent directory containing model weights",
 )
 @click.option(
-    "--pipeline-parallelism",
-    is_flag=True,
-    help="Enable pipeline parallelism, enabled by default for supported models",
+    "--pipeline-parallelism/--no-pipeline-parallelism",
+    default=None,
+    help="Enable or disable pipeline parallelism, enabled by default for supported models",
 )
 @click.option(
     "--compilation-config",
     type=click.Choice(["0", "3"]),
     help="torch.compile optimization level, accepts '0' or '3', default to '0', which means no optimization is applied",
 )
 @click.option(
-    "--enforce-eager",
-    is_flag=True,
-    help="Always use eager-mode PyTorch",
+    "--enforce-eager/--no-enforce-eager",
+    default=None,
+    help="If enabled, always use eager-mode PyTorch",
 )
 @click.option(
     "--json-mode",
     is_flag=True,
-    help="Output in JSON string",
+    help="If enabled, output will be in JSON string format instead of a table",
 )
 def launch(
     model_name: str,

diff --git a/vec_inf/cli/_config.py b/vec_inf/cli/_config.py
@@ -57,7 +57,7 @@ class ModelConfig(BaseModel):
         default=0.9, gt=0.0, le=1.0, description="GPU memory utilization"
     )
     pipeline_parallelism: bool = Field(
-        default=True, description="Enable pipeline parallelism"
+        default=False, description="Enable pipeline parallelism"
     )
     enforce_eager: bool = Field(default=False, description="Force eager mode execution")
     qos: Union[QOS, str] = Field(default="m2", description="Quality of Service tier")

diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py
@@ -98,8 +98,8 @@ def _get_launch_params(self) -> dict[str, Any]:
 
         # Process boolean fields
         for bool_field in BOOLEAN_FIELDS:
-            if self.cli_kwargs[bool_field]:
-                params[bool_field] = True
+            if self.cli_kwargs.get(bool_field) is not None:
+                params[bool_field] = self.cli_kwargs[bool_field]
 
         # Merge other overrides
         for key, value in self.cli_kwargs.items():