From 28650cc45a7b06c99823f9bcfd4a70d488859270 Mon Sep 17 00:00:00 2001 From: fcogidi <41602287+fcogidi@users.noreply.github.com> Date: Tue, 8 Apr 2025 14:16:54 -0400 Subject: [PATCH] Refactor CLI options to use enable/disable flags and update default values for pipeline parallelism --- vec_inf/cli/_cli.py | 26 +++++++++++++------------- vec_inf/cli/_config.py | 2 +- vec_inf/cli/_helper.py | 4 ++-- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py index 9bfead26..987a5580 100644 --- a/vec_inf/cli/_cli.py +++ b/vec_inf/cli/_cli.py @@ -40,14 +40,14 @@ def cli() -> None: help="GPU memory utilization, default to 0.9", ) @click.option( - "--enable-prefix-caching", - is_flag=True, - help="Enables automatic prefix caching", + "--enable-prefix-caching/--disable-prefix-caching", + default=None, + help="Enable or disable automatic prefix caching", ) @click.option( - "--enable-chunked-prefill", - is_flag=True, - help="Enable chunked prefill, enabled by default if max number of sequences > 32k", + "--enable-chunked-prefill/--disable-chunked-prefill", + default=None, + help="Enable or disable chunked prefill, enabled by default if max number of sequences > 32k", ) @click.option( "--max-num-batched-tokens", @@ -101,9 +101,9 @@ def cli() -> None: help="Path to parent directory containing model weights", ) @click.option( - "--pipeline-parallelism", - is_flag=True, - help="Enable pipeline parallelism, enabled by default for supported models", + "--pipeline-parallelism/--no-pipeline-parallelism", + default=None, + help="Enable or disable pipeline parallelism, enabled by default for supported models", ) @click.option( "--compilation-config", @@ -111,14 +111,14 @@ def cli() -> None: help="torch.compile optimization level, accepts '0' or '3', default to '0', which means no optimization is applied", ) @click.option( - "--enforce-eager", - is_flag=True, - help="Always use eager-mode PyTorch", + "--enforce-eager/--no-enforce-eager", + default=None, + help="If enabled, always use eager-mode PyTorch", ) @click.option( "--json-mode", is_flag=True, - help="Output in JSON string", + help="If enabled, output will be in JSON string format instead of a table", ) def launch( model_name: str, diff --git a/vec_inf/cli/_config.py b/vec_inf/cli/_config.py index 515a6629..18cd9c80 100644 --- a/vec_inf/cli/_config.py +++ b/vec_inf/cli/_config.py @@ -57,7 +57,7 @@ class ModelConfig(BaseModel): default=0.9, gt=0.0, le=1.0, description="GPU memory utilization" ) pipeline_parallelism: bool = Field( - default=True, description="Enable pipeline parallelism" + default=False, description="Enable pipeline parallelism" ) enforce_eager: bool = Field(default=False, description="Force eager mode execution") qos: Union[QOS, str] = Field(default="m2", description="Quality of Service tier") diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py index bd520ac1..da912f25 100644 --- a/vec_inf/cli/_helper.py +++ b/vec_inf/cli/_helper.py @@ -98,8 +98,8 @@ def _get_launch_params(self) -> dict[str, Any]: # Process boolean fields for bool_field in BOOLEAN_FIELDS: - if self.cli_kwargs[bool_field]: - params[bool_field] = True + if self.cli_kwargs.get(bool_field) is not None: + params[bool_field] = self.cli_kwargs[bool_field] # Merge other overrides for key, value in self.cli_kwargs.items():