Added CLI for Response Quality

UtkarshTheDev · UtkarshTheDev · commit 06ca79dd881a · 2025-05-01T17:59:43.000+05:30
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,39 @@
 
 All notable changes to LocalLab will be documented in this file.
 
+## [0.5.8] - 2024-05-02
+
+### Added
+
+- Added optional Response Quality Settings section to the CLI configuration
+- Added detailed parameter descriptions for all response quality settings
+- Increased default max_length from 4096 to 8192 tokens for more complete responses
+- Increased default top_k from 50 to 80 for better quality responses
+- Added max_time parameter (default: 120 seconds) to control generation time
+- Improved token-level streaming with larger token batches (4 tokens at a time)
+- Enhanced stop sequence detection to only check for definitive end markers
+- Improved repetition detection to only stop for extreme repetition
+- Added better error recovery for out-of-memory situations
+
+### Changed
+
+- Made Response Quality Settings section optional in CLI (default: skip)
+- Updated client timeouts from 180 to 300 seconds (5 minutes) for more complete responses
+- Increased client default max_length from 1024 to 8192 tokens to match server's default
+- Increased repetition_penalty from 1.1 to 1.15 for better quality
+- Updated all API routes to include top_k and repetition_penalty parameters
+- Enhanced memory management to prevent OOM errors
+- Improved error handling in streaming responses
+
+### Client Package Changes (v1.0.7)
+
+- Increased default timeouts for all operations
+- Added repetition_penalty parameter to all generation methods
+- Improved error handling and recovery in streaming
+- Added better buffering for token-level streaming
+- Increased retry counts for better reliability
+- Added top_k parameter to all generation methods
+
 ## [0.5.7] - 2024-05-01
 
 ### Improved
diff --git a/client/python_client/locallab_client/__init__.py b/client/python_client/locallab_client/__init__.py
@@ -20,7 +20,7 @@
 )
 from .sync_client import SyncLocalLabClient
 
-__version__ = "1.0.6"
+__version__ = "1.0.7"
 __author__ = "Utkarsh"
 __email__ = "utkarshweb2023@gmail.com"
 
diff --git a/client/python_client/locallab_client/client.py b/client/python_client/locallab_client/client.py
@@ -301,7 +301,7 @@ async def stream_generate(
 
         # Use a higher max_length by default to ensure complete responses
         if max_length is None:
-            max_length = 4096  # Default to 4096 tokens for more complete responses
+            max_length = 8192  # Default to 8192 tokens to match server's default
 
         payload = {
             "prompt": prompt,
@@ -311,7 +311,7 @@ async def stream_generate(
             "temperature": temperature,
             "top_p": top_p,
             # Add repetition_penalty for better quality
-            "repetition_penalty": 1.1
+            "repetition_penalty": 1.15
         }
 
         # Create a timeout for this specific request
diff --git a/client/python_client/locallab_client/sync_client.py b/client/python_client/locallab_client/sync_client.py
@@ -427,7 +427,7 @@ def batch_generate(
         Args:
             prompts: List of prompts to generate text from
             model_id: Optional model ID to use
-            max_length: Maximum length of the generated text (defaults to 1024 if None)
+            max_length: Maximum length of the generated text (defaults to 8192 if None)
             temperature: Temperature for sampling
             top_p: Top-p for nucleus sampling
             repetition_penalty: Penalty for repetition (higher values = less repetition)
@@ -437,7 +437,7 @@ def batch_generate(
         """
         # Use a higher max_length by default to ensure complete responses
         if max_length is None:
-            max_length = 4096  # Default to 4096 tokens for more complete responses
+            max_length = 8192  # Default to 8192 tokens to match server's default
 
         return self._run_coroutine(
             self._async_client.batch_generate(
diff --git a/client/python_client/setup.py b/client/python_client/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name="locallab-client",
-    version="1.0.6",
+    version="1.0.7",
     author="Utkarsh",
     author_email="utkarshweb2023@gmail.com",
     description="Python client for LocalLab - A local LLM server",
diff --git a/locallab/__init__.py b/locallab/__init__.py
@@ -2,7 +2,7 @@
 LocalLab - A lightweight AI inference server for running LLMs locally
 """
 
-__version__ = "0.5.5"  # Updated to match setup.py
+__version__ = "0.5.8"  # Updated to match setup.py
 
 # Only import what's necessary initially, lazy-load the rest
 from .logger import get_logger
diff --git a/locallab/cli/interactive.py b/locallab/cli/interactive.py
@@ -107,37 +107,72 @@ def prompt_for_config(use_ngrok: bool = None, port: int = None, ngrok_auth_token
     click.echo("\n⚡ Model Optimization Settings")
     click.echo("─────────────────────────────")
 
-    config["enable_quantization"] = click.confirm(
-        "Enable model quantization?",
-        default=config.get("enable_quantization", ENABLE_QUANTIZATION)
+    # Show current values for reference
+    click.echo("\nCurrent optimization settings:")
+    click.echo(f"  Quantization: {'Enabled' if config.get('enable_quantization', ENABLE_QUANTIZATION) else 'Disabled'}")
+    if config.get('enable_quantization', ENABLE_QUANTIZATION):
+        click.echo(f"  Quantization Type: {config.get('quantization_type', QUANTIZATION_TYPE)}")
+    click.echo(f"  CPU Offloading: {'Enabled' if config.get('enable_cpu_offloading', ENABLE_CPU_OFFLOADING) else 'Disabled'}")
+    click.echo(f"  Attention Slicing: {'Enabled' if config.get('enable_attention_slicing', ENABLE_ATTENTION_SLICING) else 'Disabled'}")
+    click.echo(f"  Flash Attention: {'Enabled' if config.get('enable_flash_attention', ENABLE_FLASH_ATTENTION) else 'Disabled'}")
+    click.echo(f"  Better Transformer: {'Enabled' if config.get('enable_bettertransformer', ENABLE_BETTERTRANSFORMER) else 'Disabled'}")
+
+    # Ask if user wants to configure optimization settings
+    configure_optimization = click.confirm(
+        "\nWould you like to configure model optimization settings?",
+        default=True  # Default to Yes for optimization settings
     )
 
-    if config["enable_quantization"]:
-        config["quantization_type"] = click.prompt(
-            "Quantization type (fp16/int8/int4)",
-            default=config.get("quantization_type", QUANTIZATION_TYPE),
-            type=click.Choice(["fp16", "int8", "int4"])
+    if configure_optimization:
+        config["enable_quantization"] = click.confirm(
+            "Enable model quantization?",
+            default=config.get("enable_quantization", ENABLE_QUANTIZATION)
         )
 
-    config["enable_cpu_offloading"] = click.confirm(
-        "Enable CPU offloading?",
-        default=config.get("enable_cpu_offloading", ENABLE_CPU_OFFLOADING)
-    )
+        if config["enable_quantization"]:
+            config["quantization_type"] = click.prompt(
+                "Quantization type (fp16/int8/int4)",
+                default=config.get("quantization_type", QUANTIZATION_TYPE),
+                type=click.Choice(["fp16", "int8", "int4"])
+            )
 
-    config["enable_attention_slicing"] = click.confirm(
-        "Enable attention slicing?",
-        default=config.get("enable_attention_slicing", ENABLE_ATTENTION_SLICING)
-    )
+        config["enable_cpu_offloading"] = click.confirm(
+            "Enable CPU offloading?",
+            default=config.get("enable_cpu_offloading", ENABLE_CPU_OFFLOADING)
+        )
 
-    config["enable_flash_attention"] = click.confirm(
-        "Enable flash attention?",
-        default=config.get("enable_flash_attention", ENABLE_FLASH_ATTENTION)
-    )
+        config["enable_attention_slicing"] = click.confirm(
+            "Enable attention slicing?",
+            default=config.get("enable_attention_slicing", ENABLE_ATTENTION_SLICING)
+        )
 
-    config["enable_better_transformer"] = click.confirm(
-        "Enable better transformer?",
-        default=config.get("enable_bettertransformer", ENABLE_BETTERTRANSFORMER)
-    )
+        config["enable_flash_attention"] = click.confirm(
+            "Enable flash attention?",
+            default=config.get("enable_flash_attention", ENABLE_FLASH_ATTENTION)
+        )
+
+        config["enable_better_transformer"] = click.confirm(
+            "Enable better transformer?",
+            default=config.get("enable_bettertransformer", ENABLE_BETTERTRANSFORMER)
+        )
+
+        click.echo("\n✅ Optimization settings updated!")
+    else:
+        # If user doesn't want to configure, use the current values or defaults
+        if 'enable_quantization' not in config:
+            config["enable_quantization"] = ENABLE_QUANTIZATION
+        if config["enable_quantization"] and 'quantization_type' not in config:
+            config["quantization_type"] = QUANTIZATION_TYPE
+        if 'enable_cpu_offloading' not in config:
+            config["enable_cpu_offloading"] = ENABLE_CPU_OFFLOADING
+        if 'enable_attention_slicing' not in config:
+            config["enable_attention_slicing"] = ENABLE_ATTENTION_SLICING
+        if 'enable_flash_attention' not in config:
+            config["enable_flash_attention"] = ENABLE_FLASH_ATTENTION
+        if 'enable_bettertransformer' not in config:
+            config["enable_bettertransformer"] = ENABLE_BETTERTRANSFORMER
+
+        click.echo("\nUsing current optimization settings.")
 
     # Advanced Settings
     # ----------------
@@ -150,40 +185,89 @@ def prompt_for_config(use_ngrok: bool = None, port: int = None, ngrok_auth_token
         type=int
     )
 
-    # Generation Parameters
-    # -------------------
-    click.echo("\n🔄 Generation Parameters")
-    click.echo("─────────────────────")
-
-    config["max_length"] = click.prompt(
-        "Maximum generation length (tokens)",
-        default=config.get("max_length", 8192),
-        type=int
+    # Response Quality Settings
+    # -----------------------
+    click.echo("\n🎯 Response Quality Settings")
+    click.echo("───────────────────────────")
+
+    # Show current values for reference with descriptions
+    click.echo("\nCurrent response quality settings:")
+    click.echo(f"  Max Length: {config.get('max_length', 8192)} tokens - Maximum number of tokens in the generated response")
+    click.echo(f"  Temperature: {config.get('temperature', 0.7)} - Controls randomness (higher = more creative, lower = more focused)")
+    click.echo(f"  Top-p: {config.get('top_p', 0.9)} - Nucleus sampling parameter (higher = more diverse responses)")
+    click.echo(f"  Top-k: {config.get('top_k', 80)} - Limits vocabulary to top K tokens (higher = more diverse vocabulary)")
+    click.echo(f"  Repetition Penalty: {config.get('repetition_penalty', 1.15)} - Penalizes repetition (higher = less repetition)")
+    click.echo(f"  Max Time: {config.get('max_time', 120.0)} seconds - Maximum time allowed for generation")
+
+    # Ask if user wants to configure response quality settings
+    configure_response_quality = click.confirm(
+        "\nWould you like to configure response quality settings?",
+        default=False  # Default to No
     )
 
-    config["temperature"] = click.prompt(
-        "Temperature (0.1-1.0)",
-        default=config.get("temperature", 0.7),
-        type=float
-    )
+    if configure_response_quality:
+        # If user wants to configure, show the prompts with descriptions
+        config["max_length"] = click.prompt(
+            "Maximum generation length in tokens (higher = longer responses, but slower)",
+            default=config.get("max_length", 8192),
+            type=int
+        )
 
-    config["top_p"] = click.prompt(
-        "Top-p (0.1-1.0)",
-        default=config.get("top_p", 0.9),
-        type=float
-    )
+        config["temperature"] = click.prompt(
+            "Temperature (0.1-1.0, higher = more creative, lower = more focused)",
+            default=config.get("temperature", 0.7),
+            type=float
+        )
 
-    config["top_k"] = click.prompt(
-        "Top-k (1-100)",
-        default=config.get("top_k", 80),
-        type=int
-    )
+        config["top_p"] = click.prompt(
+            "Top-p (0.1-1.0, higher = more diverse responses)",
+            default=config.get("top_p", 0.9),
+            type=float
+        )
 
-    config["repetition_penalty"] = click.prompt(
-        "Repetition penalty (1.0-2.0)",
-        default=config.get("repetition_penalty", 1.15),
-        type=float
-    )
+        config["top_k"] = click.prompt(
+            "Top-k (1-100, higher = more diverse vocabulary)",
+            default=config.get("top_k", 80),
+            type=int
+        )
+
+        config["repetition_penalty"] = click.prompt(
+            "Repetition penalty (1.0-2.0, higher = less repetition)",
+            default=config.get("repetition_penalty", 1.15),
+            type=float
+        )
+
+        config["max_time"] = click.prompt(
+            "Maximum generation time in seconds (higher = more complete responses, but slower)",
+            default=config.get("max_time", 120.0),
+            type=float
+        )
+
+        click.echo("\n✅ Response quality settings updated!")
+    else:
+        # If user doesn't want to configure, use the current values or defaults
+        if 'max_length' not in config:
+            config["max_length"] = 8192
+        if 'temperature' not in config:
+            config["temperature"] = 0.7
+        if 'top_p' not in config:
+            config["top_p"] = 0.9
+        if 'top_k' not in config:
+            config["top_k"] = 80
+        if 'repetition_penalty' not in config:
+            config["repetition_penalty"] = 1.15
+        if 'max_time' not in config:
+            config["max_time"] = 120.0
+
+        click.echo("\nUsing default response quality settings.")
+
+    # Set environment variables for these settings
+    os.environ["DEFAULT_MAX_LENGTH"] = str(config["max_length"])
+    os.environ["DEFAULT_TEMPERATURE"] = str(config["temperature"])
+    os.environ["DEFAULT_TOP_P"] = str(config["top_p"])
+    os.environ["DEFAULT_TOP_K"] = str(config["top_k"])
+    os.environ["DEFAULT_REPETITION_PENALTY"] = str(config["repetition_penalty"])
+    os.environ["DEFAULT_MAX_TIME"] = str(config["max_time"])
 
     # Cache Settings
     # -------------
diff --git a/setup.py b/setup.py
@@ -47,7 +47,7 @@
 
 setup(
     name="locallab",
-    version="0.5.5",
+    version="0.5.8",
     packages=find_packages(include=["locallab", "locallab.*"]),
     install_requires=install_requires,
     extras_require={

Original file line number	Diff line number	Diff line change
`@@ -20,7 +20,7 @@`
`20`	`20`	`)`
`21`	`21`	`from .sync_client import SyncLocalLabClient`
`22`	`22`
`23`		`-__version__ = "1.0.6"`
	`23`	`+__version__ = "1.0.7"`
`24`	`24`	`__author__ = "Utkarsh"`
`25`	`25`	`__email__ = "[email protected]"`
`26`	`26`