Updated LocalLab v0.2.6

UtkarshTheDev · UtkarshTheDev · commit 0583607133d3 · 2025-03-02T14:59:22.000+05:30
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,19 +2,35 @@
 
 All notable changes to LocalLab will be documented in this file.
 
-## [0.2.5] - 2025-03-02
+## [0.2.6] - 2023-05-05
 
 ### Added
 
-- Added `get_network_interfaces` function to retrieve information about available network interfaces
-- Added `get_public_ip` async function to retrieve the public IP address of the machine
-- Added adapter methods in ModelManager (`generate_text` and `generate_stream`) to maintain API compatibility with route handlers
+- New model loading endpoint that accepts model_id in the request body at `/models/load`
+- `format_chat_messages` function to properly format chat messages for the model
+- CLI function to support command-line usage with click interface
 
 ### Fixed
 
-- Fixed import error for `get_public_ip` and `get_network_interfaces` functions
-- Fixed naming mismatch between route handlers and ModelManager methods
-- Added new dependencies in setup.py: `netifaces` and `httpx`
+- Properly awaiting async `generate_text` in chat completion endpoint
+- Fixed async generator handling in `generate_stream` function
+- Fixed streaming in the `stream_chat` function to correctly send server-sent events
+- Properly escaped newline characters in the streaming response
+- Added missing dependencies in `setup.py`: colorama, python-multipart, websockets, psutil, and nest-asyncio
+
+## [0.2.5] - 2023-05-02
+
+### Added
+
+- `get_network_interfaces` function to retrieve information about available network interfaces
+- `get_public_ip` async function to retrieve the public IP address of the machine
+- Adapter methods in `ModelManager` (`generate_text` and `generate_stream`) to maintain API compatibility with route handlers
+
+### Fixed
+
+- Import error for `get_public_ip` and `get_network_interfaces` functions
+- Naming mismatch between route handlers and `ModelManager` methods
+- New dependencies in `setup.py`: `netifaces` and `httpx`
 
 ## [0.2.4] - 2025-03-04
 
diff --git a/locallab/__init__.py b/locallab/__init__.py
@@ -2,7 +2,7 @@
 LocalLab: Run LLMs locally with a friendly API similar to OpenAI
 """
 
-__version__ = "0.2.4" 
+__version__ = "0.2.6" 
 
 from typing import Dict, Any, Optional
 
diff --git a/locallab/model_manager.py b/locallab/model_manager.py
@@ -562,14 +562,50 @@ async def generate_text(self, prompt: str, system_prompt: Optional[str] = None,
         Adapter method that calls the generate method.
         This is used to maintain compatibility with routes that call generate_text.
         """
-        return await self.generate(prompt=prompt, system_prompt=system_prompt, **kwargs)
+        # Make sure we're not streaming when generating text
+        kwargs["stream"] = False
+        # Directly await the generate method to return the string result
+        return await self.generate(prompt=prompt, system_instructions=system_prompt, **kwargs)
         
     async def generate_stream(self, prompt: str, system_prompt: Optional[str] = None, **kwargs) -> AsyncGenerator[str, None]:
-        """
-        Adapter method that calls the async_stream_generate method.
-        This is used to maintain compatibility with routes that call generate_stream.
-        """
+        """Adapter method for streaming text generation.
+        Calls the async_stream_generate method with proper parameters."""
         # Ensure streaming is enabled
         kwargs["stream"] = True
-        async for token in self.async_stream_generate(prompt=prompt, system_prompt=system_prompt, **kwargs):
-            yield token
+        return self.async_stream_generate(prompt=prompt, system_prompt=system_prompt, **kwargs)
+
+    def is_model_loaded(self, model_id: str) -> bool:
+        """Check if a specific model is loaded.
+        
+        Args:
+            model_id: The ID of the model to check
+            
+        Returns:
+            True if the model is loaded, False otherwise
+        """
+        return (self.model is not None) and (self.current_model == model_id)
+    
+    def unload_model(self) -> None:
+        """Unload the current model to free memory resources.
+        
+        This method removes the current model from memory and clears
+        the tokenizer and model references.
+        """
+        if self.model is not None:
+            # Log which model is being unloaded
+            model_id = self.current_model
+            
+            # Clear model and tokenizer
+            self.model = None
+            self.tokenizer = None
+            self.current_model = None
+            
+            # Clean up memory
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            gc.collect()
+            
+            # Log model unloading
+            log_model_unloaded(model_id)
+            
+            logger.info(f"Model {model_id} unloaded successfully")
diff --git a/locallab/routes/generate.py b/locallab/routes/generate.py
@@ -6,6 +6,7 @@
 from fastapi.responses import JSONResponse, StreamingResponse
 from pydantic import BaseModel, Field
 from typing import Dict, List, Any, Optional, Generator, Tuple
+import json
 
 from ..logger import get_logger
 from ..logger.logger import get_request_count
@@ -75,6 +76,36 @@ class BatchGenerationResponse(BaseModel):
     responses: List[str]
 
 
+def format_chat_messages(messages: List[ChatMessage]) -> str:
+    """
+    Format a list of chat messages into a prompt string that the model can understand
+    
+    Args:
+        messages: List of ChatMessage objects with role and content
+        
+    Returns:
+        Formatted prompt string
+    """
+    formatted_messages = []
+    
+    for msg in messages:
+        role = msg.role.strip().lower()
+        
+        if role == "system":
+            # System messages get special formatting
+            formatted_messages.append(f"# System Instruction\n{msg.content}\n")
+        elif role == "user":
+            formatted_messages.append(f"User: {msg.content}")
+        elif role == "assistant":
+            formatted_messages.append(f"Assistant: {msg.content}")
+        else:
+            # Default formatting for other roles
+            formatted_messages.append(f"{role.capitalize()}: {msg.content}")
+    
+    # Join all messages with newlines
+    return "\n\n".join(formatted_messages)
+
+
 @router.post("/generate", response_model=GenerationResponse)
 async def generate_text(request: GenerationRequest) -> GenerationResponse:
     """
@@ -105,8 +136,8 @@ async def generate_text(request: GenerationRequest) -> GenerationResponse:
         # Merge model-specific params with request params
         generation_params.update(model_params)
         
-        # Generate text
-        generated_text = model_manager.generate_text(
+        # Generate text - properly await the async call
+        generated_text = await model_manager.generate_text(
             prompt=request.prompt,
             system_prompt=request.system_prompt,
             **generation_params
@@ -123,46 +154,51 @@ async def generate_text(request: GenerationRequest) -> GenerationResponse:
 
 @router.post("/chat", response_model=ChatResponse)
 async def chat_completion(request: ChatRequest) -> ChatResponse:
-    """Chat completion endpoint similar to OpenAI's API"""
+    """
+    Chat completion API that formats messages into a prompt and returns the response
+    """
     if not model_manager.current_model:
         raise HTTPException(status_code=400, detail="No model is currently loaded")
     
+    # Format messages into a prompt
+    formatted_prompt = format_chat_messages(request.messages)
+    
+    # If streaming is requested, return a streaming response
+    if request.stream:
+        return StreamingResponse(
+            stream_chat(formatted_prompt, request.max_tokens, request.temperature, request.top_p),
+            media_type="text/event-stream"
+        )
+    
     try:
-        # Format messages into a prompt
-        formatted_prompt = "\n".join([f"{msg.role}: {msg.content}" for msg in request.messages])
-        
-        if request.stream:
-            # Return a streaming response
-            return StreamingResponse(
-                stream_chat(formatted_prompt, request.max_tokens, request.temperature, request.top_p),
-                media_type="text/event-stream"
-            )
-        
         # Get model-specific generation parameters
         model_params = get_model_generation_params(model_manager.current_model)
         
-        # Update with request parameters
+        # Prepare generation parameters
         generation_params = {
             "max_new_tokens": request.max_tokens,
             "temperature": request.temperature,
-            "top_p": request.top_p,
+            "top_p": request.top_p
         }
         
         # Merge model-specific params with request params
         generation_params.update(model_params)
         
-        # Generate text
-        response = model_manager.generate_text(
+        # Generate completion
+        generated_text = await model_manager.generate_text(
             prompt=formatted_prompt,
             **generation_params
         )
         
+        # Format response
         return ChatResponse(
             choices=[{
                 "message": {
                     "role": "assistant",
-                    "content": response
-                }
+                    "content": generated_text
+                },
+                "index": 0,
+                "finish_reason": "stop"
             }]
         )
     except Exception as e:
@@ -177,7 +213,9 @@ async def generate_stream(
     top_p: float, 
     system_prompt: Optional[str]
 ) -> Generator[str, None, None]:
-    """Generate text in a streaming fashion"""
+    """
+    Generate text in a streaming fashion
+    """
     try:
         # Get model-specific generation parameters
         model_params = get_model_generation_params(model_manager.current_model)
@@ -187,26 +225,26 @@ async def generate_stream(
             "max_new_tokens": max_tokens,
             "temperature": temperature,
             "top_p": top_p,
-            "stream": True
         }
         
         # Merge model-specific params with request params
         generation_params.update(model_params)
         
-        for token in model_manager.generate_stream(
+        # Stream tokens
+        async for token in model_manager.generate_stream(
             prompt=prompt,
             system_prompt=system_prompt,
             **generation_params
         ):
-            # Format as a server-sent event
-            yield f"data: {token}\n\n"
-        
-        # End of stream marker
+            # Format as server-sent event
+            data = token.replace("\n", "\\n")
+            yield f"data: {data}\n\n"
+            
+        # End of stream
         yield "data: [DONE]\n\n"
     except Exception as e:
         logger.error(f"Streaming generation failed: {str(e)}")
-        yield f"data: {{\"error\": \"{str(e)}\"}}\n\n"
-        yield "data: [DONE]\n\n"
+        yield f"data: [ERROR] {str(e)}\n\n"
 
 
 async def stream_chat(
@@ -215,7 +253,9 @@ async def stream_chat(
     temperature: float,
     top_p: float
 ) -> Generator[str, None, None]:
-    """Stream chat completion tokens"""
+    """
+    Stream chat completion
+    """
     try:
         # Get model-specific generation parameters
         model_params = get_model_generation_params(model_manager.current_model)
@@ -224,25 +264,27 @@ async def stream_chat(
         generation_params = {
             "max_new_tokens": max_tokens,
             "temperature": temperature,
-            "top_p": top_p,
-            "stream": True
+            "top_p": top_p
         }
         
         # Merge model-specific params with request params
         generation_params.update(model_params)
         
-        for token in model_manager.generate_stream(
+        # Generate streaming tokens
+        async for token in model_manager.generate_stream(
             prompt=formatted_prompt,
             **generation_params
         ):
-            # Format as a server-sent event with proper JSON structure
-            yield f'data: {{"choices": [{{"delta": {{"content": "{token}"}}}}]}}\n\n'
-        
+            # Format as a server-sent event with the structure expected by chat clients
+            data = json.dumps({"role": "assistant", "content": token})
+            yield f"data: {data}\n\n"
+            
         # End of stream marker
         yield "data: [DONE]\n\n"
     except Exception as e:
         logger.error(f"Chat streaming failed: {str(e)}")
-        yield f"data: {{\"error\": \"{str(e)}\"}}\n\n"
+        error_data = json.dumps({"error": str(e)})
+        yield f"data: {error_data}\n\n"
         yield "data: [DONE]\n\n"
 
 
@@ -270,7 +312,7 @@ async def batch_generate(request: BatchGenerationRequest) -> BatchGenerationResp
         
         responses = []
         for prompt in request.prompts:
-            generated_text = model_manager.generate_text(
+            generated_text = await model_manager.generate_text(
                 prompt=prompt,
                 system_prompt=request.system_prompt,
                 **generation_params
diff --git a/locallab/routes/models.py b/locallab/routes/models.py
@@ -95,6 +95,31 @@ async def load_model(model_id: str, background_tasks: BackgroundTasks) -> Dict[s
         raise HTTPException(status_code=500, detail=str(e))
 
 
+class LoadModelRequest(BaseModel):
+    """Request model for loading a model with JSON body"""
+    model_id: str
+
+
+@router.post("/load", response_model=Dict[str, str])
+async def load_model_from_body(request: LoadModelRequest, background_tasks: BackgroundTasks) -> Dict[str, str]:
+    """Load a specific model using model_id from request body"""
+    model_id = request.model_id
+    if model_id not in MODEL_REGISTRY:
+        raise HTTPException(status_code=404, detail=f"Model {model_id} not found")
+    
+    # Check if the model is already loaded
+    if model_manager.current_model == model_id and model_manager.is_model_loaded(model_id):
+        return {"status": "success", "message": f"Model {model_id} is already loaded"}
+    
+    try:
+        # Load model in background
+        background_tasks.add_task(model_manager.load_model, model_id)
+        return {"status": "loading", "message": f"Model {model_id} loading started in background"}
+    except Exception as e:
+        logger.error(f"Failed to load model {model_id}: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+
 @router.post("/unload", response_model=Dict[str, str])
 async def unload_model() -> Dict[str, str]:
     """Unload the current model to free up resources"""
diff --git a/locallab/routes/system.py b/locallab/routes/system.py
@@ -30,7 +30,7 @@ class SystemInfoResponse(BaseModel):
     """Response model for system information"""
     cpu_usage: float
     memory_usage: float
-    gpu_info: Optional[Dict[str, Any]] = None
+    gpu_info: Optional[List[Dict[str, Any]]] = None
     active_model: Optional[str] = None
     uptime: float
     request_count: int
diff --git a/locallab/server.py b/locallab/server.py
diff --git a/setup.py b/setup.py