Updated package v4.3

UtkarshTheDev · UtkarshTheDev · commit 3a4d8193547e · 2025-03-08T08:12:12.000+05:30
diff --git a/locallab/__init__.py b/locallab/__init__.py
@@ -2,7 +2,7 @@
 LocalLab: Run LLMs locally with a friendly API similar to OpenAI
 """
 
-__version__ = "0.4.2"
+__version__ = "0.4.3"
 
 from typing import Dict, Any, Optional
 
diff --git a/locallab/model_manager.py b/locallab/model_manager.py
@@ -202,11 +202,16 @@ async def load_model(self, model_id: str) -> bool:
                 log_model_unloaded(prev_model)
 
             hf_token = os.getenv("HF_TOKEN")
-            config = self._get_quantization_config()
-
+            
             # Check quantization settings from environment variables
             enable_quantization = os.environ.get('LOCALLAB_ENABLE_QUANTIZATION', '').lower() not in ('false', '0', 'none', '')
             quantization_type = os.environ.get('LOCALLAB_QUANTIZATION_TYPE', '') if enable_quantization else "None"
+            
+            # Get configuration based on quantization settings
+            config = self._get_quantization_config() if enable_quantization else {
+                "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
+                "device_map": "auto"  # Always use device_map="auto" for automatic placement
+            }
 
             if config and config.get("quantization_config"):
                 logger.info(f"Using quantization config: {quantization_type}")
@@ -221,28 +226,23 @@ async def load_model(self, model_id: str) -> bool:
                     token=hf_token
                 )
 
+                # Load the model with device_map="auto" to let the library handle device placement
                 self.model = AutoModelForCausalLM.from_pretrained(
                     model_id,
                     trust_remote_code=True,
                     token=hf_token,
                     **config
                 )
-
-                # Check if the model has offloaded modules
-                if hasattr(self.model, 'is_offloaded') and self.model.is_offloaded:
-                    logger.warning("Model has offloaded modules; skipping device move.")
-                else:
-                    # Move model to the appropriate device only if quantization is disabled
-                    if not enable_quantization:
-                        device = "cuda" if torch.cuda.is_available() else "cpu"
-                        self.model = self.model.to(device)
+                
+                logger.info(f"Model loaded with device_map='auto' for automatic placement")
 
                 # Capture model parameters after loading
                 model_architecture = self.model.config.architectures[0] if hasattr(self.model.config, 'architectures') else 'Unknown'
                 memory_used = torch.cuda.memory_allocated() if torch.cuda.is_available() else 'N/A'
                 logger.info(f"Model architecture: {model_architecture}")
                 logger.info(f"Memory used: {memory_used}")
 
+                # Apply optimizations if needed
                 self.model = self._apply_optimizations(self.model)
 
                 self.current_model = model_id
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="locallab",
-    version="0.4.2",
+    version="0.4.3",
     packages=find_packages(include=["locallab", "locallab.*"]),
     install_requires=[
         "fastapi>=0.95.0,<1.0.0",