Add the Quantizations Methods.

codewithdark-git · codewithdark-git · commit 03c1fe9ee401 · 2025-05-21T13:27:27.000+05:00
diff --git a/quantllm/model/model.py b/quantllm/model/model.py
@@ -142,4 +142,27 @@ def copy_model(self):
                 if name in new_model.state_dict():
                     new_model.state_dict()[name].copy_(param.cpu())
         
-        return new_model
+        return new_model
+
+    def get_base_model(self):
+        """Get a fresh instance of the base model without device mapping."""
+        try:
+            # Create a fresh instance with minimal settings
+            model = AutoModelForCausalLM.from_pretrained(
+                self.config.model_name,
+                low_cpu_mem_usage=True,
+                torch_dtype=torch.float32,
+                device_map=None  # Important: disable device map
+            )
+            # Copy weights from current model
+            with torch.no_grad():
+                for name, param in self.model.named_parameters():
+                    if name in model.state_dict():
+                        param_data = param.data
+                        if hasattr(param_data, "cpu"):
+                            param_data = param_data.cpu()
+                        model.state_dict()[name].copy_(param_data)
+            return model
+        except Exception as e:
+            logging.error(f"Error creating base model: {str(e)}")
+            raise
diff --git a/quantllm/utils/benchmark.py b/quantllm/utils/benchmark.py
@@ -54,6 +54,7 @@ def benchmark_quantizer(
         results = {}
         try:
             self._clear_memory()
+            print(f"GPU memory before {name}: {torch.cuda.memory_allocated() / 1024**2:.1f}MB")
             
             # Configure quantizer for memory efficiency
             mem_efficient_args = dict(quantizer_args)
@@ -66,23 +67,26 @@ def benchmark_quantizer(
                     "percdamp": 0.01,
                     "block_size": 128,
                 })
+              print(f"Creating model copy for {name}...")
+            # Create a fresh model instance with same config
+            config_dict = self.model.config.to_dict()
+            config_dict.pop('_name_or_path', None)  # Remove path to ensure clean config
             
-            print(f"Creating copy of model for {name}...")
-            # Create a fresh model instance from pretrained
-            model_clone = AutoModelForCausalLM.from_pretrained(
-                self.model.config._name_or_path,
-                low_cpu_mem_usage=True,
-                torch_dtype=torch.float32,
-                device_map=None  # Important: disable device map for copying
-            )
+            model_clone = type(self.model)(self.model.config)
             
             print(f"Copying parameters for {name}...")
-            # Manually copy parameters to ensure proper copying
+            # Copy parameters with proper CPU offloading
             with torch.no_grad():
-                for name, param in self.model.named_parameters():
-                    if name in model_clone.state_dict():
-                        # Ensure parameter is on CPU for copying
-                        model_clone.state_dict()[name].copy_(param.cpu())
+                state_dict = {}
+                for param_name, param in self.model.state_dict().items():
+                    # Handle device placement during copy
+                    param_data = param.detach()
+                    if param_data.device.type != 'cpu':
+                        param_data = param_data.cpu()
+                    state_dict[param_name] = param_data
+                
+                # Load state dict all at once
+                model_clone.load_state_dict(state_dict)
             
             # Initialize quantizer with model copy
             quantizer = quantizer_class(model=model_clone, **mem_efficient_args)