Add the Quantizations Methods.

codewithdark-git · codewithdark-git · commit e33e301ed00c · 2025-05-21T13:21:22.000+05:00
diff --git a/quantllm/model/model.py b/quantllm/model/model.py
@@ -124,4 +124,22 @@ def get_tokenizer(self):
 
     def get_device(self):
         """Get the current device."""
-        return self._device
+        return self._device
+
+    def copy_model(self):
+        """Create a deep copy of the model with all parameters on CPU."""
+        # First, create a new instance with same config but minimal settings
+        new_model = AutoModelForCausalLM.from_pretrained(
+            self.config.model_name,
+            low_cpu_mem_usage=True,
+            torch_dtype=torch.float32,
+            device_map=None  # Disable device_map for copying
+        )
+        
+        # Copy parameters from current model
+        with torch.no_grad():
+            for name, param in self.model.named_parameters():
+                if name in new_model.state_dict():
+                    new_model.state_dict()[name].copy_(param.cpu())
+        
+        return new_model
diff --git a/quantllm/utils/benchmark.py b/quantllm/utils/benchmark.py
@@ -49,6 +49,8 @@ def benchmark_quantizer(
         quantizer_args: Dict
     ) -> Dict[str, float]:
         """Benchmark a specific quantizer with memory management."""
+        from transformers import AutoModelForCausalLM
+        
         results = {}
         try:
             self._clear_memory()
@@ -63,18 +65,29 @@ def benchmark_quantizer(
                 mem_efficient_args.update({
                     "percdamp": 0.01,
                     "block_size": 128,
-                })            # Create a deep copy of the model using from_pretrained
-            config = self.model.config
-            model_clone = type(self.model)(config)
-            # Copy weights manually to ensure proper copying
-            for param_name, param in self.model.state_dict().items():
-                if param_name in model_clone.state_dict():
-                    model_clone.state_dict()[param_name].copy_(param)
+                })
+            
+            print(f"Creating copy of model for {name}...")
+            # Create a fresh model instance from pretrained
+            model_clone = AutoModelForCausalLM.from_pretrained(
+                self.model.config._name_or_path,
+                low_cpu_mem_usage=True,
+                torch_dtype=torch.float32,
+                device_map=None  # Important: disable device map for copying
+            )
+            
+            print(f"Copying parameters for {name}...")
+            # Manually copy parameters to ensure proper copying
+            with torch.no_grad():
+                for name, param in self.model.named_parameters():
+                    if name in model_clone.state_dict():
+                        # Ensure parameter is on CPU for copying
+                        model_clone.state_dict()[name].copy_(param.cpu())
             
-            # Initialize quantizer with model copy on CPU
+            # Initialize quantizer with model copy
             quantizer = quantizer_class(model=model_clone, **mem_efficient_args)
             
-            # Move to device for quantization
+            # Move to appropriate device
             if self.device == "cuda":
                 quantizer.model = quantizer.model.cuda()
                 cal_data = self.calibration_data.cuda()
@@ -84,6 +97,7 @@ def benchmark_quantizer(
             # Measure quantization time
             start_time = time.time()
             
+            print(f"Starting quantization for {name}...")
             if name == "AWQ":
                 # AWQ uses batched processing
                 cal_steps = min(20, len(cal_data))
diff --git a/quantllm/utils/benchmark_new.py b/quantllm/utils/benchmark_new.py