Support GPTQ models with gptq_v2 checkpoint_format

turboderp · turboderp · commit bcf1ee380d0e · 2024-07-18T14:01:39.000+02:00
diff --git a/exllamav2/config.py b/exllamav2/config.py
@@ -107,6 +107,7 @@ class ExLlamaV2Config:
     norm_head: int | None
 
     checkpoint_fused_mlp: bool
+    checkpoint_offset_qzeros: bool
 
 
     def __init__(self,
@@ -287,6 +288,11 @@ def prepare(self, no_tensors: bool = False):
             # if scaling_type == "yarn":
             #     self.scale_alpha_value = factor
 
+        # Checkpoint format (for GPTQ models)
+
+        checkpoint_format = read(read_config, str, ["quantization_config->checkpoint_format"], None)
+        self.checkpoint_offset_qzeros = (checkpoint_format == "gptq_v2")
+
         # Create map of model tensors
 
         if no_tensors: return
diff --git a/exllamav2/ext.py b/exllamav2/ext.py
@@ -320,7 +320,8 @@ def make_q_matrix(w: dict,
                   temp_dq: torch.Tensor,
                   key: str = None,
                   prescale: float = 1,
-                  max_dq_rows = 0):
+                  max_dq_rows = 0,
+                  offset_qzeros: bool = False):
 
     # EXL2
 
@@ -354,6 +355,9 @@ def make_q_matrix(w: dict,
         if prescale != 1: w["scales"] *= prescale
         if w["scales"].dtype == torch.float: w["scales"] = w["scales"].half()
 
+        if offset_qzeros:
+            w["qzeros"] -= 0b00010001000100010001000100010001
+
         # GPTQ with g_idx (act_order)
 
         if "g_idx" in w and not (w["g_idx"] == 0).all().item():
diff --git a/exllamav2/linear.py b/exllamav2/linear.py
@@ -98,13 +98,15 @@ def load(self,
              w: dict | nn.Parameter | tuple | None = None,
              device_tensors: bool = True):
 
+        cfg = self.model.config
+
         if self.f_key: w = self.load_weight_fused(self.f_key, self.f_beg, self.f_end, self.in_features, self.out_features, self.altpack_qkv)
         if w is None: w = self.load_weight()
 
         # Load quantized linear layer from dictionary
 
         if isinstance(w, dict):
-            assert not self.model.config.load_in_q4, "Can't load quantized layer in Q4 mode"
+            assert not cfg.load_in_q4, "Can't load quantized layer in Q4 mode"
             if self.has_bias:
                 assert "bias" in w, self.key + " has no bias but bias expected"
             else:
@@ -119,7 +121,8 @@ def load(self,
             self.q_handle = ext.make_q_matrix(w,
                                               self.temp_dq,
                                               prescale = self.prescale,
-                                              max_dq_rows = self.model.config.max_dq_size // self.out_features)
+                                              max_dq_rows = cfg.max_dq_size // self.out_features,
+                                              offset_qzeros = cfg.checkpoint_offset_qzeros)
             self.prev_prescale = self.prescale
             self.prescale = 1