AWQ loader for transformers (eth-sri#254)

bleugreen · web-flow · commit ef703b8d7210 · 2023-10-29T11:33:18.000+01:00
diff --git a/docs/docs/models/hf.md b/docs/docs/models/hf.md
@@ -28,7 +28,7 @@ By default, this exposes an [LMQL/LMTP inference API](https://github.com/eth-sri
 
 **Endpoint and Port** By default, models will be served via port `8080`. To change this, you can specify the port via the `--port` option of the `lmql serve-model` command. On the client side, to connect to a model server running on a different port, you can specify the port when constructing an [`lmql.model`](../lib/generations.md#lmql-llm-objects) object:
 
-```
+```python
 lmql.model("gpt2", endpoint="localhost:9999")
 ```
 
@@ -58,4 +58,30 @@ If you want more control over model loading and configuration, you can pass addi
 
 ```python
 lmql.model("local:gpt2", cuda=True)
-```
+```
+
+## Quantization
+
+Quantization reduces the precision of model parameters to shrink model size and boost inference speed with minimal accuracy loss. LMQL supports two quantization formats: AWQ (using AutoAWQ) and GPTQ (using AutoGPTQ).
+
+### AutoAWQ
+
+AWQ minimizes quantization error by protecting crucial weights, promoting model efficiency without sacrificing accuracy. It's ideal for scenarios requiring both compression and acceleration of LLMs.
+
+Install AutoAWQ following the repo instructions. To use AWQ-quantized models, run:
+
+To use `AWQ`-quantized models, first install [AutoAWQ](https://github.com/casper-hansen/AutoAWQ) using the instructions in the repo.
+
+```bash
+lmql serve-model TheBloke/Mistral-7B-OpenOrca-AWQ --loader awq
+```
+
+### AutoGPTQ
+
+AutoGPTQ reduces model size while retaining performance by lowering the precision of model weights to 4 or 3 bits. It's suitable for efficient deployment and operation of LLMs on consumer-grade hardware.
+
+Install [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) following the repo instructions. To use GPTQ-quantized models, run:
+
+```bash
+lmql serve-model TheBloke/Arithmo-Mistral-7B-GPTQ --loader gptq
+```
diff --git a/src/lmql/models/lmtp/backends/transformers_model.py b/src/lmql/models/lmtp/backends/transformers_model.py
@@ -25,37 +25,69 @@ def merge(kwargs1, kwargs2, prioritize="left"):
 class TransformersLLM(LMTPModel):
     def __init__(self, model_identifier, **kwargs):
         self.model_identifier = model_identifier
+        
+        self.loader = kwargs.pop("loader", None)
+        if self.loader is None:
+            if '-gptq' in self.model_identifier.lower():
+                self.loader = "gptq"
+            elif '-awq' in self.model_identifier.lower():
+                self.loader = "awq"
+            else:
+                self.loader = "transformers"
+                
         self.model_args = kwargs
-        self.loader = kwargs.pop("loader", "transformers")
-
-        self.max_batch_size = kwargs.pop("batch_size", 32)
+        self.max_batch_size = kwargs.get("batch_size", 32)
 
         self.silent = kwargs.pop("silent", False)
 
         if not self.silent:
             print("[Loading", self.model_identifier, "with", self.model_constructor() + "]", flush=True)
 
-        if self.loader == "auto-gptq":
+        if self.loader == "gptq" or self.loader == "auto-gptq":
             from auto_gptq import AutoGPTQForCausalLM
             self.model = AutoGPTQForCausalLM.from_quantized(self.model_identifier, **self.model_args)
+        elif self.loader == 'awq':
+            from awq import AutoAWQForCausalLM
+            awq_args = {
+                'quant_filename': kwargs.pop("quant_filename", ''),
+                "max_new_tokens": kwargs.pop("max_new_tokens", None),
+                "trust_remote_code": kwargs.pop("trust_remote_code", True),
+                "safetensors": kwargs.pop("safetensors", True),
+                "fuse_layers": False,  # TODO: Figure out why this is broken
+                "max_memory": kwargs.pop("max_memory", None),
+                "offload_folder": kwargs.pop("offload_folder", None),
+                "batch_size": kwargs.get("batch_size", 16)
+            }
+            self.model = AutoAWQForCausalLM.from_quantized(self.model_identifier, **awq_args)
         else:
             from transformers import AutoModelForCausalLM            
             self.model = AutoModelForCausalLM.from_pretrained(self.model_identifier, **self.model_args)
         
+        if self.loader == 'awq':
+            self.device = self.model.model.device
+        else:
+            self.device = self.model.device
+        
         if not self.silent:
-            print("[", self.model_identifier, " ready on device ", self.model.device, 
+            print("[", self.model_identifier, " ready on device ", self.device, 
         flush=True, sep="", end="]\n")
 
     @property
     def eos_token_id(self):
-        return self.model.config.eos_token_id
+        if self.loader == 'awq':
+            return self.model.model.config.eos_token_id
+        else:
+            return self.model.config.eos_token_id
 
     def score(self, input_ids: torch.LongTensor, attention_mask: torch.LongTensor, **model_kwargs) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
-        input_ids = torch.tensor(input_ids).to(self.model.device)
-        attention_mask = torch.tensor(attention_mask).to(self.model.device)
+        input_ids = torch.tensor(input_ids).to(self.device)
+        attention_mask = torch.tensor(attention_mask).to(self.device)
         
         # prepare model inputs
-        model_inputs = self.model.prepare_inputs_for_generation(input_ids, **model_kwargs, attention_mask=attention_mask, eos_token_id=self.eos_token_id)
+        if self.loader == 'awq':
+            model_inputs = self.model.model.prepare_inputs_for_generation(input_ids, **model_kwargs, attention_mask=attention_mask, eos_token_id=self.eos_token_id)
+        else:
+            model_inputs = self.model.prepare_inputs_for_generation(input_ids, **model_kwargs, attention_mask=attention_mask, eos_token_id=self.eos_token_id)
         model_inputs["attention_mask"] = attention_mask
 
         token_scores = []
@@ -76,8 +108,8 @@ def score(self, input_ids: torch.LongTensor, attention_mask: torch.LongTensor, *
     def generate(self, input_ids: torch.LongTensor, attention_mask: torch.LongTensor, 
                  temperature: float, max_new_tokens: int, 
                  bias_tensor: torch.FloatTensor, streamer: TokenStreamer, **kwargs) -> LMTPModelResult:
-        input_ids = torch.tensor(input_ids).to(self.model.device)
-        attention_mask = torch.tensor(attention_mask).to(self.model.device)
+        input_ids = torch.tensor(input_ids).to(self.device)
+        attention_mask = torch.tensor(attention_mask).to(self.device)
         
         generate_args = {
             "input_ids": input_ids,
@@ -117,20 +149,27 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
         return [BatchLogitsProcessor()]
 
     def model_constructor(self):
-        if self.loader == "auto-gptq":
+        if self.loader == "gptq" or self.loader == "auto-gptq":
             return "AutoGPTQForCausalLM.from_quantized({})".format(format_call(self.model_identifier, **self.model_args))
+        elif self.loader == 'awq':
+            return "AutoAWQForCausalLM.from_quantized({})".format(format_call(self.model_identifier, **self.model_args))
         else:
             return "AutoModelForCausalLM.from_pretrained({})]".format(format_call(self.model_identifier, **self.model_args))
 
     def version_info(self):
         global version_info
         
         if len(version_info) == 0:
-            if self.loader == "auto-gptq":
+            if self.loader == "gptq" or self.loader == "auto-gptq":
                 import auto_gptq
                 version_info = {
                     "auto_gptq": auto_gptq.__version__
                 }
+            elif self.loader == "awq":
+                import awq
+                version_info = {
+                    "awq": awq.__version__
+                }
             else:
                 import transformers
                 version_info = {