Add AWQ support for all models (#1714)

WoosukKwon · web-flow · commit 8d17774f924d · 2023-11-18T17:56:47.000-08:00
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
@@ -1,8 +1,11 @@
 """Custom activation functions."""
+from typing import Optional
+
 import torch
 import torch.nn as nn
 
 from vllm import activation_ops
+from vllm.model_executor.layers.quantization import QuantizationConfig
 
 
 class SiluAndMul(nn.Module):
@@ -39,6 +42,27 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return out
 
 
+class ScaledActivation(nn.Module):
+    """An activation function with post-scale parameters.
+
+    This is used for some quantization methods like AWQ.
+    """
+
+    def __init__(
+        self,
+        act_module: nn.Module,
+        hidden_size: int,
+        params_dtype: torch.dtype,
+    ):
+        super().__init__()
+        self.act = act_module
+        self.scales = nn.Parameter(
+            torch.empty(hidden_size, dtype=params_dtype, device="cuda"))
+
+    def forward(self, x: torch.Tensor):
+        return self.act(x) / self.scales
+
+
 _ACTIVATION_REGISTRY = {
     "gelu": nn.GELU(),
     "gelu_fast": FastGELU(),
@@ -48,9 +72,27 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 }
 
 
-def get_act_fn(act_fn: str) -> nn.Module:
+def get_act_fn(
+    act_fn_name: str,
+    quant_config: Optional[QuantizationConfig] = None,
+    intermediate_size: Optional[int] = None,
+) -> nn.Module:
     """Get an activation function by name."""
-    act_fn = act_fn.lower()
-    if act_fn in _ACTIVATION_REGISTRY:
-        return _ACTIVATION_REGISTRY[act_fn]
-    raise ValueError(f"Activation function {act_fn!r} is not supported.")
+    act_fn_name = act_fn_name.lower()
+    if act_fn_name not in _ACTIVATION_REGISTRY:
+        raise ValueError(
+            f"Activation function {act_fn_name!r} is not supported.")
+
+    act_fn = _ACTIVATION_REGISTRY[act_fn_name]
+    if quant_config is not None:
+        if act_fn_name in quant_config.get_scaled_act_names():
+            if intermediate_size is None:
+                raise ValueError(
+                    "intermediate_size must be specified for scaled "
+                    "activation functions.")
+            return ScaledActivation(
+                act_fn,
+                intermediate_size,
+                params_dtype=torch.get_default_dtype(),
+            )
+    return act_fn
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
@@ -63,6 +63,9 @@ def from_config(cls, config: Dict[str, Any]) -> "AWQConfig":
     def get_linear_method(self) -> "AWQLinearMethod":
         return AWQLinearMethod(self)
 
+    def get_scaled_act_names(self) -> List[str]:
+        return ["gelu", "gelu_fast", "gelu_new", "gelu_pytorch_tanh"]
+
 
 class AWQLinearMethod(LinearMethodBase):
     """Linear method for AWQ.
diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
@@ -54,3 +54,11 @@ def get_from_keys(config: Dict[str, Any], keys: List[str]) -> Any:
     def get_linear_method(self) -> LinearMethodBase:
         """Get the linear method to use for the quantized linear layer."""
         raise NotImplementedError
+
+    @abstractmethod
+    def get_scaled_act_names(self) -> List[str]:
+        """Returns the activation function names that should be post-scaled.
+
+        For now, this is only used by AWQ.
+        """
+        raise NotImplementedError
diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py
@@ -52,6 +52,9 @@ def from_config(cls, config: Dict[str, Any]) -> "SqueezeLLMConfig":
     def get_linear_method(self) -> "SqueezeLLMLinearMethod":
         return SqueezeLLMLinearMethod(self)
 
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
 
 class SqueezeLLMLinearMethod(LinearMethodBase):
     """Linear method for SqueezeLLM.
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
@@ -145,7 +145,8 @@ def __init__(
             4 * hidden_size,
             linear_method=linear_method,
         )
-        self.act = get_act_fn("gelu")
+        quant_config = getattr(linear_method, "quant_config", None)
+        self.gelu_impl = get_act_fn("gelu", quant_config, 4 * hidden_size)
         self.dense_4h_to_h = RowParallelLinear(
             4 * hidden_size,
             hidden_size,
@@ -154,7 +155,7 @@ def __init__(
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x, _ = self.dense_h_to_4h(x)
-        x = self.act(x)
+        x = self.gelu_impl(x)
         x, _ = self.dense_4h_to_h(x)
         return x
 
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
@@ -27,6 +27,7 @@
 from transformers import FalconConfig as HF_FalconConfig
 
 from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.attention import (PagedAttention,
                                                   PagedAttentionWithALiBi,
                                                   PagedAttentionWithRoPE)
@@ -131,6 +132,7 @@ def __init__(
             self.hidden_size,
             bias=config.bias,
             skip_bias_add=True,
+            linear_method=linear_method,
             reduce_results=self.reduce_row_parallel_results)
 
         self.use_rotary = config.rotary
@@ -206,7 +208,8 @@ def __init__(
                                                   bias=config.bias,
                                                   skip_bias_add=True,
                                                   linear_method=linear_method)
-        self.act = nn.GELU()
+        quant_config = getattr(linear_method, "quant_config", None)
+        self.act = get_act_fn("gelu", quant_config, 4 * hidden_size)
         self.reduce_row_parallel_results = not (config.new_decoder_architecture
                                                 or config.parallel_attn)
         self.dense_4h_to_h = RowParallelLinear(
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
@@ -118,7 +118,9 @@ def __init__(
             bias=True,
             linear_method=linear_method,
         )
-        self.act = get_act_fn(config.activation_function)
+        quant_config = getattr(linear_method, "quant_config", None)
+        self.act = get_act_fn(config.activation_function, quant_config,
+                              intermediate_size)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states, _ = self.c_fc(hidden_states)
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
@@ -137,7 +137,9 @@ def __init__(
             bias=True,
             linear_method=linear_method,
         )
-        self.act = get_act_fn(config.activation_function)
+        quant_config = getattr(linear_method, "quant_config", None)
+        self.act = get_act_fn(config.activation_function, quant_config,
+                              intermediate_size)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states, _ = self.c_fc(hidden_states)
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
@@ -128,7 +128,9 @@ def __init__(
             hidden_size,
             linear_method=linear_method,
         )
-        self.act = get_act_fn(config.activation_function)
+        quant_config = getattr(linear_method, "quant_config", None)
+        self.act = get_act_fn(config.activation_function, quant_config,
+                              intermediate_size)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states, _ = self.fc_in(hidden_states)
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
@@ -124,7 +124,9 @@ def __init__(
             config.hidden_size,
             linear_method=linear_method,
         )
-        self.act = get_act_fn(config.hidden_act)
+        quant_config = getattr(linear_method, "quant_config", None)
+        self.act = get_act_fn(config.hidden_act, quant_config,
+                              config.intermediate_size)
 
     def forward(self, hidden_states):
         hidden_states, _ = self.dense_h_to_4h(hidden_states)
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
@@ -130,7 +130,8 @@ def __init__(
             bias=not config.no_bias,
             linear_method=linear_method,
         )
-        self.act = get_act_fn("gelu")
+        quant_config = getattr(linear_method, "quant_config", None)
+        self.act = get_act_fn("gelu", quant_config, intermediate_size)
         self.down_proj = RowParallelLinear(
             intermediate_size,
             hidden_size,
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
@@ -129,7 +129,9 @@ def __init__(
             linear_method=linear_method,
         )
         self.do_layer_norm_before = config.do_layer_norm_before
-        self.activation_fn = get_act_fn(config.activation_function)
+        quant_config = getattr(linear_method, "quant_config", None)
+        self.activation_fn = get_act_fn(config.activation_function,
+                                        quant_config, config.ffn_dim)
 
         self.self_attn_layer_norm = nn.LayerNorm(
             self.embed_dim,
@@ -251,7 +253,7 @@ def forward(
         inputs_embeds = self.embed_tokens(input_ids)
         pos_embeds = self.embed_positions(positions)
         if self.project_in is not None:
-            inputs_embeds = self.project_in(inputs_embeds)
+            inputs_embeds, _ = self.project_in(inputs_embeds)
         hidden_states = inputs_embeds + pos_embeds
 
         for i in range(len(self.layers)):
@@ -266,7 +268,7 @@ def forward(
         if self.final_layer_norm is not None:
             hidden_states = self.final_layer_norm(hidden_states)
         if self.project_out is not None:
-            hidden_states = self.project_out(hidden_states)
+            hidden_states, _ = self.project_out(hidden_states)
         return hidden_states
 
 
diff --git a/vllm/model_executor/models/phi_1_5.py b/vllm/model_executor/models/phi_1_5.py
@@ -168,7 +168,9 @@ def __init__(self,
             config.hidden_size,
             linear_method=linear_method,
         )
-        self.act = get_act_fn(config.activation_function)
+        quant_config = getattr(linear_method, "quant_config", None)
+        self.act = get_act_fn(config.activation_function, quant_config,
+                              n_inner)
 
     def forward(self, hidden_states):
         hidden_states, _ = self.fc1(hidden_states)