Merge pull request #2 from GreenBitAI/haojin_dev

yanghaojin · web-flow · commit 8addc3e3a9e5 · 2024-05-23T00:20:54.000+02:00
Haojin dev
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,23 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/)
 and this project adheres to [Semantic Versioning](http://semver.org/).
 
 
+## [0.2.4] - 2024/05/23
+
+### Added
+
+- Tuned the hyperparameters of DiodeMix optimizer for sft.
+- Added sft-support for the classical gptq-style models.
+- Implemented qzeros update in finetuning process.
+
+### Updated
+
+- Extended pack_fp_weight function.
+- Enhanced the performance of MPQLinearCUDA layer.
+
+### Fixed
+
+- Fixed various errors in DiodeMix update function.
+
 ## [0.2.3] - 2024/05/01
 
 ### Updated
diff --git a/bitorch_engine/layers/qlinear/nbit/cuda/mbwq_layer.py b/bitorch_engine/layers/qlinear/nbit/cuda/mbwq_layer.py
@@ -109,9 +109,8 @@ def backward(ctx: torch.autograd.function.BackwardCFunction,
         grad_input = output_gradient.mm(weights.t()) # (m, n)*(n, k) = (m, k)
         #======================================================================================================#
 
-        # (n, m) * (m, k) = (n, k)
         if qweight.requires_grad: # This additional check is required by peft training.
-            qweight.privileged_grad = output_gradient.t().mm(input).t()  # (k, n)
+            qweight.privileged_grad = input.t().mm(output_gradient)  # (k, m) * (m, n) = (k, n)
 
         grad_input = unflatten_x(grad_input, shape)
 
diff --git a/bitorch_engine/layers/qlinear/nbit/cuda/mbwq_linear_cuda_kernel.cu b/bitorch_engine/layers/qlinear/nbit/cuda/mbwq_linear_cuda_kernel.cu
@@ -749,7 +749,6 @@ torch::Tensor mbwq_linear_q4_forward_cuda(
     int bits
 ){
     const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
-    cublasHandle_t cublas_handle = at::cuda::getCurrentCUDABlasHandle();
 
     TORCH_CHECK(x.dtype() == torch::kHalf);
     TORCH_CHECK(x.size(1) == qweight.size(0) * (32 / bits));
@@ -770,16 +769,8 @@ torch::Tensor mbwq_linear_q4_forward_cuda(
 									               group_size,
 									               bits,
 									               q_perm);
-
-        const half alpha = __float2half(1.0f);
-        const half beta = __float2half(0.0f);
-        cublasHgemm(cublas_handle,
-                    CUBLAS_OP_N,
-                    CUBLAS_OP_N,
-                    size_n, size_m,                                    size_k,
-                    &alpha, reinterpret_cast<half *>(fp_w.data_ptr()), size_n,
-                            reinterpret_cast<half *>(x.data_ptr()),    size_k,
-                    &beta,  reinterpret_cast<half *>(out.data_ptr()),  size_n);
+		// indirectly use cublas through torch matmul api
+        out = torch::matmul(x, fp_w.to(option_output));
 
 	}else{
 
@@ -943,7 +934,6 @@ torch::Tensor mbwq_linear_exl2_forward_cuda(
     bool use_cublas
 ){
     const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
-    cublasHandle_t cublas_handle = at::cuda::getCurrentCUDABlasHandle();
     TORCH_CHECK(x.dtype() == torch::kHalf);
 
 	int size_m = x.size(0);       // m
@@ -963,15 +953,8 @@ torch::Tensor mbwq_linear_exl2_forward_cuda(
 									               qgroup_map,
 									               rows);
 
-        const half alpha = __float2half(1.0f);
-        const half beta = __float2half(0.0f);
-        cublasHgemm(cublas_handle,
-                    CUBLAS_OP_N,
-                    CUBLAS_OP_N,
-                    size_n, size_m,                                    size_k,
-                    &alpha, reinterpret_cast<half *>(fp_w.data_ptr()), size_n,
-                            reinterpret_cast<half *>(x.data_ptr()),    size_k,
-                    &beta,  reinterpret_cast<half *>(out.data_ptr()),  size_n);
+        // indirectly use cublas through torch matmul api
+        out = torch::matmul(x, fp_w.to(option_output));
 
 	}else{
 	    int rows_8 = rows[0];
diff --git a/bitorch_engine/layers/qlinear/nbit/cuda/mpq_layer.py b/bitorch_engine/layers/qlinear/nbit/cuda/mpq_layer.py
@@ -6,6 +6,7 @@
 from bitorch_engine.layers.qlinear.nbit import MPQLinearBase
 from bitorch_engine.utils.safe_import import import_extension
 from bitorch_engine.utils.model_helper import flatten_x, unflatten_x
+from bitorch_engine.layers.qlinear.nbit.cuda.utils import unpack_qweight
 
 
 q_linear_cuda = import_extension("q_linear_cuda")
@@ -47,19 +48,33 @@ def forward(ctx, x: torch.Tensor, qweight: torch.Tensor, a_bit: int, w_bit: int,
         Returns:
            torch.Tensor: The result of the quantized linear operation.
        """
-        x, shape = flatten_x(x)
-        output = q_linear_cuda.mpq_forward(x, qweight, scales, zeros, g_idx, a_bit, w_bit, asym)
-        if is_training:
+        def setup_qweight():
             qweight.scales = scales
             qweight.zeros = zeros
             qweight.g_idx = g_idx
             qweight.w_bit = w_bit
-            qweight.privileged_grad = privileged_grad
             qweight.asym = asym
             qweight.layer_type = 1
+
+        x, original_shape = flatten_x(x)
+
+        if x.size(0) > 32: # use pytorch api
+            setup_qweight()
+            # Reconstruct the floating-point weight
+            fp_weight = unpack_qweight(qweight)
+            output = torch.matmul(x, fp_weight)
+        else:
+            output = q_linear_cuda.mpq_forward(x, qweight, scales, zeros, g_idx, a_bit, w_bit, asym)
+
+        if is_training:
+            qweight.privileged_grad = privileged_grad
+            if qweight.scales is None:
+                setup_qweight()
             ctx.a_bit = a_bit
             ctx.save_for_backward(x, qweight)
-        output = unflatten_x(output, shape)
+
+        output = unflatten_x(output, original_shape)
+
         return output
 
     @staticmethod
@@ -100,9 +115,8 @@ def backward(ctx: torch.autograd.function.BackwardCFunction,
                                                   output_gradient, a_bit, w_bit, asym)
         #==================================================================#
 
-        # (n, m) * (m, k) = (n, k)
         if qweight.requires_grad:  # This additional check is required by peft training.
-            qweight.privileged_grad = output_gradient.t().mm(input).t()  # (k, n)
+            qweight.privileged_grad = input.t().mm(output_gradient)  # (k, m) * (m, n) = (k, n)
 
         grad_input = unflatten_x(grad_input, shape)
 
diff --git a/bitorch_engine/layers/qlinear/nbit/cuda/utils.py b/bitorch_engine/layers/qlinear/nbit/cuda/utils.py
@@ -69,7 +69,7 @@ def unpack_qweight(qweight: MPQWeightParameter) -> torch.Tensor:
     return weights
 
 
-def pack_fp_weight(weight: torch.Tensor, qweight: MPQWeightParameter) -> torch.Tensor:
+def pack_fp_weight(weight: torch.Tensor, qweight: MPQWeightParameter, unpacked_zeros: torch.Tensor = None) -> torch.Tensor:
     """Packs the fp16 weight into a quantized weight format using the attributes defined in the QweightParameter.
 
     This function handles three main scenarios:
@@ -100,8 +100,22 @@ def pack_fp_weight(weight: torch.Tensor, qweight: MPQWeightParameter) -> torch.T
 
     # Process based on layer_type and existence of q_perm for quantization
     if layer_type == 1 or (layer_type == 2 and qweight.q_group_map is None): # MPQLinear or MBWQLinear-q4
-        if asym:
-            intweight = torch.round(weight / scales[g_idx] + zeros[g_idx]).to(torch.int32).clamp(0, 2**w_bit-1)
+        if asym: # this if-branch is for classical GPTQ-style models
+            if unpacked_zeros is not None:
+                zeros = unpacked_zeros
+            elif zeros.dtype == torch.int32:
+                wf = torch.tensor(list(range(0, 32, w_bit)), dtype=torch.int32,
+                                  device=qweight.device).unsqueeze(0)
+                zeros_unpack = torch.bitwise_right_shift(
+                    torch.unsqueeze(zeros, 2).expand(-1, -1, 32 // w_bit),
+                    wf.unsqueeze(0)).to(torch.int16 if w_bit == 8 else torch.int8)
+                torch.bitwise_and(zeros_unpack, (2 ** w_bit) - 1, out=zeros_unpack)
+                zeros_unpack = zeros_unpack + 1
+                zeros = zeros_unpack.reshape(-1, qweight.size(-1))
+            else:
+                raise ValueError(f"Error: Got invalid dtype of qweight.zeros while packing fp weight.")
+
+            intweight = torch.round(weight / scales[g_idx.long()] + zeros[g_idx.long()]).to(torch.int32).clamp(0, 2**w_bit-1)
         else:
             if g_idx is None:
                 # Adjust scales and zeros for symmetric quantization without group index
@@ -114,8 +128,7 @@ def pack_fp_weight(weight: torch.Tensor, qweight: MPQWeightParameter) -> torch.T
                 intweight = torch.round((weight + zeros) / scales).to(torch.int32).clamp(0, 2 ** w_bit - 1)
             else:
                 # Calculate integer weights for symmetric quantization with group index
-                # TODO: recalculate scales and zeros?
-                intweight = torch.round((weight + zeros[g_idx]) / scales[g_idx]).to(torch.int32).clamp(0, 2**w_bit-1)
+                intweight = torch.round((weight + zeros[g_idx.long()]) / scales[g_idx.long()]).to(torch.int32).clamp(0, 2**w_bit-1)
 
         # Perform parallel bitpacking
         wf = torch.tensor(list(range(0, 32, w_bit)), dtype=torch.int32, device=qweight.device).unsqueeze(0)
@@ -128,8 +141,8 @@ def pack_fp_weight(weight: torch.Tensor, qweight: MPQWeightParameter) -> torch.T
             dtype=torch.int32
         )
     else:
-        # TODO: Placeholder for mixed-bit-width quantization method
-        raise NotImplementedError("Error: pack_fp_weight for MBWQLinear using mixed-bit-width not supported yet.")
+        # TODO: Placeholder for channel-mix quantization method
+        raise NotImplementedError("Error: pack_fp_weight for MBWQLinear using channel-mix quantization not supported yet.")
 
     return intweight.to(torch.int32)
 
diff --git a/bitorch_engine/layers/qlinear/nbit/layer.py b/bitorch_engine/layers/qlinear/nbit/layer.py
@@ -403,7 +403,8 @@ def init_gptq(self) -> None:
         """
         self.register_buffer('qzeros', torch.zeros((math.ceil(self.in_channels / self.group_size),
                                                     self.out_channels // 32 * self.w_bit), dtype=torch.int32))
-        self.scales = torch.ones((math.ceil(self.in_channels / self.group_size), self.out_channels), dtype=self.dtype)
+        self.register_buffer('scales', torch.ones((math.ceil(self.in_channels / self.group_size),
+                                                   self.out_channels), dtype=self.dtype))
         self.asym = True
 
     def init_gba(self) -> None:
diff --git a/bitorch_engine/utils/model_helper.py b/bitorch_engine/utils/model_helper.py
@@ -3,7 +3,7 @@
 
 import torch
 import torch.nn.functional as F
-from bitorch_engine.utils.quant_operators import nv_tensor_quant, gptq_stype_unpacking
+from bitorch_engine.utils.quant_operators import nv_tensor_quant, gptq_style_unpacking, gptq_style_zeros_packing
 from bitorch_engine.functions.cuda import tensor_to_packed_uint8, unpack_uint8_tensor
 
 
@@ -327,6 +327,39 @@ def init_weight(weight: torch.Tensor, cls: Type[torch.nn.Parameter]=torch.nn.Par
     return weight, scale_w
 
 
+def update_zeros(qweight, w, norm_grad, step_size, z_unpacked=None):
+    """
+    Updates the zeros attribute of the qweight object based on its layer type.
+
+    Args:
+        qweight: An object containing quantization parameters, including the zeros attribute.
+        w: Weight tensor.
+        norm_grad: Normalized gradient tensor.
+        step_size: Step size for updating zeros.
+        z_unpacked: Optional unpacked zeros tensor for specific layer types.
+    """
+    if qweight.layer_type == 2:  # MBWQ-layer
+        q_perm = qweight.q_perm.unsqueeze(1).repeat(1, w.size(1)).long()
+        zeros_grad = torch.gather(norm_grad, dim=0, index=q_perm)
+        qweight.zeros.add_(
+            step_size * zeros_grad.view(-1, w.size(0) // qweight.scales.size(0), qweight.scales.size(-1)).mean(1)
+        )
+        del zeros_grad
+    elif qweight.layer_type == 1 and qweight.g_idx is not None:  # MPQ-layer & GPTQ
+        zeros_unpack = z_unpacked[qweight.g_idx.long()]
+        zeros_unpack.add_(step_size * norm_grad)
+
+        g_idx = qweight.g_idx.long()
+        perm = torch.argsort(g_idx, dim=0)
+        zeros = zeros_unpack[perm, :].view(-1, w.size(0) // qweight.scales.size(0), qweight.scales.size(-1)).mean(1)
+        
+        # pack to qzeros
+        qweight.zeros = gptq_style_zeros_packing(zeros, qweight.w_bit, zeros.size(-1), qweight.group_size)
+    else:
+        raise NotImplementedError(
+            "qweight.layer_type: '{}' has not been supported yet.".format(str(qweight.layer_type)))
+
+
 def qweight_update_fn(qweight: torch.nn.Parameter, exp_avg_s: torch.Tensor=None, exp_avg_l: torch.Tensor=None,
                        step: torch.Tensor=None, lr:float=1e-4, weight_decay:float=0.0, beta1:float=0.99,
                       beta2:float=0.9999, eps: float = 1e-6, dtype=torch.half, correct_bias=None, projector=None,
@@ -452,7 +485,9 @@ def qweight_update_fn(qweight: torch.nn.Parameter, exp_avg_s: torch.Tensor=None,
     elif isinstance(qweight, MPQWeightParameter):
 
         # unpack qweight
-        w = gptq_stype_unpacking(qweight).to(dtype)
+        w, z_unpacked = gptq_style_unpacking(qweight)
+        w = w.to(dtype)
+        z_unpacked = z_unpacked.to(dtype)
 
         # Decay the first and second moment running average coefficient
         # In-place operations to update the averages at the same time
@@ -475,11 +510,19 @@ def qweight_update_fn(qweight: torch.nn.Parameter, exp_avg_s: torch.Tensor=None,
 
         w.add_(norm_grad, alpha=-step_size)
 
-        if weight_decay > 0.0:
-            w.add_(w, alpha=(-lr * weight_decay))
+        # ===== update zeros ===== #
+        # We are not performing the gradient update for 'zeros' in the conventional way.
+        # Instead, we are making a special handling here because, although 'zeros' is of fp data type,
+        # in our optimization scenario, it is tied to the updates of 'qweight'.
+        # Moreover, 'zeros' is not always updated but interacts with 'qweight' at a relatively sparse frequency.
+        # If we were to update 'zeros' as a regular fp-parameter, it might not allow us the flexibility
+        # to design these interactions conveniently.
+        # Considering this is a beta version, future updates and adjustments might be possible.
+        if step % 5 == 0:
+            update_zeros(qweight, w, norm_grad, step_size, z_unpacked)
 
         # pack fp weight back to Q-weight and update qweight data
-        qweight.data = pack_fp_weight(w, qweight)
+        qweight.data = pack_fp_weight(w, qweight, z_unpacked)
 
         # manually empty cuda cache.
         del w
diff --git a/bitorch_engine/utils/quant_operators.py b/bitorch_engine/utils/quant_operators.py
@@ -1,4 +1,5 @@
 from typing import Tuple
+import math
 
 import torch
 
@@ -306,7 +307,7 @@ def q4_quantization(input: torch.Tensor, scale_a: torch.Tensor=None, eps: torch.
         return (input / scale_a).round().clamp(Qn, Qp)
 
 
-def gptq_stype_unpacking(qweight) -> torch.Tensor:
+def gptq_style_unpacking(qweight) -> Tuple[torch.Tensor, torch.Tensor]:
     """
     Reconstructs the fp16 weight tensor from the input quantized weight parameter in GPTQ style.
 
@@ -341,4 +342,27 @@ def gptq_stype_unpacking(qweight) -> torch.Tensor:
         else:
             weights = weight * qweight.scales[qweight.g_idx.long()] - qweight.zeros[qweight.g_idx.long()]
 
-    return weights
+    return weights, zeros
+
+
+def gptq_style_zeros_packing(zeros: torch.Tensor, w_bit: int, out_features: int, group_size: int) -> torch.Tensor:
+    """
+    Packs the zeros tensor in GPTQ style for efficient storage and computation.
+
+    Args:
+        zeros (torch.Tensor): Input tensor containing zeros.
+        w_bit (int): Number of bits for weight quantization.
+        out_features (int): Number of output features.
+        group_size (int): Size of the group for packing.
+
+    Returns:
+        torch.Tensor: Packed tensor with reduced storage.
+    """
+    
+    zeros = zeros.reshape(zeros.shape[0], math.ceil(out_features // 32 * w_bit), 32//w_bit).to(torch.int32)
+    zeros_pack = zeros - 1
+    wf = torch.arange(0, 32, w_bit, device=zeros.device, dtype=torch.int32)
+    zeros_pack = torch.bitwise_and(zeros_pack, (2 ** w_bit) - 1)
+    zeros_pack = torch.bitwise_left_shift(zeros_pack.to(torch.int32), wf.unsqueeze(0).unsqueeze(1))
+    zeros_pack = zeros_pack.sum(dim=-1).to(torch.int32)
+    return zeros_pack
diff --git a/docker/build_scripts/install_modified_pytorch.sh b/docker/build_scripts/install_modified_pytorch.sh
@@ -24,7 +24,7 @@ fi
 if [ "${from_image}" == "pytorch/pytorch:2.2.0-cuda12.1-cudnn8-devel" ]; then
     gdrive_id="1LjFNImboq8QeFSompMS2gPjBRYtP2Dsz"
     file="torch-2.2.2-cp310-cp310-linux_x86_64.whl"
-    checksum="2a5953dab7be6c1640112e38ae7519ad88180d9fa79faab6c86dbee6b1cc210e"
+    checksum="bcc0ba7f121ee2f42ed0a59f01d4e3d70f82a8981be0be25c5e0fe0635a54b2d"
 fi
 #if [ "${from_image}" == "pytorch/pytorch:X.X.X-cudaXX.X-cudnn8-devel" ]; then
 #    gdrive_id="xxx"
diff --git a/version.txt b/version.txt
@@ -1 +1 @@
-0.2.0
+0.2.4