Add tensor parallelism on QLoRA (#2424)

tugang-baidu · web-flow · commit 877b5a7f9a35 · 2025-08-15T09:46:05.000+08:00
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -36,7 +36,6 @@ jobs:
             -e no_proxy \
             -e python_version \
             -w /workspace ${docker_image}
-
       - name: Download Code
         env:
           work_dir: ${{ github.workspace }}
@@ -64,18 +63,15 @@ jobs:
             echo "Not in a pull_request event. Skipping PR-specific operations."
           fi
           git log --pretty=oneline -10
-
           if ! git show-ref --quiet refs/heads/develop; then \
               echo "local develop branch is missing, creating local develop branch that tracks remote develop branch"
               git fetch origin develop
               git branch develop --track origin/develop
           else
             echo "local develop branch exist, skipping"
           fi
-
           unset http_proxy && unset https_proxy
           '
-
       - name: Setup Environment
         run: |
           docker exec -t $container_name /bin/bash -c '
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -60,4 +60,4 @@ repos:
         entry: python scripts/codestyle/check_dead_links.py
         language: python
         files: \.(md|markdown|rst)$
-        pass_filenames: true
+        pass_filenames: true
diff --git a/paddleformers/peft/lora/__init__.py b/paddleformers/peft/lora/__init__.py
@@ -16,3 +16,4 @@
 from .lora_config import LoRAAutoConfig, LoRAConfig
 from .lora_layers import ColumnParallelLoRALinear, LoRALinear, RowParallelLoRALinear
 from .lora_model import LoRAModel
+from .lora_quantization_layers import QuantizationLoRABaseLinear
diff --git a/paddleformers/peft/lora/lora_quantization_layers.py b/paddleformers/peft/lora/lora_quantization_layers.py
@@ -44,7 +44,6 @@ def __init__(self, layer, lora_config):
         else:
             self.weight_scale = layer.weight_scale
         self.bias = layer.bias
-
         # LoRA related parameters
         self.lora_config = lora_config
         if not isinstance(self.lora_config.r, int) or self.lora_config.r <= 0:
diff --git a/paddleformers/quantization/qlora.py b/paddleformers/quantization/qlora.py
@@ -37,7 +37,7 @@ def qlora_weight_quantize(
             return quant_weight, (qweight_scale, double_weight_scale, quant_sacle_offset)
         qweight_scale_name = f"{linear_name}.qweight_scale" if linear_name else "qweight_scale"
         double_weight_scale_name = f"{linear_name}.double_weight_scale" if linear_name else "double_weight_scale"
-        quant_sacle_offset_name = f"{linear_name}.quant_sacle_offset" if linear_name else "quant_sacle_offset"
+        quant_sacle_offset_name = f"{linear_name}.weight_scale_offset" if linear_name else "weight_scale_offset"
         qlora_state_dict = {
             qweight_scale_name: qweight_scale,
             double_weight_scale_name: double_weight_scale,
diff --git a/paddleformers/quantization/quantization_linear.py b/paddleformers/quantization/quantization_linear.py
@@ -357,6 +357,7 @@ def __init__(
                     dtype="float32",
                     is_bias=False,
                 )
+                self.weight_scale = None
             else:
                 self.weight_scale = self.create_parameter(
                     shape=[in_features * out_features // self.quantization_config.qlora_weight_blocksize],
@@ -496,6 +497,74 @@ def __init__(
                 self.activation_scale.is_distributed = False
                 self.activation_scale.stop_gradient = True
                 self.group = get_activation_scale_group()
+        elif self.weight_quantize_algo in ["nf4", "fp4"]:
+            if qlora_weight_linear is None:
+                raise ImportError(
+                    "Please run the following commands to install: qlora related package first\n"
+                    "1) git clone https://github.com/PaddlePaddle/PaddleSlim \n"
+                    "2) cd PaddleSlim && pip install -e .\n"
+                    "3) cd csrc &&  python ./setup_cuda.py install"
+                )
+            # print(self.output_size_per_partition, in_features)
+            self.quant_weight = self.create_parameter(
+                shape=[self.output_size_per_partition * in_features // 2, 1],
+                attr=paddle.nn.initializer.Constant(value=0),
+                dtype="uint8",
+                is_bias=False,
+            )
+            self.quant_weight.is_distributed = True if self.is_mp else False
+            if self.quant_weight.is_distributed:
+                self.quant_weight.split_axis = 0
+            if self.quantization_config.qlora_weight_double_quant:
+                # quantized weight_scale
+                self.qweight_scale = self.create_parameter(
+                    shape=[
+                        in_features * self.output_size_per_partition // self.quantization_config.qlora_weight_blocksize
+                    ],
+                    dtype="uint8",
+                    is_bias=False,
+                )
+                # double weight_scale: weight_scale of quantized weight_scale
+                self.qweight_scale.stop_gradient = True
+                self.qweight_scale.is_distributed = True if self.is_mp else False
+                if self.qweight_scale.is_distributed:
+                    self.qweight_scale.split_axis = 0
+                self.double_weight_scale = self.create_parameter(
+                    shape=[
+                        in_features
+                        * self.output_size_per_partition
+                        // self.quantization_config.qlora_weight_blocksize
+                        // self.quantization_config.qlora_weight_double_quant_block_size
+                    ],
+                    dtype="float32",
+                    is_bias=False,
+                )
+                self.double_weight_scale.stop_gradient = True
+                self.double_weight_scale.is_distributed = True if self.is_mp else False
+                if self.double_weight_scale.is_distributed:
+                    self.double_weight_scale.split_axis = 0
+                self.weight_scale_offset = self.create_parameter(
+                    shape=[],
+                    dtype="float32",
+                    is_bias=False,
+                )
+                self.weight_scale_offset.stop_gradient = True
+                self.weight_scale_offset.is_distributed = True if self.is_mp else False
+                if self.weight_scale_offset.is_distributed:
+                    self.weight_scale_offset.split_axis = 0
+            else:
+                self.weight_scale = self.create_parameter(
+                    shape=[
+                        in_features * self.output_size_per_partition // self.quantization_config.qlora_weight_blocksize
+                    ],
+                    dtype="float32",
+                    is_bias=False,
+                )
+                self.weight_scale.stop_gradient = True
+                self.weight_scale.is_distributed = True if self.is_mp else False
+                if self.weight_scale.is_distributed:
+                    self.weight_scale.split_axis = 0
+
         else:
             raise NotImplementedError(f"Not yet support weight_quantize_algo: {self.weight_quantize_algo}")
         if bias_attr is False:
@@ -647,6 +716,74 @@ def __init__(
                 self.activation_scale.is_distributed = False
                 self.activation_scale.stop_gradient = True
                 self.group = get_activation_scale_group(is_row=True)
+        elif self.weight_quantize_algo in ["nf4", "fp4"]:
+            if qlora_weight_linear is None:
+                raise ImportError(
+                    "Please run the following commands to install: qlora related package first\n"
+                    "1) git clone https://github.com/PaddlePaddle/PaddleSlim \n"
+                    "2) cd PaddleSlim && pip install -e .\n"
+                    "3) cd csrc &&  python ./setup_cuda.py install"
+                )
+            self.quant_weight = self.create_parameter(
+                shape=[out_features * self.input_size_per_partition // 2, 1],
+                attr=paddle.nn.initializer.Constant(value=0),
+                dtype="uint8",
+                is_bias=False,
+            )
+            self.quant_weight.is_distributed = True if self.is_mp else False
+            if self.quant_weight.is_distributed:
+                self.quant_weight.split_axis = 1
+            if self.quantization_config.qlora_weight_double_quant:
+                # quantized weight_scale
+                self.qweight_scale = self.create_parameter(
+                    shape=[
+                        self.input_size_per_partition * out_features // self.quantization_config.qlora_weight_blocksize
+                    ],
+                    dtype="uint8",
+                    is_bias=False,
+                )
+                self.qweight_scale.stop_gradient = True
+                self.qweight_scale.is_distributed = True if self.is_mp else False
+                if self.qweight_scale.is_distributed:
+                    self.qweight_scale.split_axis = 0
+                # double weight_scale: weight_scale of quantized weight_scale
+                self.double_weight_scale = self.create_parameter(
+                    shape=[
+                        self.input_size_per_partition
+                        * out_features
+                        // self.quantization_config.qlora_weight_blocksize
+                        // self.quantization_config.qlora_weight_double_quant_block_size
+                    ],
+                    dtype="float32",
+                    is_bias=False,
+                )
+                self.double_weight_scale.stop_gradient = True
+                self.double_weight_scale.is_distributed = True if self.is_mp else False
+                if self.double_weight_scale.is_distributed:
+                    self.double_weight_scale.split_axis = 1
+                self.weight_scale_offset = self.create_parameter(
+                    shape=[],
+                    dtype="float32",
+                    is_bias=False,
+                )
+                self.weight_scale_offset.stop_gradient = True
+                self.weight_scale_offset.is_distributed = True if self.is_mp else False
+                if self.weight_scale_offset.is_distributed:
+                    self.weight_scale_offset.split_axis = 0
+            else:
+                self.weight_scale = self.create_parameter(
+                    shape=[
+                        self.input_size_per_partition * out_features // self.quantization_config.qlora_weight_blocksize
+                    ],
+                    dtype="float32",
+                    is_bias=False,
+                )
+
+                self.weight_scale.stop_gradient = True
+                self.weight_scale.is_distributed = True if self.is_mp else False
+                if self.weight_scale.is_distributed:
+                    self.weight_scale.split_axis = 0
+
         else:
             raise NotImplementedError(f"Not yet support weight_quantize_algo: {self.weight_quantize_algo}")
 
diff --git a/paddleformers/quantization/quantization_utils.py b/paddleformers/quantization/quantization_utils.py
@@ -197,7 +197,7 @@ def convert_to_qlora_state_dict(state_dict, name, quantization_config, dtype, we
     else:
         qweight_scale_name = name + ".qweight_scale"
         double_weight_scale_name = name + ".double_weight_scale"
-        quant_sacle_offset_name = name + ".quant_sacle_offset"
+        quant_sacle_offset_name = name + ".weight_scale_offset"
         quant_name_list += [qweight_scale_name, double_weight_scale_name, quant_sacle_offset_name]
 
     if all(quant_name in state_dict for quant_name in quant_name_list):
@@ -252,7 +252,7 @@ def update_loaded_state_dict_keys(state_dict, quantization_linear_list, quantiza
         activation_scale_name = name + ".activation_scale"
         qweight_scale_name = name + ".qweight_scale"
         double_weight_scale_name = name + ".double_weight_scale"
-        quant_sacle_offset_name = name + ".quant_sacle_offset"
+        quant_sacle_offset_name = name + ".weight_scale_offset"
 
         if quant_weight_name in state_dict and weight_scale_name in state_dict:
             continue
diff --git a/paddleformers/trainer/trainer.py b/paddleformers/trainer/trainer.py
@@ -84,6 +84,7 @@
     init_dataloader_comm_group,
 )
 from ..peft import LoKrModel, LoRAModel, PrefixModelForCausalLM, ReFTModel, VeRAModel
+from ..peft.lora import QuantizationLoRABaseLinear
 from ..quantization.quantization_linear import (
     ColumnParallelQuantizationLinear,
     QuantizationLinear,
@@ -524,7 +525,12 @@ def _wrap_amp_model(self, args, model):
                 models=model,
                 level=self.args.fp16_opt_level,
                 dtype=self.amp_dtype,
-                excluded_layers=[QuantizationLinear, ColumnParallelQuantizationLinear, RowParallelQuantizationLinear]
+                excluded_layers=[
+                    QuantizationLinear,
+                    ColumnParallelQuantizationLinear,
+                    RowParallelQuantizationLinear,
+                    QuantizationLoRABaseLinear,
+                ]
                 + self._decorate_exclude_layers(model),
             )
         # for pipeline mode and pure tensor parallel
@@ -2194,7 +2200,12 @@ def _wrap_model(self, model, training=True):
                 optimizers=self.optimizer,
                 level=self.args.fp16_opt_level,
                 dtype=self.amp_dtype,
-                excluded_layers=[QuantizationLinear, ColumnParallelQuantizationLinear, RowParallelQuantizationLinear]
+                excluded_layers=[
+                    QuantizationLinear,
+                    ColumnParallelQuantizationLinear,
+                    RowParallelQuantizationLinear,
+                    QuantizationLoRABaseLinear,
+                ]
                 + self._decorate_exclude_layers(model),
             )
 
diff --git a/paddleformers/transformers/conversion_utils.py b/paddleformers/transformers/conversion_utils.py
@@ -67,10 +67,12 @@ def add_quant_mapping(name_action_mappings, quantization_config):
         post_quantize = quantization_config.weight_quantize_algo in [
             "weight_only_int4",
             "weight_only_int8",
+            "nf4",
         ]
     elif isinstance(quantization_config.weight_quantize_algo, dict):
         post_quantize = any(
-            key in ["weight_only_int4", "weight_only_int8"] for key in quantization_config.weight_quantize_algo.keys()
+            key in ["weight_only_int4", "weight_only_int8", "nf4"]
+            for key in quantization_config.weight_quantize_algo.keys()
         )
     else:
         post_quantize = False
diff --git a/paddleformers/transformers/model_utils.py b/paddleformers/transformers/model_utils.py
@@ -2055,10 +2055,12 @@ def _load_pretrained_model(
                 post_quantize = config.quantization_config.weight_quantize_algo in [
                     "weight_only_int4",
                     "weight_only_int8",
+                    "nf4",
+                    "fp4",
                 ]
             elif isinstance(config.quantization_config.weight_quantize_algo, dict):
                 post_quantize = any(
-                    key in ["weight_only_int4", "weight_only_int8"]
+                    key in ["weight_only_int4", "weight_only_int8", "nf4", "fp4"]
                     for key in config.quantization_config.weight_quantize_algo.keys()
                 )
             else:

Original file line number	Diff line number	Diff line change
`@@ -67,10 +67,12 @@ def add_quant_mapping(name_action_mappings, quantization_config):`
`67`	`67`	`post_quantize = quantization_config.weight_quantize_algo in [`
`68`	`68`	`"weight_only_int4",`
`69`	`69`	`"weight_only_int8",`
	`70`	`+ "nf4",`
`70`	`71`	`]`
`71`	`72`	`elif isinstance(quantization_config.weight_quantize_algo, dict):`
`72`	`73`	`post_quantize = any(`
`73`		`- key in ["weight_only_int4", "weight_only_int8"] for key in quantization_config.weight_quantize_algo.keys()`
	`74`	`+ key in ["weight_only_int4", "weight_only_int8", "nf4"]`
	`75`	`+ for key in quantization_config.weight_quantize_algo.keys()`
`74`	`76`	`)`
`75`	`77`	`else:`
`76`	`78`	`post_quantize = False`
Original file line number	Diff line number	Diff line change
`@@ -2055,10 +2055,12 @@ def _load_pretrained_model(`
`2055`	`2055`	`post_quantize = config.quantization_config.weight_quantize_algo in [`
`2056`	`2056`	`"weight_only_int4",`
`2057`	`2057`	`"weight_only_int8",`
	`2058`	`+ "nf4",`
	`2059`	`+ "fp4",`
`2058`	`2060`	`]`
`2059`	`2061`	`elif isinstance(config.quantization_config.weight_quantize_algo, dict):`
`2060`	`2062`	`post_quantize = any(`
`2061`		`- key in ["weight_only_int4", "weight_only_int8"]`
	`2063`	`+ key in ["weight_only_int4", "weight_only_int8", "nf4", "fp4"]`
`2062`	`2064`	`for key in config.quantization_config.weight_quantize_algo.keys()`
`2063`	`2065`	`)`
`2064`	`2066`	`else:`