@@ -357,6 +357,7 @@ def __init__(
357
357
dtype = "float32" ,
358
358
is_bias = False ,
359
359
)
360
+ self .weight_scale = None
360
361
else :
361
362
self .weight_scale = self .create_parameter (
362
363
shape = [in_features * out_features // self .quantization_config .qlora_weight_blocksize ],
@@ -496,6 +497,74 @@ def __init__(
496
497
self .activation_scale .is_distributed = False
497
498
self .activation_scale .stop_gradient = True
498
499
self .group = get_activation_scale_group ()
500
+ elif self .weight_quantize_algo in ["nf4" , "fp4" ]:
501
+ if qlora_weight_linear is None :
502
+ raise ImportError (
503
+ "Please run the following commands to install: qlora related package first\n "
504
+ "1) git clone https://github.com/PaddlePaddle/PaddleSlim \n "
505
+ "2) cd PaddleSlim && pip install -e .\n "
506
+ "3) cd csrc && python ./setup_cuda.py install"
507
+ )
508
+ # print(self.output_size_per_partition, in_features)
509
+ self .quant_weight = self .create_parameter (
510
+ shape = [self .output_size_per_partition * in_features // 2 , 1 ],
511
+ attr = paddle .nn .initializer .Constant (value = 0 ),
512
+ dtype = "uint8" ,
513
+ is_bias = False ,
514
+ )
515
+ self .quant_weight .is_distributed = True if self .is_mp else False
516
+ if self .quant_weight .is_distributed :
517
+ self .quant_weight .split_axis = 0
518
+ if self .quantization_config .qlora_weight_double_quant :
519
+ # quantized weight_scale
520
+ self .qweight_scale = self .create_parameter (
521
+ shape = [
522
+ in_features * self .output_size_per_partition // self .quantization_config .qlora_weight_blocksize
523
+ ],
524
+ dtype = "uint8" ,
525
+ is_bias = False ,
526
+ )
527
+ # double weight_scale: weight_scale of quantized weight_scale
528
+ self .qweight_scale .stop_gradient = True
529
+ self .qweight_scale .is_distributed = True if self .is_mp else False
530
+ if self .qweight_scale .is_distributed :
531
+ self .qweight_scale .split_axis = 0
532
+ self .double_weight_scale = self .create_parameter (
533
+ shape = [
534
+ in_features
535
+ * self .output_size_per_partition
536
+ // self .quantization_config .qlora_weight_blocksize
537
+ // self .quantization_config .qlora_weight_double_quant_block_size
538
+ ],
539
+ dtype = "float32" ,
540
+ is_bias = False ,
541
+ )
542
+ self .double_weight_scale .stop_gradient = True
543
+ self .double_weight_scale .is_distributed = True if self .is_mp else False
544
+ if self .double_weight_scale .is_distributed :
545
+ self .double_weight_scale .split_axis = 0
546
+ self .weight_scale_offset = self .create_parameter (
547
+ shape = [],
548
+ dtype = "float32" ,
549
+ is_bias = False ,
550
+ )
551
+ self .weight_scale_offset .stop_gradient = True
552
+ self .weight_scale_offset .is_distributed = True if self .is_mp else False
553
+ if self .weight_scale_offset .is_distributed :
554
+ self .weight_scale_offset .split_axis = 0
555
+ else :
556
+ self .weight_scale = self .create_parameter (
557
+ shape = [
558
+ in_features * self .output_size_per_partition // self .quantization_config .qlora_weight_blocksize
559
+ ],
560
+ dtype = "float32" ,
561
+ is_bias = False ,
562
+ )
563
+ self .weight_scale .stop_gradient = True
564
+ self .weight_scale .is_distributed = True if self .is_mp else False
565
+ if self .weight_scale .is_distributed :
566
+ self .weight_scale .split_axis = 0
567
+
499
568
else :
500
569
raise NotImplementedError (f"Not yet support weight_quantize_algo: { self .weight_quantize_algo } " )
501
570
if bias_attr is False :
@@ -647,6 +716,74 @@ def __init__(
647
716
self .activation_scale .is_distributed = False
648
717
self .activation_scale .stop_gradient = True
649
718
self .group = get_activation_scale_group (is_row = True )
719
+ elif self .weight_quantize_algo in ["nf4" , "fp4" ]:
720
+ if qlora_weight_linear is None :
721
+ raise ImportError (
722
+ "Please run the following commands to install: qlora related package first\n "
723
+ "1) git clone https://github.com/PaddlePaddle/PaddleSlim \n "
724
+ "2) cd PaddleSlim && pip install -e .\n "
725
+ "3) cd csrc && python ./setup_cuda.py install"
726
+ )
727
+ self .quant_weight = self .create_parameter (
728
+ shape = [out_features * self .input_size_per_partition // 2 , 1 ],
729
+ attr = paddle .nn .initializer .Constant (value = 0 ),
730
+ dtype = "uint8" ,
731
+ is_bias = False ,
732
+ )
733
+ self .quant_weight .is_distributed = True if self .is_mp else False
734
+ if self .quant_weight .is_distributed :
735
+ self .quant_weight .split_axis = 1
736
+ if self .quantization_config .qlora_weight_double_quant :
737
+ # quantized weight_scale
738
+ self .qweight_scale = self .create_parameter (
739
+ shape = [
740
+ self .input_size_per_partition * out_features // self .quantization_config .qlora_weight_blocksize
741
+ ],
742
+ dtype = "uint8" ,
743
+ is_bias = False ,
744
+ )
745
+ self .qweight_scale .stop_gradient = True
746
+ self .qweight_scale .is_distributed = True if self .is_mp else False
747
+ if self .qweight_scale .is_distributed :
748
+ self .qweight_scale .split_axis = 0
749
+ # double weight_scale: weight_scale of quantized weight_scale
750
+ self .double_weight_scale = self .create_parameter (
751
+ shape = [
752
+ self .input_size_per_partition
753
+ * out_features
754
+ // self .quantization_config .qlora_weight_blocksize
755
+ // self .quantization_config .qlora_weight_double_quant_block_size
756
+ ],
757
+ dtype = "float32" ,
758
+ is_bias = False ,
759
+ )
760
+ self .double_weight_scale .stop_gradient = True
761
+ self .double_weight_scale .is_distributed = True if self .is_mp else False
762
+ if self .double_weight_scale .is_distributed :
763
+ self .double_weight_scale .split_axis = 1
764
+ self .weight_scale_offset = self .create_parameter (
765
+ shape = [],
766
+ dtype = "float32" ,
767
+ is_bias = False ,
768
+ )
769
+ self .weight_scale_offset .stop_gradient = True
770
+ self .weight_scale_offset .is_distributed = True if self .is_mp else False
771
+ if self .weight_scale_offset .is_distributed :
772
+ self .weight_scale_offset .split_axis = 0
773
+ else :
774
+ self .weight_scale = self .create_parameter (
775
+ shape = [
776
+ self .input_size_per_partition * out_features // self .quantization_config .qlora_weight_blocksize
777
+ ],
778
+ dtype = "float32" ,
779
+ is_bias = False ,
780
+ )
781
+
782
+ self .weight_scale .stop_gradient = True
783
+ self .weight_scale .is_distributed = True if self .is_mp else False
784
+ if self .weight_scale .is_distributed :
785
+ self .weight_scale .split_axis = 0
786
+
650
787
else :
651
788
raise NotImplementedError (f"Not yet support weight_quantize_algo: { self .weight_quantize_algo } " )
652
789
0 commit comments