Skip to content

Commit 3ef14dc

Browse files
authored
Adding LoKrModel Class to paddle.peft library (#9269)
* passing pre-commit * removing tp and pp logic for single gpu training * add disable_lokr attribute in lokr_layer * refine comments * add lokr tests and modified layer bug * add lokrtests * add lokrtests * add lokr_argument.json * add integration test, fix bugs based on tests. * refactor lora_dim to lokr_dim * no inference * add more tests * resolve merge conflict * add more randtests * pass isort check(maybe)
1 parent 8d8a42c commit 3ef14dc

File tree

15 files changed

+1297
-2
lines changed

15 files changed

+1297
-2
lines changed
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
{
2+
"model_name_or_path": "meta-llama/Meta-Llama-3-8B",
3+
"dataset_name_or_path": "./data",
4+
"output_dir": "./checkpoints/lokr_ckpts",
5+
"lokr": true,
6+
"per_device_train_batch_size": 4,
7+
"gradient_accumulation_steps": 4,
8+
"num_train_epochs": 1,
9+
"learning_rate": 2e-05,
10+
"lr_scheduler_type": "linear",
11+
"attention_probs_dropout_prob": 0,
12+
"hidden_dropout_prob": 0,
13+
"warmup_steps": 30,
14+
"logging_steps": 1,
15+
"evaluation_strategy": "no",
16+
"save_strategy": "steps",
17+
"save_steps": 500,
18+
"src_length": 512,
19+
"max_length": 512,
20+
"bf16": true,
21+
"do_train": true,
22+
"do_eval": false,
23+
"disable_tqdm": false,
24+
"load_best_model_at_end": false,
25+
"eval_with_do_generation": false,
26+
"metric_for_best_model": "accuracy",
27+
"recompute": false,
28+
"save_total_limit": 100,
29+
"fp16_opt_level": "O2",
30+
"sharding": "stage2",
31+
"zero_padding": false,
32+
"use_flash_attention": false,
33+
"unified_checkpoint": true
34+
}

llm/docs/finetune.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,8 @@ python merge_lora_params.py \
130130
- `neftune_noise_alpha`: NEFT alpha 参数,默认为5.0。
131131
- `vera`: 是否开启 VeRA 微调策略,默认为 False。
132132
- `vera_rank`: VeRA 算法中 rank(秩)的值,默认为8。
133+
- `lokr`: 是否开启 LoKr 微调策略,默认为 False。
134+
- `lokr_rank`: LoKr 算法中 rank(秩)的值,默认为8。
133135
- `use_long_sequence_strategies`: 是否使用长序列扩展策略,默认为 False。
134136
- `strategy_type`: 长序列扩展策略的类型,默认为 None。
135137
- `strategy_name`: 长序列扩展策略的具体名称,默认为 None。

llm/run_finetune.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
)
3737
from paddlenlp.metrics import BLEU, Rouge1, Rouge2, RougeL
3838
from paddlenlp.peft import (
39+
LoKrConfig,
40+
LoKrModel,
3941
LoRAConfig,
4042
LoRAModel,
4143
PrefixConfig,
@@ -451,6 +453,21 @@ def neft_post_hook(module, input, output):
451453

452454
model.print_trainable_parameters()
453455

456+
if model_args.lokr:
457+
if model_args.lokr_path is None:
458+
target_modules = get_lora_target_modules(model)
459+
lokr_config = LoKrConfig(
460+
target_modules=target_modules,
461+
lokr_dim=model_args.lokr_dim,
462+
dtype=dtype,
463+
base_model_name_or_path=model_args.model_name_or_path,
464+
)
465+
model = LoKrModel(model, lokr_config)
466+
else:
467+
model = LoKrModel.from_pretrained(model=model, lokr_path=model_args.lokr_path)
468+
469+
# For debugging purpose, you can print the model to see which layer is transformed into a lokr layer
470+
# print(model)
454471
if model_args.reft:
455472
intervention_dtype = dtype
456473
intervention_params = {

llm/tools/merge_lokr_params.py

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import argparse
16+
import os
17+
18+
import paddle
19+
20+
from paddlenlp.peft import LoKrConfig, LoKrModel
21+
from paddlenlp.transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
22+
from paddlenlp.utils.env import CONFIG_NAME
23+
24+
25+
def parse_arguments():
26+
parser = argparse.ArgumentParser()
27+
parser.add_argument("--model_name_or_path", default=None, help="The directory of pretrained model.")
28+
parser.add_argument("--lokr_path", default="", help="The directory of lokr parameters. Default to None")
29+
parser.add_argument(
30+
"--merge_lokr_model_path",
31+
default="",
32+
help="The directory of merged parameters. Default to None",
33+
)
34+
parser.add_argument("--device", type=str, default="gpu", help="Device")
35+
parser.add_argument(
36+
"--low_gpu_mem", type=bool, default=True, help="Whether to use low gpu memory. Default to False"
37+
)
38+
return parser.parse_args()
39+
40+
41+
def weight_process(name, lokr_config, state_dict):
42+
weight = state_dict.pop(name + ".weight")
43+
use_w1 = True if ((name + ".lokr_w1") in state_dict) else False
44+
use_w2 = True if ((name + ".lokr_w2") in state_dict) else False
45+
if use_w1:
46+
lokr_w1 = state_dict.pop(name + ".lokr_w1")
47+
else:
48+
lokr_w1_a = state_dict.pop(name + ".lokr_w1_a")
49+
lokr_w1_b = state_dict.pop(name + ".lokr_w1_b")
50+
if use_w2:
51+
lokr_w2 = state_dict.pop(name + ".lokr_w2")
52+
else:
53+
lokr_w2_a = state_dict.pop(name + ".lokr_w2_a")
54+
lokr_w2_b = state_dict.pop(name + ".lokr_w2_b")
55+
56+
scaling = lokr_config.lokr_alpha / lokr_config.lokr_dim
57+
58+
adapter_weight = (
59+
scaling
60+
* paddle.kron(lokr_w1 if use_w1 else lokr_w1_a @ lokr_w1_b, lokr_w2 if use_w2 else lokr_w2_a @ lokr_w2_b).T
61+
)
62+
state_dict[name + ".weight"] = weight + adapter_weight
63+
64+
65+
def merge():
66+
args = parse_arguments()
67+
paddle.set_device(args.device)
68+
69+
lokr_config = LoKrConfig.from_pretrained(args.lokr_path)
70+
if lokr_config.base_model_name_or_path is None:
71+
if args.model_name_or_path is not None:
72+
raise ValueError("We can not find a valid model_name_or_path.")
73+
else:
74+
lokr_config.base_model_name_or_path = args.model_name_or_path
75+
76+
if os.path.isfile(os.path.join(args.lokr_path, CONFIG_NAME)):
77+
config = AutoConfig.from_pretrained(args.lokr_path)
78+
elif args.model_name_or_path is not None:
79+
config = AutoConfig.from_pretrained(args.model_name_or_path)
80+
else:
81+
raise ValueError(
82+
f"We can not find config.json in lokr_path: {args.lokr_path} or find a valid model_name_or_path."
83+
)
84+
config.dtype = lokr_config.dtype
85+
if (
86+
lokr_config.dtype == "bfloat16" or config.quantization_config.weight_quantize_algo in ["nf4", "fp4"]
87+
) and args.device == "cpu":
88+
raise ValueError("We can not apply bfloat16 or nf4/fp4 lokr merge on cpu.")
89+
90+
# with device_guard() will cause SVD decomposition to fail
91+
model = AutoModelForCausalLM.from_pretrained(
92+
lokr_config.base_model_name_or_path,
93+
config=config,
94+
low_cpu_mem_usage=True,
95+
)
96+
model = LoKrModel.from_pretrained(model=model, lokr_path=args.lokr_path, lokr_config=lokr_config)
97+
98+
model.eval()
99+
model_state_dict = model.model.state_dict()
100+
lokr_name_list = []
101+
102+
for key in model_state_dict.keys():
103+
if "lokr" in key:
104+
lokr_name_list.append(key.split(".lokr")[0])
105+
106+
lokr_name_list = list(set(lokr_name_list))
107+
for name in lokr_name_list:
108+
weight_process(name, lokr_config, model_state_dict)
109+
110+
model.model.save_pretrained(args.merge_lokr_model_path, state_dict=model_state_dict)
111+
tokenizer = AutoTokenizer.from_pretrained(lokr_config.base_model_name_or_path)
112+
tokenizer.save_pretrained(args.merge_lokr_model_path)
113+
114+
115+
if __name__ == "__main__":
116+
merge()

llm/utils/argument.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,13 @@ class ModelArgument:
223223
vera: bool = field(default=False, metadata={"help": "Whether to use vera technique"})
224224
vera_rank: int = field(default=8, metadata={"help": "Vera attention dimension"})
225225

226+
# lokr related parameter
227+
lokr: bool = field(default=False, metadata={"help": "Whether to use LoKr technique"})
228+
lokr_path: str = field(
229+
default=None, metadata={"help": "Initialize lokr state dict and apply customized lokr config"}
230+
)
231+
lokr_dim: int = field(default=8, metadata={"help": "Lora dimention in LoKr dimension for adapter matrix"})
232+
226233
# prefix tuning related parameters
227234
prefix_tuning: bool = field(default=False, metadata={"help": "Whether to use Prefix technique"})
228235
prefix_path: str = field(default=None, metadata={"help": "Initialize prefix state dict."})

paddlenlp/peft/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
15+
from .lokr import LoKrConfig, LoKrModel
1616
from .lora import LoRAConfig, LoRAModel
1717
from .prefix import PrefixConfig, PrefixModelForCausalLM
1818
from .reft import ReFTModel

paddlenlp/peft/lokr/__init__.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from .lokr_config import LoKrConfig
16+
from .lokr_layers import LoKrLinear
17+
from .lokr_model import LoKrModel
18+
19+
__all__ = ["LoKrConfig", "LoKrModel", "LoKrLinear"]

paddlenlp/peft/lokr/lokr_config.py

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import json
16+
import os
17+
from dataclasses import asdict, dataclass, field
18+
from typing import List, Optional, Union
19+
20+
from ...utils.env import LOKR_CONFIG_NAME
21+
22+
23+
@dataclass
24+
class LoKrConfig:
25+
"""
26+
This is the configuration class to store the configuration of a [`LoKrModel`].
27+
Convention of LoKrModel: W1 can be named as scaling matrix, W2 can be named as adapter matrix.
28+
Args:
29+
target_modules (`Union[List[str],str]`): The names of the modules to apply Lora to.
30+
trainable_modules (`List[str]`): The names of the modules to train when applying Lora.
31+
lokr_alpha (`float`): The alpha parameter for Lora scaling.
32+
merge_weights (`bool`):
33+
Whether to merge the weights of the Lora layers with the base transformer model in `eval` mode.
34+
"""
35+
36+
base_model_name_or_path: Optional[str] = field(
37+
default=None, metadata={"help": "The name of the base model to use."}
38+
)
39+
target_modules: Optional[Union[List[str], str]] = field(
40+
default=None,
41+
metadata={
42+
"help": "List of module names or regex expression of the module names to replace with LoKr."
43+
"For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
44+
},
45+
)
46+
trainable_modules: Optional[List[str]] = field(
47+
default=None,
48+
metadata={
49+
"help": "List of module names or regex expression of the module names to train when applying with LoKr."
50+
"For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
51+
},
52+
)
53+
trainable_bias: Optional[str] = field(
54+
default=None, metadata={"help": "Define trainable bias parameters for the Lora model."}
55+
)
56+
lokr_dim: int = field(default=8, metadata={"help": "Lora dimention in LoKr dimension, for adapter matrix"})
57+
factor: int = field(default=-1, metadata={"help": "Determine the decomposition size of LoKr matrices"})
58+
decompose_both: bool = field(
59+
default=False,
60+
metadata={"help": "Determine whether to decomposed both Scaling Matrix and adapter matrix together"},
61+
)
62+
lokr_alpha: float = field(
63+
default=0.0, metadata={"help": "Determine the scaling of adapter weight, follow lokr convention"}
64+
)
65+
merge_weight: bool = field(
66+
default=False, metadata={"help": "Merge weights of the original model and the Lokr model"}
67+
)
68+
tensor_parallel_degree: int = field(default=-1, metadata={"help": "-1 for not use tensor parallel"})
69+
dtype: Optional[str] = field(default=None, metadata={"help": "The data type of tensor"})
70+
71+
@property
72+
def __dict__(self):
73+
return asdict(self)
74+
75+
def to_dict(self):
76+
return self.__dict__
77+
78+
@property
79+
def scaling(self):
80+
if not (self.lokr_alpha or self.lokr_dim):
81+
return 1.0
82+
return self.lokr_alpha / self.lokr_dim
83+
84+
def save_pretrained(self, save_directory):
85+
r"""
86+
This method saves the configuration of your adapter model in a directory.
87+
Args:
88+
save_directory (`str`):
89+
The directory where the configuration will be saved.
90+
"""
91+
if os.path.isfile(save_directory):
92+
raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
93+
94+
os.makedirs(save_directory, exist_ok=True)
95+
96+
output_dict = self.__dict__
97+
output_dict["scaling"] = self.scaling
98+
output_path = os.path.join(save_directory, LOKR_CONFIG_NAME)
99+
100+
# save it
101+
with open(output_path, "w") as writer:
102+
writer.write(json.dumps(output_dict, indent=2, sort_keys=True))
103+
104+
@classmethod
105+
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
106+
r"""
107+
This method loads the configuration of your adapter model from a directory.
108+
Args:
109+
pretrained_model_name_or_path (`str`):
110+
The directory or the hub-id where the configuration is saved.
111+
**kwargs:
112+
Additional keyword arguments passed along to the child class initialization.
113+
"""
114+
if os.path.isfile(os.path.join(pretrained_model_name_or_path, LOKR_CONFIG_NAME)):
115+
config_file = os.path.join(pretrained_model_name_or_path, LOKR_CONFIG_NAME)
116+
else:
117+
raise ValueError(f"Can't find lokr_config.json at '{pretrained_model_name_or_path}'")
118+
119+
loaded_attributes = cls.from_json_file(config_file)
120+
loaded_attributes.pop("scaling", None)
121+
122+
config = cls(**kwargs)
123+
124+
for key, value in loaded_attributes.items():
125+
if hasattr(config, key):
126+
setattr(config, key, value)
127+
128+
return config
129+
130+
@classmethod
131+
def from_json_file(cls, path_json_file):
132+
r"""
133+
Loads a configuration file from a json file.
134+
Args:
135+
path_json_file (`str`):
136+
The path to the json file.
137+
"""
138+
with open(path_json_file, "r") as file:
139+
json_object = json.load(file)
140+
141+
return json_object

0 commit comments

Comments
 (0)