diff --git a/examples/fdkt/__init__.py b/examples/fdkt/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/examples/fdkt/fdkt.py b/examples/fdkt/fdkt.py new file mode 100644 index 0000000..7e4e0e9 --- /dev/null +++ b/examples/fdkt/fdkt.py @@ -0,0 +1,164 @@ +import yaml +from fate_client.pipeline.components.fate.homo_nn import HomoNN, get_config_of_fdkt_runner +from fate_client.pipeline.components.fate.nn.algo_params import FDKTTrainingArguments +from fate_client.pipeline.components.fate.nn.loader import LLMModelLoader, LLMDatasetLoader, LLMDataFuncLoader +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate.reader import Reader +from fate_client.pipeline.components.fate.nn.torch import nn, optim +from typing import Union, Dict +import argparse + +def main(config="../../config.yaml", param: Union[Dict, str] = None, namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + if isinstance(param, str): + param = yaml.safe_load(param) + # Load the configuration file + parties = config.parties + guest = parties.guest[0] + arbiter = parties.arbiter[0] + + pipeline = FateFlowPipeline().set_parties(guest=guest, arbiter=arbiter) + pipeline.bind_local_path(path=param["datasets"]["slm_data_path"], namespace=param["data"]["guest"]["namespace"], + name=param["data"]["guest"]["name"]) + + def get_llm_conf(): + embedding_model = LLMModelLoader( + "embedding_transformer.st_model", + "SentenceTransformerModel", + model_name_or_path=param['llm']['embedding_model_path'] + ) + + dataset = LLMDatasetLoader( + "flex_dataset", + "FlexDataset", + tokenizer_name_or_path=param['llm']['pretrained_path'], + need_preprocess=True, + dataset_name="yelp_review", + data_part="train.json", + load_from="json", + few_shot_num_per_label=1, + ) + + training_args = FDKTTrainingArguments( + sample_num_per_cluster=4, + filter_prompt_max_length=2 ** 14, + filter_generation_config=dict( + max_tokens=3000, + ), + use_cpu=param['slm']['training_args']['use_cpu'], + aug_generation_config=dict( + max_tokens=3000, + temperature=0.8, + top_p=0.9, + ), + aug_prompt_num=200, + ) + + inference_inst_conf = dict( + module_name="fate_llm.algo.fdkt.inference_inst", + item_name="api_init", + kwargs=dict( + api_url=param['client']['api_url'], + model_name=param['llm']['pretrained_path'], + api_key=param['client']['api_key'] + ) + ) + + return get_config_of_fdkt_runner( + training_args=training_args, + embedding_model=embedding_model, + dataset=dataset, + inference_inst_conf=inference_inst_conf, + ) + + + def get_slm_conf(): + slm_model = LLMModelLoader( + "hf_model", + "HFAutoModelForCausalLM", + pretrained_model_name_or_path=param['slm']['pretrained_path'], + torch_dtype="bfloat16", + ) + + tokenizer = LLMDataFuncLoader( + "tokenizers.cust_tokenizer", + "get_tokenizer", + tokenizer_name_or_path=param['slm']['pretrained_path'], + pad_token_id=50256 + ) + + training_args = FDKTTrainingArguments( + use_cpu=param['slm']['training_args']['use_cpu'], + device_id=1, + num_train_epochs=param['slm']['training_args']['num_train_epochs'], + per_device_train_batch_size=param['slm']['training_args']['per_device_train_batch_size'], + slm_generation_batch_size=param['slm']['training_args']['slm_generation_batch_size'], + seq_num_for_single_category=param['slm']['training_args']['seq_num_for_single_category'], + slm_generation_config=param['slm']['training_args']['slm_generation_config'], + ) + + dataset = LLMDatasetLoader( + "flex_dataset", + "FlexDataset", + tokenizer_name_or_path=param['slm']['pretrained_path'], + need_preprocess=True, + dataset_name="yelp_review", + data_part="train", + load_from="json", + select_num=2000, + few_shot_num_per_label=1, + ) + + optimizer = optim.Adam(lr=0.01) + + return get_config_of_fdkt_runner( + model=slm_model, + tokenizer=tokenizer, + training_args=training_args, + dataset=dataset, + optimizer=optimizer, + data_collator=LLMDataFuncLoader( + "data_collator.cust_data_collator", + "get_seq2seq_data_collator", + label_pad_token_id=50256, + tokenizer_name_or_path=param['slm']['pretrained_path'], + pad_token_id=50256, + ), + ) + + + + reader_0 = Reader("reader_0", runtime_parties=dict(guest=guest)) + reader_0.guest.task_parameters( + namespace=param["data"]["guest"]["namespace"], + name=param["data"]["guest"]["name"] + ) + + homo_nn_0 = HomoNN( + 'homo_nn_0', + train_data=reader_0.outputs["output_data"], + runner_module="fdkt_runner", + runner_class="FDKTRunner", + ) + + homo_nn_0.arbiter.task_parameters( + runner_conf=get_llm_conf() + ) + + homo_nn_0.guest.task_parameters( + runner_conf=get_slm_conf() + ) + + pipeline.add_tasks([reader_0, homo_nn_0]) + pipeline.conf.set("task", dict(engine_run={"cores": 1})) + + pipeline.compile() + pipeline.fit() + +if __name__ == "__main__": + parser = argparse.ArgumentParser("FDKT JOB") + parser.add_argument("-c", "--config", type=str, help="Path to config file", default="./config.yaml") + parser.add_argument("-p", "--param", type=str, help="Path to parameter file", default="./fdkt_config.yaml") + args = parser.parse_args() + main(args.config, args.param) diff --git a/examples/fdkt/fdkt_config.yaml b/examples/fdkt/fdkt_config.yaml new file mode 100644 index 0000000..0e8bc51 --- /dev/null +++ b/examples/fdkt/fdkt_config.yaml @@ -0,0 +1,97 @@ +data: + guest: + namespace: experiment + name: slm_train + host: + namespace: experiment + name: slm_train + + +datasets: + slm_data_path: "train.json" # should be absolute path + +# Inference initialization configuration + +client: + api_url: "http://127.0.0.1:9999/v1" + api_key: "demo" + +# LLM Configuration +llm: + pretrained_path: "Sheared-LLaMa-1.3B" # Please add your model path + embedding_model_path: "all-mpnet-base-v2" # Please add your model path + + dataset: + tokenizer_name_or_path: "Sheared-LLaMa-1.3B" # Please add your model path + need_preprocess: true + dataset_name: "yelp_review" + data_part: "train.json" + load_from: "json" + few_shot_num_per_label: 1 + + training_args: + sample_num_per_cluster: 4 + filter_prompt_max_length: 16384 + filter_generation_config: + max_tokens: 3000 + use_cpu: false + aug_generation_config: + max_tokens: 3000 + temperature: 0.8 + top_p: 0.9 + aug_prompt_num: 200 + + inference_inst_conf: + module_name: "fate_llm.algo.fdkt.inference_inst" + item_name: "api_init" + kwargs: + api_url: "http://127.0.0.1:9999/v1/" + model_name: "Sheared-LLaMa-1.3B" + api_key: "demo" + +# SLM Configuration +slm: + pretrained_path: "gpt2" # Please add your model path + data_path: "train.json" # Please add your datasets path + + model: + torch_dtype: "bfloat16" + + tokenizer: + tokenizer_name_or_path: "gpt2" # Please add your model path + pad_token_id: 50256 + + training_args: + use_cpu: false + device_id: 1 + num_train_epochs: 1 + per_device_train_batch_size: 2 + slm_generation_batch_size: 32 + seq_num_for_single_category: 200 + slm_generation_config: + max_new_tokens: 256 + do_sample: true + temperature: 1.0 + top_k: 50 + top_p: 0.9 + repetition_penalty: 1.0 + pad_token_id: 50256 + + dataset: + tokenizer_name_or_path: "gpt2" # Please add your model path + need_preprocess: true + dataset_name: "yelp_review" + data_part: "train" + load_from: "json" + select_num: 2000 + few_shot_num_per_label: 1 + + optimizer: + type: "Adam" + params: + lr: 0.01 + + data_collator: + label_pad_token_id: 50256 + tokenizer_name_or_path: "gpt2" # Please add your model path + pad_token_id: 50256 diff --git a/examples/fdkt/fdkt_test_quick_start_zh.md b/examples/fdkt/fdkt_test_quick_start_zh.md new file mode 100644 index 0000000..fe1aeb0 --- /dev/null +++ b/examples/fdkt/fdkt_test_quick_start_zh.md @@ -0,0 +1,184 @@ +#### fdkt_fate_test_quick_start_zh + +##### 1. 数据准备 + +###### Dataset: Yelp +从 Yelp 数据集中处理并采样了“健康”子域的[数据](https://arxiv.org/abs/1509.01626),数据集可以从[此处](https://www.yelp.com/dataset)下载。下载数据集后,执行以下命令解压下载的数据集。 + +```shell +tar -xvf yelp_dataset.tar +``` +以下代码将对“健康”子域的 5000 条数据行进行采样,并将在文件夹“./processed_data/Health/train.json”下生成训练数据 +~~~ +# Health数据处理脚本,稍微修改即可执行,执行需在fate环境下执行 +import os +import json +import sys +import random +from pathlib import Path +random.seed(42) + + +base_dir = "./" +business_data_path = os.path.join(base_dir, 'yelp_academic_dataset_business.json') +review_data_path = os.path.join(base_dir, 'yelp_academic_dataset_review.json') + +business_data_file = open(business_data_path, 'r') +review_data_file = open(review_data_path, 'r') + +categories_list = ['Restaurants', 'Shopping', 'Arts', 'Health'] +business_dic = {} +data_dict = {} +for category in categories_list: + business_dic[category] = set() + data_dict[category] = [] + + +def get_categories(categories): + return_list = [] + for category in categories_list: + if category in categories: + return_list.append(category) + return return_list + + +for line in business_data_file.readlines(): + dic = json.loads(line) + if 'categories' in dic.keys() and dic['categories'] is not None: + category = get_categories(dic['categories']) + if len(category) == 1: + business_dic[category[0]].add(dic['business_id']) + +# for category in categories_list: +for line in review_data_file.readlines(): + dic = json.loads(line) + if 'business_id' in dic.keys() and dic['business_id'] is not None: + for category in categories_list: + if dic['business_id'] in business_dic[category]: + if dic['text'] is not None and dic['stars'] is not None: + data_dict[category].append({'text': dic['text'], 'stars': dic['stars']}) + break + +train_data_path = os.path.join('processed_data', "Health", 'train.json') +os.makedirs(Path(train_data_path).parent, exist_ok=True) +train_data_file = open(train_data_path, 'w') +data_list = data_dict["Health"] + +sample_data_dict = dict() + +for data in data_list: + star = int(data["stars"]) + if star not in sample_data_dict: + sample_data_dict[star] = [] + + sample_data_dict[star].append(data) + +data_list = [] +star_keys = list(sample_data_dict.keys()) +for star in star_keys: + sample_data = sample_data_dict[star][:1000] + random.shuffle(sample_data) + data_list.extend(sample_data) + +random.shuffle(data_list) +json.dump(data_list, train_data_file, indent=4) +train_data_file.close() +~~~ + +​ 将数据集路径与名称和命名空间绑定。记住自己的数据集保存路径的路径。 + +~~~ +flow table bind --namespace experiment --name slm_train --path path_to_save/train.json +~~~ + +##### 2. 配置文件修改 + +​ test_fdkt_llmsuite.yaml 仅需修改数据与模型的实际路径 + +~~~ +data: + - file: path_to_save/train.json #实际需要放置的路径 + table_name: slm_train + namespace: experiment + role: guest_0 + - file: path_to_save/train.json #实际需要放置的路径 + table_name: slm_train + namespace: experiment + role: host_0 +fdkt_lora_vs_zero_shot: + pdss_lora: + pretrained: "Sheared-LLaMa-1.3B" # 模型放置的实际路径 + script: "./fdkt.py" + conf: "./fdkt_config.yaml" + +~~~ + + fdkt_config.yaml 仅需修改下模型的实际路径 + +~~~ +# fdkt_config.yaml 开头片段 +data: + guest: + namespace: experiment + name: slm_train + host: + namespace: experiment + name: slm_train + +# 配置路径 +# LLM Configuration +llm: + pretrained_path: "Sheared-LLaMa-1.3B" # 模型放置的实际路径 + embedding_model_path: "opt-1.3b" # 模型放置的实际路径 + + dataset: + tokenizer_name_or_path: "Sheared-LLaMa-1.3B" # 模型放置的实际路径 +# SLM Configuration +slm: + pretrained_path: "gpt2" # 模型放置的实际路径 + data_path: "train.json" # 数据放置的实际路径 + + tokenizer: + tokenizer_name_or_path: "gpt2" # 模型放置的实际路径 + + dataset: + tokenizer_name_or_path: "gpt2" # 模型放置的实际路径 + need_preprocess: true + dataset_name: "yelp_review" + data_part: "train" + load_from: "json" + select_num: 2000 + few_shot_num_per_label: 1 + + data_collator: + label_pad_token_id: 50256 + tokenizer_name_or_path: "gpt2" # 模型放置的实际路径 + pad_token_id: 50256 +~~~ + +##### 3. 运行 +~~~ +# 创建 vllm 环境 +python -m venv vllm_venv +source vllm_venv/bin/activate +pip install vllm==0.4.3 +pip install numpy==1.26.4 # numpy >= 2.0.0 will raise error, so reinstall numpy<2.0.0 + +# Sheared-LLaMa-1.3B为本地llm模型保存路径 +export CUDA_VISIBLE_DEVICES=1,2 +nohup python -m vllm.entrypoints.openai.api_server --host 127.0.0.1 --port 9999 --model Sheared-LLaMa-1.3B --dtype=half --enforce-eager --api-key demo --device cuda -tp 2 & +~~~ +~~~ +# 环境准备 +cd /fate/ +source fate/bin/init_env.sh + +# 命令执行 +fate_test llmsuite -i fate_llm/examples/fdkt/ --yes +~~~ + +**注意:** fdkt暂不支持数据评估,如您环境是torch=1.13.1,需升级至torch=2.3.1 +##### 4. 问题定位 + +​ 任务运行失败,报错,会在当前执行命令的目录下生成一个logs目录,找到对应的任务,检查stdout,或者exception.log检查报错原因。 + diff --git a/examples/fdkt/test_fdkt_llmsuite.yaml b/examples/fdkt/test_fdkt_llmsuite.yaml new file mode 100644 index 0000000..4008104 --- /dev/null +++ b/examples/fdkt/test_fdkt_llmsuite.yaml @@ -0,0 +1,14 @@ +data: + - file: examples/data/train.json # Please add your datasets path + table_name: slm_train + namespace: experiment + role: guest_0 + - file: examples/data/train.json # Please add your datasets path + table_name: slm_train + namespace: experiment + role: host_0 +fdkt: + fdkt: + pretrained: "Sheared-LLaMa-1.3B" # Please add your model path + script: "./fdkt.py" + conf: "./fdkt_config.yaml" diff --git a/examples/fedmkt/fedmkt.py b/examples/fedmkt/fedmkt.py index 5c3dd9e..19d7e8a 100644 --- a/examples/fedmkt/fedmkt.py +++ b/examples/fedmkt/fedmkt.py @@ -9,32 +9,44 @@ import yaml from typing import Union, Dict -def main(config="./config.yaml", param: Union[Dict, str] = None): +def main(config="../../config.yaml", param: Union[Dict, str] = None, namespace=""): if isinstance(config, str): - with open(config, 'r') as f: - config = yaml.safe_load(f) - + config = test_utils.load_job_config(config) if isinstance(param, str): param = yaml.safe_load(param) - - guest = config['parties']['guest'][0] # replace with actual guest party ID - host = config['parties']['host'][0] # replace with actual host party ID - arbiter = config['parties']['arbiter'][0] # replace with actual arbiter party ID + + # load config + parties = config.parties + guest = parties.guest[0] # replace with actual guest party ID + host = parties.host[0] # replace with actual host party ID + arbiter = parties.arbiter[0] # replace with actual arbiter party ID - process_data_output_dir = config['paths']['process_data_output_dir'] - llm_pretrained_path = config['paths']['llm_pretrained_path'] - slm_pretrained_paths = config['paths']['slm_pretrained_paths'] - vocab_mapping_directory = config['paths']['vocab_mapping_directory'] + process_data_output_dir = param['paths']['process_data_output_dir'] + llm_pretrained_path = param['paths']['llm_pretrained_path'] + slm_0_pretrained_path = param['paths']['slm_0_pretrained_path'] + slm_1_pretrained_path = param['paths']['slm_1_pretrained_path'] + llm_slm_pairs = [ + (llm_pretrained_path, slm_0_pretrained_path), + (llm_pretrained_path, slm_1_pretrained_path) + ] + vocab_mapping_directory = param['paths']['vocab_mapping_directory'] slm_to_llm_vocab_mapping_paths = [ - vocab_mapping_directory + "/" + path for path in config['paths']['slm_to_llm_vocab_mapping_paths'] + vocab_mapping_directory + "/" + path for path in param['paths']['slm_to_llm_vocab_mapping_paths'] ] llm_to_slm_vocab_mapping_paths = [ - vocab_mapping_directory + "/" + path for path in config['paths']['llm_to_slm_vocab_mapping_paths'] + vocab_mapping_directory + "/" + path for path in param['paths']['llm_to_slm_vocab_mapping_paths'] + ] + slm_pretrained_paths = [slm_0_pretrained_path, slm_1_pretrained_path] + slm_models = param['models']['slm_models'] + slm_lora_target_modules = [ + ["q_proj", "v_proj"], + ["c_attn"] + ] + slm_models = [ + ("pellm.opt", "OPT"), + ("pellm.gpt2", "GPT2CLM") ] - - slm_models = config['models']['slm_models'] - slm_lora_target_modules = config['lora_config']['slm_lora_target_modules'] def get_llm_conf(): lora_config = LoraConfig( @@ -210,20 +222,20 @@ def get_slm_conf(slm_idx): save_trainable_weights_only=True, data_collator=data_collator ) - + pipeline = FateFlowPipeline().set_parties(guest=guest, arbiter=arbiter, host=host) pipeline.bind_local_path(path=process_data_output_dir, namespace="experiment", name="arc_challenge") reader_0 = Reader("reader_0", runtime_parties=dict(guest=guest, host=host)) reader_0.guest.task_parameters( - namespace=config['data']['guest']['namespace'], - name=config['data']['guest']['name'] + namespace=param['data']['guest']['namespace'], + name=param['data']['guest']['name'] ) - reader_0.hosts[[0, 1, 2]].task_parameters( - namespace=config['data']['host']['namespace'], - name=config['data']['host']['name'] + reader_0.hosts[0].task_parameters( + namespace=param['data']['host']['namespace'], + name=param['data']['host']['name'] ) - + homo_nn_0 = HomoNN( 'nn_0', train_data=reader_0.outputs["output_data"], @@ -238,7 +250,7 @@ def get_slm_conf(slm_idx): homo_nn_0.guest.task_parameters( runner_conf=get_slm_conf(slm_idx=0) ) - + for idx in range(1): homo_nn_0.hosts[idx].task_parameters( runner_conf=get_slm_conf(slm_idx=idx + 1) @@ -253,6 +265,7 @@ def get_slm_conf(slm_idx): pipeline.compile() pipeline.fit() + return llm_pretrained_path if __name__ == "__main__": parser = argparse.ArgumentParser("LLMSUITE PIPELINE JOB") diff --git a/examples/fedmkt/fedmkt_config.yaml b/examples/fedmkt/fedmkt_config.yaml index 597bee6..b1372c1 100644 --- a/examples/fedmkt/fedmkt_config.yaml +++ b/examples/fedmkt/fedmkt_config.yaml @@ -1,3 +1,11 @@ +# Data configuration +data: + guest: + namespace: experiment + name: arc_challenge + host: + namespace: experiment + name: arc_challenge # fedmkt_config.yaml # Configuration for Lora @@ -63,20 +71,17 @@ training: # Paths configuration paths: - process_data_output_dir: "" - llm_pretrained_path: "Llama-2-7b-hf" - slm_pretrained_paths: - - "opt-1.3b" - - "gpt2" - vocab_mapping_directory: "" + process_data_output_dir: "examples/data/arc_challenge" # Please add your datasets path + llm_pretrained_path: "Sheared-LLaMa-1.3B" # Please add your mdoel path + slm_0_pretrained_path: "opt-1.3b" # Please add your mdoel path + slm_1_pretrained_path: "gpt2" # Please add your mdoel path + vocab_mapping_directory: "vocab_mapping_datas" # Please add your vocab_mapping datasets path slm_to_llm_vocab_mapping_paths: - "opt_to_llama.json" - "gpt2_to_llama.json" - - "llama_small_to_llama.json" llm_to_slm_vocab_mapping_paths: - "llama_to_opt.json" - "llama_to_gpt2.json" - - "llama_to_llama_small" # Models configuration models: @@ -84,16 +89,7 @@ models: - ["pellm.opt", "OPT"] - ["pellm.gpt2", "GPT2CLM"] -# Data configuration -data: - guest: - namespace: "experiment" - name: "arc_challenge" - host: - namespace: "experiment" - name: "arc_challenge" - # Example: Additional custom configuration custom_config: some_param: "value" - another_param: 123 + another_param: 123 \ No newline at end of file diff --git a/examples/fedmkt/fedmkt_fate_test_quick_start_zh.md b/examples/fedmkt/fedmkt_fate_test_quick_start_zh.md new file mode 100644 index 0000000..c83701f --- /dev/null +++ b/examples/fedmkt/fedmkt_fate_test_quick_start_zh.md @@ -0,0 +1,111 @@ +#### fedmkt_fate_test_quick_start_zh + +##### 1. 数据准备 + +​数据集:ARC-Challenge +是一个真实的小学级别的多选题的科学问题数据集,目的是激励在高级问题回答领域的研究。 + +您可以参考以下链接了解有关 [ARC-Challenge](https://huggingface.co/datasets/allenai/ai2_arc) 的更多详细信息 + +从 huggingface 下载 ARC-Challenge 数据集并将其分成五个部分,部分“common”用于公共数据集,其他部分用于 slms(opt2、gpt2、llama、opt)的训练。 + +~~~ +# ARC-Challenge数据处理脚本,稍微修改即可执行,执行需在fate环境下执行 +import datasets + +data = datasets.load_dataset("ai2_arc", "ARC-Challenge", download_mode="force_redownload", ignore_verifications=True) +train_data = data.pop("train") + +seed=123 +n = train_data.shape[0] +client_num = 4 +process_data_output_dir = "" # processed data saved directory should be specified, it will be used in later. + +client_data_num = n // (client_num + 1) + +for i in range(client_num): + splits = train_data.train_test_split(train_size=client_data_num, shuffle=True, seed=seed) + client_name = f"client_{i}" + data[client_name] = splits["train"] + train_data = splits["test"] + +if train_data.shape[0] == client_data_num: + data["common"] = train_data +else: + data["common"] = train_data.train_test_split( + train_size=client_data_num, shuffle=True, seed=args.seed + )["train"] + +data.save_to_disk(process_data_output_dir) +~~~ + +​ 将数据集路径与名称和命名空间绑定。记住自己的数据集保存路径的路径。 + +~~~ +flow table bind --namespace experiment --name arc_challenge --path path_to_save/arc_challenge +~~~ + +##### 2. 配置文件修改 + +​ test_fedmkt_llmsuite.yaml 仅需修改数据与模型的实际路径 + +~~~ +data: + - file: path_to_save/arc_challenge #实际需要放置的路径 + table_name: arc_challenge + namespace: experiment + role: guest_0 + - file: path_to_save/arc_challenge #实际需要放置的路径 + table_name: arc_challenge + namespace: experiment + role: host_0 +fedmkt_lora_vs_zero_shot: + fedmkt_lora: + pretrained: "Sheared-LLaMa-1.3B" # 模型放置的实际路径 + script: "./fedmkt.py" + conf: "./fedmkt_config.yaml" + peft_path_format: "{{fate_base}}/fate_flow/model/{{job_id}}/guest/{{party_id}}/{{model_task_name}}/0/output/output_model/model_directory/" + tasks: + - "arc_challenge" + bloom_zero_shot: + pretrained: "Sheared-LLaMa-1.3B" # 模型放置的实际路径 + tasks: + - "arc_challenge" +~~~ + + fedmkt_config.yaml 仅需修改下模型的实际路径 + +~~~ +# pdss_config.yaml 开头片段 +data: + guest: + namespace: experiment + name: arc_challenge + host: + namespace: experiment + name: arc_challenge + +# 配置路径 +paths: + process_data_output_dir: "examples/data/arc_challenge" # 数据放置的实际路径 + llm_pretrained_path: "Sheared-LLaMa-1.3B" # 模型放置的实际路径 + slm_0_pretrained_path: "opt-1.3b" # 模型放置的实际路径 + slm_1_pretrained_path: "gpt2" # 模型放置的实际路径 + vocab_mapping_directory: "vocab_mapping_datas" # vocab_mapping数据放置的实际路径 +~~~ + +##### 3. 运行 + +~~~ +# 环境准备 +cd $fate_base +source fate/bin/init_env.sh + +# 命令执行 +fate_test llmsuite -i fate_llm/examples/fedmkt/ --yes +~~~ + +##### 4. 问题定位 + +​ 任务运行失败,报错,会在当前执行命令的目录下生成一个logs目录,找到对应的任务,检查stdout,或者exception.log检查报错原因。 + diff --git a/examples/fedmkt/test_fedmkt_llmsuit.yaml b/examples/fedmkt/test_fedmkt_llmsuit.yaml deleted file mode 100644 index 1516809..0000000 --- a/examples/fedmkt/test_fedmkt_llmsuit.yaml +++ /dev/null @@ -1,14 +0,0 @@ -data: - - file: - table_name: arc_challenge - namespace: experiment - role: guest_0 - - file: - table_name: arc_challenge - namespace: experiment - role: host_0 -bloom_lora_vs_zero_shot: - gpt2_fedmkt: - pretrained: "gpt2" - script: "./fedmkt.py" - conf: "./fedmkt_config.yaml" \ No newline at end of file diff --git a/examples/fedmkt/test_fedmkt_llmsuite.yaml b/examples/fedmkt/test_fedmkt_llmsuite.yaml new file mode 100644 index 0000000..c293ecc --- /dev/null +++ b/examples/fedmkt/test_fedmkt_llmsuite.yaml @@ -0,0 +1,21 @@ +data: + - file: examples/data/arc_c # Please add your datasets path + table_name: arc_challenge + namespace: experiment + role: guest_0 + - file: examples/data/arc_c # Please add your datasets path + table_name: arc_challenge + namespace: experiment + role: host_0 +fedmkt_lora_vs_zero_shot: + gpt2_fedmkt: + pretrained: "gpt2" # Please add your model path + script: "./fedmkt.py" + conf: "./fedmkt_config.yaml" + peft_path_format: "{{fate_base}}/fate_flow/model/{{job_id}}/guest/{{party_id}}/{{model_task_name}}/0/output/output_model/model_directory" + tasks: + - "arc_challenge" + gpt2_zero_shot: + pretrained: "gpt2" # Please add your model path + tasks: + - "arc_challenge" \ No newline at end of file diff --git a/examples/offsite_tuning/offsite_tuning.py b/examples/offsite_tuning/offsite_tuning.py index 73f6884..bf87de0 100644 --- a/examples/offsite_tuning/offsite_tuning.py +++ b/examples/offsite_tuning/offsite_tuning.py @@ -5,74 +5,78 @@ from fate_client.pipeline.components.fate.homo_nn import HomoNN, get_conf_of_ot_runner from fate_client.pipeline.components.fate.nn.algo_params import Seq2SeqTrainingArguments, FedAVGArguments from fate_client.pipeline.components.fate.nn.loader import LLMModelLoader, LLMDatasetLoader, LLMDataFuncLoader -from fate_client.pipeline.components.fate.nn.torch.base import Sequential from fate_client.pipeline.components.fate.nn.torch import nn +from typing import Union, Dict -def load_params(file_path): - """Load and parse the YAML params file.""" - with open(file_path, 'r') as f: - params = yaml.safe_load(f) - return params - -def setup_pipeline(params): - """Set up the pipeline using the provided parameters.""" - guest = params['pipeline']['guest'] - arbiter = params['pipeline']['arbiter'] - pretrained_model_path = params['paths']['pretrained_model_path'] +def main(config="../../config.yaml", param: Union[Dict, str] = None, namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + if isinstance(param, str): + param = yaml.safe_load(param) + # Load the configuration file + parties = config.parties + guest = parties.guest[0] + arbiter = parties.arbiter[0] + pretrained_model_path = param["pretrained_model_path"] + # Create pipeline pipeline = FateFlowPipeline().set_parties(guest=guest, arbiter=arbiter) - reader = Reader("reader_0", runtime_parties=dict(guest=guest)) - reader.guest.task_parameters( - namespace=params['pipeline']['namespace'], - name=params['pipeline']['name'] + # Set up the data reader + reader_0 = Reader("reader_0", runtime_parties=dict(guest=guest)) + reader_0.guest.task_parameters( + namespace=param["data"]["guest"]["namespace"], + name=param["data"]["guest"]["name"] ) + # Load the LLM model client_model = LLMModelLoader( - module_name=params['models']['client']['module_name'], - item_name=params['models']['client']['item_name'], + module_name='offsite_tuning.gpt2', item_name='GPT2LMHeadSubModel', model_name_or_path=pretrained_model_path, - emulator_layer_num=params['models']['client']['emulator_layer_num'], - adapter_top_layer_num=params['models']['client']['adapter_top_layer_num'], - adapter_bottom_layer_num=params['models']['client']['adapter_bottom_layer_num'] + emulator_layer_num=param["model_config"]["emulator_layer_num"], + adapter_top_layer_num=param["model_config"]["adapter_top_layer_num"], + adapter_bottom_layer_num=param["model_config"]["adapter_bottom_layer_num"] ) server_model = LLMModelLoader( - module_name=params['models']['server']['module_name'], - item_name=params['models']['server']['item_name'], + module_name='offsite_tuning.gpt2', item_name='GPT2LMHeadMainModel', model_name_or_path=pretrained_model_path, - emulator_layer_num=params['models']['server']['emulator_layer_num'], - adapter_top_layer_num=params['models']['server']['adapter_top_layer_num'], - adapter_bottom_layer_num=params['models']['server']['adapter_bottom_layer_num'] + emulator_layer_num=param["model_config"]["emulator_layer_num"], + adapter_top_layer_num=param["model_config"]["adapter_top_layer_num"], + adapter_bottom_layer_num=param["model_config"]["adapter_bottom_layer_num"] ) + # Load the dataset and data processor dataset = LLMDatasetLoader( - module_name=params['dataset']['module_name'], - item_name=params['dataset']['item_name'], - tokenizer_name_or_path=params['dataset']['tokenizer_name_or_path'], - select_num=params['dataset']['select_num'] - ) + module_name='qa_dataset', item_name='QaDataset', + tokenizer_name_or_path=pretrained_model_path, + select_num=100 + ) data_collator = LLMDataFuncLoader( - module_name=params['data_collator']['module_name'], - item_name=params['data_collator']['item_name'], - tokenizer_name_or_path=params['data_collator']['tokenizer_name_or_path'] - ) + module_name='data_collator.cust_data_collator', + item_name='get_seq2seq_data_collator', + tokenizer_name_or_path=pretrained_model_path + ) + # DeepSpeed config + ds_config = param["deepspeed_config"] + # Training parameter settings train_args = Seq2SeqTrainingArguments( - per_device_train_batch_size=params['training']['batch_size'], - learning_rate=params['training']['learning_rate'], - disable_tqdm=False, - num_train_epochs=params['training']['num_train_epochs'], - logging_steps=params['training']['logging_steps'], + per_device_train_batch_size=param["training"]["batch_size"], + learning_rate=param["training"]["lr"], + disable_tqdm=param["training"]["disable_tqdm"], + num_train_epochs=param["training"]["num_train_epochs"], + logging_steps=param["training"]["logging_steps"], logging_strategy='steps', - dataloader_num_workers=4, + dataloader_num_workers=param["training"]["dataloader_num_workers"], use_cpu=False, - deepspeed=params['training']['deepspeed'], # Add DeepSpeed config here + deepspeed=ds_config, remove_unused_columns=False, fp16=True ) + # Set the configuration of the client and server models client_conf = get_conf_of_ot_runner( model=client_model, dataset=dataset, @@ -91,33 +95,30 @@ def setup_pipeline(params): aggregate_model=False ) - homo_nn = HomoNN( + # Set up the HomoNN component + homo_nn_0 = HomoNN( 'nn_0', - train_data=reader.outputs["output_data"], + train_data=reader_0.outputs["output_data"], runner_module="offsite_tuning_runner", runner_class="OTRunner" - ) + ) + homo_nn_0.guest.task_parameters(runner_conf=client_conf) + homo_nn_0.arbiter.task_parameters(runner_conf=server_conf) - homo_nn.guest.task_parameters(runner_conf=client_conf) - homo_nn.arbiter.task_parameters(runner_conf=server_conf) - - # If using Eggroll, you can add this line to submit your job - homo_nn.guest.conf.set("launcher_name", "deepspeed") - - pipeline.add_tasks([reader, homo_nn]) - pipeline.conf.set("task", dict(engine_run=params['pipeline']['engine_run'])) + # Using DeepSpeed + homo_nn_0.guest.conf.set("launcher_name", "deepspeed") + # Build a task pipeline + pipeline.add_tasks([reader_0, homo_nn_0]) + pipeline.conf.set("task", dict(engine_run={"cores": 1})) pipeline.compile() pipeline.fit() - -def main(config_file, param_file): - params = load_params(param_file) - setup_pipeline(params) + return pretrained_model_path if __name__ == "__main__": - parser = argparse.ArgumentParser("LLMSUITE Offsite-tuning JOB") + parser = argparse.ArgumentParser("LLMSUITE PIPELINE JOB") parser.add_argument("-c", "--config", type=str, - help="Path to config file", default="./config.yaml") + help="config file", default="./config.yaml") parser.add_argument("-p", "--param", type=str, - help="Path to parameter file", default="./test_offsite_tuning_llmsuite.yaml") + help="config file for params", default="./offsite_tuning_config.yaml") args = parser.parse_args() - main(args.config, args.param) + main(args.config, args.param) \ No newline at end of file diff --git a/examples/offsite_tuning/offsite_tuning_config.yaml b/examples/offsite_tuning/offsite_tuning_config.yaml index deb79cf..6156031 100644 --- a/examples/offsite_tuning/offsite_tuning_config.yaml +++ b/examples/offsite_tuning/offsite_tuning_config.yaml @@ -1,67 +1,50 @@ -# params.yaml +data: + guest: + namespace: experiment + name: sciq + host: + namespace: experiment + name: sciq -paths: - pretrained_model_path: 'gpt2' - -pipeline: - guest: '9999' - arbiter: '9999' - namespace: 'experiment' - name: 'sciq' - engine_run: - cores: 1 +# Model path +pretrained_model_path: "gpt2" # Please add your model path +# Training parameters training: batch_size: 1 - learning_rate: 5e-5 + lr: 5e-5 num_train_epochs: 1 logging_steps: 10 - deepspeed: - train_micro_batch_size_per_gpu: 1 - optimizer: - type: "Adam" - params: - lr: 5e-5 - torch_adam: true - adam_w_mode: false - fp16: - enabled: true - gradient_accumulation_steps: 1 - zero_optimization: - stage: 2 - allgather_partitions: true - allgather_bucket_size: 1e8 - overlap_comm: true - reduce_scatter: true - reduce_bucket_size: 1e8 - contiguous_gradients: true - offload_optimizer: - device: "cpu" - offload_param: - device: "cpu" - -models: - client: - module_name: 'offsite_tuning.gpt2' - item_name: 'GPT2LMHeadSubModel' - emulator_layer_num: 11 - adapter_top_layer_num: 2 - adapter_bottom_layer_num: 2 - - server: - module_name: 'offsite_tuning.gpt2' - item_name: 'GPT2LMHeadMainModel' - emulator_layer_num: 11 - adapter_top_layer_num: 2 - adapter_bottom_layer_num: 2 + disable_tqdm: false + dataloader_num_workers: 4 -dataset: - module_name: 'qa_dataset' - item_name: 'QaDataset' - tokenizer_name_or_path: 'gpt2' - select_num: 100 +# LLM model configuration +model_config: + emulator_layer_num: 11 + adapter_top_layer_num: 2 + adapter_bottom_layer_num: 2 -data_collator: - module_name: 'data_collator.cust_data_collator' - item_name: 'get_seq2seq_data_collator' - tokenizer_name_or_path: 'gpt2' +# DeepSpeed ​​Configuration +deepspeed_config: + train_micro_batch_size_per_gpu: 1 + optimizer: + type: "Adam" + params: + lr: 5e-5 + torch_adam: true + adam_w_mode: false + fp16: + enabled: true + gradient_accumulation_steps: 1 + zero_optimization: + stage: 2 + allgather_partitions: true + allgather_bucket_size: 1e8 + overlap_comm: true + reduce_scatter: true + reduce_bucket_size: 1e8 + contiguous_gradients: true + offload_optimizer: + device: "cpu" + offload_param: + device: "cpu" diff --git a/examples/offsite_tuning/offsite_tuning_fate_test_quick_start_zh.md b/examples/offsite_tuning/offsite_tuning_fate_test_quick_start_zh.md new file mode 100644 index 0000000..5f51d93 --- /dev/null +++ b/examples/offsite_tuning/offsite_tuning_fate_test_quick_start_zh.md @@ -0,0 +1,103 @@ +#### offsite_tuning_fate_test_quick_start_zh + +##### 1. 数据准备 + +​ 准备QA数据集,本样例中使用sciq数据集。可以使用qa_dataset.py中提供的工具对sciq数据集进行分词,并保存分词结果。记住将save_path修改为自己的路径。数据处理脚本如下 + +~~~ +# sciq数据处理脚本,稍微修改即可执行,执行需在fate环境下执行 +import os +from fate_llm.dataset.qa_dataset +import tokenize_qa_dataset +from transformers import AutoTokenizer +from fate_llm.dataset.qa_dataset import QaDataset + +tokenizer_name_or_path = 'gpt2' # 实际模型放置路径 +tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path) + +if 'llama' in tokenizer_name_or_path: + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, unk_token="", bos_token="", eos_token="", add_eos_token=True) + tokenizer.pad_token = tokenizer.eos_token +else: + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path) +if 'gpt2' in tokenizer_name_or_path: + tokenizer.pad_token = tokenizer.eos_token + + +# bind data path to name & namespace +save_path = '/example/data/sciq' #实际需要放置的路径,可自定义 +rs = tokenize_qa_dataset('sciq', tokenizer, save_path, seq_max_len=600) + +ds = QaDataset(tokenizer_name_or_path=tokenizer_name_or_path) +ds.load(save_path) + +print(len(ds)) # train set length +print(ds[0]['input_ids'].__len__()) # first sample length +~~~ + +​ 将数据集路径与名称和命名空间绑定。记住自己的数据集保存路径的路径。 + +~~~ +flow table bind --namespace experiment --name sciq --path /example/data/sciq +~~~ + +##### 2. 配置文件修改 + +​ test_offsite_tuning_llmsuite.yaml 仅需修改数据与模型的实际路径 + +~~~ +data: + - file: /example/data/sciq #实际需要放置的路径 + table_name: sciq + namespace: experiment + role: guest_0 + - file: /example/data/sciq #实际需要放置的路径 + table_name: sciq + namespace: experiment + role: host_0 +offsite_tuning_lora_vs_zero_shot: + offsite_tuning_lora: + pretrained: "gpt2" # 模型放置的实际路径 + script: "./offsite_tuning.py" + conf: "./offsite_tuning_config.yaml" + loader: "ot" + model_weights_format: "{{fate_base}}/fate_flow/model/{{job_id}}/guest/{{party_id}}/{{model_task_name}}/0/output/output_model/model_directory/pytorch_model.bin" + tasks: + - "sciq" + bloom_zero_shot: + pretrained: "gpt2" # 模型放置的实际路径 + tasks: + - "sciq" +~~~ + + offsite_tuning_config.yaml 仅需修改下模型的实际路径 + +~~~ +# offsite_tuning_config.yaml 开头片段 +data: + guest: + namespace: experiment + name: sciq + host: + namespace: experiment + name: sciq + +# 模型路径 +pretrained_model_path: "gpt2" # 模型放置的实际路径 +~~~ + +##### 3. 运行 + +~~~ +# 环境准备 +cd $fate_base +source /fate/bin/init_env.sh + +# 命令执行 +fate_test llmsuite -i fate_llm/examples/offsite_tuning/ --yes +~~~ + +##### 4. 问题定位 + +​ 任务运行失败,报错,会在当前执行命令的目录下生成一个logs目录,找到对应的任务,检查stdout,或者exception.log检查报错原因。 + diff --git a/examples/offsite_tuning/test_offsite_tuning_llmsuite.yaml b/examples/offsite_tuning/test_offsite_tuning_llmsuite.yaml deleted file mode 100644 index 0cdd007..0000000 --- a/examples/offsite_tuning/test_offsite_tuning_llmsuite.yaml +++ /dev/null @@ -1,14 +0,0 @@ -data: - - file: - table_name: sciq - namespace: experiment - role: guest_0 - - file: - table_name: sciq - namespace: experiment - role: host_0 -bloom_lora_vs_zero_shot: - gpt2_ot: - pretrained: "gpt2" - script: "./offsite_tuning.py" - conf: "./offsite_tuning_config.yaml" \ No newline at end of file diff --git a/examples/offsite_tuning/test_offsite_tuning_main_llmsuite.yaml b/examples/offsite_tuning/test_offsite_tuning_main_llmsuite.yaml new file mode 100644 index 0000000..0865a4e --- /dev/null +++ b/examples/offsite_tuning/test_offsite_tuning_main_llmsuite.yaml @@ -0,0 +1,29 @@ +data: + - file: /examples/pellm/sciq_t # Please add your datasets path + table_name: sciq + namespace: experiment + role: guest_0 + - file: /examples/pellm/sciq_t # Please add your datasets path + table_name: sciq + namespace: experiment + role: host_0 +offsite_tuning_gpt2_vs_zero_shot: + offsite_tuning_gpt2: + pretrained: "gpt2" # Please add your model path + script: "./offsite_tuning.py" + conf: "./offsite_tuning_config.yaml" + loader: "ot" + loader_conf: + module_name: "offsite_tuning.gpt2" + item_name: "GPT2LMHeadMainModel" + emulator_layer_num: 11 + adapter_top_layer_num: 2 + adapter_bottom_layer_num: 2 + requires_untar: "{{fate_base}}/fate_flow/model/{{job_id}}/0/arbiter/{{party_id}}/{{model_task_name}}" + model_weights_format: "{{fate_base}}/fate_flow/model/{{job_id}}/0/arbiter/{{party_id}}/{{model_task_name}}/output_models/pytorch_model.bin" + tasks: + - "sciq" + gpt2_zero_shot: + pretrained: "gpt2" # Please add your model path + tasks: + - "sciq" \ No newline at end of file diff --git a/examples/offsite_tuning/test_offsite_tuning_sub_llmsuite.yaml b/examples/offsite_tuning/test_offsite_tuning_sub_llmsuite.yaml new file mode 100644 index 0000000..2b7e291 --- /dev/null +++ b/examples/offsite_tuning/test_offsite_tuning_sub_llmsuite.yaml @@ -0,0 +1,29 @@ +data: + - file: /examples/pellm/sciq_t # Please add your datasets path + table_name: sciq + namespace: experiment + role: guest_0 + - file: /examples/pellm/sciq_t # Please add your datasets path + table_name: sciq + namespace: experiment + role: host_0 +offsite_tuning_lora_vs_zero_shot: + gpt2_ot: + pretrained: "gpt2" # Please add your model path + script: "./offsite_tuning.py" + conf: "./offsite_tuning_config.yaml" + loader : "ot" + loader_conf: + module_name: "offsite_tuning.gpt2" + item_name: "GPT2LMHeadSubModel" + emulator_layer_num: 11 + adapter_top_layer_num: 2 + adapter_bottom_layer_num: 2 + model_weights_format: "{{fate_base}}/fate_flow/model/{{job_id}}/guest/{{party_id}}/{{model_task_name}}/0/output/output_model/model_directory/pytorch_model.bin" + tasks: + - "sciq" + gpt2_zero_shot: + pretrained: "gpt2" # Please add your model path + tasks: + - "sciq" + diff --git a/examples/pdss/__init__.py b/examples/pdss/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/examples/pdss/pdss.py b/examples/pdss/pdss.py new file mode 100644 index 0000000..da3353d --- /dev/null +++ b/examples/pdss/pdss.py @@ -0,0 +1,157 @@ +import argparse +import yaml +from fate_client.pipeline.components.fate.reader import Reader +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate.nn.loader import Loader +from fate_client.pipeline.components.fate.homo_nn import HomoNN +from fate_client.pipeline.utils import test_utils +from typing import Union, Dict + +def main(config="../../config.yaml", param: Union[Dict, str] = None, namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + if isinstance(param, str): + param = yaml.safe_load(param) + # Load the configuration file + parties = config.parties + guest = parties.guest[0] + arbiter = parties.arbiter[0] + pretrained_model_path = param["model"]["pretrained_model_name_or_path"] + + # Create pipeline + pipeline = FateFlowPipeline().set_parties(guest=guest, arbiter=arbiter) + + # Set up the data reader + reader_0 = Reader("reader_0", runtime_parties=dict(guest=guest)) + reader_0.guest.task_parameters( + namespace="experiment", + name="arc_easy" + ) + + # Model loading configuration + model_conf = Loader( + module_name='fate_llm.model_zoo.hf_model', + item_name='HFAutoModelForCausalLM', + pretrained_model_name_or_path=param["model"]["pretrained_model_name_or_path"] + ).to_dict() + + # Data Processor Configuration + data_collator_conf = Loader( + module_name='fate_llm.data.data_collator.pdss_collator', + item_name='get_prefix_data_collator', + tokenizer_name_or_path=param["data_collator"]["tokenizer_name_or_path"] + ).to_dict() + + # Client reasoning initialization configuration + infer_init_conf_client = dict( + module_name='fate_llm.algo.inferdpt.init.default_init', + item_name='InferDPTAPIClientInit', + kwargs=dict( + api_url=param["inference"]["client"]["api_url"], + api_model_name=param["inference"]["client"]["api_model_name"], + api_key=param["inference"]["client"]["api_key"], + inferdpt_kit_path=param["inference"]["client"]["inferdpt_kit_path"] + ) + ) + + # Server inference initialization configuration + infer_init_conf_server = dict( + module_name='fate_llm.algo.inferdpt.init.default_init', + item_name='InferDPTAPIServerInit', + kwargs=dict( + api_url=param["inference"]["server"]["api_url"], + api_model_name=param["inference"]["server"]["api_model_name"], + api_key=param["inference"]["server"]["api_key"] + ) + ) + + # Dataset configuration + dataset_conf = dict( + module_name='fate_llm.dataset.pdss_dataset', + item_name='PrefixDataset', + kwargs=dict( + tokenizer_path=param["dataset"]["tokenizer_path"], + predict_input_template=param["dataset"]["predict_input_template"], + predict_output_template=param["dataset"]["predict_output_template"], + rationale_input_template=param["dataset"]["rationale_input_template"], + rationale_output_template=param["dataset"]["rationale_output_template"], + max_input_length=param["dataset"]["max_input_length"], + max_target_length=param["dataset"]["max_target_length"], + split_key=param["dataset"]["split_key"] + ) + ) + + # Encoding and decoding templates + encoder_prompt = param["template"]["encoder_prompt"] + decoder_prompt = param["template"]["decoder_prompt"] + instruction_prompt = param["template"]["instruction_prompt"] + + # Inference parameters + remote_inference_kwargs = param["inference_params"]["remote"] + local_inference_kwargs = param["inference_params"]["local"] + + # DeepSpeed ​​Configuration + ds_config = param["deepspeed_config"] + + # Training parameters + training_args_dict = dict( + per_device_train_batch_size=param["training"]["batch_size"], + gradient_accumulation_steps=param["training"]["gradient_accumulation_steps"], + logging_steps=param["training"]["logging_steps"], + max_steps=param["training"]["max_steps"], + deepspeed=ds_config, + fp16=True, + log_level=param["training"]["log_level"] + ) + + # Set the configuration of the client and server models + client_conf = dict( + model_conf=model_conf, + dataset_conf=dataset_conf, + training_args_conf=training_args_dict, + data_collator_conf=data_collator_conf, + mode=param["mode"], + infer_inst_init_conf=infer_init_conf_client, + encode_template=encoder_prompt, + instruction_template=instruction_prompt, + decode_template=decoder_prompt, + remote_inference_kwargs=remote_inference_kwargs, + local_inference_kwargs=local_inference_kwargs, + perturb_doc_key='perturbed_doc', + perturbed_response_key='perturbed_response', + result_key='infer_result' + ) + + server_conf = dict( + infer_inst_init_conf=infer_init_conf_server, + mode=param["mode"] + ) + + # Initialize HomoNN component + homo_nn_0 = HomoNN( + 'nn_0', + train_data=reader_0.outputs["output_data"], + runner_module="pdss_runner", + runner_class="PDSSRunner" + ) + + homo_nn_0.guest.task_parameters(runner_conf=client_conf) + homo_nn_0.arbiter.task_parameters(runner_conf=server_conf) + + homo_nn_0.guest.conf.set("launcher_name", "deepspeed") + + pipeline.add_tasks([reader_0, homo_nn_0]) + pipeline.conf.set("task", dict(engine_run={"cores": 1})) + pipeline.compile() + pipeline.fit() + + return pretrained_model_path + +if __name__ == "__main__": + parser = argparse.ArgumentParser("LLMSUITE PIPELINE JOB") + parser.add_argument("-c", "--config", type=str, + help="config file", default="./config.yaml") + parser.add_argument("-p", "--param", type=str, + help="config file for params", default="./pdss_config.yaml") + args = parser.parse_args() + main(args.config, args.param) \ No newline at end of file diff --git a/examples/pdss/pdss_config.yaml b/examples/pdss/pdss_config.yaml new file mode 100644 index 0000000..ad8ce13 --- /dev/null +++ b/examples/pdss/pdss_config.yaml @@ -0,0 +1,118 @@ +data: + guest: + namespace: experiment + name: arc_easy + host: + namespace: experiment + name: arc_easy + +# Model configuration +model: + pretrained_model_name_or_path: "Qwen-14B" # Please add your model path + +# Data Processor Configuration +data_collator: + tokenizer_name_or_path: "Qwen-14B" # Please add your model path + +# Inference initialization configuration +inference: + client: + api_url: "http://127.0.0.1:9999/v1" + api_model_name: "Qwen-14B" # Please add your model path + api_key: "demo" + inferdpt_kit_path: "/examples/pdss" # Please add your inferdpt_kit_path + server: + api_url: "http://127.0.0.1:9999/v1" + api_model_name: "Qwen-14B" # Please add your model path + api_key: "demo" + +# Dataset configuration +dataset: + tokenizer_path: "Qwen-14B" # Please add your model path + predict_input_template: | + Predict: + Question:{{question}} + Choices:{{choices.text}} + predict_output_template: | + {{choices.text[choices.label.index(answerKey)]}} + rationale_input_template: | + Explain: + Question:{{question}} + Choices:{{choices.text}} + rationale_output_template: | + {{infer_result}} + max_input_length: 64 + max_target_length: 64 + split_key: "test" + +# Encoding and decoding templates +template: + encoder_prompt: | + {{question}} + Choices:{{choices.text}} + decoder_prompt: | + Select Answer from Choices and explain it in "Rationale" with few words. Please refer to the example to write the rationale.Use to finish your rationale. + + Example(s): + Question:George wants to warm his hands quickly by rubbing them. Which skin surface will produce the most heat? + Choices:['dry palms', 'wet palms', 'palms covered with oil', 'palms covered with lotion'] + Rationale:Friction between two surfaces generates heat due to the conversion of kinetic energy into thermal energy. Dry palms produce the most heat when rubbed together as they create higher friction compared to wet or lubricated palms, which reduce friction. Therefore, the answer is 'dry palms'. + + Question:{{perturbed_doc}} + Rationale:{{perturbed_response | replace('\n', '')}} + + Please explain: + Question:{{question}} + Choices:{{choices.text}} + instruction_prompt: | + <|im_start|>system + You are a helpful assistant<|im_end|> + <|im_start|>user + Select Answer from Choices and explain it in "Rationale" with few words. Please refer to the example to write the rationale. + Use to finish your rationale. + + Example(s): + Question:Which factor will most likely cause a person to develop a fever? + Choices:['a leg muscle relaxing after exercise', 'a bacterial population in the bloodstream', 'several viral particles on the skin', 'carbohydrates being digested in the stomach'] + Rationale:A bacterial infection in the bloodstream triggers the immune system to respond, therefore often causing a fever as the body tries to fight off the bacteria. Therefore, the answer is 'a bacterial population in the bloodstream' + + Please explain: + Question:{{perturbed_doc}} + Rationale: + <|im_end|> + <|im_start|>assistant + +# Inference parameters +inference_params: + remote: + stop: ['<|im_end|>', '', '\n', '\n\n', '.\n\n\n\n\n', '<|end_of_text|>', '>\n\n\n'] + temperature: 0.01 + max_tokens: 256 + local: + stop: ['<|im_end|>', '', '\n', '\n\n', '.\n\n\n\n\n', '<|end_of_text|>', '>\n\n\n'] + temperature: 0.01 + max_tokens: 256 + +# DeepSpeed ​​Configuration +deepspeed_config: + train_micro_batch_size_per_gpu: 1 + gradient_accumulation_steps: 8 + optimizer: + type: "AdamW" + params: + lr: 5e-5 + fp16: + enabled: true + zero_optimization: + stage: 0 + +# Training parameters +training: + batch_size: 1 + gradient_accumulation_steps: 8 + logging_steps: 10 + max_steps: 30 + log_level: "info" + +# Mode Configuration +mode: "infer_and_train" diff --git a/examples/pdss/pdss_fate_test_quick_start_zh.md b/examples/pdss/pdss_fate_test_quick_start_zh.md new file mode 100644 index 0000000..dbe1a84 --- /dev/null +++ b/examples/pdss/pdss_fate_test_quick_start_zh.md @@ -0,0 +1,130 @@ +#### pdss_fate_test_quick_start_zh + +##### 1. 数据准备 + +​ 准备QA数据集,本样例中使用arc_easy数据集。可以使用qa_dataset.py中提供的工具对arc_easy数据集进行分词,并保存分词结果。记住将save_path修改为自己的路径。数据处理脚本如下 + +~~~ +# arc_easy数据处理脚本,稍微修改即可执行,执行需在fate环境下执行 +from datasets +import load_dataset +dataset = load_dataset("arc_easy") +dataset.save_to_disk('path_to_save/arc_easy') + +from fate_llm.dataset.pdss_dataset import PrefixDataset + +pds = PrefixDataset( + tokenizer_path='/data/models/Qwen-14B/', + predict_input_template="""Predict: +Question:{{question}} +Choices:{{choices.text}} +Answer: + """, + predict_output_template="""{{choices.text[choices.label.index(answerKey)]}}""", + rationale_input_template="""Explain: +Question:{{question}} +Choices:{{choices.text}} +Rationale: + """, + rationale_output_template="""{{infer_result}}""", + max_input_length=128, + max_target_length=128, + split_key='train' + ) + +pds.load('path_to_save/arc_easy') +~~~ + +​ 将数据集路径与名称和命名空间绑定。记住自己的数据集保存路径的路径。 + +~~~ +flow table bind --namespace experiment --name arc_easy --path path_to_save/arc_easy +~~~ + +##### 2. 配置文件修改 + +​ test_pdss_llmsuite.yaml 仅需修改数据与模型的实际路径 + +~~~ +data: + - file: path_to_save/arc_easy #实际需要放置的路径 + table_name: arc_easy + namespace: experiment + role: guest_0 + - file: path_to_save/arc_easy #实际需要放置的路径 + table_name: arc_easy + namespace: experiment + role: host_0 +pdss_lora_vs_zero_shot: + pdss_lora: + pretrained: "Qwen" # 模型放置的实际路径 + script: "./pdss.py" + conf: "./pdss_config.yaml" + loader: "pdss" + model_weights_format: "{{fate_base}}/fate_flow/model/{{job_id}}/guest/{{party_id}}/{{model_task_name}}/0/output/output_model/model_directory/pytorch_model.bin" + tasks: + - "arc_easy" + bloom_zero_shot: + pretrained: "Qwen-14B" # 模型放置的实际路径 + tasks: + - "arc_easy" +~~~ + + pdss_config.yaml 仅需修改下模型的实际路径 + +~~~ +# pdss_config.yaml 开头片段 +data: + guest: + namespace: experiment + name: arc_easy + host: + namespace: experiment + name: arc_easy + +# 模型配置 +model: + pretrained_model_name_or_path: "Qwen-14B" # 模型放置的实际路径 + +data_collator: + tokenizer_name_or_path: "Qwen-14B" # 模型的tokenizer放置的实际路径 + +inference: + client: + api_url: "http://127.0.0.1:9999/v1" + api_model_name: "Qwen-14B" # 模型放置的实际路径 + api_key: "demo" + inferdpt_kit_path: "/examples/pdss" # inferdpt_kit_path数据放置的实际路径际路径 + server: + api_url: "http://127.0.0.1:9999/v1" + api_model_name: "Qwen-14B" # 模型放置的实际路径 + api_key: "demo" + +~~~ + +##### 3. 运行 + +~~~ +# 创建 vllm 环境 +python -m venv vllm_venv +source vllm_venv/bin/activate +pip install vllm==0.4.3 +pip install numpy==1.26.4 # numpy >= 2.0.0 will raise error, so reinstall numpy<2.0.0 + +# Qwen1.5-0.5为本地llm模型保存路径 +export CUDA_VISIBLE_DEVICES=1,2 +nohup python -m vllm.entrypoints.openai.api_server --host 127.0.0.1 --port 9999 --model Qwen1.5-0.5 --dtype=half --enforce-eager --api-key demo --device cuda -tp 2 & +~~~ +~~~ +# 环境准备 +cd $fate_base +source fate/bin/init_env.sh + +# 命令执行 +fate_test llmsuite -i fate_llm/examples/pdss/ --yes +~~~ + +##### 4. 问题定位 + +​ 任务运行失败,报错,会在当前执行命令的目录下生成一个logs目录,找到对应的任务,检查stdout,或者exception.log检查报错原因。 + diff --git a/examples/pdss/test_pdss_llmsuite.yaml b/examples/pdss/test_pdss_llmsuite.yaml new file mode 100644 index 0000000..6909fef --- /dev/null +++ b/examples/pdss/test_pdss_llmsuite.yaml @@ -0,0 +1,24 @@ +data: + - file: examples/data/arc_e # Please add your datasets path + table_name: arc_easy + namespace: experiment + role: guest_0 + - file: examples/data/arc_e # Please add your datasets path + table_name: arc_easy + namespace: experiment + role: host_0 +pdss_qwen_vs_zero_shot: + pdss_qwen: + pretrained: "Qwen-14B" # Please add your model path + script: "./pdss.py" + conf: "./pdss_config.yaml" + loader: "pdss" + model_weights_format: "{{fate_base}}/fate_flow/model/{{job_id}}/guest/{{party_id}}/{{model_task_name}}/0/output/output_model/model_directory/pytorch_model.bin" + tasks: + - "arc_easy" + Qwen_zero_shot: + pretrained: "Qwen-14B" # Please add your model path + tasks: + - "arc_easy" + + diff --git a/examples/pellm/pellm_fate_test_quick_start_zh.md b/examples/pellm/pellm_fate_test_quick_start_zh.md new file mode 100644 index 0000000..ec2f0b6 --- /dev/null +++ b/examples/pellm/pellm_fate_test_quick_start_zh.md @@ -0,0 +1,73 @@ +#### pellm_fate_test_quick_start_zh + +##### 1. 数据准备 + +​ 用的是一个广告测试生成的数据集,可以从以下链接下载数据集并将其放置在fate_llm/examples/data文件夹中, 同时 $fate_base/fate_llm/python/evaluate/tasks/advertise_gen下也要放置数据train.json, dev.json +​ [data_link_1](https://drive.google.com/file/d/13_vf0xRTQsyneRKdD1bZIr93vBGOczrk/view) + +##### 2. 配置文件修改 + +​ test_pellm_llmsuite.yaml仅需修改数据与模型的实际路径 + +~~~ +# fate_llm/examples/pellm/test_pellm_llmsuite.yaml + +data: + - file: examples/data/AdvertiseGen/train.json # 下载数据放置的实际路径 + table_name: ad + namespace: experiment + role: guest_0 + - file: examples/data/AdvertiseGen/train.json # 下载数据放置的实际路径 + table_name: ad + namespace: experiment + role: host_0 +bloom_lora_vs_zero_shot: + bloom_lora: + pretrained: "bloom-560m" # 模型放置的实际路径 + script: "./test_bloom_lora.py" + conf: "./bloom_lora_config.yaml" + peft_path_format: "{{fate_base}}/fate_flow/model/{{job_id}}/guest/{{party_id}}/{{model_task_name}}/0/output/output_model/model_directory" + tasks: + - "advertise-gen" + bloom_zero_shot: + pretrained: "bloom-560m" + tasks: + - "advertise-gen" +~~~ + + bloom_lora_config.yaml仅需修改模型路径 + +~~~ +# bloom_lora_config.yaml开头片段 + +data: + guest: + namespace: experiment + name: ad + host: + namespace: experiment + name: ad +epoch: 1 +batch_size: 4 +lr: 5e-4 +pretrained_model_path: bloom-560m # 模型放置的实际路径 +~~~ + +##### 3. 运行 + +~~~ +# 环境准备 +cd /data/projects/fate/ +source /data/projects/fate/bin/init_env.sh + +# 命令执行 +fate_test llmsuite -i fate_llm/examples/pellm/ --yes +~~~ + +##### 4. 问题定位 + +​ 任务运行失败,报错,会在当前执行命令的目录下生成一个logs目录,找到对应的任务,检查stdout,或者exception.log检查报错原因。 + + + + diff --git a/examples/pellm/test_pellm_llmsuite.yaml b/examples/pellm/test_pellm_llmsuite.yaml index c9bb689..0fc43d2 100644 --- a/examples/pellm/test_pellm_llmsuite.yaml +++ b/examples/pellm/test_pellm_llmsuite.yaml @@ -1,21 +1,21 @@ data: - - file: examples/data/AdvertiseGen/train.json + - file: examples/data/AdvertiseGen/train.json # Please add your datasets path table_name: ad namespace: experiment role: guest_0 - - file: examples/data/AdvertiseGen/train.json + - file: examples/data/AdvertiseGen/train.json # Please add your datasets path table_name: ad namespace: experiment role: host_0 bloom_lora_vs_zero_shot: bloom_lora: - pretrained: "bloom-560m" + pretrained: "bloom-560m" # Please add your model path script: "./test_bloom_lora.py" conf: "./bloom_lora_config.yaml" peft_path_format: "{{fate_base}}/fate_flow/model/{{job_id}}/guest/{{party_id}}/{{model_task_name}}/0/output/output_model/model_directory" tasks: - "advertise-gen" bloom_zero_shot: - pretrained: "bloom-560m" + pretrained: "bloom-560m" # Please add your model path tasks: - "advertise-gen" \ No newline at end of file diff --git a/python/fate_llm/algo/inferdpt/init/default_init.py b/python/fate_llm/algo/inferdpt/init/default_init.py index 0050bfd..7a61adc 100644 --- a/python/fate_llm/algo/inferdpt/init/default_init.py +++ b/python/fate_llm/algo/inferdpt/init/default_init.py @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import os + from fate_llm.algo.inferdpt.init._init import InferInit from fate_llm.inference.api import APICompletionInference from fate_llm.algo.inferdpt import inferdpt @@ -24,15 +26,30 @@ class InferDPTAPIClientInit(InferInit): api_url = '' api_model_name = '' - api_key = 'EMPTY' + api_key = os.environ.get("FATE_LLM_API_KEY", "") inferdpt_kit_path = '' - eps = 3.0 + eps = os.environ.get("FATE_LLM_INFERDPT_EPS", 3.0) - def __init__(self, ctx): + def __init__(self, ctx, api_url=None, api_model_name=None, inferdpt_kit_path=None, api_key=None, eps=None): super().__init__(ctx) self.ctx = ctx + if api_url is not None: + self.api_url = api_url + if api_model_name is not None: + self.api_model_name = api_model_name + if inferdpt_kit_path is not None: + self.inferdpt_kit_path = inferdpt_kit_path + if api_key is not None: + self.api_key = api_key + if eps is not None: + self.eps = eps + def get_inst(self)-> InferDPTClient: + if self.api_key is None: + raise ValueError("Please set the environment variable FATE_LLM_API_KEY for llm querying") + if self.api_model_name is None or self.api_url is None: + raise ValueError(f"api_model_name={self.api_model_name} and api_url={self.api_url} should not be None") inference = APICompletionInference(api_url=self.api_url, model_name=self.api_model_name, api_key=self.api_key) kit = InferDPTKit.load_from_path(self.inferdpt_kit_path) inferdpt_client = inferdpt.InferDPTClient(self.ctx, kit, inference, epsilon=self.eps) @@ -43,13 +60,22 @@ class InferDPTAPIServerInit(InferInit): api_url = '' api_model_name = '' - api_key = 'EMPTY' + api_key = os.environ.get("FATE_LLM_API_KEY", "") - def __init__(self, ctx): + def __init__(self, ctx, api_url, api_model_name, api_key=None): super().__init__(ctx) self.ctx = ctx + self.api_url = api_url + self.api_model_name = api_model_name + + if api_key is not None: + self.api_key = api_key - def get_inst(self)-> InferDPTServer: + def get_inst(self) -> InferDPTServer: + if self.api_key is None: + raise ValueError("Please set the environment variable FATE_LLM_API_KEY for llm querying") + if self.api_model_name is None or self.api_url is None: + raise ValueError(f"api_model_name={self.api_model_name} and api_url={self.api_url} should not be None") inference = APICompletionInference(api_url=self.api_url, model_name=self.api_model_name, api_key=self.api_key) inferdpt_server = inferdpt.InferDPTServer(self.ctx,inference_inst=inference) - return inferdpt_server + return inferdpt_server \ No newline at end of file diff --git a/python/fate_llm/evaluate/scripts/eval_cli.py b/python/fate_llm/evaluate/scripts/eval_cli.py index 5f5a0ad..750a96d 100644 --- a/python/fate_llm/evaluate/scripts/eval_cli.py +++ b/python/fate_llm/evaluate/scripts/eval_cli.py @@ -28,6 +28,15 @@ from ..utils._io import echo from ..utils._parser import LlmSuite +from typing import List, Tuple, Union, Optional +from transformers import AutoTokenizer +from lm_eval.models.huggingface import HFLM +import torch +import transformers +from fate_llm.evaluate.utils.model_tools import load_by_loader_OT, load_by_loader_PDSS +from fate_llm.evaluate.utils import llm_evaluator +from transformers import AutoModelForCausalLM + @click.command('evaluate') @click.option('-i', '--include', required=True, type=click.Path(exists=True), help='Path to model and metrics conf') @@ -67,6 +76,42 @@ def run_evaluate(ctx, include, eval_config, result_output, **kwargs): # run_suite_eval(suite, eval_config_dict, result_output) run_suite_eval(suite, eval_config, result_output) +class CustomLM(HFLM): + def __init__(self, pretrained: torch.nn.Module, + model_path: str, + tokenizer: Optional[Union[str, transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast]] = None, + rank=0, world_size=1, **kwargs): + + super().__init__(pretrained=model_path, **kwargs) + + self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + self._model = pretrained.to(self._device) + + if tokenizer is None: + self.tokenizer = AutoTokenizer.from_pretrained(model_path) + else: + self.tokenizer = tokenizer + + if self.tokenizer.pad_token is None: + self.tokenizer.add_special_tokens({'pad_token': '[PAD]'}) + + self._rank = rank + self._world_size = world_size + self.batch_size_per_gpu = 4 + + self._config = model_path + self._max_length = self._config.max_length if hasattr(self._config, 'max_length') else 1024 + self._logits_cache = None + + def loglikelihood_rolling(self, requests: List[Tuple[str, str]]) -> List[Tuple[float, bool]]: + + return [(0.0, True) for _ in requests] + + def generate_until(self, requests: List[Tuple[str, str]]) -> List[str]: + + return ["Generated text" for _ in requests] + def run_job_eval(job, eval_conf): job_eval_conf = {} if isinstance(eval_conf, dict): @@ -74,7 +119,6 @@ def run_job_eval(job, eval_conf): elif eval_conf is not None and os.path.exists(eval_conf): with open(eval_conf, 'r') as f: job_eval_conf.update(yaml.safe_load(f)) - # echo.echo(f"Evaluating job: {job.job_name} with tasks: {job.tasks}") if job.eval_conf_path: # job-level eval conf takes priority @@ -84,12 +128,25 @@ def run_job_eval(job, eval_conf): if job.loader: if job.peft_path: model = load_by_loader(loader_name=job.loader, - loader_conf_path=loader_conf_path, + # loader_conf_path=loader_conf_path, peft_path=job.peft_path) - else: - model = load_by_loader(loader_name=job.loader, - loader_conf_path=loader_conf_path) - result = evaluate(model=model, tasks=job.tasks, include_path=job.include_path, **job_eval_conf) + + result = evaluate(model=model, tasks=job.tasks, include_path=job.include_path, **job_eval_conf) + if job.model_weights_format: + if job.loader == 'ot': + loaded_model = load_by_loader_OT(trained_weights=job.model_weights_format, loader_conf=job.loader_conf_path, model_path=job.pretrained_model_path) + if job.loader == 'pdss': + loaded_model = load_by_loader_PDSS(trained_weights=job.model_weights_format,model_path=job.pretrained_model_path) + + loaded_model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) + + custom_lm = CustomLM(pretrained=loaded_model,model_path=job.pretrained_model_path) + + llm_evaluator.init_tasks() + #result = llm_evaluator.evaluate(model=gpt2_lm, tasks="sciq") + result = llm_evaluator.evaluate(model=custom_lm, tasks=job.tasks) + + else: # feed in pretrained & peft path job_eval_conf["model_args"]["pretrained"] = job.pretrained_model_path diff --git a/python/fate_llm/evaluate/utils/_parser.py b/python/fate_llm/evaluate/utils/_parser.py index 057e66b..ae5318a 100644 --- a/python/fate_llm/evaluate/utils/_parser.py +++ b/python/fate_llm/evaluate/utils/_parser.py @@ -19,12 +19,11 @@ import typing from pathlib import Path - class LlmJob(object): def __init__(self, job_name: str, script_path: Path=None, conf_path: Path=None, model_task_name: str=None, - pretrained_model_path: Path=None, peft_path: Path=None, + pretrained_model_path: Path=None, peft_path: Path=None, model_weights: Path=None, eval_conf_path: Path=None, loader: str=None, loader_conf_path: Path=None, - tasks: typing.List[str]=None, include_path: Path=None, peft_path_format: str=None): + tasks: typing.List[str]=None, include_path: Path=None, peft_path_format: str=None, model_weights_format: str=None,requires_untar: str=None): self.job_name = job_name self.script_path = script_path self.conf_path = conf_path @@ -38,7 +37,9 @@ def __init__(self, job_name: str, script_path: Path=None, conf_path: Path=None, self.include_path = include_path self.evaluate_only = self.script_path is None self.peft_path_format = peft_path_format - + self.model_weights_format = model_weights_format + self.model_weights = model_weights + self.requires_untar = requires_untar class LlmPair(object): def __init__( @@ -55,6 +56,7 @@ def __init__( self.pairs = pairs self.path = path self.dataset = dataset + self.suite_name = Path(self.path).stem self._final_status = {} @staticmethod @@ -92,31 +94,52 @@ def load(path: Path): if peft_path and not os.path.isabs(peft_path): peft_path = path.parent.joinpath(peft_path).resolve() + model_weights = job_configs.get("weights", None) + if model_weights and not os.path.isabs(model_weights): + model_weights = path.parent.joinpath(model_weights).resolve() + + requires_untar = job_configs.get("untar", None) + if requires_untar and not os.path.isabs(requires_untar): + requires_untar = path.parent.joinpath(requires_untar).resolve() + eval_conf_path = job_configs.get("eval_conf", None) if eval_conf_path and not os.path.isabs(eval_conf_path): eval_conf_path = path.parent.joinpath(eval_conf_path).resolve() loader = job_configs.get("loader", None) - if job_configs.get("loader_conf"): - loader_conf_path = path.parent.joinpath(job_configs["loader_conf"]).resolve() + # loader_conf + loader_conf = job_configs.get("loader_conf", None) + if isinstance(loader_conf, dict): + loader_conf_data = loader_conf + loader_conf_path = None + elif isinstance(loader_conf, str): + loader_conf_path = path.parent.joinpath(loader_conf).resolve() + loader_conf_data = None else: - loader_conf_path = "" + loader_conf_data = None + loader_conf_path = None + tasks = job_configs.get("tasks", []) include_path = job_configs.get("include_path", "") if include_path and not os.path.isabs(include_path): include_path = path.parent.joinpath(job_configs["include_path"]).resolve() - peft_path_format = job_configs.get("peft_path_format", "{{fate_base}}/fate_flow/model/{{job_id}}/" - "guest/{{party_id}}/{{model_task_name}}/0/" - "output/output_model/model_directory") + peft_path_format = job_configs.get("peft_path_format",None) + model_weights_format = job_configs.get("model_weights_format",None) + + requires_untar = job_configs.get("requires_untar",None) + jobs.append( LlmJob( job_name=job_name, script_path=script_path, conf_path=conf_path, model_task_name=model_task_name, pretrained_model_path=pretrained_model_path, peft_path=peft_path, eval_conf_path=eval_conf_path, - loader=loader, loader_conf_path=loader_conf_path, tasks=tasks, include_path=include_path, - peft_path_format=peft_path_format + loader=loader, loader_conf_path=loader_conf_data, tasks=tasks, include_path=include_path, + peft_path_format=peft_path_format, + model_weights_format=model_weights_format, + model_weights=model_weights, + requires_untar=requires_untar ) ) @@ -127,7 +150,7 @@ def load(path: Path): ) suite = LlmSuite(pairs=pairs, path=path) return suite - + def update_status( self, pair_name, job_name, job_id=None, status=None, exception_id=None, time_elapsed=None, event=None ): @@ -138,3 +161,5 @@ def update_status( def get_final_status(self): return self._final_status + + \ No newline at end of file diff --git a/python/fate_llm/evaluate/utils/model_tools.py b/python/fate_llm/evaluate/utils/model_tools.py index 05874a5..53c0521 100644 --- a/python/fate_llm/evaluate/utils/model_tools.py +++ b/python/fate_llm/evaluate/utils/model_tools.py @@ -16,8 +16,12 @@ import os from transformers import AutoModel, AutoTokenizer +from transformers import AutoModelForCausalLM from lm_eval.models.huggingface import HFLM - +from fate_llm.model_zoo.offsite_tuning.gpt2 import GPT2LMHeadMainModel,GPT2LMHeadSubModel +from ..utils._io import echo +import torch +import importlib def load_model_from_path(model_path, peft_path=None, peft_config=None, model_args=None): model_args = model_args or {} @@ -27,7 +31,6 @@ def load_model_from_path(model_path, peft_path=None, peft_config=None, model_arg else: raise ValueError(f"given model path is not valid, please check: {model_path}") else: - import torch from peft import PeftModel, PeftConfig, LoraConfig, TaskType, get_peft_model tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) model = AutoModel.from_pretrained(model_path, trust_remote_code=True) @@ -48,4 +51,48 @@ def load_model(model_path, peft_path=None, model_args=None): def load_by_loader(loader_name=None, loader_conf_path=None, peft_path=None): #@todo: find loader fn & return loaded model - pass \ No newline at end of file + pass + +def load_by_loader_OT(trained_weights=None, loader_conf=None, model_path=None): + + if not isinstance(loader_conf, dict): + raise ValueError("loader_conf must be a dictionary") + + module_name = loader_conf.get('module_name') + item_name = loader_conf.get('item_name') + + if not module_name or not item_name: + raise ValueError("loader_conf must contain 'module_name' and 'item_name' keys") + + module_name = 'fate_llm.model_zoo.' + module_name + + try: + module = importlib.import_module(module_name) + except ImportError as e: + raise ImportError(f"Failed to import module {module_name}: {e}") + + try: + ModelClass = getattr(module, item_name) + except AttributeError as e: + raise AttributeError(f"Module {module_name} does not have a class named {item_name}: {e}") + + model_init_params = {key: value for key, value in loader_conf.items() if key not in ['module_name', 'item_name']} + model_init_params['model_name_or_path'] = model_path + + model = ModelClass(**model_init_params) + + state_dict = torch.load(trained_weights, map_location='cuda') + model.load_state_dict(state_dict) + + return model + +def load_by_loader_PDSS(trained_weights=None, model_path=None): + + model_weights_path = trained_weights + + model = AutoModelForCausalLM.from_pretrained(model_path) + + state_dict = torch.load(model_weights_path, map_location='cuda') + model.load_state_dict(state_dict) + + return model \ No newline at end of file diff --git a/python/fate_llm/runner/pdss_runner.py b/python/fate_llm/runner/pdss_runner.py index 423dbf3..deea37b 100644 --- a/python/fate_llm/runner/pdss_runner.py +++ b/python/fate_llm/runner/pdss_runner.py @@ -112,7 +112,7 @@ def _get_infer_inst(self, init_conf): if init_conf is None: return None loader = Loader.from_dict(init_conf) - init_inst = loader.load_item()(self.get_context()) + init_inst = loader.load_item()(self.get_context(), **init_conf.get("kwargs", {})) assert isinstance(init_inst, InferInit), 'Need a InferInit class for initialization, but got {}'.format(type(init_inst)) infer_inst = init_inst.get_inst() logger.info('inferdpt inst loaded')