diff --git a/.gitignore b/.gitignore index 1ee937f0cffa..5f29895be914 100644 --- a/.gitignore +++ b/.gitignore @@ -140,4 +140,4 @@ autogen/ #fp8 ops/csrc/fp8/deep_gemm/include/cutlass ops/csrc/fp8/deep_gemm/include/cute -.ccls-cache +.ccls-cache \ No newline at end of file diff --git a/slm/pipelines/examples/contrastive_training/README.md b/slm/pipelines/examples/contrastive_training/README.md index d3148786ec06..c33ed2941b6e 100644 --- a/slm/pipelines/examples/contrastive_training/README.md +++ b/slm/pipelines/examples/contrastive_training/README.md @@ -1,17 +1,23 @@ # 向量检索模型训练 -推荐安装 gpu 版本的[PaddlePaddle](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/conda/linux-conda.html),以 cuda12.3的 paddle 为例,安装命令如下: +推荐安装 gpu 版本的[PaddlePaddle](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/conda/linux-conda.html),以 cuda11.8的 paddle 为例,安装命令如下: ``` -conda install nccl -c conda-forge -conda install paddlepaddle-gpu==3.0.0rc1 -i https://www.paddlepaddle.org.cn/packages/stable/cu123/ -c conda-forge -``` -安装其他依赖: -``` -pip install git+https://github.com/PaddlePaddle/PaddleNLP.git@develop -pip install -r requirements.txt +# 创建一个名为 paddle_env 的新环境,并激活 +conda create --name paddle_env python=3.10 +conda activate paddle_env + +# 安装 paddlenlp develop版本 +pip install --pre --upgrade paddlenlp -f https://www.paddlepaddle.org.cn/whl/paddlenlp.html + +# 安装 paddlepaddle-gpu nightly版本 +pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu118/ + +#安装其他依赖: +pip install -r slm/pipelines/examples/contrastive_training/requirements.txt ``` + 下载 DuReader-Retrieval 中文数据集: ``` cd data @@ -45,7 +51,7 @@ python train.py --do_train \ python -m paddle.distributed.launch --gpus "0,1,2,3" train.py --do_train \ --model_name_or_path rocketqa-zh-base-query-encoder \ --output_dir rocketqa-zh-base-query-encoder-duretrieval \ - --train_data ./data/dual.train.json \ + --train_data ./data/dureader_dual.train.jsonl \ --overwrite_output_dir \ --fine_tune_type sft \ --sentence_pooling_method cls \ @@ -132,7 +138,7 @@ python evaluation/benchmarks.py --model_type bert \ --passage_max_length 512 \ ``` 可配置参数包括: -- `model_type`: 模型的类似,可选 bert 或 roberta 等等 +- `model_type`: 模型的类型,可选 bert 或 roberta 等等 - `query_model`: query 向量模型的路径 - `passage_model`: passage 向量模型的路径 - `query_max_length`: query 的最大长度 @@ -179,7 +185,7 @@ python -u evaluation/eval_mteb.py \ - `add_bos_token`:是否添加起始符,0表示不添加,1表示添加 - `add_eos_token`:是否添加结束符,0表示不添加,1表示添加 -# MTEB 评估 +## MTEB 评估 [MTEB](https://github.com/embeddings-benchmark/mteb) 是一个大规模文本嵌入评测基准,包含了丰富的向量检索评估任务和数据集。 本仓库主要面向其中的英文检索任务(Retrieval),并额外支持针对 MSMARCO-Title 的评估。 @@ -261,6 +267,39 @@ MTEB-Retrieval 数据集, NDCG@10分数: | LLARA-passage | 52.48 | 47.51 | 26.13 | 37.26 | 44.12 | 81.09 | 43.98 | 69.17 | 45.49 | 37.07 | 61.76 | 82.29 | 17.30 | 76.07 | 36.73 | 81.30 | +## 压缩 + +### 模型删层 +模型剪枝脚本 `shortgpt_prune.py`,用于评估并移除大语言模型中重要性较低的层,以生成一个更小、更高效的模型。该脚本采用“块影响”度量来计算层的重要性,并直接在内存中完成剪枝和保存,流程高效。 + +#### 使用方法 + +通过以下命令执行剪枝脚本。可指定原始模型、输出路径、要剪枝的层数以及模型中transformer层的路径。 + +以repllama-v1-7b-lora-passage为例: +```bash +python shortgpt_prune.py \ + --model_name_or_path castorini/repllama-v1-7b-lora-passage \ + --output_model_path ./pruned-repllama-v1-7b-lora-passage \ + --n_prune_layers 6 \ + --layers_path "llama.layers" +``` + +以NV-Embed-v1为例: +```bash +python shortgpt_prune.py \ + --model_name_or_path nvidia/NV-Embed-v1 \ + --output_model_path /pruned-NV-Embed-v1_pruned_26 \ + --n_prune_layers 6 \ + --layers_path "layers" +``` +可配置参数包括: +- `--model_name_or_path`: 原始模型的名称或本地路径。 +- `--output_model_path`: 剪枝后模型的保存路径。 +- `--n_prune_layers`: 希望移除的层数。脚本会自动找出最不重要的N层。 +- `--layers_path`: 模型对象中指向transformer层列表的点分隔路径(例如repllama为`"llama.layers"`, llama为`"model.layers"`)。 + +可用output_model_path路径中的模型跑评估[评估部分的代码](#评估) ## Reference @@ -281,3 +320,5 @@ MTEB-Retrieval 数据集, NDCG@10分数: [8] Yingqi Qu, Yuchen Ding, Jing Liu, Kai Liu, Ruiyang Ren, Wayne Xin Zhao, Daxiang Dong, Hua Wu, Haifeng Wang: RocketQA: An Optimized Training Approach to Dense Passage Retrieval for Open-Domain Question Answering. NAACL 2021 [9] Ruiyang Ren, Yingqi Qu, Jing Liu, Wayne Xin Zhao, Qiaoqiao She, Hua Wu, Haifeng Wang, Ji-Rong Wen: RocketQAv2: A Joint Training Method for Dense Passage Retrieval and Passage Re-ranking. EMNLP 2021 + +[10] Xin Men, Mingyu Xu, Qingyu Zhang, Bingning Wang, Hongyu Lin, Yaojie Lu, Xianpei Han, Weipeng Chen: Shortgpt: Layers in large language models are more redundant than you expect. ACL Findings 2025 \ No newline at end of file diff --git a/slm/pipelines/examples/contrastive_training/evaluation/benchmarks.py b/slm/pipelines/examples/contrastive_training/evaluation/benchmarks.py index 62f00a11570f..327604e3f87f 100644 --- a/slm/pipelines/examples/contrastive_training/evaluation/benchmarks.py +++ b/slm/pipelines/examples/contrastive_training/evaluation/benchmarks.py @@ -21,7 +21,7 @@ from datasets import load_dataset from mteb.abstasks import AbsTaskRetrieval -from prediction import Eval_modle +from prediction import Eval_model csv.field_size_limit(500 * 1024 * 1024) @@ -51,7 +51,7 @@ def __init__( pooling_mode="mean_tokens", **kwargs, ): - self.query_model = Eval_modle( + self.query_model = Eval_model( model=query_model, max_seq_len=max_seq_len, batch_size=batch_size, diff --git a/slm/pipelines/examples/contrastive_training/evaluation/eval_mteb.py b/slm/pipelines/examples/contrastive_training/evaluation/eval_mteb.py index eb676d23f8f2..5813a1117880 100644 --- a/slm/pipelines/examples/contrastive_training/evaluation/eval_mteb.py +++ b/slm/pipelines/examples/contrastive_training/evaluation/eval_mteb.py @@ -29,7 +29,7 @@ class MSMARCOTITLE(AbsTaskRetrieval): metadata = TaskMetadata( dataset={ - "corpus_path": "Tevatron/msmarco-passage-corpus", + "corpus_path": "Tevatron/msmarco-passage-corpus-new", "path": "mteb/msmarco", "revision": "c5a29a104738b98a9e76336939199e264163d4a0", }, @@ -53,6 +53,9 @@ class MSMARCOTITLE(AbsTaskRetrieval): bibtex_citation=None, n_samples=None, avg_character_length=None, + modalities=["text"], + sample_creation="created", + descriptive_stats={}, ) def load_data(self, **kwargs): diff --git a/slm/pipelines/examples/contrastive_training/evaluation/eval_mteb.sh b/slm/pipelines/examples/contrastive_training/evaluation/eval_mteb.sh index 262496a3f122..52e146381bf7 100644 --- a/slm/pipelines/examples/contrastive_training/evaluation/eval_mteb.sh +++ b/slm/pipelines/examples/contrastive_training/evaluation/eval_mteb.sh @@ -12,18 +12,43 @@ # See the License for the specific language governing permissions and # limitations under the License. +#!/bin/bash -for task in "ArguAna" "ClimateFEVER" "DBPedia" "FEVER" "FiQA2018" "HotpotQA" "MSMARCO" "NFCorpus" "NQ" "QuoraRetrieval" "SCIDOCS" "SciFact" "Touche2020" "TRECCOVID" "CQADupstackAndroidRetrieval" "CQADupstackEnglishRetrieval" "CQADupstackGamingRetrieval" "CQADupstackGisRetrieval" "CQADupstackMathematicaRetrieval" "CQADupstackPhysicsRetrieval" "CQADupstackProgrammersRetrieval" "CQADupstackStatsRetrieval" "CQADupstackTexRetrieval" "CQADupstackUnixRetrieval" "CQADupstackWebmastersRetrieval" "CQADupstackWordpressRetrieval" "MSMARCOTITLE" -do +# --- Script Configuration --- +# Exit immediately if a command exits with a non-zero status. +set -e - # 1. RocketQA V1 - python3.10 -u eval_mteb.py \ - --corpus_model_name_or_path rocketqa-en-base-v1/passage_model \ - --query_model_name_or_path rocketqa-en-base-v1/query_model \ +# Define the list of all tasks (datasets) to be evaluated. +# TASKS=( +# "ArguAna" "ClimateFEVER" "DBPedia" "FEVER" "FiQA2018" "HotpotQA" "MSMARCO" "NFCorpus" "NQ" "QuoraRetrieval" +# "SCIDOCS" "SciFact" "Touche2020" "TRECCOVID" "CQADupstackAndroidRetrieval" "CQADupstackEnglishRetrieval" +# "CQADupstackGamingRetrieval" "CQADupstackGisRetrieval" "CQADupstackMathematicaRetrieval" "CQADupstackPhysicsRetrieval" +# "CQADupstackProgrammersRetrieval" "CQADupstackStatsRetrieval" "CQADupstackTexRetrieval" "CQADupstackUnixRetrieval" +# "CQADupstackWebmastersRetrieval" "CQADupstackWordpressRetrieval" "MSMARCOTITLE" +# ) + +TASKS=("ArguAna" "SCIDOCS" "FEVER") + + +# You can uncomment the models you wish to evaluate. +# MODELS_TO_RUN=("RocketQA-V1" "RocketQA-V2" "BGE" "RepLLaMA" "NV-Embed-v1" "BGE-EN-ICL" "LLARA-passage") +MODELS_TO_RUN=("BGE") + + +# =================================================================================== +# 🚀 1. RocketQA V1 +# =================================================================================== +if [[ " ${MODELS_TO_RUN[*]} " =~ " RocketQA-V1 " ]]; then + echo "===== Running Evaluation for Model: RocketQA V1 =====" + for task in "${TASKS[@]}"; do + echo "--- Task: $task ---" + python3.10 -u evaluation/eval_mteb.py \ + --corpus_model_name_or_path rocketqa-v1-marco-para-encoder \ + --query_model_name_or_path rocketqa-v1-marco-query-encoder \ --model_flag RocketQA-V1 \ --output_folder en_results/rocketqa-en-base-v1 \ --task_name "$task" \ - --task_split $(if [[ "$task" == *"MSMARCO"* ]]; then echo "dev"; else echo "test"; fi) \ + --task_split $([[ "$task" == *"MSMARCO"* ]] && echo "dev" || echo "test") \ --query_instruction "" \ --document_instruction "" \ --max_seq_length 512 \ @@ -31,15 +56,24 @@ do --dtype "float32" \ --padding_side right \ --pooling_method "cls" + done +fi + - # 2. RocketQA V2 - python3.10 -u eval_mteb.py \ - --corpus_model_name_or_path rocketqa-en-base-v2/passage_model \ - --query_model_name_or_path rocketqa-en-base-v2/query_model \ +# =================================================================================== +# 🚀 2. RocketQA V2 +# =================================================================================== +if [[ " ${MODELS_TO_RUN[*]} " =~ " RocketQA-V2 " ]]; then + echo "===== Running Evaluation for Model: RocketQA V2 =====" + for task in "${TASKS[@]}"; do + echo "--- Task: $task ---" + python3.10 -u evaluation/eval_mteb.py \ + --corpus_model_name_or_path rocketqav2-en-marco-para-encoder \ + --query_model_name_or_path rocketqav2-en-marco-query-encoder \ --model_flag RocketQA-V2 \ --output_folder en_results/rocketqa-en-base-v2 \ --task_name "$task" \ - --task_split $(if [[ "$task" == *"MSMARCO"* ]]; then echo "dev"; else echo "test"; fi) \ + --task_split $([[ "$task" == *"MSMARCO"* ]] && echo "dev" || echo "test") \ --query_instruction "" \ --document_instruction "" \ --max_seq_length 512 \ @@ -47,27 +81,45 @@ do --dtype "float32" \ --padding_side right \ --pooling_method "cls" + done +fi + - # 3. BGE - python3.10 eval_mteb.py \ +# =================================================================================== +# 🎯 3. BGE (BAAI/bge-large-en-v1.5) +# =================================================================================== +if [[ " ${MODELS_TO_RUN[*]} " =~ " BGE " ]]; then + echo "===== Running Evaluation for Model: BGE (bge-large-en-v1.5) =====" + for task in "${TASKS[@]}"; do + echo "--- Task: $task ---" + python3.10 evaluation/eval_mteb.py \ --base_model_name_or_path BAAI/bge-large-en-v1.5 \ - --output_folder en_results/bge-large-en-v1.5 \ + --output_folder en_results/bge-large-en-v1.5_2 \ --task_name "$task" \ - --task_split $(if [[ "$task" == *"MSMARCO"* ]]; then echo "dev"; else echo "test"; fi) \ + --task_split $([[ "$task" == *"MSMARCO"* ]] && echo "dev" || echo "test") \ --document_instruction 'Represent this sentence for searching relevant passages: ' \ --pooling_method mean \ --max_seq_length 512 \ --eval_batch_size 32 \ --padding_side right \ --add_bos_token 0 \ - --add_eos_token 0 + --add_eos_token 0 + done +fi - # 4. RepLLaMA - python3.10 eval_mteb.py \ + +# =================================================================================== +# 🦙 4. RepLLaMA +# =================================================================================== +if [[ " ${MODELS_TO_RUN[*]} " =~ " RepLLaMA " ]]; then + echo "===== Running Evaluation for Model: RepLLaMA =====" + for task in "${TASKS[@]}"; do + echo "--- Task: $task ---" + python3.10 evaluation/eval_mteb.py \ --base_model_name_or_path castorini/repllama-v1-7b-lora-passage \ --output_folder en_results/repllama-v1-7b-lora-passage \ --task_name "$task" \ - --task_split $(if [[ "$task" == *"MSMARCO"* ]]; then echo "dev"; else echo "test"; fi) \ + --task_split $([[ "$task" == *"MSMARCO"* ]] && echo "dev" || echo "test") \ --query_instruction 'query: ' \ --document_instruction 'passage: ' \ --pooling_method last \ @@ -76,22 +128,40 @@ do --padding_side right \ --add_bos_token 0 \ --add_eos_token 1 + done +fi + - # 5. NV-Embed-v1 - python3.10 eval_mteb.py \ +# =================================================================================== +# Nvidia 5. NV-Embed-v1 +# =================================================================================== +if [[ " ${MODELS_TO_RUN[*]} " =~ " NV-Embed-v1 " ]]; then + echo "===== Running Evaluation for Model: NV-Embed-v1 =====" + for task in "${TASKS[@]}"; do + echo "--- Task: $task ---" + python3.10 evaluation/eval_mteb.py \ --base_model_name_or_path nvidia/NV-Embed-v1 \ --output_folder en_results/nv-embed-v1 \ --query_instruction "Given a claim, find documents that refute the claim" \ --task_name "$task" \ - --task_split $(if [[ "$task" == *"MSMARCO"* ]]; then echo "dev"; else echo "test"; fi) \ + --task_split $([[ "$task" == *"MSMARCO"* ]] && echo "dev" || echo "test") \ --eval_batch_size 8 + done +fi + - # 6. BGE-EN-ICL - python3.10 eval_mteb.py \ +# =================================================================================== +# 🎯 6. BGE-EN-ICL +# =================================================================================== +if [[ " ${MODELS_TO_RUN[*]} " =~ " BGE-EN-ICL " ]]; then + echo "===== Running Evaluation for Model: BGE-EN-ICL =====" + for task in "${TASKS[@]}"; do + echo "--- Task: $task ---" + python3.10 evaluation/eval_mteb.py \ --base_model_name_or_path BAAI/bge-en-icl \ --output_folder en_results/bge-en-icl \ --task_name "$task" \ - --task_split $(if [[ "$task" == *"MSMARCO"* ]]; then echo "dev"; else echo "test"; fi) \ + --task_split $([[ "$task" == *"MSMARCO"* ]] && echo "dev" || echo "test") \ --query_instruction $' Given a scientific claim, retrieve documents that support or refute the claim.\n' \ --max_seq_length 512 \ --eval_batch_size 32 \ @@ -99,18 +169,31 @@ do --padding_side left \ --add_bos_token 1 \ --add_eos_token 1 + done +fi - # 7. LLARA-passage - python3.10 eval_mteb.py \ + +# =================================================================================== +# 🦙 7. LLARA-passage +# =================================================================================== +if [[ " ${MODELS_TO_RUN[*]} " =~ " LLARA-passage " ]]; then + echo "===== Running Evaluation for Model: LLARA-passage =====" + for task in "${TASKS[@]}"; do + echo "--- Task: $task ---" + python3.10 evaluation/eval_mteb.py \ --base_model_name_or_path BAAI/LLARA-passage \ --output_folder en_results/llara-passage \ --task_name "$task" \ - --task_split $(if [[ "$task" == *"MSMARCO"* ]]; then echo "dev"; else echo "test"; fi) \ + --task_split $([[ "$task" == *"MSMARCO"* ]] && echo "dev" || echo "test") \ --eval_batch_size 8 \ --pooling_method last_8 \ --model_flag llara \ --add_bos_token 1 \ --add_eos_token 0 \ --max_seq_length 532 + done +fi + + -done \ No newline at end of file +echo "All specified evaluations are complete." \ No newline at end of file diff --git a/slm/pipelines/examples/contrastive_training/evaluation/prediction.py b/slm/pipelines/examples/contrastive_training/evaluation/prediction.py index 9cbbfb7edc3d..8311a2efafdd 100644 --- a/slm/pipelines/examples/contrastive_training/evaluation/prediction.py +++ b/slm/pipelines/examples/contrastive_training/evaluation/prediction.py @@ -11,20 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os -import sys - import numpy as np import paddle from paddlenlp.data import DataCollatorWithPadding -from paddlenlp.transformers import AutoTokenizer - -sys.path.append(os.path.abspath(".")) -from models.modeling import BiEncoderModel +from paddlenlp.transformers import AutoTokenizer, BiEncoderModel -class Eval_modle: +class Eval_model: def __init__( self, model: str = None, @@ -45,10 +39,8 @@ def _construct_model(self): Construct the inference model for the predictor. """ if self.model_type in ["bert", "roberta", "ernie"]: - self._model = BiEncoderModel.from_pretrained( - model_name_or_path=self.model, - normalized=True, - sentence_pooling_method="cls", + self._model = BiEncoderModel( + model_name_or_path=self.model, normalized=True, sentence_pooling_method="cls", dtype="float32" ) print(f"loading checkpoints {self.model}") else: @@ -138,7 +130,7 @@ def _run_model(self, inputs, **kwargs): with paddle.no_grad(): for batch_inputs in inputs["batches"]: batch_inputs = self._collator(batch_inputs) - token_embeddings = self._model.encode(batch_inputs) + token_embeddings = self._model.encode(batch_inputs, self._model.model) all_feats.append(token_embeddings.detach().cpu().numpy()) return all_feats diff --git a/slm/pipelines/examples/contrastive_training/requirements.txt b/slm/pipelines/examples/contrastive_training/requirements.txt index 67d35be1ac5b..af6ff0e518f2 100644 --- a/slm/pipelines/examples/contrastive_training/requirements.txt +++ b/slm/pipelines/examples/contrastive_training/requirements.txt @@ -1,2 +1,4 @@ mteb==1.12.80 -typer==0.9.0 \ No newline at end of file +typer==0.9.0 +h11==0.16.0 +requests[socks] \ No newline at end of file diff --git a/slm/pipelines/examples/contrastive_training/shortgpt_prune.py b/slm/pipelines/examples/contrastive_training/shortgpt_prune.py new file mode 100644 index 000000000000..3ad30180688d --- /dev/null +++ b/slm/pipelines/examples/contrastive_training/shortgpt_prune.py @@ -0,0 +1,355 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import json +import math +import os +import re +from collections import OrderedDict +from typing import List + +import paddle +from datasets import load_dataset +from paddle.io import DataLoader +from tqdm import tqdm + +from paddlenlp.transformers import AutoModelForCausalLM, AutoTokenizer, NVEncodeModel + + +# ===================================================================================== +# 1. block_influence +# ===================================================================================== +def block_influence( + input_hidden_state: paddle.Tensor, + output_hidden_state: paddle.Tensor, + angular: bool = False, +) -> paddle.Tensor: + """ + Calculates block influence between input and output hidden states. + """ + _, _, d = input_hidden_state.shape + input_hidden_state = paddle.reshape(input_hidden_state, [-1, d]) + output_hidden_state = paddle.reshape(output_hidden_state, [-1, d]) + + norm_input = paddle.norm(input_hidden_state, p=2, axis=-1, keepdim=True) + norm_output = paddle.norm(output_hidden_state, p=2, axis=-1, keepdim=True) + + sim = paddle.matmul(input_hidden_state, output_hidden_state, transpose_y=True) / (norm_input * norm_output) + sim = paddle.diag(sim).astype("float32").nan_to_num(nan=0.5) + + if angular: + return paddle.acos(sim) / math.pi + return 1 - sim + + +# ===================================================================================== +# 2. ShortGPT +# ===================================================================================== +class ShortGPT: + """ + A class to evaluate layer importance in LLMs using PaddlePaddle. + """ + + def __init__(self, model_name: str, layers_path: str): + print(f"Loading tokenizer for '{model_name}'...") + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.tokenizer.pad_token = self.tokenizer.eos_token + + print(f"Loading model '{model_name}' with PaddlePaddle backend...") + if "NV-Embed" in model_name: + self.model = NVEncodeModel.from_pretrained( + model_name, tokenizer_path=model_name, query_instruction="", document_instruction="" + ) + else: + self.model = AutoModelForCausalLM.from_pretrained(model_name, dtype=paddle.float16) + + self.model.eval() + print("Model loaded successfully for importance evaluation.") + + try: + path_parts = layers_path.split(".") # e.g., 'llama.layers' -> ['llama', 'layers'] + + self.base_model_for_call = self.model + # 遍历路径中除了最后 'layers' 之外的部分 (e.g., 'llama') + for part in path_parts[:-1]: + self.base_model_for_call = getattr(self.base_model_for_call, part) + + # 从基础模型中获取 'layers' 列表 + self.layers = getattr(self.base_model_for_call, path_parts[-1]) + print(f"Successfully located base model for evaluation call: {type(self.base_model_for_call)}") + print(f"Successfully located {len(self.layers)} layers.") + + except AttributeError: + raise AttributeError(f"Could not find layers at path '{layers_path}' in the model architecture.") + + self.importances = [0.0 for _ in self.layers] + + def compute_bi(self, hiddens: List[paddle.Tensor]): + """ + Computes and accumulates block influence scores from hidden states. + """ + n = 1 + for i in range(len(hiddens) - n): + layer_index = i + if layer_index < len(self.importances): + in_hidden = hiddens[i] + out_hidden = hiddens[i + n] + self.importances[layer_index] += block_influence(in_hidden, out_hidden).sum().item() + + @paddle.no_grad() + def eval_importance(self, prompts: List[str], model_name: str, stride: int = 256): + """ + Evaluates the importance of model layers on given prompts. + """ + prompt_tokens = self.tokenizer(prompts, padding=True, return_attention_mask=True, return_tensors="pd") + input_ids = prompt_tokens.input_ids + attn_mask = prompt_tokens.attention_mask + + max_prompt_len = max(len(t) for t in input_ids) + + for start in range(0, max_prompt_len, stride): + seq_ids = (attn_mask.sum(axis=-1) > start).nonzero().squeeze() + seq_ids = seq_ids.unsqueeze(0) if seq_ids.ndim == 0 else seq_ids + + if seq_ids.shape[0] == 0: + continue + + inputs = input_ids[seq_ids, start : start + stride] + attn = attn_mask[seq_ids, start : start + stride] + + if "NV-Embed" in model_name: + outputs = self.base_model_for_call.m_forward( + input_ids=inputs, attention_mask=attn, output_hidden_states=True, return_dict=True + ) + else: + outputs = self.base_model_for_call( + input_ids=inputs, attention_mask=attn, output_hidden_states=True, return_dict=True + ) + + if outputs.hidden_states: + self.compute_bi(outputs.hidden_states) + + +def load_model_weights(model_folder_path: str) -> OrderedDict: + print(f"Attempting to load model weights from FOLDER: '{model_folder_path}'...") + + # 1. Ensure the path is a valid directory + if not os.path.isdir(model_folder_path): + raise NotADirectoryError(f"The provided path is not a valid directory: '{model_folder_path}'") + + state_dict = OrderedDict() + index_path = os.path.join(model_folder_path, "model_state.pdparams.index.json") + + # 2. Check for the presence of a sharded index file + if os.path.isfile(index_path): + # Case A: Sharded model format detected (index file found) + print("Sharded model format detected (index file found).") + with open(index_path, "r", encoding="utf-8") as f: + index_data = json.load(f) + + shard_files = sorted(list(set(index_data["weight_map"].values()))) + print(f"Found {len(shard_files)} shard(s).") + + for shard_file in shard_files: + shard_path = os.path.join(model_folder_path, shard_file) + if not os.path.exists(shard_path): + raise FileNotFoundError(f"Shard file '{shard_file}' listed in index not found at '{shard_path}'") + + print(f" > Loading shard: {shard_file}") + shard_state_dict = paddle.load(shard_path, return_numpy=True) + state_dict.update(shard_state_dict) + del shard_state_dict + print("All weight shards loaded successfully.") + + else: + # Case B: No index file found; look for a single .pdparams file + print("No index file found. Searching for a single .pdparams file inside the folder...") + pdparams_files = [f for f in os.listdir(model_folder_path) if f.endswith(".pdparams")] + + if len(pdparams_files) == 1: + # Found exactly one .pdparams file + single_file_path = os.path.join(model_folder_path, pdparams_files[0]) + print(f" > Loading single parameters file: {pdparams_files[0]}") + state_dict = paddle.load(single_file_path, return_numpy=True) + print("Single weight file loaded successfully.") + elif len(pdparams_files) > 1: + raise ValueError( + f"Ambiguous model files. Multiple .pdparams files found in '{model_folder_path}' " + "but no 'model_state.pdparams.index.json' to specify order." + ) + else: # len(pdparams_files) == 0 + raise FileNotFoundError(f"No .pdparams files found in the directory '{model_folder_path}'.") + + return state_dict + + +# ===================================================================================== +# 3. Prune and Save +# ===================================================================================== +def prune_and_save_model_in_memory( + model, + tokenizer, + new_model_path, + layers_to_delete, + layers_path_str, +): + """ + Prunes and saves a model directly from the in-memory model object. + """ + print("=" * 50) + print("PART 2: Starting In-Memory Model Pruning and Saving") + print("=" * 50) + os.makedirs(new_model_path, exist_ok=True) + + # Step 1: Get state_dict directly from the in-memory model + print("Getting state_dict directly from the in-memory model...") + state_dict = model.state_dict() + + # Step 2: Iterate, filter, and rename weights + print("Processing weights: removing specified layers and re-indexing...") + escaped_layers_path = layers_path_str.replace(".", r"\.") + layer_pattern = re.compile(rf"^{escaped_layers_path}\.(\d+)\.") + new_state_dict = OrderedDict() + + for key, value in state_dict.items(): + match = layer_pattern.match(key) + if not match: + new_state_dict[key] = value + continue + + layer_idx = int(match.group(1)) + if layer_idx in layers_to_delete: + continue + + num_layers_deleted_before = sum(1 for deleted_idx in layers_to_delete if deleted_idx < layer_idx) + new_layer_idx = layer_idx - num_layers_deleted_before + old_prefix = f"{layers_path_str}.{layer_idx}." + new_prefix = f"{layers_path_str}.{new_layer_idx}." + new_key = key.replace(old_prefix, new_prefix, 1) + new_state_dict[new_key] = value + + print(f"Processing complete. Removed {len(layers_to_delete)} layer(s): {sorted(list(layers_to_delete))}.") + + # Step 3: Get and modify the configuration from the model object + print("Updating configuration file...") + config = model.config.to_dict() + + # Fix: Convert non-serializable paddle data types to strings + for key, value in config.items(): + if type(value).__name__ == "DataType": + config[key] = str(value).split(".")[-1] + + if "num_hidden_layers" in config: + original_num_layers = config["num_hidden_layers"] + new_num_layers = original_num_layers - len(layers_to_delete) + config["num_hidden_layers"] = new_num_layers + print(f" - Number of layers changed from {original_num_layers} to {new_num_layers}.") + + new_config_path = os.path.join(new_model_path, "config.json") + with open(new_config_path, "w", encoding="utf-8") as f: + json.dump(config, f, indent=2) + print(f"New config saved to '{new_config_path}'.") + + # Step 4: Save the new weights and tokenizer + print("Saving pruned weights...") + new_weights_path = os.path.join(new_model_path, "model_state.pdparams") + paddle.save(new_state_dict, new_weights_path) + print(f"Pruned weights saved to '{new_weights_path}'.") + + print("Saving tokenizer files...") + tokenizer.save_pretrained(new_model_path) + print(f"Tokenizer files saved to '{new_model_path}'.") + + print("\n🎉 Pruning process completed successfully!") + print(f"Pruned model has been saved to '{new_model_path}'") + + +def main(): + parser = argparse.ArgumentParser( + description="Calculate layer importance, prune, and save a new PaddlePaddle model." + ) + parser.add_argument( + "--model_name_or_path", + type=str, + required=True, + help="Path or HuggingFace name of the source PaddlePaddle model.", + ) + parser.add_argument( + "--output_model_path", type=str, required=True, help="Path to save the new, pruned model directory." + ) + parser.add_argument( + "--layers_path", type=str, required=True, help="Dot-separated path to the layers list (e.g., 'llama.layers')." + ) + parser.add_argument( + "--n_prune_layers", type=int, required=True, help="The number of layers to identify and prune." + ) + parser.add_argument( + "--dataset_name", + type=str, + default="emozilla/pg19", + help="Name of the Hugging Face dataset for calibration. Default: 'emozilla/pg19'.", + ) + parser.add_argument( + "--dataset_split", + type=str, + default="validation", + help="The split of the dataset to use. Default: 'validation'.", + ) + args = parser.parse_args() + + # --- PART 1: Calculate Layer Importance --- + print("=" * 50) + print("PART 1: Calculating Layer Importance") + print("=" * 50) + print(f"Loading '{args.dataset_split}' split from '{args.dataset_name}' dataset for calibration...") + try: + data = load_dataset(args.dataset_name, split=args.dataset_split) + except Exception as e: + print(f"Failed to load dataset. Error: {e}") + print( + "Please ensure the dataset name and split are correct and you have internet access for Hugging Face datasets." + ) + return + + dataloader = DataLoader(data, batch_size=1, shuffle=False) + + short_model = ShortGPT(model_name=args.model_name_or_path, layers_path=args.layers_path) + + for batch in tqdm(dataloader, desc="Evaluating Layer Importance"): + if "text" not in batch: + raise ValueError("Dataset must contain a 'text' column.") + prompts = batch["text"] + short_model.eval_importance(prompts=prompts, model_name=args.model_name_or_path, stride=256) + + prune_order = sorted(range(len(short_model.importances)), key=lambda i: short_model.importances[i]) + layers_to_delete = set(prune_order[: args.n_prune_layers]) + + print("\n--- Importance Calculation Complete ---") + print(f"Calculated importances: {[f'{v:.2f}' for v in short_model.importances]}") + print(f"Pruning order (least to most important): {prune_order}") + print(f"Will delete the {args.n_prune_layers} least important layers: {sorted(list(layers_to_delete))}") + + # --- PART 2: Perform In-Memory Pruning and Saving --- + prune_and_save_model_in_memory( + model=short_model.model, + tokenizer=short_model.tokenizer, + new_model_path=args.output_model_path, + layers_to_delete=layers_to_delete, + layers_path_str=args.layers_path, + ) + + +if __name__ == "__main__": + main() diff --git a/slm/pipelines/examples/contrastive_training/train.py b/slm/pipelines/examples/contrastive_training/train.py index c18263dff95d..40ada0c70a07 100644 --- a/slm/pipelines/examples/contrastive_training/train.py +++ b/slm/pipelines/examples/contrastive_training/train.py @@ -109,6 +109,7 @@ def main(): use_inbatch_neg=training_args.use_inbatch_neg, matryoshka_dims=training_args.matryoshka_dims if training_args.use_matryoshka else None, matryoshka_loss_weights=training_args.matryoshka_loss_weights if training_args.use_matryoshka else None, + dtype=dtype, ) if training_args.fix_position_embedding: @@ -129,13 +130,13 @@ def main(): if training_args.fine_tune_type == "lora": if any([x in model_args.model_name_or_path for x in ["llama", "baichuan", "NV-Embed"]]): target_modules = [ - ".*q_proj.*", - ".*k_proj.*", - ".*v_proj.*", - ".*o_proj.*", - ".*down_proj.*", - ".*up_proj.*", - ".*gate_proj.*", + ".*q_proj$", + ".*k_proj$", + ".*v_proj$", + ".*o_proj$", + ".*down_proj$", + ".*up_proj$", + ".*gate_proj$", ] else: raise ValueError("need to specify the target modules for LoRA fine-tuning.")