fix: add shortgpt script, fix formatting and imports

tianyumyum · tianyumyum · commit 7c5fee8d174b · 2025-07-23T02:07:36.000Z
diff --git a/slm/pipelines/examples/contrastive_training/evaluation/eval_mteb.py b/slm/pipelines/examples/contrastive_training/evaluation/eval_mteb.py
@@ -31,7 +31,7 @@ class MSMARCOTITLE(AbsTaskRetrieval):
         dataset={
             "corpus_path": "Tevatron/msmarco-passage-corpus-new",
             "path": "mteb/msmarco",
-            "revision": "c5a29a104738b98a9e76336939199e264163d4a0",            
+            "revision": "c5a29a104738b98a9e76336939199e264163d4a0",
         },
         name="MSMARCOTITLE",
         description="MS MARCO is a collection of datasets focused on deep learning in search",
@@ -53,9 +53,9 @@ class MSMARCOTITLE(AbsTaskRetrieval):
         bibtex_citation=None,
         n_samples=None,
         avg_character_length=None,
-        modalities = ["text"],
-        sample_creation = "created",
-        descriptive_stats = {}
+        modalities=["text"],
+        sample_creation="created",
+        descriptive_stats={},
     )
 
     def load_data(self, **kwargs):
diff --git a/slm/pipelines/examples/contrastive_training/evaluation/prediction.py b/slm/pipelines/examples/contrastive_training/evaluation/prediction.py
@@ -11,16 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
-import sys
-
 import numpy as np
 import paddle
 
 from paddlenlp.data import DataCollatorWithPadding
-from paddlenlp.transformers import AutoTokenizer
-
-from paddlenlp.transformers import BiEncoderModel
+from paddlenlp.transformers import AutoTokenizer, BiEncoderModel
 
 
 class Eval_model:
@@ -45,10 +40,7 @@ def _construct_model(self):
         """
         if self.model_type in ["bert", "roberta", "ernie"]:
             self._model = BiEncoderModel(
-                model_name_or_path=self.model,
-                normalized=True,
-                sentence_pooling_method="cls",
-                dtype='float32'
+                model_name_or_path=self.model, normalized=True, sentence_pooling_method="cls", dtype="float32"
             )
             print(f"loading checkpoints {self.model}")
         else:
diff --git a/slm/pipelines/examples/contrastive_training/shortgpt_prune.py b/slm/pipelines/examples/contrastive_training/shortgpt_prune.py
@@ -1,19 +1,33 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import json
 import math
 import os
 import re
 from collections import OrderedDict
-from shutil import copyfile
-from typing import List, Optional
+from typing import List
 
-import numpy as np
 import paddle
 from datasets import load_dataset
 from paddle.io import DataLoader
-from paddlenlp.transformers import AutoModelForCausalLM, AutoTokenizer, NVEncodeModel
 from tqdm import tqdm
 
+from paddlenlp.transformers import AutoModelForCausalLM, AutoTokenizer, NVEncodeModel
+
+
 # =====================================================================================
 # 1. block_influence
 # =====================================================================================
@@ -33,19 +47,21 @@ def block_influence(
     norm_output = paddle.norm(output_hidden_state, p=2, axis=-1, keepdim=True)
 
     sim = paddle.matmul(input_hidden_state, output_hidden_state, transpose_y=True) / (norm_input * norm_output)
-    sim = paddle.diag(sim).astype('float32').nan_to_num(nan=0.5)
+    sim = paddle.diag(sim).astype("float32").nan_to_num(nan=0.5)
 
     if angular:
         return paddle.acos(sim) / math.pi
     return 1 - sim
 
+
 # =====================================================================================
-# 2. ShortGPT 
+# 2. ShortGPT
 # =====================================================================================
 class ShortGPT:
     """
     A class to evaluate layer importance in LLMs using PaddlePaddle.
     """
+
     def __init__(self, model_name: str, layers_path: str):
         print(f"Loading tokenizer for '{model_name}'...")
         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -54,36 +70,30 @@ def __init__(self, model_name: str, layers_path: str):
         print(f"Loading model '{model_name}' with PaddlePaddle backend...")
         if "NV-Embed" in model_name:
             self.model = NVEncodeModel.from_pretrained(
-                model_name,
-                tokenizer_path=model_name,
-                query_instruction = "",
-                document_instruction =""
-                )
-        else:
-            self.model = AutoModelForCausalLM.from_pretrained(
-                model_name,
-                dtype=paddle.float16
+                model_name, tokenizer_path=model_name, query_instruction="", document_instruction=""
             )
-        
+        else:
+            self.model = AutoModelForCausalLM.from_pretrained(model_name, dtype=paddle.float16)
+
         self.model.eval()
         print("Model loaded successfully for importance evaluation.")
-        
+
         try:
-            path_parts = layers_path.split('.')  # e.g., 'llama.layers' -> ['llama', 'layers']
+            path_parts = layers_path.split(".")  # e.g., 'llama.layers' -> ['llama', 'layers']
 
             self.base_model_for_call = self.model
             # 遍历路径中除了最后 'layers' 之外的部分 (e.g., 'llama')
             for part in path_parts[:-1]:
                 self.base_model_for_call = getattr(self.base_model_for_call, part)
-            
+
             # 从基础模型中获取 'layers' 列表
             self.layers = getattr(self.base_model_for_call, path_parts[-1])
             print(f"Successfully located base model for evaluation call: {type(self.base_model_for_call)}")
             print(f"Successfully located {len(self.layers)} layers.")
 
         except AttributeError:
             raise AttributeError(f"Could not find layers at path '{layers_path}' in the model architecture.")
-        
+
         self.importances = [0.0 for _ in self.layers]
 
     def compute_bi(self, hiddens: List[paddle.Tensor]):
@@ -95,20 +105,15 @@ def compute_bi(self, hiddens: List[paddle.Tensor]):
             layer_index = i
             if layer_index < len(self.importances):
                 in_hidden = hiddens[i]
-                out_hidden = hiddens[i+n]
-                self.importances[layer_index] += block_influence(
-                    in_hidden,
-                    out_hidden
-                ).sum().item()
+                out_hidden = hiddens[i + n]
+                self.importances[layer_index] += block_influence(in_hidden, out_hidden).sum().item()
 
     @paddle.no_grad()
     def eval_importance(self, prompts: List[str], model_name: str, stride: int = 256):
         """
         Evaluates the importance of model layers on given prompts.
         """
-        prompt_tokens = self.tokenizer(
-            prompts, padding=True, return_attention_mask=True, return_tensors='pd'
-        )
+        prompt_tokens = self.tokenizer(prompts, padding=True, return_attention_mask=True, return_tensors="pd")
         input_ids = prompt_tokens.input_ids
         attn_mask = prompt_tokens.attention_mask
 
@@ -117,32 +122,27 @@ def eval_importance(self, prompts: List[str], model_name: str, stride: int = 256
         for start in range(0, max_prompt_len, stride):
             seq_ids = (attn_mask.sum(axis=-1) > start).nonzero().squeeze()
             seq_ids = seq_ids.unsqueeze(0) if seq_ids.ndim == 0 else seq_ids
-            
+
             if seq_ids.shape[0] == 0:
                 continue
 
-            inputs = input_ids[seq_ids, start:start+stride]
-            attn = attn_mask[seq_ids, start:start+stride]
+            inputs = input_ids[seq_ids, start : start + stride]
+            attn = attn_mask[seq_ids, start : start + stride]
 
             if "NV-Embed" in model_name:
                 outputs = self.base_model_for_call.m_forward(
-                input_ids=inputs,
-                attention_mask=attn,
-                output_hidden_states=True,
-                return_dict=True
-            )
+                    input_ids=inputs, attention_mask=attn, output_hidden_states=True, return_dict=True
+                )
             else:
                 outputs = self.base_model_for_call(
-                    input_ids=inputs,
-                    attention_mask=attn,
-                    output_hidden_states=True,
-                    return_dict=True
+                    input_ids=inputs, attention_mask=attn, output_hidden_states=True, return_dict=True
                 )
-            
+
             if outputs.hidden_states:
                 self.compute_bi(outputs.hidden_states)
-        
-def load_model_weights(model_folder_path: str) -> OrderedDict:    
+
+
+def load_model_weights(model_folder_path: str) -> OrderedDict:
     print(f"Attempting to load model weights from FOLDER: '{model_folder_path}'...")
 
     # 1. Ensure the path is a valid directory
@@ -156,7 +156,7 @@ def load_model_weights(model_folder_path: str) -> OrderedDict:
     if os.path.isfile(index_path):
         # Case A: Sharded model format detected (index file found)
         print("Sharded model format detected (index file found).")
-        with open(index_path, 'r', encoding='utf-8') as f:
+        with open(index_path, "r", encoding="utf-8") as f:
             index_data = json.load(f)
 
         shard_files = sorted(list(set(index_data["weight_map"].values())))
@@ -190,9 +190,7 @@ def load_model_weights(model_folder_path: str) -> OrderedDict:
                 "but no 'model_state.pdparams.index.json' to specify order."
             )
         else:  # len(pdparams_files) == 0
-            raise FileNotFoundError(
-                f"No .pdparams files found in the directory '{model_folder_path}'."
-            )
+            raise FileNotFoundError(f"No .pdparams files found in the directory '{model_folder_path}'.")
 
     return state_dict
 
@@ -210,9 +208,9 @@ def prune_and_save_model_in_memory(
     """
     Prunes and saves a model directly from the in-memory model object.
     """
-    print("="*50)
+    print("=" * 50)
     print("PART 2: Starting In-Memory Model Pruning and Saving")
-    print("="*50)
+    print("=" * 50)
     os.makedirs(new_model_path, exist_ok=True)
 
     # Step 1: Get state_dict directly from the in-memory model
@@ -276,41 +274,63 @@ def prune_and_save_model_in_memory(
 
     print("\n🎉 Pruning process completed successfully!")
     print(f"Pruned model has been saved to '{new_model_path}'")
-    
 
 
 def main():
     parser = argparse.ArgumentParser(
         description="Calculate layer importance, prune, and save a new PaddlePaddle model."
     )
-    parser.add_argument("--model_name_or_path", type=str, required=True, help="Path or HuggingFace name of the source PaddlePaddle model.")
-    parser.add_argument("--output_model_path", type=str, required=True, help="Path to save the new, pruned model directory.")
-    parser.add_argument("--layers_path", type=str, required=True, help="Dot-separated path to the layers list (e.g., 'llama.layers').")
-    parser.add_argument("--n_prune_layers", type=int, required=True, help="The number of layers to identify and prune.")
-    parser.add_argument("--dataset_name", type=str, default="emozilla/pg19", help="Name of the Hugging Face dataset for calibration. Default: 'emozilla/pg19'.")
-    parser.add_argument("--dataset_split", type=str, default="validation", help="The split of the dataset to use. Default: 'validation'.")
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        required=True,
+        help="Path or HuggingFace name of the source PaddlePaddle model.",
+    )
+    parser.add_argument(
+        "--output_model_path", type=str, required=True, help="Path to save the new, pruned model directory."
+    )
+    parser.add_argument(
+        "--layers_path", type=str, required=True, help="Dot-separated path to the layers list (e.g., 'llama.layers')."
+    )
+    parser.add_argument(
+        "--n_prune_layers", type=int, required=True, help="The number of layers to identify and prune."
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default="emozilla/pg19",
+        help="Name of the Hugging Face dataset for calibration. Default: 'emozilla/pg19'.",
+    )
+    parser.add_argument(
+        "--dataset_split",
+        type=str,
+        default="validation",
+        help="The split of the dataset to use. Default: 'validation'.",
+    )
     args = parser.parse_args()
 
     # --- PART 1: Calculate Layer Importance ---
-    print("="*50)
+    print("=" * 50)
     print("PART 1: Calculating Layer Importance")
-    print("="*50)
+    print("=" * 50)
     print(f"Loading '{args.dataset_split}' split from '{args.dataset_name}' dataset for calibration...")
     try:
         data = load_dataset(args.dataset_name, split=args.dataset_split)
     except Exception as e:
         print(f"Failed to load dataset. Error: {e}")
-        print("Please ensure the dataset name and split are correct and you have internet access for Hugging Face datasets.")
+        print(
+            "Please ensure the dataset name and split are correct and you have internet access for Hugging Face datasets."
+        )
         return
-    
+
     dataloader = DataLoader(data, batch_size=1, shuffle=False)
-    
+
     short_model = ShortGPT(model_name=args.model_name_or_path, layers_path=args.layers_path)
-        
+
     for batch in tqdm(dataloader, desc="Evaluating Layer Importance"):
-        if 'text' not in batch:
+        if "text" not in batch:
             raise ValueError("Dataset must contain a 'text' column.")
-        prompts = batch['text']
+        prompts = batch["text"]
         short_model.eval_importance(prompts=prompts, model_name=args.model_name_or_path, stride=256)
 
     prune_order = sorted(range(len(short_model.importances)), key=lambda i: short_model.importances[i])
@@ -327,8 +347,9 @@ def main():
         tokenizer=short_model.tokenizer,
         new_model_path=args.output_model_path,
         layers_to_delete=layers_to_delete,
-        layers_path_str=args.layers_path
+        layers_path_str=args.layers_path,
     )
 
+
 if __name__ == "__main__":
-    main()
+    main()