[New Sample] Add Qwen Series Model Computational Graph #94

fangfangssj · 2025-08-05T11:07:00Z

PR Category

New Sample

Description

添加qwen系列的模型，提取时设置dynamic=False

抽取脚本

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from graph_net.torch.extractor import extract
import os
import subprocess
import shutil
import traceback
import torch
import time
import glob
import logging

def run_nlp_model_full_graph(model_name: str, device_str: str) -> None:
    device = torch.device(device_str)
    print(f"\nTesting NLP model: {model_name} on {device_str}")

    try:
        config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
        model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.float16, trust_remote_code=True).to(device).eval()
        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        tokenizer.pad_token = tokenizer.eos_token

        text = "Hello, my name is Bob. I am learning about large language models and their architectures. "
        inputs = tokenizer(
            text,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512,
        )
        input_data = {key: val.to(device) for key, val in inputs.items()}
        
        input_data['use_cache'] = False
        
        wrapped = extract(name=model_name,dynamic=False)(model).eval()
        with torch.no_grad():
            wrapped(**input_data)
        
        print(f"[OK] {model_name}: Full graph extracted.")
        return 1

    except Exception as e:
        print(f"[FAIL] {model_name}: extract error - {e}")
        return 0

# run_nlp_model_full_graph("Qwen/Qwen2.5-0.5B", "cuda:0" if torch.cuda.is_available() else "cpu")

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("model_processing.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger()

GRAPH_NET_EXTRACT_WORKSPACE = os.getenv("GRAPH_NET_EXTRACT_WORKSPACE", "./workspace")
SUCCESS_MODELS_DIR = os.getenv("SUCCESS_MODELS_DIR", "./validated_models")
TRANSFORMERS_CACHE = os.getenv("TRANSFORMERS_CACHE", os.path.expanduser("~/.cache/huggingface/hub"))

os.makedirs(SUCCESS_MODELS_DIR, exist_ok=True)
os.makedirs(GRAPH_NET_EXTRACT_WORKSPACE, exist_ok=True)

# 待测试的模型列表
MODEL_LIST = [
    "Qwen/Qwen2.5-0.5B",
    "Qwen/Qwen2.5-1.5B",
    "Qwen/Qwen2.5-3B",
    "Qwen/Qwen2.5-7B",
    "Qwen/Qwen2.5-14B",
    "Qwen/Qwen2.5-32B",
]

def clear_transformers_cache():
    """清理transformers缓存目录"""
    logger.info(f"Cleaning transformers cache: {TRANSFORMERS_CACHE}")
    
    if not os.path.exists(TRANSFORMERS_CACHE):
        logger.warning(f"Cache directory does not exist: {TRANSFORMERS_CACHE}")
        return
    
    # 删除缓存目录中的所有内容
    for item in os.listdir(TRANSFORMERS_CACHE):
        item_path = os.path.join(TRANSFORMERS_CACHE, item)
        try:
            if os.path.isfile(item_path) or os.path.islink(item_path):
                os.unlink(item_path)
            elif os.path.isdir(item_path):
                shutil.rmtree(item_path)
        except Exception as e:
            logger.error(f"Failed to delete {item_path}. Reason: {e}")
    
    logger.info("Transformers cache cleared successfully")

def process_model(model_name):
    """处理单个模型的完整流程"""
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model_dir = os.path.join(GRAPH_NET_EXTRACT_WORKSPACE, model_name)
    
    try:
        logger.info(f"Starting processing for: {model_name}")
        
        # 步骤1: 运行模型提取
        result = run_nlp_model_full_graph(model_name, device)
        
        if result == 1:
            logger.info(f"Extraction successful for {model_name}")
            
            # 步骤2: 执行验证命令
            validation_cmd = [
                "python", "-m", "graph_net.torch.validate",
                "--model-path", model_dir
            ]
            
            logger.info(f"Validating model: {' '.join(validation_cmd)}")
            
            try:
                val_result = subprocess.run(
                    validation_cmd,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE,
                    text=True,
                    timeout=600
                )
                
                if val_result.returncode == 0:
                    model_name = model_name.split("/")[-1]
                    dest_path = os.path.join(SUCCESS_MODELS_DIR, model_name)
                    os.makedirs(os.path.dirname(dest_path), exist_ok=True)
                    if os.path.exists(dest_path):
                        shutil.rmtree(dest_path)
                    
                    shutil.copytree(model_dir, dest_path)
                    
                    logger.info(f"Model validated and copied to {dest_path}")
                    return "SUCCESS", "Extraction and validation successful"
                else:
                    error_msg = f"Validation failed (code {val_result.returncode}):\n{val_result.stderr}"
                    logger.error(error_msg)
                    return "VALIDATION_FAILED", error_msg
                    
            except subprocess.TimeoutExpired:
                error_msg = "Validation command timed out (10 minutes)"
                logger.error(error_msg)
                return "VALIDATION_TIMEOUT", error_msg
                
            except Exception as e:
                error_msg = f"Validation command failed: {str(e)}\n{traceback.format_exc()}"
                logger.error(error_msg)
                return "VALIDATION_ERROR", error_msg
                
        else:
            error_msg = "Extraction returned failure code"
            logger.error(error_msg)
            return "EXTRACTION_FAILED", error_msg
            
    except Exception as e:
        error_msg = f"Critical processing error: {str(e)}\n{traceback.format_exc()}"
        logger.error(error_msg)
        return "PROCESSING_ERROR", error_msg

def main():
    success_models = []
    failed_models = []
    
    logger.info("=" * 60)
    logger.info(f"Starting model processing for {len(MODEL_LIST)} models")
    logger.info("=" * 60)
    
    for i, model in enumerate(MODEL_LIST, 1):
        logger.info(f"\n{'=' * 50}")
        logger.info(f"Processing model {i}/{len(MODEL_LIST)}: {model}")
        logger.info(f"{'=' * 50}")
        
        start_time = time.time()
        
        status, message = process_model(model)
        duration = time.time() - start_time
        result = {
            "model": model,
            "status": status,
            "message": message,
            "duration": f"{duration:.2f} seconds"
        }
        
        if status == "SUCCESS":
            success_models.append(result)
            logger.info(f"✅ SUCCESS: {model} ({duration:.2f}s)")
        else:
            failed_models.append(result)
            logger.error(f"❌ FAILED: {model} - {status} ({duration:.2f}s)")
        
        logger.info("Cleaning up resources...")
        clear_transformers_cache()
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            logger.info("CUDA cache cleared")
        
        logger.info(f"Cleanup completed for {model}")
    
    with open("success_models.txt", "w") as f:
        for result in success_models:
            f.write(f"{result['model']}\n")
    
    with open("failed_models.txt", "w") as f:
        for result in failed_models:
            f.write(f"{result['model']}: {result['status']}\n")
            f.write(f"Duration: {result['duration']}\n")
            f.write(f"Error: {result['message']}\n")
            f.write("-" * 80 + "\n")
    
    logger.info("\n" + "=" * 60)
    logger.info("Processing complete!")
    logger.info(f"Success: {len(success_models)} models")
    logger.info(f"Failed: {len(failed_models)} models")
    logger.info(f"Results saved to: model_processing.log")
    logger.info(f"Success models list: success_models.txt")
    logger.info(f"Failed models details: failed_models.txt")
    logger.info("=" * 60)

if __name__ == "__main__":
    main()

CI报错原因，生成的hash相同，qwen系列前面的头结构是一样的，第一次提取可以提取到完整的计算图，第二次提取的时候，里面有一个torch无法跟踪的torch._C._functorch.PyCapsule._vmap_increment_nesting函数，第二次提取计算图到这里就结束了，第二次已经跟踪了的部分生成一个hash值，所以所有qwen系列hash算出来都是一样的

paddle-bot · 2025-08-05T11:07:05Z

Thanks for your contribution!

add qwen2 and qwen1.5

cab5750

paddle-bot bot added the contributor External developers label Aug 5, 2025

fangfangssj closed this Aug 6, 2025

fangfangssj reopened this Aug 6, 2025

luotao1 added the HappyOpenSource 快乐开源活动issue与PR label Aug 26, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[New Sample] Add Qwen Series Model Computational Graph #94

[New Sample] Add Qwen Series Model Computational Graph #94

Uh oh!

fangfangssj commented Aug 5, 2025 •

edited

Loading

Uh oh!

paddle-bot bot commented Aug 5, 2025

Uh oh!

Uh oh!

[New Sample] Add Qwen Series Model Computational Graph #94

Are you sure you want to change the base?

[New Sample] Add Qwen Series Model Computational Graph #94

Uh oh!

Conversation

fangfangssj commented Aug 5, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

PR Category

Description

Uh oh!

paddle-bot bot commented Aug 5, 2025

Uh oh!

Uh oh!

fangfangssj commented Aug 5, 2025 •

edited

Loading