Skip to content

[New Sample] Add Qwen Series Model Computational Graph #94

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from

Conversation

fangfangssj
Copy link
Contributor

@fangfangssj fangfangssj commented Aug 5, 2025

PR Category

New Sample

Description

添加qwen系列的模型,提取时设置dynamic=False

抽取脚本
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from graph_net.torch.extractor import extract
import os
import subprocess
import shutil
import traceback
import torch
import time
import glob
import logging

def run_nlp_model_full_graph(model_name: str, device_str: str) -> None:
    device = torch.device(device_str)
    print(f"\nTesting NLP model: {model_name} on {device_str}")

    try:
        config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
        model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.float16, trust_remote_code=True).to(device).eval()
        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        tokenizer.pad_token = tokenizer.eos_token

        text = "Hello, my name is Bob. I am learning about large language models and their architectures. "
        inputs = tokenizer(
            text,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512,
        )
        input_data = {key: val.to(device) for key, val in inputs.items()}
        
        input_data['use_cache'] = False
        
        wrapped = extract(name=model_name,dynamic=False)(model).eval()
        with torch.no_grad():
            wrapped(**input_data)
        
        print(f"[OK] {model_name}: Full graph extracted.")
        return 1

    except Exception as e:
        print(f"[FAIL] {model_name}: extract error - {e}")
        return 0

# run_nlp_model_full_graph("Qwen/Qwen2.5-0.5B", "cuda:0" if torch.cuda.is_available() else "cpu")

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("model_processing.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger()

GRAPH_NET_EXTRACT_WORKSPACE = os.getenv("GRAPH_NET_EXTRACT_WORKSPACE", "./workspace")
SUCCESS_MODELS_DIR = os.getenv("SUCCESS_MODELS_DIR", "./validated_models")
TRANSFORMERS_CACHE = os.getenv("TRANSFORMERS_CACHE", os.path.expanduser("~/.cache/huggingface/hub"))

os.makedirs(SUCCESS_MODELS_DIR, exist_ok=True)
os.makedirs(GRAPH_NET_EXTRACT_WORKSPACE, exist_ok=True)

# 待测试的模型列表
MODEL_LIST = [
    "Qwen/Qwen2.5-0.5B",
    "Qwen/Qwen2.5-1.5B",
    "Qwen/Qwen2.5-3B",
    "Qwen/Qwen2.5-7B",
    "Qwen/Qwen2.5-14B",
    "Qwen/Qwen2.5-32B",
]

def clear_transformers_cache():
    """清理transformers缓存目录"""
    logger.info(f"Cleaning transformers cache: {TRANSFORMERS_CACHE}")
    
    if not os.path.exists(TRANSFORMERS_CACHE):
        logger.warning(f"Cache directory does not exist: {TRANSFORMERS_CACHE}")
        return
    
    # 删除缓存目录中的所有内容
    for item in os.listdir(TRANSFORMERS_CACHE):
        item_path = os.path.join(TRANSFORMERS_CACHE, item)
        try:
            if os.path.isfile(item_path) or os.path.islink(item_path):
                os.unlink(item_path)
            elif os.path.isdir(item_path):
                shutil.rmtree(item_path)
        except Exception as e:
            logger.error(f"Failed to delete {item_path}. Reason: {e}")
    
    logger.info("Transformers cache cleared successfully")

def process_model(model_name):
    """处理单个模型的完整流程"""
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model_dir = os.path.join(GRAPH_NET_EXTRACT_WORKSPACE, model_name)
    
    try:
        logger.info(f"Starting processing for: {model_name}")
        
        # 步骤1: 运行模型提取
        result = run_nlp_model_full_graph(model_name, device)
        
        if result == 1:
            logger.info(f"Extraction successful for {model_name}")
            
            # 步骤2: 执行验证命令
            validation_cmd = [
                "python", "-m", "graph_net.torch.validate",
                "--model-path", model_dir
            ]
            
            logger.info(f"Validating model: {' '.join(validation_cmd)}")
            
            try:
                val_result = subprocess.run(
                    validation_cmd,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE,
                    text=True,
                    timeout=600
                )
                
                if val_result.returncode == 0:
                    model_name = model_name.split("/")[-1]
                    dest_path = os.path.join(SUCCESS_MODELS_DIR, model_name)
                    os.makedirs(os.path.dirname(dest_path), exist_ok=True)
                    if os.path.exists(dest_path):
                        shutil.rmtree(dest_path)
                    
                    shutil.copytree(model_dir, dest_path)
                    
                    logger.info(f"Model validated and copied to {dest_path}")
                    return "SUCCESS", "Extraction and validation successful"
                else:
                    error_msg = f"Validation failed (code {val_result.returncode}):\n{val_result.stderr}"
                    logger.error(error_msg)
                    return "VALIDATION_FAILED", error_msg
                    
            except subprocess.TimeoutExpired:
                error_msg = "Validation command timed out (10 minutes)"
                logger.error(error_msg)
                return "VALIDATION_TIMEOUT", error_msg
                
            except Exception as e:
                error_msg = f"Validation command failed: {str(e)}\n{traceback.format_exc()}"
                logger.error(error_msg)
                return "VALIDATION_ERROR", error_msg
                
        else:
            error_msg = "Extraction returned failure code"
            logger.error(error_msg)
            return "EXTRACTION_FAILED", error_msg
            
    except Exception as e:
        error_msg = f"Critical processing error: {str(e)}\n{traceback.format_exc()}"
        logger.error(error_msg)
        return "PROCESSING_ERROR", error_msg

def main():
    success_models = []
    failed_models = []
    
    logger.info("=" * 60)
    logger.info(f"Starting model processing for {len(MODEL_LIST)} models")
    logger.info("=" * 60)
    
    for i, model in enumerate(MODEL_LIST, 1):
        logger.info(f"\n{'=' * 50}")
        logger.info(f"Processing model {i}/{len(MODEL_LIST)}: {model}")
        logger.info(f"{'=' * 50}")
        
        start_time = time.time()
        
        status, message = process_model(model)
        duration = time.time() - start_time
        result = {
            "model": model,
            "status": status,
            "message": message,
            "duration": f"{duration:.2f} seconds"
        }
        
        if status == "SUCCESS":
            success_models.append(result)
            logger.info(f"✅ SUCCESS: {model} ({duration:.2f}s)")
        else:
            failed_models.append(result)
            logger.error(f"❌ FAILED: {model} - {status} ({duration:.2f}s)")
        
        logger.info("Cleaning up resources...")
        clear_transformers_cache()
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            logger.info("CUDA cache cleared")
        
        logger.info(f"Cleanup completed for {model}")
    
    with open("success_models.txt", "w") as f:
        for result in success_models:
            f.write(f"{result['model']}\n")
    
    with open("failed_models.txt", "w") as f:
        for result in failed_models:
            f.write(f"{result['model']}: {result['status']}\n")
            f.write(f"Duration: {result['duration']}\n")
            f.write(f"Error: {result['message']}\n")
            f.write("-" * 80 + "\n")
    
    logger.info("\n" + "=" * 60)
    logger.info("Processing complete!")
    logger.info(f"Success: {len(success_models)} models")
    logger.info(f"Failed: {len(failed_models)} models")
    logger.info(f"Results saved to: model_processing.log")
    logger.info(f"Success models list: success_models.txt")
    logger.info(f"Failed models details: failed_models.txt")
    logger.info("=" * 60)

if __name__ == "__main__":
    main()

CI报错原因,生成的hash相同,qwen系列前面的头结构是一样的,第一次提取可以提取到完整的计算图,第二次提取的时候,里面有一个torch无法跟踪的torch._C._functorch.PyCapsule._vmap_increment_nesting函数,第二次提取计算图到这里就结束了,第二次已经跟踪了的部分生成一个hash值,所以所有qwen系列hash算出来都是一样的

Copy link

paddle-bot bot commented Aug 5, 2025

Thanks for your contribution!

@paddle-bot paddle-bot bot added the contributor External developers label Aug 5, 2025
@fangfangssj fangfangssj closed this Aug 6, 2025
@fangfangssj fangfangssj reopened this Aug 6, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
contributor External developers
Projects
None yet
Development

Successfully merging this pull request may close these issues.

1 participant