Skip to content

qwen3.5-0.8 grpo trl(most recent version) is not working with vllm 0.17.0 #5269

@deter3

Description

@deter3

when using pip install git+https://github.com/huggingface/trl.git@bd5307e9ecca6b0985381499250f589dab091605 , for grpo training with vllm 0.17.0 , the error said "ValueError: There is no module or parameter named 'model' in Qwen3_5ForConditionalGeneration. The available parameters belonging to (Qwen3_5ForConditionalGeneration) are: {'visual.blocks.3.norm2.bias' ............." , this is for both vllm_mode is "colocate" and "server" .

Reproduction

import torch
import re
import os 
import ast
from rapidfuzz import fuzz
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig  # <--- NEW IMPORT
)
from peft import (
    LoraConfig, 
    get_peft_model, 
    prepare_model_for_kbit_training # <--- NEW IMPORT
)
from trl import GRPOConfig, GRPOTrainer
from transformers import TrainerCallback
import gc
from transformers import TrainerCallback

class NuclearCacheClearCallback(TrainerCallback):
    """
    A callback that forces Python garbage collection and clears the CUDA/ROCm cache
    at the end of every training step.
    """
    def on_step_end(self, args, state, control, **kwargs):
        # 1. Force Python Garbage Collection (cleans up orphaned variables)
        gc.collect()
        # 2. Clear the GPU Cache (releases memory back to OS)
        torch.cuda.empty_cache()

# --- 1. CONFIGURATION ---
MODEL_NAME = "Qwen/Qwen3.5-0.8B"
# We set this slightly lower than the physical max (70k) to be safe
MAX_PROMPT_LENGTH = 20000 
MAX_COMPLETION_LENGTH = 5000
LORA_RANK = 32
OUTPUT_DIR = "outputs_qwen_70k_4bit"  # Updated output dir name for clarity

# --- 2. REWARD FUNCTIONS ---
def get_completion_text(completion):
    if isinstance(completion, str): return completion
    if isinstance(completion, list):
        if len(completion) > 0 and isinstance(completion[-1], dict) and "content" in completion[-1]:
            return completion[-1]["content"]
        if len(completion) > 0 and isinstance(completion[0], str):
            return "".join(completion)
    return str(completion)

def extract_answer(text):
    try:
        cleaned_text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
        extracted_data = re.findall(r'<answer>(.*?)</answer>', cleaned_text, flags=re.DOTALL)
        return extracted_data[0].strip() if extracted_data else None
    except: return None

def calculate_robust_rl_reward(prompts, completions, answer, **kwargs):
    responses = [get_completion_text(c) for c in completions]
    extracted_responses = [extract_answer(r) for r in responses]

    score_list = []
    for extracted, gt_list in zip(extracted_responses, answer):
        if not extracted: 
            score_list.append(-1.0)
            continue
        try:
            generated_list = ast.literal_eval(extracted)
        except:
            score_list.append(-2.0)
            continue
        
        if len(generated_list) > 5:
            score_list.append(-2.0)
            continue
        if not gt_list:
            score_list.append(1.0 if not generated_list else -1.0)
        else:
            matches = 0
            for gen in generated_list:
                # 1. NEW: Skip anything that isn't a string (like the Ellipsis object)
                if not isinstance(gen, str):
                    continue
                    
                for gt in gt_list:
                    # 2. NEW: Safety check for ground truth as well
                    if not isinstance(gt, str):
                        continue
                        
                    if fuzz.token_set_ratio(gt.lower(), gen.lower()) >= 90:
                        matches += 1
                        break 
            score_list.append(1.0 + (matches/len(gt_list)*0.8) if matches > 0 else -0.5)
    return score_list

def strict_format_reward_func(prompts, completions, **kwargs):
    responses = [get_completion_text(c) for c in completions]
    return [0.125 if "<think>" in r and "<answer>" in r else 0.0 for r in responses]

def xmlcount_reward_func(prompts, completions, **kwargs):
    responses = [get_completion_text(c) for c in completions]
    scores = []
    for text in responses:
        count = 0.0
        for tag in ["<think>", "</think>", "<answer>", "</answer>"]:
            if text.count(tag) == 1: count += 0.125
            else: count -= 0.125 * text.count(tag)
        scores.append(count)
    return scores

# --- 3. MODEL & TOKENIZER SETUP (4-BIT QLoRA) ---
print("Loading Model in 4-bit (NF4)...")

# Load Tokenizer first for the dataset logic
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# Define 4-bit Quantization Configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,  # Compute in bf16 for stability
    bnb_4bit_quant_type="nf4",              # Normalized Float 4 (standard for QLoRA)
    bnb_4bit_use_double_quant=True          # Double quantization saves a bit more memory
)
local_rank = int(os.environ.get("LOCAL_RANK", 0))
torch.cuda.set_device(local_rank)
device_map = {"": local_rank}

print(f"Process Rank {local_rank} is using GPU {device_map} for Training.")

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16, # Use pure BF16
    attn_implementation= "sdpa",#"sdpa", #"flash_attention_2",
    device_map=device_map
)

# Prepare model for k-bit training (Important: enables gradient checkpointing and freezes base model)
#model = prepare_model_for_kbit_training(model)

# LoRA Config
peft_config = LoraConfig(
    r=LORA_RANK,
    lora_alpha=LORA_RANK * 2,
    target_modules="all-linear",
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
model.config.use_cache = False 

print(f"Current Attention Implementation: {model.config._attn_implementation}")
print(f"Model Ready. Trainable Params: {model.print_trainable_parameters()}")

# --- 4. DATASET SETUP (ACCURATE TOKEN COUNTING) ---
def get_gsm8k_questions_accurate(split="train") -> Dataset:
    print("Loading and filtering dataset with accurate token counts...")
    
    # Load raw dataset
    data = load_dataset('negative-data', split='train')
    
    # 1. Apply Thinking Template (From your code78)
    def apply_thinking_template(example):
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": example['input']}
        ]
        
        # We use the global tokenizer loaded above
        prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False, 
            add_generation_prompt=True,
            enable_thinking=True # Uncomment if your specific tokenizer version supports this flag
        )
        
        return {
            'prompt': prompt,
            'answer': example["output"]
        }
    
    data = data.map(apply_thinking_template)

    # 2. Filter based on ACCURATE token length
    def check_token_length(example):
        # This is the slow but accurate part
        tokens = tokenizer.encode(example['prompt'], add_special_tokens=False)
        return len(tokens) <= MAX_PROMPT_LENGTH

    # Apply filter
    filtered_data = data.filter(check_token_length)
    
    # Shuffle and Select
    shuffled_data = filtered_data.shuffle(seed=42)
    
    # Select up to 1300, or fewer if the filter removed them
    count = min(len(shuffled_data), 1300)
    final_data = shuffled_data.select(range(count))
    
    print(f"Dataset ready. Original: {len(data)}, Filtered (<= {MAX_PROMPT_LENGTH}): {len(filtered_data)}, Final: {len(final_data)}")
    return final_data

dataset = get_gsm8k_questions_accurate()

# --- 5. TRAINING CONFIG ---
training_args = GRPOConfig(
    temperature=1,
    # top_p=0.8, 
    # repetition_penalty=1.2,
    # top_k=20,
    output_dir=OUTPUT_DIR,
    learning_rate=2e-6,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    
    # Allow slightly more than the filter to avoid edge case crashes
    max_completion_length=MAX_COMPLETION_LENGTH,
    
    num_generations=4,             # Keep low for 70k context stability
    bf16=True,
    fp16=False,
    optim="paged_adamw_8bit",
    max_grad_norm=0.1,
    logging_steps=1,
    report_to="wandb",
    save_strategy="steps",
    save_steps=20,
    logging_first_step=True, 
    # vLLM Configuration (Multi-GPU Colocate)
    use_vllm=True,
    #vllm_mode="colocate",
    #vllm_device="cuda:0",
    gradient_checkpointing_kwargs={"use_reentrant": False},
    vllm_server_host='0.0.0.0',
    vllm_server_port= 8000,
    vllm_server_timeout= 1200.0,
    #deepspeed="ds_config.json",
    #vllm_gpu_memory_utilization=0.50,
    #torch_empty_cache_steps = 1,
)

# --- 6. TRAINER ---
trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=[xmlcount_reward_func, strict_format_reward_func, calculate_robust_rl_reward],
    args=training_args,
    train_dataset=dataset,
    callbacks=[NuclearCacheClearCallback()] 
)

print("Starting Multi-GPU GRPO Training...")
trainer.train()

for vllm server : CUDA_VISIBLE_DEVICES=0 nohup trl vllm-serve --model Qwen/Qwen3.5-0.8B --tensor_parallel_size 1 --max-model-len 30000 > vllm123.log 2>&1 &
for trl training: CUDA_VISIBLE_DEVICES=1 nohup accelerate launch test1.py > output.log 2>&1 &

vllm server outputs:

INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
INFO:     127.0.0.1:40184 - "GET /health/ HTTP/1.1" 200 OK
INFO:     127.0.0.1:40190 - "GET /get_world_size/ HTTP/1.1" 200 OK
INFO:     127.0.0.1:40198 - "POST /init_communicator/ HTTP/1.1" 200 OK
INFO:     127.0.0.1:40198 - "POST /update_named_param/ HTTP/1.1" 200 OK
INFO:     127.0.0.1:40198 - "POST /update_named_param/ HTTP/1.1" 200 OK
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229] Invocation of collective_rpc method failed
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229] Traceback (most recent call last):
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]   File "/usr/local/lib/python3.11/dist-packages/vllm/v1/engine/core.py", line 1219, in _invoke_utility_method
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]     result = get_result()
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]              ^^^^^^^^^^^^
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]   File "/usr/local/lib/python3.11/dist-packages/vllm/v1/engine/core.py", line 1200, in <lambda>
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]     get_result = lambda: (method := getattr(self, method_name)) and method(
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]                                                                     ^^^^^^^
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]   File "/usr/local/lib/python3.11/dist-packages/vllm/v1/engine/core.py", line 733, in collective_rpc
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]     return self.model_executor.collective_rpc(method, timeout, args, kwargs)
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]   File "/usr/local/lib/python3.11/dist-packages/vllm/v1/executor/uniproc_executor.py", line 76, in collective_rpc
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]     result = run_method(self.driver_worker, method, args, kwargs)
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]   File "/usr/local/lib/python3.11/dist-packages/vllm/v1/serial_utils.py", line 459, in run_method
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]     return func(*args, **kwargs)
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]            ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]   File "/usr/local/lib/python3.11/dist-packages/trl/scripts/vllm_serve.py", line 147, in update_named_param
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]     self.model_runner.model.load_weights(weights=[(name, weight)])
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]   File "/usr/local/lib/python3.11/dist-packages/vllm/model_executor/models/qwen3_5.py", line 752, in load_weights
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]     return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]   File "/usr/local/lib/python3.11/dist-packages/vllm/model_executor/model_loader/reload/torchao_decorator.py", line 50, in patched_model_load_weights
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]     return original_load_weights(self, weights, *args, **kwargs)
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]   File "/usr/local/lib/python3.11/dist-packages/vllm/model_executor/models/utils.py", line 340, in load_weights
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]     autoloaded_weights = set(self._load_module("", self.module, weights))
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]   File "/usr/local/lib/python3.11/dist-packages/vllm/model_executor/models/utils.py", line 324, in _load_module
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229]     raise ValueError(msg)
(EngineCore_DP0 pid=963) ERROR 03-10 17:16:32 [core.py:1229] ValueError: There is no module or parameter named 'model' in Qwen3_5ForConditionalGeneration. The available parameters belonging to  (Qwen3_5ForConditionalGeneration) are: {'visual.blocks.3.norm2.bias', 'language_model.model.layers.5.post_attention_layernorm.weight', 'visual.blocks.5.norm1.weight', 'language_model.model.layers.1.linear_attn.out_proj.weight', 'visual.blocks.0.attn.proj.weight', 'language_model.model.layers.16.linear_attn.in_proj_ba.weight', 'visual.blocks.3.norm1.weight', 'language_model.model.layers.7.self_attn.q_norm.weight', 'language_model.model.layers.2.linear_attn.dt_bias', 'visual.blocks.11.attn.proj.weight', 'visual.merger.norm.weight', 'visual.blocks.6.mlp.linear_fc1.bias', 'language_model.model.layers.22.linear_attn.conv1d.weight', 'language_model.model.layers.16.linear_attn.A_log', 'language_model.model.layers.2.mlp.down_proj.weight', 'language_model.model.layers.20.mlp.down_proj.weight', 'language_model.model.layers.17.linear_attn.in_proj_ba.weight', 'visual.blocks.8.mlp.linear_fc1.weight', 'language_model.model.layers.13.linear_attn.in_proj_qkvz.weight', 'language_model.model.layers.22.linear_attn.out_proj.weight', 'language_model.model.layers.0.post_attention_layernorm.weight', 'visual.blocks.1.norm1.weight', 'visual.blocks.6.norm1.bias', 'language_model.model.layers.13.linear_attn.norm.weight', 'visual.blocks.6.mlp.linear_fc1.weight', 'language_model.model.layers.9.post_attention_layernorm.weight', 'visual.blocks.6.attn.proj.weight', 'visual.blocks.8.attn.proj.bias', 'visual.blocks.9.norm2.bias', 'language_model.model.layers.2.linear_attn.out_proj.weight', 'language_model.model.layers.7.mlp.gate_up_proj.weight', 'language_model.model.layers.22.input_layernorm.weight', 'language_model.model.layers.22.linear_attn.dt_bias', 'language_model.model.layers.14.mlp.gate_up_proj.weight', 'language_model.model.layers.11.self_attn.k_norm.weight', 'visual.blocks.9.attn.qkv.bias', 'visual.blocks.4.attn.proj.weight', 'visual.blocks.1.norm1.bias', 'visual.blocks.6.attn.proj.bias', 'visual.blocks.0.attn.proj.bias', 'language_model.model.layers.3.self_attn.k_norm.weight', 'language_model.model.layers.19.self_attn.q_norm.weight', 'language_model.model.layers.7.post_attention_layernorm.weight', 'visual.blocks.8.attn.qkv.weight', 'language_model.model.layers.6.input_layernorm.weight', 'language_model.model.layers.3.input_layernorm.weight', 'visual.blocks.2.norm2.weight', 'language_model.model.layers.0.linear_attn.out_proj.weight', 'language_model.model.layers.21.linear_attn.in_proj_qkvz.weight', 'visual.blocks.7.attn.proj.weight', 'visual.blocks.5.mlp.linear_fc1.bias', 'visual.blocks.5.attn.qkv.weight', 'visual.blocks.4.mlp.linear_fc2.weight', 'visual.blocks.10.attn.proj.bias', 'language_model.model.layers.2.post_attention_layernorm.weight', 'language_model.model.layers.20.linear_attn.dt_bias', 'visual.blocks.5.attn.qkv.bias', 'visual.blocks.2.norm1.weight', 'language_model.model.layers.11.mlp.gate_up_proj.weight', 'language_model.model.layers.17.input_layernorm.weight', 'visual.blocks.4.norm2.weight', 'visual.blocks.7.mlp.linear_fc2.weight', 'language_model.model.layers.12.linear_attn.in_proj_ba.weight', 'language_model.model.layers.5.linear_attn.dt_bias', 'visual.blocks.3.norm1.bias', 'language_model.model.layers.2.linear_attn.norm.weight', 'language_model.model.layers.22.linear_attn.in_proj_qkvz.weight', 'visual.blocks.1.attn.proj.weight', 'visual.blocks.2.attn.qkv.weight', 'visual.blocks.7.attn.qkv.weight', 'visual.merger.linear_fc1.weight', 'language_model.model.layers.4.post_attention_layernorm.weight', 'visual.blocks.10.norm1.bias', 'language_model.model.layers.5.mlp.gate_up_proj.weight', 'language_model.model.layers.13.input_layernorm.weight', 'visual.blocks.0.mlp.linear_fc2.bias', 'visual.blocks.3.norm2.weight', 'visual.blocks.1.norm2.bias', 'language_model.model.layers.12.linear_attn.in_proj_qkvz.weight', 'language_model.model.layers.16.mlp.down_proj.weight', 'language_model.model.layers.14.linear_attn.in_proj_qkvz.weight', 'language_model.model.layers.15.self_attn.qkv_proj.weight', 'visual.merger.linear_fc2.bias', 'visual.blocks.8.mlp.linear_fc2.bias', 'language_model.model.layers.11.self_attn.qkv_proj.weight', 'visual.blocks.2.mlp.linear_fc2.weight', 'visual.blocks.11.mlp.linear_fc2.bias', 'visual.blocks.8.norm2.weight', 'language_model.model.layers.19.mlp.gate_up_proj.weight', 'visual.blocks.9.mlp.linear_fc1.bias', 'language_model.model.layers.16.mlp.gate_up_proj.weight', 'language_model.model.layers.17.post_attention_layernorm.weight', 'visual.blocks.4.attn.proj.bias', 'visual.blocks.2.attn.proj.weight', 'language_model.model.layers.13.linear_attn.dt_bias', 'language_model.model.layers.7.self_attn.qkv_proj.weight', 'language_model.model.layers.6.linear_attn.norm.weight', 'language_model.model.layers.17.linear_attn.in_proj_qkvz.weight', 'visual.blocks.1.mlp.linear_fc1.bias', 'language_model.model.layers.19.self_attn.k_norm.weight', 'visual.blocks.1.mlp.linear_fc2.weight', 'language_model.model.layers.17.mlp.gate_up_proj.weight', 'language_model.model.layers.9.linear_attn.A_log', 'visual.blocks.4.attn.qkv.bias', 'visual.blocks.3.attn.proj.bias', 'language_model.model.layers.4.linear_attn.norm.weight', 'language_model.model.layers.9.linear_attn.in_proj_ba.weight', 'language_model.model.layers.2.mlp.gate_up_proj.weight', 'language_model.model.layers.17.linear_attn.out_proj.weight', 'language_model.model.layers.23.self_attn.qkv_proj.weight', 'language_model.model.layers.9.linear_attn.dt_bias', 'visual.patch_embed.proj.bias', 'language_model.model.layers.4.linear_attn.in_proj_ba.weight', 'language_model.model.layers.12.mlp.gate_up_proj.weight', 'language_model.model.layers.0.linear_attn.A_log', 'language_model.model.layers.4.linear_attn.conv1d.weight', 'visual.blocks.9.mlp.linear_fc1.weight', 'visual.blocks.10.norm1.weight', 'language_model.model.layers.14.linear_attn.conv1d.weight', 'language_model.model.layers.13.linear_attn.in_proj_ba.weight', 'visual.blocks.4.norm1.weight', 'language_model.model.layers.14.linear_attn.out_proj.weight', 'language_model.model.layers.16.linear_attn.norm.weight', 'language_model.model.layers.4.input_layernorm.weight', 'language_model.model.layers.0.linear_attn.in_proj_qkvz.weight', 'visual.blocks.4.norm2.bias', 'language_model.model.layers.12.linear_attn.out_proj.weight', 'language_model.model.layers.10.mlp.down_proj.weight', 'language_model.model.layers.4.linear_attn.out_proj.weight', 'language_model.model.layers.14.post_attention_layernorm.weight', 'language_model.model.layers.18.linear_attn.in_proj_qkvz.weight', 'visual.blocks.9.mlp.linear_fc2.weight', 'visual.blocks.7.mlp.linear_fc1.bias', 'language_model.model.layers.11.self_attn.q_norm.weight', 'visual.blocks.0.norm1.weight', 'visual.blocks.1.mlp.linear_fc2.bias', 'language_model.model.layers.4.linear_attn.in_proj_qkvz.weight', 'language_model.model.layers.12.linear_attn.dt_bias', 'language_model.model.layers.18.mlp.gate_up_proj.weight', 'language_model.model.layers.6.linear_attn.dt_bias', 'visual.blocks.10.attn.qkv.weight', 'language_model.model.layers.14.input_layernorm.weight', 'language_model.model.layers.13.post_attention_layernorm.weight', 'language_model.model.layers.23.self_attn.o_proj.weight', 'language_model.model.layers.22.linear_attn.in_proj_ba.weight', 'language_model.model.layers.2.linear_attn.in_proj_qkvz.weight', 'visual.blocks.8.norm1.weight', 'language_model.model.layers.8.input_layernorm.weight', 'language_model.model.layers.12.input_layernorm.weight', 'visual.blocks.8.attn.proj.weight', 'visual.blocks.0.norm1.bias', 'language_model.model.layers.20.linear_attn.in_proj_ba.weight', 'language_model.model.layers.20.input_layernorm.weight', 'language_model.model.layers.7.input_layernorm.weight', 'visual.blocks.8.norm2.bias', 'language_model.model.layers.8.linear_attn.in_proj_ba.weight', 'visual.blocks.0.attn.qkv.weight', 'visual.blocks.5.attn.proj.weight', 'language_model.model.layers.8.mlp.gate_up_proj.weight', 'language_model.model.layers.15.mlp.down_proj.weight', 'language_model.model.layers.16.linear_attn.dt_bias', 'language_model.model.layers.20.post_attention_layernorm.weight', 'language_model.model.layers.16.linear_attn.conv1d.weight', 'language_model.model.layers.1.linear_attn.A_log', 'visual.blocks.4.mlp.linear_fc1.bias', 'language_model.model.layers.8.linear_attn.conv1d.weight', 'language_model.model.layers.8.post_attention_layernorm.weight', 'visual.blocks.1.attn.qkv.bias', 'visual.blocks.6.norm2.weight', 'language_model.model.layers.21.linear_attn.dt_bias', 'language_model.model.layers.2.input_layernorm.weight', 'language_model.model.layers.9.mlp.gate_up_proj.weight', 'language_model.model.layers.12.linear_attn.norm.weight', 'visual.blocks.3.attn.proj.weight', 'language_model.model.layers.16.linear_attn.in_proj_qkvz.weight', 'language_model.model.layers.7.mlp.down_proj.weight', 'language_model.model.layers.22.mlp.gate_up_proj.weight', 'visual.blocks.8.attn.qkv.bias', 'language_model.model.layers.11.post_attention_layernorm.weight', 'language_model.model.layers.6.post_attention_layernorm.weight', 'visual.blocks.1.attn.proj.bias', 'language_model.model.layers.23.post_attention_layernorm.weight', 'language_model.model.layers.4.mlp.down_proj.weight', 'language_model.model.layers.6.linear_attn.in_proj_ba.weight', 'visual.blocks.2.mlp.linear_fc1.bias', 'language_model.model.layers.10.linear_attn.A_log', 'language_model.model.layers.21.mlp.gate_up_proj.weight', 'language_model.model.layers.18.linear_attn.A_log', 'visual.blocks.7.mlp.linear_fc2.bias', 'language_model.model.layers.19.self_attn.qkv_proj.weight', 'visual.blocks.3.attn.qkv.weight', 'visual.blocks.5.norm1.bias', 'language_model.model.layers.6.linear_attn.out_proj.weight', 'language_model.model.layers.12.linear_attn.A_log', 'language_model.model.layers.2.linear_attn.conv1d.weight', 'language_model.model.layers.3.mlp.gate_up_proj.weight', 'visual.blocks.0.norm2.weight', 'language_model.model.embed_tokens.weight', 'language_model.model.layers.21.linear_attn.conv1d.weight', 'language_model.model.layers.12.post_attention_layernorm.weight', 'visual.pos_embed.weight', 'visual.blocks.11.mlp.linear_fc2.weight', 'language_model.model.layers.1.mlp.gate_up_proj.weight', 'language_model.model.layers.18.mlp.down_proj.weight', 'language_model.model.layers.20.linear_attn.out_proj.weight', 'language_model.model.layers.19.input_layernorm.weight', 'language_model.model.layers.9.input_layernorm.weight', 'language_model.model.layers.13.mlp.down_proj.weight', 'visual.blocks.9.norm1.bias', 'language_model.model.layers.23.self_attn.k_norm.weight', 'visual.blocks.2.norm1.bias', 'visual.blocks.7.norm2.weight', 'visual.blocks.8.mlp.linear_fc2.weight', 'language_model.model.layers.17.linear_attn.A_log', 'language_model.model.layers.18.input_layernorm.weight', 'language_model.model.layers.16.linear_attn.out_proj.weight', 'language_model.model.layers.21.linear_attn.in_proj_ba.weight', 'visual.blocks.5.mlp.linear_fc1.weight', 'visual.blocks.2.attn.proj.bias', 'language_model.model.layers.10.linear_attn.conv1d.weight', 'visual.blocks.5.norm2.weight', 'visual.blocks.3.mlp.linear_fc2.weight', 'visual.blocks.11.norm1.weight', 'language_model.model.layers.11.input_layernorm.weight', 'language_model.model.layers.21.linear_attn.out_proj.weight', 'language_model.model.layers.0.mlp.down_proj.weight', 'language_model.model.layers.1.post_attention_layernorm.weight', 'language_model.model.layers.9.mlp.down_proj.weight', 'visual.blocks.7.attn.proj.bias', 'visual.blocks.5.norm2.bias', 'language_model.model.layers.15.self_attn.o_proj.weight', 'visual.blocks.2.attn.qkv.bias', 'language_model.model.layers.18.post_attention_layernorm.weight', 'visual.blocks.11.mlp.linear_fc1.bias', 'language_model.model.layers.5.input_layernorm.weight', 'language_model.model.layers.5.linear_attn.norm.weight', 'language_model.model.layers.1.linear_attn.in_proj_ba.weight', 'language_model.model.layers.9.linear_attn.out_proj.weight', 'language_model.model.layers.18.linear_attn.conv1d.weight', 'visual.blocks.4.norm1.bias', 'language_model.model.layers.1.mlp.down_proj.weight', 'visual.blocks.0.attn.qkv.bias', 'visual.blocks.1.mlp.linear_fc1.weight', 'language_model.model.layers.5.linear_attn.out_proj.weight', 'visual.blocks.11.norm2.bias', 'visual.blocks.0.mlp.linear_fc1.weight', 'visual.blocks.10.attn.proj.weight', 'visual.blocks.10.mlp.linear_fc2.weight', 'language_model.model.layers.19.post_attention_layernorm.weight', 'language_model.model.layers.17.linear_attn.norm.weight', 'language_model.model.layers.13.mlp.gate_up_proj.weight', 'visual.blocks.4.mlp.linear_fc2.bias', 'visual.blocks.0.norm2.bias', 'visual.blocks.10.norm2.weight', 'language_model.model.layers.0.linear_attn.dt_bias', 'language_model.model.layers.22.post_attention_layernorm.weight', 'language_model.model.layers.22.linear_attn.A_log', 'language_model.model.layers.10.linear_attn.out_proj.weight', 'language_model.model.layers.0.linear_attn.norm.weight', 'language_model.model.layers.0.mlp.gate_up_proj.weight', 'language_model.model.layers.5.linear_attn.conv1d.weight', 'language_model.model.layers.8.linear_attn.dt_bias', 'language_model.model.layers.22.mlp.down_proj.weight', 'visual.blocks.4.mlp.linear_fc1.weight', 'language_model.model.layers.10.linear_attn.norm.weight', 'language_model.model.layers.6.linear_attn.in_proj_qkvz.weight', 'language_model.model.layers.8.linear_attn.out_proj.weight', 'language_model.model.layers.8.linear_attn.A_log', 'visual.blocks.6.mlp.linear_fc2.weight', 'visual.blocks.1.norm2.weight', 'language_model.model.layers.19.self_attn.o_proj.weight', 'language_model.model.layers.21.input_layernorm.weight', 'language_model.model.layers.6.linear_attn.A_log', 'visual.blocks.5.attn.proj.bias', 'visual.blocks.7.norm1.bias', 'language_model.model.layers.15.input_layernorm.weight', 'visual.blocks.9.attn.proj.bias', 'visual.blocks.10.mlp.linear_fc1.weight', 'language_model.model.layers.10.post_attention_layernorm.weight', 'language_model.model.layers.0.linear_attn.in_proj_ba.weight', 'language_model.model.layers.20.linear_attn.in_proj_qkvz.weight', 'visual.blocks.2.mlp.linear_fc1.weight', 'visual.blocks.1.attn.qkv.weight', 'language_model.model.layers.15.self_attn.k_norm.weight', 'language_model.model.layers.18.linear_attn.dt_bias', 'visual.blocks.6.norm2.bias', 'language_model.model.layers.1.linear_attn.dt_bias', 'language_model.model.layers.9.linear_attn.conv1d.weight', 'language_model.model.layers.17.mlp.down_proj.weight', 'language_model.model.layers.1.linear_attn.conv1d.weight', 'language_model.model.layers.22.linear_attn.norm.weight', 'language_model.model.layers.20.mlp.gate_up_proj.weight', 'visual.blocks.9.attn.proj.weight', 'language_model.model.layers.10.linear_attn.in_proj_ba.weight', 'visual.blocks.6.norm1.weight', 'language_model.model.layers.4.linear_attn.A_log', 'language_model.model.layers.2.linear_attn.in_proj_ba.weight', 'language_model.model.layers.2.linear_attn.A_log', 'visual.blocks.9.norm2.weight', 'language_model.model.layers.0.input_layernorm.weight', 'language_model.model.layers.10.linear_attn.in_proj_qkvz.weight', 'visual.blocks.2.norm2.bias', 'language_model.model.layers.1.input_layernorm.weight', 'visual.blocks.9.mlp.linear_fc2.bias', 'visual.blocks.11.norm1.bias', 'language_model.model.layers.13.linear_attn.out_proj.weight', 'language_model.model.layers.17.linear_attn.dt_bias', 'language_model.model.layers.11.self_attn.o_proj.weight', 'language_model.model.norm.weight', 'language_model.model.layers.9.linear_attn.in_proj_qkvz.weight', 'visual.blocks.9.norm1.weight', 'language_model.model.layers.6.linear_attn.conv1d.weight', 'visual.blocks.5.mlp.linear_fc2.bias', 'language_model.model.layers.3.self_attn.o_proj.weight', 'language_model.model.layers.20.linear_attn.conv1d.weight', 'visual.blocks.7.mlp.linear_fc1.weight', 'visual.blocks.4.attn.qkv.weight', 'language_model.model.layers.3.mlp.down_proj.weight', 'visual.blocks.7.norm2.bias', 'visual.blocks.9.attn.qkv.weight', 'language_model.model.layers.7.self_attn.o_proj.weight', 'visual.blocks.3.attn.qkv.bias', 'language_model.model.layers.5.mlp.down_proj.weight', 'language_model.model.layers.15.self_attn.q_norm.weight', 'language_model.model.layers.3.self_attn.q_norm.weight', 'language_model.model.layers.20.linear_attn.norm.weight', 'language_model.model.layers.0.linear_attn.conv1d.weight', 'language_model.model.layers.8.mlp.down_proj.weight', 'visual.blocks.0.mlp.linear_fc2.weight', 'language_model.model.layers.1.linear_attn.norm.weight', 'language_model.model.layers.18.linear_attn.norm.weight', 'language_model.model.layers.15.post_attention_layernorm.weight', 'language_model.model.layers.8.linear_attn.norm.weight', 'language_model.model.layers.21.linear_attn.A_log', 'language_model.model.layers.13.linear_attn.conv1d.weight', 'visual.patch_embed.proj.weight', 'visual.blocks.3.mlp.linear_fc1.bias', 'language_model.model.layers.9.linear_attn.norm.weight', 'language_model.model.layers.3.post_attention_layernorm.weight', 'language_model.model.layers.11.mlp.down_proj.weight', 'language_model.model.layers.14.mlp.down_proj.weight', 'language_model.model.layers.23.mlp.down_proj.weight', 'language_model.model.layers.1.linear_attn.in_proj_qkvz.weight', 'visual.blocks.8.norm1.bias', 'visual.blocks.10.mlp.linear_fc2.bias', 'language_model.model.layers.16.post_attention_layernorm.weight', 'language_model.model.layers.6.mlp.down_proj.weight', 'visual.blocks.7.norm1.weight', 'visual.blocks.6.mlp.linear_fc2.bias', 'language_model.model.layers.14.linear_attn.A_log', 'language_model.model.layers.23.input_layernorm.weight', 'visual.blocks.3.mlp.linear_fc2.bias', 'visual.blocks.3.mlp.linear_fc1.weight', 'language_model.model.layers.21.linear_attn.norm.weight', 'language_model.model.layers.18.linear_attn.out_proj.weight', 'language_model.model.layers.16.input_layernorm.weight', 'visual.blocks.6.attn.qkv.weight', 'language_model.model.layers.4.mlp.gate_up_proj.weight', 'language_model.model.layers.15.mlp.gate_up_proj.weight', 'language_model.model.layers.12.linear_attn.conv1d.weight', 'visual.merger.linear_fc1.bias', 'language_model.model.layers.19.mlp.down_proj.weight', 'visual.blocks.8.mlp.linear_fc1.bias', 'language_model.model.layers.18.linear_attn.in_proj_ba.weight', 'visual.blocks.7.attn.qkv.bias', 'language_model.model.layers.6.mlp.gate_up_proj.weight', 'visual.blocks.10.mlp.linear_fc1.bias', 'language_model.model.layers.8.linear_attn.in_proj_qkvz.weight', 'language_model.model.layers.7.self_attn.k_norm.weight', 'language_model.model.layers.5.linear_attn.A_log', 'visual.blocks.2.mlp.linear_fc2.bias', 'language_model.model.layers.10.linear_attn.dt_bias', 'visual.blocks.0.mlp.linear_fc1.bias', 'language_model.model.layers.12.mlp.down_proj.weight', 'language_model.model.layers.3.self_attn.qkv_proj.weight', 'language_model.model.layers.14.linear_attn.in_proj_ba.weight', 'language_model.model.layers.5.linear_attn.in_proj_qkvz.weight', 'language_model.model.layers.4.linear_attn.dt_bias', 'visual.blocks.5.mlp.linear_fc2.weight', 'visual.blocks.11.attn.proj.bias', 'language_model.model.layers.21.mlp.down_proj.weight', 'visual.blocks.6.attn.qkv.bias', 'language_model.model.layers.10.mlp.gate_up_proj.weight', 'language_model.model.layers.14.linear_attn.norm.weight', 'visual.blocks.11.norm2.weight', 'language_model.model.layers.5.linear_attn.in_proj_ba.weight', 'language_model.model.layers.23.mlp.gate_up_proj.weight', 'visual.blocks.10.norm2.bias', 'visual.blocks.11.attn.qkv.bias', 'visual.merger.norm.bias', 'visual.blocks.11.attn.qkv.weight', 'visual.blocks.10.attn.qkv.bias', 'language_model.model.layers.14.linear_attn.dt_bias', 'language_model.model.layers.23.self_attn.q_norm.weight', 'language_model.model.layers.20.linear_attn.A_log', 'language_model.model.layers.17.linear_attn.conv1d.weight', 'visual.merger.linear_fc2.weight', 'visual.blocks.11.mlp.linear_fc1.weight', 'language_model.model.layers.10.input_layernorm.weight', 'language_model.model.layers.21.post_attention_layernorm.weight', 'language_model.model.layers.13.linear_attn.A_log'}

System Info

trl : pip install git+https://github.com/huggingface/trl.git@bd5307e9ecca6b0985381499250f589dab091605

Package Version


accelerate 1.13.0
aiohappyeyeballs 2.6.1
aiohttp 3.13.3
aiosignal 1.4.0
annotated-doc 0.0.4
annotated-types 0.7.0
anthropic 0.84.0
anyio 4.12.1
apache-tvm-ffi 0.1.9
argon2-cffi 23.1.0
argon2-cffi-bindings 21.2.0
arrow 1.3.0
astor 0.8.1
asttokens 2.4.1
async-lru 2.0.4
attrs 24.2.0
babel 2.16.0
beautifulsoup4 4.12.3
bitsandbytes 0.49.2
blake3 1.0.8
bleach 6.1.0
blinker 1.4
cachetools 7.0.5
causal_conv1d 1.6.1
cbor2 5.8.0
certifi 2024.8.30
cffi 1.17.1
charset-normalizer 3.3.2
click 8.3.1
cloudpickle 3.1.2
comm 0.2.2
compressed-tensors 0.13.0
cryptography 3.4.8
cuda-bindings 12.9.4
cuda-pathfinder 1.4.1
cuda-python 12.9.4
cupy-cuda12x 14.0.1
datasets 4.7.0
dbus-python 1.2.18
debugpy 1.8.5
decorator 5.1.1
defusedxml 0.7.1
depyf 0.20.0
dill 0.4.0
diskcache 5.6.3
distro 1.7.0
dnspython 2.8.0
docstring_parser 0.17.0
einops 0.8.2
email-validator 2.3.0
entrypoints 0.4
executing 2.1.0
fastapi 0.135.1
fastapi-cli 0.0.24
fastapi-cloud-cli 0.14.1
fastar 0.8.0
fastjsonschema 2.20.0
filelock 3.25.1
fla-core 0.4.1
flash_attn 2.8.3
flash-linear-attention 0.4.1
flashinfer-python 0.6.4
fqdn 1.5.1
frozenlist 1.8.0
fsspec 2024.2.0
gguf 0.18.0
gitdb 4.0.12
GitPython 3.1.46
googleapis-common-protos 1.73.0
grpcio 1.78.0
grpcio-reflection 1.78.0
h11 0.14.0
hf-xet 1.3.2
httpcore 1.0.5
httplib2 0.20.2
httptools 0.7.1
httpx 0.27.2
httpx-sse 0.4.3
huggingface_hub 1.6.0
idna 3.10
ijson 3.5.0
importlib_metadata 8.7.1
interegular 0.3.3
ipykernel 6.29.5
ipython 8.27.0
ipython-genutils 0.2.0
ipywidgets 8.1.5
isoduration 20.11.0
jedi 0.19.1
jeepney 0.7.1
Jinja2 3.1.6
jiter 0.13.0
jmespath 1.1.0
json5 0.9.25
jsonpointer 3.0.0
jsonschema 4.23.0
jsonschema-specifications 2023.12.1
jupyter-archive 3.4.0
jupyter_client 7.4.9
jupyter_contrib_core 0.4.2
jupyter_contrib_nbextensions 0.7.0
jupyter_core 5.7.2
jupyter-events 0.10.0
jupyter-highlight-selected-word 0.2.0
jupyter-lsp 2.2.5
jupyter_nbextensions_configurator 0.6.4
jupyter_server 2.14.2
jupyter_server_terminals 0.5.3
jupyterlab 4.2.5
jupyterlab_pygments 0.3.0
jupyterlab_server 2.27.3
jupyterlab_widgets 3.0.13
kaldi-native-fbank 1.22.3
keyring 23.5.0
lark 1.2.2
launchpadlib 1.10.16
lazr.restfulclient 0.14.4
lazr.uri 1.0.6
llguidance 1.3.0
llvmlite 0.44.0
lm-format-enforcer 0.11.3
loguru 0.7.3
lxml 5.3.0
markdown-it-py 4.0.0
MarkupSafe 2.1.5
matplotlib-inline 0.1.7
mcp 1.26.0
mdurl 0.1.2
mistral_common 1.9.1
mistune 3.0.2
model-hosting-container-standards 0.1.13
more-itertools 8.10.0
mpmath 1.3.0
msgpack 1.1.2
msgspec 0.20.0
multidict 6.7.1
multiprocess 0.70.18
nbclassic 1.1.0
nbclient 0.10.0
nbconvert 7.16.4
nbformat 5.10.4
nest-asyncio 1.6.0
networkx 3.2.1
ninja 1.13.0
notebook 6.5.5
notebook_shim 0.2.4
numba 0.61.2
numpy 2.2.6
nvidia-cublas-cu12 12.8.4.1
nvidia-cuda-cupti-cu12 12.8.90
nvidia-cuda-nvrtc-cu12 12.8.93
nvidia-cuda-runtime-cu12 12.8.90
nvidia-cudnn-cu12 9.10.2.21
nvidia-cudnn-frontend 1.19.0
nvidia-cufft-cu12 11.3.3.83
nvidia-cufile-cu12 1.13.1.3
nvidia-curand-cu12 10.3.9.90
nvidia-cusolver-cu12 11.7.3.90
nvidia-cusparse-cu12 12.5.8.93
nvidia-cusparselt-cu12 0.7.1
nvidia-cutlass-dsl 4.4.1
nvidia-cutlass-dsl-libs-base 4.4.1
nvidia-ml-py 13.590.48
nvidia-nccl-cu12 2.27.5
nvidia-nvjitlink-cu12 12.8.93
nvidia-nvshmem-cu12 3.4.5
nvidia-nvtx-cu12 12.8.90
oauthlib 3.2.0
openai 2.24.0
openai-harmony 0.0.8
opencv-python-headless 4.13.0.92
opentelemetry-api 1.40.0
opentelemetry-exporter-otlp 1.40.0
opentelemetry-exporter-otlp-proto-common 1.40.0
opentelemetry-exporter-otlp-proto-grpc 1.40.0
opentelemetry-exporter-otlp-proto-http 1.40.0
opentelemetry-proto 1.40.0
opentelemetry-sdk 1.40.0
opentelemetry-semantic-conventions 0.61b0
opentelemetry-semantic-conventions-ai 0.4.15
outlines_core 0.2.11
overrides 7.7.0
packaging 26.0
pandas 3.0.1
pandocfilters 1.5.1
parso 0.8.4
partial-json-parser 0.2.1.1.post7
peft 0.18.1
pexpect 4.9.0
pillow 12.1.1
pip 24.2
platformdirs 4.3.6
prometheus_client 0.21.0
prometheus-fastapi-instrumentator 7.1.0
prompt_toolkit 3.0.47
propcache 0.4.1
protobuf 6.33.5
psutil 6.0.0
ptyprocess 0.7.0
pure_eval 0.2.3
py-cpuinfo 9.0.0
pyarrow 23.0.1
pybase64 1.4.3
pycountry 26.2.16
pycparser 2.22
pydantic 2.12.5
pydantic_core 2.41.5
pydantic-extra-types 2.11.0
pydantic-settings 2.13.1
Pygments 2.18.0
PyGObject 3.42.1
PyJWT 2.11.0
pyparsing 2.4.7
python-apt 2.4.0+ubuntu4
python-dateutil 2.9.0.post0
python-dotenv 1.2.2
python-json-logger 2.0.7
python-multipart 0.0.22
PyYAML 6.0.2
pyzmq 27.1.0
quack-kernels 0.3.2
RapidFuzz 3.14.3
ray 2.54.0
referencing 0.35.1
regex 2026.2.28
requests 2.32.3
rfc3339-validator 0.1.4
rfc3986-validator 0.1.1
rich 14.3.3
rich-toolkit 0.19.7
rignore 0.7.6
rpds-py 0.20.0
safetensors 0.7.0
SecretStorage 3.3.1
Send2Trash 1.8.3
sentencepiece 0.2.1
sentry-sdk 2.54.0
setproctitle 1.3.7
setuptools 75.1.0
shellingham 1.5.4
six 1.16.0
smmap 5.0.3
sniffio 1.3.1
soupsieve 2.6
sse-starlette 3.3.2
stack-data 0.6.3
starlette 0.52.1
supervisor 4.3.0
sympy 1.14.0
tabulate 0.10.0
terminado 0.18.1
tiktoken 0.12.0
tinycss2 1.3.0
tokenizers 0.22.2
torch 2.10.0
torch_c_dlpack_ext 0.1.5
torchaudio 2.10.0
torchvision 0.25.0
tornado 6.4.1
tqdm 4.67.3
traitlets 5.14.3
transformers 5.3.0
triton 3.6.0
trl 1.0.0.dev0
typer 0.24.1
types-python-dateutil 2.9.0.20240906
typing_extensions 4.15.0
typing-inspection 0.4.2
uri-template 1.3.0
urllib3 2.2.3
uvicorn 0.41.0
uvloop 0.22.1
vllm 0.17.0
wadllib 1.3.6
wandb 0.25.0
watchfiles 1.1.1
wcwidth 0.2.13
webcolors 24.8.0
webencodings 0.5.1
websocket-client 1.8.0
websockets 16.0
wheel 0.44.0
widgetsnbextension 4.0.13
xgrammar 0.1.29
xxhash 3.6.0
yarl 1.23.0
zipp 3.23.0

Checklist

  • I have checked that my issue isn't already filed (see open issues)
  • I have included my system information
  • Any code provided is minimal, complete, and reproducible (more on MREs)
  • Any code provided is properly formatted in code blocks, (no screenshot, more on code blocks)
  • Any traceback provided is complete

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions