1515import inspect
1616import random
1717import textwrap
18+ import warnings
1819from collections import defaultdict
1920from collections .abc import Callable
2021from contextlib import contextmanager , nullcontext
@@ -775,7 +776,19 @@ def process_row(
775776 ) -> dict [str , list [int ]]:
776777 """
777778 Same as `tokenize_row` but for vision models. Please refer to `tokenize_row` for more information.
779+
780+ Note: Unlike `tokenize_row`, this method does not truncate prompts even if `max_prompt_length` is set. For
781+ vision models, prompts contain image tokens that must exactly match the image features (pixel_values).
782+ Truncating these tokens would cause a mismatch, leading to errors during the forward pass, like "Image features
783+ and image tokens do not match". Users should filter their datasets to ensure prompts are an appropriate length
784+ before training.
778785 """
786+ if max_prompt_length is not None :
787+ warnings .warn (
788+ "max_prompt_length is not supported for vision models and will be ignored. "
789+ "Truncating prompts would cause image token/feature mismatch errors." ,
790+ stacklevel = 2 ,
791+ )
779792 processor , tokenizer = processing_class , processing_class .tokenizer # the processing class is a processor
780793 processed_features = processor (images = features ["images" ], text = features ["prompt" ], add_special_tokens = False )
781794
@@ -794,9 +807,11 @@ def process_row(
794807 chosen_input_ids = chosen_input_ids + [tokenizer .eos_token_id ]
795808 rejected_input_ids = rejected_input_ids + [tokenizer .eos_token_id ]
796809
797- # Truncate prompt and completion sequences
798- if max_prompt_length is not None :
799- prompt_input_ids = prompt_input_ids [- max_prompt_length :]
810+ # Truncate completion sequences only.
811+ # Note: We do not truncate prompt_input_ids for vision models because the prompts contain image tokens
812+ # that must exactly match the image features (pixel_values). Truncating would cause errors like
813+ # "Image features and image tokens do not match: tokens: X, features: Y". Users should filter overlong
814+ # prompts from their dataset before training (the recommended approach for the deprecated max_prompt_length).
800815 if max_completion_length is not None :
801816 chosen_input_ids = chosen_input_ids [:max_completion_length ]
802817 rejected_input_ids = rejected_input_ids [:max_completion_length ]
0 commit comments