Fixes in check_model_inputs, GPTBigCodeModel and ImageGPTModel (#40811)

IlyasMoutawwakil · web-flow · commit 7a1aeec36e3a · 2025-10-06T16:34:24.000+02:00
* misc fixes * fix * Update src/transformers/models/imagegpt/modeling_imagegpt.py * Apply suggestion from @IlyasMoutawwakil * pickup use_cache from args input as well * fix
diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -472,14 +472,7 @@ def forward(
             raise ValueError("batch_size has to be defined and > 0")
 
         if use_cache and past_key_values is None:
-            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
-        if use_cache and isinstance(past_key_values, tuple):
-            logger.warning_once(
-                "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. "
-                "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
-                "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
-            )
-            past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+            past_key_values = DynamicCache(config=self.config)
 
         if inputs_embeds is None:
             inputs_embeds = self.wte(input_ids)
diff --git a/src/transformers/models/imagegpt/modeling_imagegpt.py b/src/transformers/models/imagegpt/modeling_imagegpt.py
@@ -517,24 +517,20 @@ def forward(
                 )
                 use_cache = False
 
-        if use_cache and past_key_values is None:
-            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
-        if use_cache and isinstance(past_key_values, tuple):
-            logger.warning_once(
-                "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. "
-                "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
-                "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
-            )
-            past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
-
-        past_length = past_key_values.get_seq_length() if past_key_values is not None else past_key_values
-
         if token_type_ids is not None:
             token_type_ids = token_type_ids.view(-1, input_shape[-1])
 
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position: torch.Tensor = torch.arange(
+                past_seen_tokens, past_seen_tokens + input_shape[-1], device=device
+            )
+
         if position_ids is None:
-            position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
-            position_ids = position_ids.unsqueeze(0)
+            position_ids = cache_position.unsqueeze(0)
 
         # ImageGPTAttention mask.
         if attention_mask is not None:
diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py
@@ -797,17 +797,34 @@ def check_model_inputs(tie_last_hidden_states=True):
     def wrapped_fn(func):
         @wraps(func)
         def wrapper(self, *args, **kwargs):
-            use_cache = (
-                kwargs["use_cache"] if kwargs.get("use_cache") is not None else getattr(self.config, "use_cache", None)
-            )
+            use_cache_arg_index = None
+            if "use_cache" in func.__code__.co_varnames:
+                use_cache_arg_index = func.__code__.co_varnames.index("use_cache") - 1  # -1 for self
+
+            if (
+                use_cache_arg_index is not None
+                and len(args) > use_cache_arg_index
+                and args[use_cache_arg_index] is not None
+            ):
+                use_cache = args[use_cache_arg_index]
+            elif kwargs.get("use_cache") is not None:
+                use_cache = kwargs["use_cache"]
+            else:
+                use_cache = getattr(self.config, "use_cache", None)
+
             if use_cache is not None:
                 if getattr(self, "gradient_checkpointing", False) and self.training and use_cache:
                     logger.warning_once(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
                     )
                     use_cache = False
 
-                kwargs["use_cache"] = use_cache
+                if use_cache_arg_index is not None and len(args) > use_cache_arg_index:
+                    args = list(args)
+                    args[use_cache_arg_index] = use_cache
+                    args = tuple(args)
+                else:
+                    kwargs["use_cache"] = use_cache
 
             return_dict = kwargs.pop("return_dict", None)
             if return_dict is None:
@@ -818,7 +835,8 @@ def wrapper(self, *args, **kwargs):
                 for k, v in all_args["kwargs"].items():
                     all_args[k] = v
 
-            capture_flags = _CAN_RECORD_REGISTRY.get(str(self.__class__), {})  # there is a weak ref for executorch
+            # _can_record_outputs is None by default
+            capture_flags = _CAN_RECORD_REGISTRY.get(str(self.__class__)) or {}  # there is a weak ref for executorch
             recordable_keys = {
                 f"output_{k}": all_args.get(
                     f"output_{k}",