mindspore-lab
diff --git a/‎mindone/transformers/__init__.py‎
Lines changed: 8 additions & 0 deletions b/‎mindone/transformers/__init__.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎mindone/transformers/generation/utils.py‎
Lines changed: 1 addition & 0 deletions b/‎mindone/transformers/generation/utils.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎mindone/transformers/masking_utils.py‎
Lines changed: 160 additions & 8 deletions b/‎mindone/transformers/masking_utils.py‎
Lines changed: 160 additions & 8 deletions
diff --git a/‎mindone/transformers/modeling_rope_utils.py‎
Lines changed: 58 additions & 3 deletions b/‎mindone/transformers/modeling_rope_utils.py‎
Lines changed: 58 additions & 3 deletions
diff --git a/‎mindone/transformers/models/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎mindone/transformers/models/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎mindone/transformers/models/auto/configuration_auto.py‎
Lines changed: 2 additions & 2 deletions b/‎mindone/transformers/models/auto/configuration_auto.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎mindone/transformers/models/auto/modeling_auto.py‎
Lines changed: 5 additions & 1 deletion b/‎mindone/transformers/models/auto/modeling_auto.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎mindone/transformers/models/minimax/__init__.py‎
Lines changed: 19 additions & 0 deletions b/‎mindone/transformers/models/minimax/__init__.py‎
Lines changed: 19 additions & 0 deletions
@@ -539,4 +539,12 @@
         Glm4vTextModel,
         Glm4vVisionModel,
     )
+    from .models.minimax import (
+        MiniMaxForCausalLM,
+        MiniMaxForQuestionAnswering,
+        MiniMaxForSequenceClassification,
+        MiniMaxForTokenClassification,
+        MiniMaxModel,
+        MiniMaxPreTrainedModel,
+    )
     from .models.vjepa2 import VJEPA2ForVideoClassification, VJEPA2Model, VJEPA2PreTrainedModel
@@ -1830,6 +1830,7 @@ def _supports_default_dynamic_cache(self) -> bool:
             and "jamba" not in self.__class__.__name__.lower()
             and "zamba" not in self.__class__.__name__.lower()
             and "bamba" not in self.__class__.__name__.lower()
+            and "minimax" not in self.__class__.__name__.lower()
         )
 
     def _supports_default_dynamic_input(self) -> bool:
 
@@ -64,6 +64,25 @@ def causal_mask_function(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int)
     return kv_idx <= q_idx
 
 
+def sliding_window_overlay(sliding_window: int) -> Callable:
+    """
+    This is an overlay depicting a sliding window pattern. Add it on top of a causal mask for a proper sliding
+    window mask.
+    """
+
+    def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+        return kv_idx > q_idx - sliding_window
+
+    return inner_mask
+
+
+def sliding_window_causal_mask_function(sliding_window: int) -> Callable:
+    """
+    This return the mask_function function to create a sliding window mask.
+    """
+    return and_masks(sliding_window_overlay(sliding_window), causal_mask_function)
+
+
 def _vmap_for_bhqkv(mask_function: Callable, bh_indices: bool = True) -> Callable:
     """
     Used to vmap our mask_functions over the q_idx and kv_idx dimensions of the inputs. Optionally, vmap over
@@ -280,12 +299,65 @@ def eager_mask(
     return mask
 
 
+def flash_attention_mask(
+    batch_size: int,
+    cache_position: ms.Tensor,
+    kv_length: int,
+    kv_offset: int = 0,
+    mask_function: Callable = causal_mask_function,
+    attention_mask: Optional[ms.Tensor] = None,
+    **kwargs,
+):
+    """
+    Create the attention mask necesary to use FA2. Since FA2 is un-padded by definition, here we simply return
+    `None` if the mask is fully causal, or we return the 2D mask which will then be used to extract the seq_lens.
+    We just slice it in case of sliding window.
+
+    Args:
+        batch_size (`int`):
+            The batch size of the input sequence.
+        cache_position (`ms.Tensor`):
+            A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
+        kv_length (`int`):
+            The size that the key and value states will have during the attention computation.
+        kv_offset (`int`, optional):
+            An optional offset to indicate at which first position the key and values states will refer to.
+        mask_function (`Callable`):
+            The mask factory function describing the mask pattern.
+        attention_mask (`ms.Tensor`, optional):
+            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length)
+    """
+    if attention_mask is not None:
+        # Here we need to slice from the right if using sliding or chunked (for full attention, this is equivalent to doing nothing)
+        attention_mask = attention_mask[:, -kv_length:]
+        # We only return an actual mask if there is at least 1 padding token, otherwise we return `None` and use `is_causal` in FA2
+        # (note that the attention_mask is a boolean dtype here)
+        if attention_mask.all():
+            attention_mask = None
+
+    return attention_mask
+
+
+def flex_attention_mask(
+    batch_size: int,
+    cache_position: ms.Tensor,
+    kv_length: int,
+    kv_offset: int = 0,
+    mask_function: Callable = causal_mask_function,
+    attention_mask: Optional[ms.Tensor] = None,
+    **kwargs,
+):
+    raise NotImplementedError("`flex_attention` is not supported yet.")
+
+
 class AttentionMaskInterface(GeneralInterface):
     # Class instance object, so that a call to `register` can be reflected into all other files correctly, even if
     # a new instance is created (in order to locally override a given function)
     _global_mapping = {
+        "sdpa": sdpa_mask,
         "eager": eager_mask,
-        "flash_attention_2": eager_mask,
+        "flash_attention_2": flash_attention_mask,
+        "flex_attention": flex_attention_mask,
     }
 
 
@@ -308,13 +380,13 @@ def _preprocess_mask_arguments(
     Args:
         config (`PretrainedConfig`):
             The model config.
-        input_embeds (`torch.Tensor`):
+        input_embeds (`ms.Tensor`):
             The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the
             batch size, query length and dtype.
-        attention_mask (`torch.Tensor`, optional):
+        attention_mask (`ms.Tensor`, optional):
             The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length).
             It can also be an already prepared 4D mask, in which case it is returned as-is.
-        cache_position (`torch.Tensor`):
+        cache_position (`ms.Tensor`):
             A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
         past_key_values (`Cache`, optional):
             The past key values, if we use a cache.
@@ -325,7 +397,7 @@ def _preprocess_mask_arguments(
     Returns:
         early_exit (`bool`):
             Whether we should early exit mask creation, and return the mask as-is.
-        attention_mask (`torch.Tensor` or `BlockMask` or `None`):
+        attention_mask (`ms.Tensor` or `BlockMask` or `None`):
             The attention mask to either return immediately, or to use in downstream mask creation.
         kv_length (`int`):
             The size that the key and value states will have during the attention computation.
@@ -375,13 +447,13 @@ def create_causal_mask(
     Args:
         config (`PretrainedConfig`):
             The model config.
-        input_embeds (`torch.Tensor`):
+        input_embeds (`ms.Tensor`):
             The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the
             batch size, query length and dtype.
-        attention_mask (`torch.Tensor`, optional):
+        attention_mask (`ms.Tensor`, optional):
             The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length).
             It can also be an already prepared 4D mask, in which case it is returned as-is.
-        cache_position (`torch.Tensor`):
+        cache_position (`ms.Tensor`):
             A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
         past_key_values (`Cache`, optional):
             The past key values, if we use a cache.
@@ -435,6 +507,86 @@ def create_causal_mask(
     return causal_mask
 
 
+def create_sliding_window_causal_mask(
+    config: PretrainedConfig,
+    input_embeds: ms.Tensor,
+    attention_mask: Optional[ms.Tensor],
+    cache_position: ms.Tensor,
+    past_key_values: Optional[Cache],
+    or_mask_function: Optional[Callable] = None,
+    and_mask_function: Optional[Callable] = None,
+) -> Optional[Union[ms.Tensor, BlockMask]]:
+    """
+    Create a sliding window causal mask based on the attention implementation used (stored in the config). This type
+    of attention pattern was mostly democratized by Mistral. If `past_key_values` has an HybridCache structure, this
+    function will return the mask corresponding to one of the "sliding_attention" layers (to align to what is needed in the
+    `modeling_xxx.py` files).
+
+    Args:
+        config (`PretrainedConfig`):
+            The model config.
+        input_embeds (`ms.Tensor`):
+            The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the
+            batch size, query length and dtype.
+        attention_mask (`ms.Tensor`, optional):
+            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length).
+            It can also be an already prepared 4D mask, in which case it is returned as-is.
+        cache_position (`ms.Tensor`):
+            A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
+        past_key_values (`Cache`, optional):
+            The past key values, if we use a cache.
+        or_mask_function (`Callable`, optional):
+            An optional mask function to combine with the sliding causal mask function (by doing the union of both). This is
+            useful to easily overlay another mask on top of the sliding causal one, for example for image tokens handling.
+        and_mask_function (`Callable`, optional):
+            An optional mask function to combine with the sliding causal mask function (by doing the intersection of both). This is
+            useful to easily overlay another mask on top of the sliding causal one, for example for image tokens handling.
+    """
+    # If we have an HybridCache structure, here we want to create the mask for the sliding layers
+    if hasattr(past_key_values, "is_sliding") and True in past_key_values.is_sliding:
+        layer_idx = past_key_values.is_sliding.index(True)
+    else:
+        layer_idx = 0
+
+    early_exit, attention_mask, kv_length, kv_offset = _preprocess_mask_arguments(
+        config, input_embeds, attention_mask, cache_position, past_key_values, layer_idx
+    )
+    if early_exit:
+        return attention_mask
+
+    sliding_window = getattr(config, "sliding_window", None)
+    if sliding_window is None:
+        raise ValueError("Could not find a `sliding_window` argument in the config, or it is not set")
+
+    batch_size, dtype = input_embeds.shape[0], input_embeds.dtype
+    mask_factory_function = sliding_window_causal_mask_function(sliding_window)
+    mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]
+
+    # Do not allow skip if we are compiling (this is to match BC)
+    # TODO: cyril -> probably revisit and remove this, but a lot of tests rely on it
+    allow_is_causal_skip = not past_key_values.is_compileable if past_key_values is not None else True
+
+    # Allow slight deviations from sliding causal mask
+    if or_mask_function is not None or and_mask_function is not None:
+        raise NotImplementedError("`or_mask_function` or `and_mask_function` arguments are not supported yet.")
+
+    # We now create the mask
+    causal_mask = mask_interface(
+        batch_size=batch_size,
+        cache_position=cache_position,
+        kv_length=kv_length,
+        kv_offset=kv_offset,
+        mask_function=mask_factory_function,
+        attention_mask=attention_mask,
+        allow_is_causal_skip=allow_is_causal_skip,  # additional kwarg for sdpa
+        local_size=sliding_window,  # Additional kwarg for sdpa
+        dtype=dtype,  # Additional kwarg for eager
+        config=config,  # Pass the config as well, in case someone wants to easily have their own mask_interface
+    )
+    return causal_mask
+
+
 LAYER_PATTERN_TO_MASK_FUNCTION_MAPPING = {
     "full_attention": create_causal_mask,
+    "sliding_attention": create_sliding_window_causal_mask,
 }
@@ -16,7 +16,8 @@
 # limitations under the License.
 
 import math
-from typing import Optional, Tuple
+from functools import wraps
+from typing import Optional
 
 from transformers import PretrainedConfig
 from transformers.utils import logging
@@ -27,9 +28,63 @@
 logger = logging.get_logger(__name__)
 
 
+def dynamic_rope_update(rope_forward):
+    """
+    Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
+    (i.e. a RoPE implementation that may recompute its frequencies in the forward pass).
+
+    Args:
+        rope_forward (Callable):
+            The forward pass of the RoPE implementation.
+
+    Returns:
+        The decorated forward pass.
+    """
+
+    def longrope_frequency_update(self, position_ids):
+        """Longrope uses long factor if sequence is larger than original pretraining length, short otherwise."""
+        seq_len = mint.max(position_ids) + 1
+        if hasattr(self.config, "original_max_position_embeddings"):
+            original_max_position_embeddings = self.config.original_max_position_embeddings
+        else:
+            original_max_position_embeddings = self.config.max_position_embeddings
+        if seq_len > original_max_position_embeddings:
+            if not hasattr(self, "long_inv_freq"):
+                self.long_inv_freq, _ = self.rope_init_fn(self.config, seq_len=original_max_position_embeddings + 1)
+            self.register_buffer("inv_freq", self.long_inv_freq, persistent=False)
+        else:
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+
+    def dynamic_frequency_update(self, position_ids):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = mint.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, seq_len=seq_len)
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+
+        if seq_len < self.original_max_seq_len < self.max_seq_len_cached:  # reset
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+
+    @wraps(rope_forward)
+    def wrapper(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            dynamic_frequency_update(self, position_ids)
+        elif self.rope_type == "longrope":
+            longrope_frequency_update(self, position_ids)
+        return rope_forward(self, x, position_ids)
+
+    return wrapper
+
+
 def _compute_default_rope_parameters(
     config: Optional[PretrainedConfig] = None, seq_len: Optional[int] = None, **rope_kwargs
-) -> Tuple[Tensor, float]:
+) -> tuple[Tensor, float]:
     """
     Computes the inverse frequencies according to the original RoPE implementation
     Args:
@@ -54,7 +109,7 @@ def _compute_default_rope_parameters(
     elif config is not None:
         base = config.rope_theta
         partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
-        head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
         dim = int(head_dim * partial_rotary_factor)
 
     attention_factor = 1.0  # Unused in this type of RoPE
 
@@ -94,4 +94,4 @@
     from . import glm4
 
 if version.parse(transformers.__version__) >= version.parse("4.53.0"):
-    from . import glm4v, vjepa2
+    from . import glm4v, minimax, vjepa2
@@ -271,8 +271,8 @@
     MODEL_NAMES_MAPPING.update({"glm4": "glm4"})
 
 if version.parse(transformers.__version__) >= version.parse("4.53.0"):
-    CONFIG_MAPPING_NAMES.update({"vjepa2": "VJEPA2Model"})
-    MODEL_NAMES_MAPPING.update({"vjepa2": "VJEPA2Model"})
+    CONFIG_MAPPING_NAMES.update({"minimax": "MiniMaxConfig", "vjepa2": "VJEPA2Model"})
+    MODEL_NAMES_MAPPING.update({"minimax": "MiniMax", "vjepa2": "VJEPA2Model"})
 
 
 def model_type_to_module_name(key):
 
@@ -603,8 +603,12 @@
     MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES.update({"glm4": "Glm4ForTokenClassification"})
 
 if version.parse(transformers.__version__) >= version.parse("4.53.0"):
-    MODEL_MAPPING_NAMES.update({"vjepa2": "VJEPA2Model"})
+    MODEL_MAPPING_NAMES.update({"minimax": "MiniMaxModel", "vjepa2": "VJEPA2Model"})
+    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.update({"minimax": "MiniMaxForCausalLM"})
     MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES.update({"vjepa2": "VJEPA2ForVideoClassification"})
+    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES.update({"minimax": "MiniMaxForSequenceClassification"})
+    MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES.update({"minimax": "MiniMaxForQuestionAnswering"})
+    MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES.update({"minimax": "MiniMaxForTokenClassification"})
 
 MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)
 MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_PRETRAINING_MAPPING_NAMES)
 
@@ -0,0 +1,19 @@
+# coding=utf-8
+# Copyright 2025 MiniMaxAI and HuggingFace Inc. teams. All rights reserved.
+#
+# This code is adapted from https://github.com/huggingface/transformers
+# with modifications to run transformers on mindspore.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .modeling_minimax import *
Original file line number	Diff line number	Diff line change
`@@ -1830,6 +1830,7 @@ def _supports_default_dynamic_cache(self) -> bool:`
`1830`	`1830`	`and "jamba" not in self.__class__.__name__.lower()`
`1831`	`1831`	`and "zamba" not in self.__class__.__name__.lower()`
`1832`	`1832`	`and "bamba" not in self.__class__.__name__.lower()`
	`1833`	`+ and "minimax" not in self.__class__.__name__.lower()`
`1833`	`1834`	`)`
`1834`	`1835`
`1835`	`1836`	`def _supports_default_dynamic_input(self) -> bool:`