PaddlePaddle
diff --git a/‎llm/run_finetune.py‎
Lines changed: 1 addition & 1 deletion b/‎llm/run_finetune.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎llm/utils/data.py‎
Lines changed: 5 additions & 11 deletions b/‎llm/utils/data.py‎
Lines changed: 5 additions & 11 deletions
diff --git a/‎paddlenlp/trainer/trainer.py‎
Lines changed: 12 additions & 1 deletion b/‎paddlenlp/trainer/trainer.py‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎paddlenlp/trainer/training_args.py‎
Lines changed: 12 additions & 0 deletions b/‎paddlenlp/trainer/training_args.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎paddlenlp/transformers/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎paddlenlp/transformers/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎paddlenlp/transformers/auto/modeling.py‎
Lines changed: 1 addition & 0 deletions b/‎paddlenlp/transformers/auto/modeling.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddlenlp/transformers/auto/tokenizer.py‎
Lines changed: 1 addition & 0 deletions b/‎paddlenlp/transformers/auto/tokenizer.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddlenlp/transformers/jamba/__init__.py‎
Lines changed: 13 additions & 0 deletions b/‎paddlenlp/transformers/jamba/__init__.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎paddlenlp/transformers/jamba/configuration.py‎
Lines changed: 223 additions & 0 deletions b/‎paddlenlp/transformers/jamba/configuration.py‎
Lines changed: 223 additions & 0 deletions
@@ -342,7 +342,7 @@ def neft_post_hook(module, input, output):
 
     if data_args.zero_padding:
         if (
-            model.base_model_prefix not in ["llama", "bloom", "chatglm", "chatglm_v2", "qwen", "mistral"]
+            model.base_model_prefix not in ["llama", "bloom", "chatglm", "chatglm_v2", "qwen", "mistral", "jamba"]
             and training_args.pipeline_parallel_degree < 1
         ):
             raise NotImplementedError(
 
@@ -56,11 +56,12 @@ def get_convert_example(model):
         "qwen2_moe",
         "gpt",
         "yuan",
+        "jamba",
     ]:
         return convert_example_common
     else:
         raise ValueError(
-            f"Unknown base_model_prefix: {model.base_model_prefix}. Supported base_model_prefix list: chatglm, bloom, llama, qwen, mixtral, gemma, qwen2, qwen2_moe, yuan",
+            f"Unknown base_model_prefix: {model.base_model_prefix}. Supported base_model_prefix list: chatglm, bloom, llama, qwen, mixtral, gemma, qwen2, qwen2_moe, yuan, jamba",
         )
 
 
@@ -198,9 +199,7 @@ def convert_example_common(example, tokenizer, data_args, is_test=True, zero_pad
             features["position_ids"] = list(range(seq_length))
         if zero_padding:
             if flash_mask:
-                features["attn_mask_startend_row_indices"] = (
-                    [seq_length] * seq_length
-                )
+                features["attn_mask_startend_row_indices"] = [seq_length] * seq_length
             else:
                 features["attention_mask"] = np.tri(seq_length, seq_length, dtype=bool)
 
@@ -236,13 +235,10 @@ def convert_rounds_example_common(example, tokenizer, data_args, is_test=True, z
     features = {"input_ids": input_ids, "labels": labels}
     if zero_padding:
         if flash_mask:
-            features["attn_mask_startend_row_indices"] = (
-                [seq_length] * seq_length
-            )
+            features["attn_mask_startend_row_indices"] = [seq_length] * seq_length
         else:
             features["attention_mask"] = np.tri(seq_length, seq_length, dtype=bool)
 
-
     if "position_ids" in rounds_inputs:
         rounds_inputs["position_ids"] = rounds_inputs["position_ids"][:-1]
 
@@ -252,9 +248,7 @@ def convert_rounds_example_common(example, tokenizer, data_args, is_test=True, z
 
 def convert_example_chatglm(example, tokenizer, data_args, is_test=True, zero_padding=False, flash_mask=False):
     if flash_mask:
-        raise ValueError(
-            "chatglm does not support flash mask for now!"
-        )
+        raise ValueError("chatglm does not support flash mask for now!")
     if tokenizer.chat_template is not None:
         # chatglm only support single-round finetune
         example = convert_multi_rounds_to_single_round(example, tokenizer)
 
@@ -1827,7 +1827,18 @@ def _wrap_model(self, model, training=True):
         # Multi-gpu training
         if self.args.world_size > 1 and (not self.args.use_hybrid_parallel):
             # MOE use DDP to broadcaset parameters.
-            model = paddle.DataParallel(model)
+            ddp_kwargs = {}
+            if self.args.ddp_find_unused_parameters is not None:
+                ddp_kwargs["find_unused_parameters"] = self.args.ddp_find_unused_parameters
+            elif isinstance(model, PretrainedModel):
+                # find_unused_parameters breaks checkpointing as per
+                # https://github.com/huggingface/transformers/pull/4659#issuecomment-643356021
+                ddp_kwargs["find_unused_parameters"] = not any(
+                    hasattr(m, "enable_recompute") and m.enable_recompute for m in model.sublayers(include_self=True)
+                )
+            else:
+                ddp_kwargs["find_unused_parameters"] = True
+            model = paddle.DataParallel(model, **ddp_kwargs)
             # Distributed training (should be after fp16 initialization)
 
             if self.args.amp_master_grad:
 
@@ -343,6 +343,9 @@ class TrainingArguments:
             The list of integrations to report the results and logs to.
             Supported platforms are `"visualdl"`/`"wandb"`/`"tensorboard"`.
             `"none"` for no integrations.
+        ddp_find_unused_parameters (`bool`, *optional*):
+            When using distributed training, the value of the flag `find_unused_parameters` passed to
+            `paddle.DataParallel`. Will default to `False` if recompute is used, `True` otherwise.
         wandb_api_key (`str`, *optional*):
             Weights & Biases (WandB) API key(s) for authentication with the WandB service.
         resume_from_checkpoint (`str`, *optional*):
@@ -762,6 +765,15 @@ class TrainingArguments:
     report_to: Optional[List[str]] = field(
         default=None, metadata={"help": "The list of integrations to report the results and logs to."}
     )
+    ddp_find_unused_parameters: Optional[bool] = field(
+        default=None,
+        metadata={
+            "help": (
+                "When using distributed training, the value of the flag `find_unused_parameters` passed to "
+                "`DataParallel`."
+            )
+        },
+    )
     wandb_api_key: Optional[str] = field(
         default=None,
         metadata={"help": "Weights & Biases (WandB) API key(s) for authentication with the WandB service."},
 
@@ -303,3 +303,6 @@
 from .mamba.configuration import *
 from .mamba.modeling import *
 from .mamba.tokenizer import *
+from .jamba.modeling import *
+from .jamba.configuration import *
+from .jamba.tokenizer import *
@@ -124,6 +124,7 @@
         ("Gemma", "gemma"),
         ("Yuan", "yuan"),
         ("Mamba", "mamba"),
+        ("Jamba", "jamba"),
     ]
 )
 
 
@@ -100,6 +100,7 @@
         ("GemmaTokenizer", "gemma"),
         ("YuanTokenizer", "yuan"),
         ("MambaTokenizer", "mamba"),
+        ("JambaTokenizer", "jamba"),
     ]
 )
 
 
@@ -0,0 +1,13 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
@@ -0,0 +1,223 @@
+# coding=utf-8
+# Copyright 2024 AI21 Labs Ltd. and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Jamba model configuration"""
+import math
+
+from ..configuration_utils import PretrainedConfig
+
+__all__ = [
+    "JambaConfig",
+]
+
+
+class JambaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`JambaModel`]. It is used to instantiate a
+    Jamba model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Jamba-v0.1 model.
+
+    [ai21labs/Jamba-v0.1](https://huggingface.co/ai21labs/Jamba-v0.1)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 65536):
+            Vocabulary size of the Jamba model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`JambaModel`]
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
+            model has a output word embedding layer.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 14336):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
+            Number of prompt logits to calculate during generation. If `None`, all logits will be calculated. If an
+            integer value, only last `num_logits_to_keep` logits will be calculated. Default is 1 because only the
+            logits of the last prompt token are needed for generation. For long sequences, the logits for the entire
+            sequence may use a lot of memory so, setting `num_logits_to_keep=1` will reduce memory footprint
+            significantly.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabling this will also
+            allow the model to output the auxiliary loss. See [here]() for more details
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size. If not specified, will default to `None`.
+        max_position_embeddings (`int`, *optional*, defaults to 262144):
+            This value doesn't have any real effect. The maximum sequence length that this model is intended to be
+            used with. It can be used with longer sequences, but performance may degrade.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        num_experts_per_tok (`int`, *optional*, defaults to 2):
+            The number of experts to root per-token, can be also interpreted as the `top-p` routing
+            parameter
+        num_experts (`int`, *optional*, defaults to 16):
+            Number of experts per Sparse MLP layer.
+        expert_layer_period (`int`, *optional*, defaults to 2):
+            Once in this many layers, we will have an expert layer
+        expert_layer_offset (`int`, *optional*, defaults to 1):
+            The first layer index that contains an expert mlp layer
+        attn_layer_period (`int`, *optional*, defaults to 8):
+            Once in this many layers, we will have a vanilla attention layer
+        attn_layer_offset (`int`, *optional*, defaults to 4):
+            The first layer index that contains a vanilla attention mlp layer
+        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use the fast mamba kernels. These are available only if `mamba-ssm` and
+            `causal-conv1d` are installed, and the mamba modules are running on a CUDA device. Raises ValueError if
+            `True` and kernels are not available
+        mamba_d_state (`int`, *optional*, defaults to 16):
+            The dimension the mamba state space latents
+        mamba_d_conv (`int`, *optional*, defaults to 4):
+            The size of the mamba convolution kernel
+        mamba_expand (`int`, *optional*, defaults to 2):
+            Expanding factor (relative to hidden_size) used to determine the mamba intermediate size
+        mamba_dt_rank (`Union[int,str]`, *optional*, defaults to `"auto"`):
+            Rank of the the mamba discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)`
+        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use bias in the convolution layer of the mamba mixer block.
+        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
+            Flag indicating whether or not to use bias in the input and output projections (["in_proj", "out_proj"]) of the mamba mixer block
+
+    """
+
+    model_type = "jamba"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=65536,
+        tie_word_embeddings=False,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        num_logits_to_keep=1,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        sliding_window=None,
+        max_position_embeddings=262144,
+        attention_dropout=0.0,
+        num_experts_per_tok=2,
+        num_experts=16,
+        expert_layer_period=2,
+        expert_layer_offset=1,
+        attn_layer_period=8,
+        attn_layer_offset=4,
+        use_mamba_kernels=True,
+        mamba_d_state=16,
+        mamba_d_conv=4,
+        mamba_expand=2,
+        mamba_dt_rank="auto",
+        mamba_conv_bias=True,
+        mamba_proj_bias=False,
+        **kwargs,
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.tie_word_embeddings = tie_word_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+        self.max_position_embeddings = max_position_embeddings
+        self.attention_dropout = attention_dropout
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+
+        self.use_cache = use_cache
+        self.num_logits_to_keep = num_logits_to_keep
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.expert_layer_period = expert_layer_period
+        self.expert_layer_offset = expert_layer_offset
+        self.attn_layer_period = attn_layer_period
+        self.attn_layer_offset = attn_layer_offset
+
+        self.use_mamba_kernels = use_mamba_kernels
+        self.mamba_d_state = mamba_d_state
+        self.mamba_d_conv = mamba_d_conv
+        self.mamba_expand = mamba_expand
+        self.mamba_dt_rank = math.ceil(self.hidden_size / 16) if mamba_dt_rank == "auto" else mamba_dt_rank
+        self.mamba_conv_bias = mamba_conv_bias
+        self.mamba_proj_bias = mamba_proj_bias
+
+    @property
+    def layers_block_type(self):
+        return [
+            "attention" if i % self.attn_layer_period == self.attn_layer_offset else "mamba"
+            for i in range(self.num_hidden_layers)
+        ]
+
+    @property
+    def layers_num_experts(self):
+        return [
+            self.num_experts if i % self.expert_layer_period == self.expert_layer_offset else 1
+            for i in range(self.num_hidden_layers)
+        ]
Original file line number	Diff line number	Diff line change
`@@ -124,6 +124,7 @@`
`124`	`124`	`("Gemma", "gemma"),`
`125`	`125`	`("Yuan", "yuan"),`
`126`	`126`	`("Mamba", "mamba"),`
	`127`	`+ ("Jamba", "jamba"),`
`127`	`128`	`]`
`128`	`129`	`)`
`129`	`130`
Original file line number	Diff line number	Diff line change
`@@ -100,6 +100,7 @@`
`100`	`100`	`("GemmaTokenizer", "gemma"),`
`101`	`101`	`("YuanTokenizer", "yuan"),`
`102`	`102`	`("MambaTokenizer", "mamba"),`
	`103`	`+ ("JambaTokenizer", "jamba"),`
`103`	`104`	`]`
`104`	`105`	`)`
`105`	`106`