PaddlePaddle
diff --git a/‎examples/config/gpt_oss/sft_argument_gptoss_20b.json‎
Lines changed: 42 additions & 0 deletions b/‎examples/config/gpt_oss/sft_argument_gptoss_20b.json‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎paddleformers/transformers/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎paddleformers/transformers/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎paddleformers/transformers/auto/configuration.py‎
Lines changed: 1 addition & 0 deletions b/‎paddleformers/transformers/auto/configuration.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddleformers/transformers/auto/modeling.py‎
Lines changed: 1 addition & 0 deletions b/‎paddleformers/transformers/auto/modeling.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddleformers/transformers/gpt_oss/__init__.py‎
Lines changed: 16 additions & 0 deletions b/‎paddleformers/transformers/gpt_oss/__init__.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎paddleformers/transformers/gpt_oss/configuration.py‎
Lines changed: 101 additions & 0 deletions b/‎paddleformers/transformers/gpt_oss/configuration.py‎
Lines changed: 101 additions & 0 deletions
@@ -0,0 +1,42 @@
+{
+    "model_name_or_path": "../gpt-oss-model-bf16",
+    "dataset_name_or_path": "./data",
+    "output_dir": "./checkpoints/gptoss_paddle_sft_ckpts",
+    "overwrite_output_dir": false,
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 4,
+    "per_device_eval_batch_size": 8,
+    "eval_accumulation_steps":16,
+    "num_train_epochs": 1,
+    "learning_rate": 3e-05,
+    "warmup_steps": 10,
+    "logging_steps": 1,
+    "evaluation_strategy": "epoch",
+    "save_strategy": "epoch",
+    "src_length": 1024,
+    "max_length": 2048,
+    "bf16": true,
+    "fp16_opt_level": "O2",
+    "do_train": true,
+    "do_eval": false,
+    "disable_tqdm": true,
+    "load_best_model_at_end": true,
+    "eval_with_do_generation": false,
+    "metric_for_best_model": "accuracy",
+    "recompute": false,
+    "save_total_limit": 1,
+    "tensor_parallel_degree": 4,
+    "pipeline_parallel_degree": 1,
+    "sharding": "stage2",
+    "zero_padding": false,
+    "unified_checkpoint": true,
+    "use_flash_attention": false,
+    "lora": true,
+    "lora_rank": 8,
+    "rslora": false,
+    "lora_plus_scale": 1.0,
+    "pissa": false,
+    "use_quick_lora": false,
+    "lora_use_mixer": false,
+    "use_mora": false
+  }
@@ -174,6 +174,8 @@
     "ernie4_5_moe.configuration": ["Ernie4_5_MoeConfig"],
     "ernie4_5_moe.modeling": ["Ernie4_5_MoeModel", "Ernie4_5_MoeForCausalLM", "Ernie4_5_MoeForCausalLMPipe"],
     "export": ["export_model"],
+    "gpt_oss.configuration": ["GptOssConfig"],
+    "gpt_oss.modeling": ["GptOssModel", "GptOssForCausalLM"],
     "llama.configuration": [
         "LLAMA_PRETRAINED_INIT_CONFIGURATION",
         "LlamaConfig",
@@ -400,6 +402,7 @@
     from .qwen2_moe import *
     from .qwen3 import *
     from .qwen3_moe import *
+    from .gpt_oss import *
 else:
     sys.modules[__name__] = _LazyModule(
         __name__,
 
@@ -44,6 +44,7 @@
         ("qwen2_moe", "Qwen2MoeConfig"),
         ("qwen3", "Qwen3Config"),
         ("qwen3_moe", "Qwen3MoeConfig"),
+        ("gpt_oss", "GptOssConfig"),
     ]
 )
 
 
@@ -62,6 +62,7 @@
         ("Qwen3", "qwen3"),
         ("Qwen2Moe", "qwen2_moe"),
         ("Qwen3Moe", "qwen3_moe"),
+        ("GptOss", "gpt_oss"),
     ]
 )
 
 
@@ -0,0 +1,16 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .configuration import *
+from .modeling import *
@@ -0,0 +1,101 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# from ..configuration_utils import PretrainedConfig, layer_type_validation
+from ..configuration_utils import PretrainedConfig
+
+# from ...modeling_rope_utils import rope_config_validation
+
+
+class GptOssConfig(PretrainedConfig):
+    r"""
+    This will yield a configuration to that of the BERT
+    [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) architecture.
+    """
+
+    model_type = "gpt_oss"
+
+    def __init__(
+        self,
+        num_hidden_layers: int = 24,
+        num_local_experts: int = 128,
+        vocab_size: int = 201088,
+        hidden_size: int = 2880,
+        intermediate_size: int = 2880,
+        head_dim: int = 64,
+        num_attention_heads: int = 64,
+        num_key_value_heads: int = 8,
+        sliding_window: int = 128,
+        rope_theta: float = 150000.0,
+        tie_word_embeddings=False,
+        hidden_act: str = "silu",
+        initializer_range: float = 0.02,
+        max_position_embeddings=131072,
+        rms_norm_eps: float = 1e-5,
+        rope_scaling={"rope_type": "yarn", "factor": 32.0, "beta_fast": 32.0, "beta_slow": 1.0, "truncate": False},
+        attention_dropout: float = 0.0,
+        num_experts_per_tok=4,
+        router_aux_loss_coef: float = 0.9,
+        output_router_logits=False,
+        use_cache=True,
+        layer_types=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_experts = num_local_experts
+        self.sliding_window = sliding_window
+        self.num_experts_per_tok = num_experts_per_tok
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_dropout = attention_dropout
+        self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
+        self.layer_types = layer_types
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention" if bool((i + 1) % 2) else "full_attention" for i in range(self.num_hidden_layers)
+            ]
+        # layer_type_validation(self.layer_types)
+
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        # rope_config_validation(self)
+
+        self.attention_bias = True
+        self.max_position_embeddings = max_position_embeddings
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.output_router_logits = output_router_logits
+        self.use_cache = use_cache
+        self.use_bias = False
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["GptOssConfig"]
Original file line number	Diff line number	Diff line change
`@@ -44,6 +44,7 @@`
`44`	`44`	`("qwen2_moe", "Qwen2MoeConfig"),`
`45`	`45`	`("qwen3", "Qwen3Config"),`
`46`	`46`	`("qwen3_moe", "Qwen3MoeConfig"),`
	`47`	`+ ("gpt_oss", "GptOssConfig"),`
`47`	`48`	`]`
`48`	`49`	`)`
`49`	`50`
Original file line number	Diff line number	Diff line change
`@@ -62,6 +62,7 @@`
`62`	`62`	`("Qwen3", "qwen3"),`
`63`	`63`	`("Qwen2Moe", "qwen2_moe"),`
`64`	`64`	`("Qwen3Moe", "qwen3_moe"),`
	`65`	`+ ("GptOss", "gpt_oss"),`
`65`	`66`	`]`
`66`	`67`	`)`
`67`	`68`