foldl
diff --git a/‎CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎convert.py‎
Lines changed: 164 additions & 1 deletion b/‎convert.py‎
Lines changed: 164 additions & 1 deletion
diff --git a/‎docs/models.md‎
Lines changed: 5 additions & 0 deletions b/‎docs/models.md‎
Lines changed: 5 additions & 0 deletions
@@ -18,6 +18,7 @@ if (MSVC)
     add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/D_CRT_SECURE_NO_WARNINGS>")
     add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/wd4996>")
     add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/wd4722>")
+    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/MP>")
 endif ()
 
 if (NOT MSVC)
@@ -79,6 +80,7 @@ set(core_files src/backend.cpp
     models/falcon.cpp
     models/gemma.cpp
     models/gigachat.cpp
+    models/gpt.cpp
     models/granite.cpp
     models/groq.cpp
     models/grok.cpp
 
@@ -13,6 +13,7 @@ pure C++ implementation based on [@ggerganov](https://github.com/ggerganov)'s [g
 
 **What's New:**
 
+* 2025-08-11: GPT-OSS
 * 2025-08-05: Pangu-Embedded
 * 2025-07-29: Jiutian
 * 2025-07-10: SmolLM-3
 
@@ -209,6 +209,8 @@ class ModelType(Enum):
 
     JiuTian         = 0x2900
 
+    GPTOSS          = 0x2A00
+
     BCE_Embedding           = 0x10000100
     BCE_ReRanker            = 0x10000101
     BGE_M3                  = 0x10000102
@@ -7242,7 +7244,6 @@ def get_block(prefix: str):
 
         return weights + dac_weights
 
-
 class JiuTianConverter(BaseConverter):
     MODEL_TYPE = ModelType.JiuTian
 
@@ -7262,6 +7263,166 @@ def dump_config(f, config, ggml_type):
     def get_weight_names(config):
         return QWen2Converter.get_weight_names(config)
 
+class GptOssConverter(BaseConverter):
+    MODEL_TYPE = ModelType.GPTOSS
+
+    @classmethod
+    def state_dict_pp(cls, config, state_dict):
+        def convert_moe_packed_tensors(
+            blocks,
+            scales,
+            *,
+            dtype: torch.dtype = torch.bfloat16,
+            rows_per_chunk: int = 32768 * 1024,
+        ) -> torch.Tensor:
+            FP4_VALUES = [ +0.0, +0.5, +1.0, +1.5, +2.0, +3.0, +4.0, +6.0, -0.0, -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0, ]
+            scales = scales.to(torch.int32) - 127
+            assert blocks.shape[:-1] == scales.shape, f"{blocks.shape=} does not match {scales.shape=}"
+
+            lut = torch.tensor(FP4_VALUES, dtype=dtype, device=blocks.device)
+
+            *prefix_shape, G, B = blocks.shape
+            rows_total = math.prod(prefix_shape) * G
+
+            blocks = blocks.reshape(rows_total, B)
+            scales = scales.reshape(rows_total, 1)
+
+            out = torch.empty(rows_total, B * 2, dtype=dtype, device=blocks.device)
+
+            for r0 in range(0, rows_total, rows_per_chunk):
+                r1 = min(r0 + rows_per_chunk, rows_total)
+
+                blk = blocks[r0:r1]
+                exp = scales[r0:r1]
+
+                # nibble indices -> int64
+                idx_lo = (blk & 0x0F).to(torch.long)
+                idx_hi = (blk >> 4).to(torch.long)
+
+                sub = out[r0:r1]
+                sub[:, 0::2] = lut[idx_lo]
+                sub[:, 1::2] = lut[idx_hi]
+
+                torch.ldexp(sub, exp, out=sub)
+                del idx_lo, idx_hi, blk, exp
+
+            out = out.reshape(*prefix_shape, G, B * 2).view(*prefix_shape, G * B * 2)
+            # to match for now existing implementation
+            return out.to(torch.float8_e5m2)
+
+        r = {}
+
+        for name in state_dict:
+            t = state_dict[name]
+            if name.endswith('mlp.experts.gate_up_proj_blocks'):
+                unpacked = convert_moe_packed_tensors(t, state_dict[name.replace('gate_up_proj_blocks', 'gate_up_proj_scales')])
+                for j in range(config.num_local_experts):
+                    gate_up = unpacked[j]
+                    new_name = name.replace('experts.gate_up_proj_blocks', f'experts.{j}.gate_proj.weight')
+                    r[new_name] = gate_up[0::2, ...]
+                    new_name = name.replace('experts.gate_up_proj_blocks', f'experts.{j}.up_proj.weight')
+                    r[new_name] = gate_up[1::2, ...]
+
+            elif name.endswith('mlp.experts.gate_up_proj_bias'):
+                for j in range(config.num_local_experts):
+                    gate_up = t[j]
+                    new_name = name.replace('experts.gate_up_proj_bias', f'experts.{j}.gate_proj.bias')
+                    r[new_name] = gate_up[0::2]
+                    new_name = name.replace('experts.gate_up_proj_bias', f'experts.{j}.up_proj.bias')
+                    r[new_name] = gate_up[1::2]
+            elif name.endswith('mlp.experts.down_proj_blocks'):
+                unpacked = convert_moe_packed_tensors(t, state_dict[name.replace('down_proj_blocks', 'down_proj_scales')])
+                for j in range(config.num_local_experts):
+                    new_name = name.replace('experts.down_proj_blocks', f'experts.{j}.down_proj.weight')
+                    r[new_name] = unpacked[j]
+            elif name.endswith('mlp.experts.down_proj_bias'):
+                for j in range(config.num_local_experts):
+                    new_name = name.replace('experts.down_proj_bias', f'experts.{j}.down_proj.bias')
+                    r[new_name] = t[j]
+            elif name.endswith('mlp.experts.gate_up_proj_scales') or name.endswith('mlp.experts.down_proj_scales'):
+                pass
+            else:
+                r[name] = t
+
+        return r
+
+    @staticmethod
+    def dump_config(f, config, ggml_type):
+        MAX_LAYERS = 128
+        assert not config.tie_word_embeddings
+        assert len(config.layer_types) <= MAX_LAYERS
+        assert config.num_hidden_layers <= MAX_LAYERS
+        assert config.rope_scaling['rope_type'] == 'yarn'
+
+        dump_llama_like_config(f, config, ggml_type)
+
+        layer_types = [0] * MAX_LAYERS
+        for i in range(len(config.layer_types)):
+            layer_types[i] = 1 if config.layer_types[i] == 'sliding_attention' else 0
+
+        config_values = [
+            config.num_key_value_heads,
+            config.head_dim,
+            config.experts_per_token,
+            config.num_experts_per_tok,
+            config.num_local_experts,
+            config.sliding_window,
+        ] + layer_types
+        f.write(struct.pack("<" + "i" * len(config_values), *config_values))
+
+        config_values = [
+            config.router_aux_loss_coef,
+            config.swiglu_limit,
+            config.rope_theta,
+            config.rope_scaling['original_max_position_embeddings'],
+            config.rope_scaling['beta_fast'],
+            config.rope_scaling['beta_slow'],
+            config.rope_scaling['factor'],
+        ]
+        f.write(struct.pack("<" + "f" * len(config_values), *config_values))
+
+    @staticmethod
+    def get_weight_names(config):
+        weight_names = ["model.embed_tokens.weight"]
+        for i in range(config.num_hidden_layers):
+
+            weight_names += [
+                f"model.layers.{i}.input_layernorm.weight",
+            ]
+
+            for j in range(config.num_local_experts):
+                weight_names += [
+                    f"model.layers.{i}.mlp.experts.{j}.down_proj.weight",
+                    f"model.layers.{i}.mlp.experts.{j}.down_proj.bias",
+                    f"model.layers.{i}.mlp.experts.{j}.gate_proj.weight",
+                    f"model.layers.{i}.mlp.experts.{j}.gate_proj.bias",
+                    f"model.layers.{i}.mlp.experts.{j}.up_proj.weight",
+                    f"model.layers.{i}.mlp.experts.{j}.up_proj.bias",
+                ]
+
+            weight_names += [
+                f"model.layers.{i}.mlp.router.weight",
+                f"model.layers.{i}.mlp.router.bias",
+
+                f"model.layers.{i}.post_attention_layernorm.weight",
+                f"model.layers.{i}.self_attn.k_proj.weight",
+                f"model.layers.{i}.self_attn.k_proj.bias",
+                f"model.layers.{i}.self_attn.q_proj.weight",
+                f"model.layers.{i}.self_attn.q_proj.bias",
+                f"model.layers.{i}.self_attn.v_proj.weight",
+                f"model.layers.{i}.self_attn.v_proj.bias",
+                f"model.layers.{i}.self_attn.o_proj.weight",
+                f"model.layers.{i}.self_attn.o_proj.bias",
+                f"model.layers.{i}.self_attn.sinks",
+            ]
+
+        weight_names += [
+            "model.norm.weight",
+            "lm_head.weight"
+        ]
+
+        return weight_names
+
 def convert_grok_1_base(args, vocab, ggml_type):
     def ffn_size(emb_size, widening_factor):
         _ffn_size = int(widening_factor * emb_size) * 2 // 3
@@ -7857,6 +8018,8 @@ def main():
         PanguEmbeddedConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
     elif arch == 'JiutianForCausalLM':
         JiuTianConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
+    elif arch == 'GptOssForCausalLM':
+        GptOssConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
     elif arch == 'deepseek-r1-distill-qwen3':
         QWen3Converter.MODEL_TYPE = ModelType.DeepSeek_R1_Distill_QWen3
         QWen3Converter.convert(config, model_files, vocab, ggml_type, args.save_path)
 
@@ -74,6 +74,11 @@
 
     Note: Only download `tokenizer.model` and DO NOT download `tokenizer.json` when converting.
 
+* GPT (`GptOssForCausalLM`)
+    * [x] OSS: [20B](https://huggingface.co/openai/gpt-oss-20b/tree/cbf31f62664d4b1360b3a78427f7b3c3ed8f0fa8), [120B](https://huggingface.co/openai/gpt-oss-120b/tree/bc75b44b8a2a116a0e4c6659bcd1b7969885f423)
+
+    Note: Q4_1/Q4_0 quantization won't work. Use Q8 instead.
+
 * Granite (`GraniteForCausalLM`, `GraniteMoeForCausalLM`)
     * [x] v3.0: [Instruct-1B-A400M](https://huggingface.co/ibm-granite/granite-3.0-1b-a400m-instruct), [Instruct-3B-A800M](https://huggingface.co/ibm-granite/granite-3.0-3b-a800m-instruct), [Instruct-2B](https://huggingface.co/ibm-granite/granite-3.0-2b-instruct), [Instruct-8B](https://huggingface.co/ibm-granite/granite-3.0-8b-instruct)
     * [x] v3.1: [Instruct-1B-A400M](https://huggingface.co/ibm-granite/granite-3.1-1b-a400m-instruct), [Instruct-3B-A800M](https://huggingface.co/ibm-granite/granite-3.1-3b-a800m-instruct), [Instruct-2B](https://huggingface.co/ibm-granite/granite-3.1-2b-instruct), [Instruct-8B](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct)