Skip to content

Commit 4ea8a66

Browse files
committed
support GPT-OSS
1 parent 62b5dd4 commit 4ea8a66

File tree

8 files changed

+683
-21
lines changed

8 files changed

+683
-21
lines changed

CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ if (MSVC)
1818
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/D_CRT_SECURE_NO_WARNINGS>")
1919
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/wd4996>")
2020
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/wd4722>")
21+
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/MP>")
2122
endif ()
2223

2324
if (NOT MSVC)
@@ -79,6 +80,7 @@ set(core_files src/backend.cpp
7980
models/falcon.cpp
8081
models/gemma.cpp
8182
models/gigachat.cpp
83+
models/gpt.cpp
8284
models/granite.cpp
8385
models/groq.cpp
8486
models/grok.cpp

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ pure C++ implementation based on [@ggerganov](https://github.com/ggerganov)'s [g
1313

1414
**What's New:**
1515

16+
* 2025-08-11: GPT-OSS
1617
* 2025-08-05: Pangu-Embedded
1718
* 2025-07-29: Jiutian
1819
* 2025-07-10: SmolLM-3

convert.py

Lines changed: 164 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,8 @@ class ModelType(Enum):
209209

210210
JiuTian = 0x2900
211211

212+
GPTOSS = 0x2A00
213+
212214
BCE_Embedding = 0x10000100
213215
BCE_ReRanker = 0x10000101
214216
BGE_M3 = 0x10000102
@@ -7242,7 +7244,6 @@ def get_block(prefix: str):
72427244

72437245
return weights + dac_weights
72447246

7245-
72467247
class JiuTianConverter(BaseConverter):
72477248
MODEL_TYPE = ModelType.JiuTian
72487249

@@ -7262,6 +7263,166 @@ def dump_config(f, config, ggml_type):
72627263
def get_weight_names(config):
72637264
return QWen2Converter.get_weight_names(config)
72647265

7266+
class GptOssConverter(BaseConverter):
7267+
MODEL_TYPE = ModelType.GPTOSS
7268+
7269+
@classmethod
7270+
def state_dict_pp(cls, config, state_dict):
7271+
def convert_moe_packed_tensors(
7272+
blocks,
7273+
scales,
7274+
*,
7275+
dtype: torch.dtype = torch.bfloat16,
7276+
rows_per_chunk: int = 32768 * 1024,
7277+
) -> torch.Tensor:
7278+
FP4_VALUES = [ +0.0, +0.5, +1.0, +1.5, +2.0, +3.0, +4.0, +6.0, -0.0, -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0, ]
7279+
scales = scales.to(torch.int32) - 127
7280+
assert blocks.shape[:-1] == scales.shape, f"{blocks.shape=} does not match {scales.shape=}"
7281+
7282+
lut = torch.tensor(FP4_VALUES, dtype=dtype, device=blocks.device)
7283+
7284+
*prefix_shape, G, B = blocks.shape
7285+
rows_total = math.prod(prefix_shape) * G
7286+
7287+
blocks = blocks.reshape(rows_total, B)
7288+
scales = scales.reshape(rows_total, 1)
7289+
7290+
out = torch.empty(rows_total, B * 2, dtype=dtype, device=blocks.device)
7291+
7292+
for r0 in range(0, rows_total, rows_per_chunk):
7293+
r1 = min(r0 + rows_per_chunk, rows_total)
7294+
7295+
blk = blocks[r0:r1]
7296+
exp = scales[r0:r1]
7297+
7298+
# nibble indices -> int64
7299+
idx_lo = (blk & 0x0F).to(torch.long)
7300+
idx_hi = (blk >> 4).to(torch.long)
7301+
7302+
sub = out[r0:r1]
7303+
sub[:, 0::2] = lut[idx_lo]
7304+
sub[:, 1::2] = lut[idx_hi]
7305+
7306+
torch.ldexp(sub, exp, out=sub)
7307+
del idx_lo, idx_hi, blk, exp
7308+
7309+
out = out.reshape(*prefix_shape, G, B * 2).view(*prefix_shape, G * B * 2)
7310+
# to match for now existing implementation
7311+
return out.to(torch.float8_e5m2)
7312+
7313+
r = {}
7314+
7315+
for name in state_dict:
7316+
t = state_dict[name]
7317+
if name.endswith('mlp.experts.gate_up_proj_blocks'):
7318+
unpacked = convert_moe_packed_tensors(t, state_dict[name.replace('gate_up_proj_blocks', 'gate_up_proj_scales')])
7319+
for j in range(config.num_local_experts):
7320+
gate_up = unpacked[j]
7321+
new_name = name.replace('experts.gate_up_proj_blocks', f'experts.{j}.gate_proj.weight')
7322+
r[new_name] = gate_up[0::2, ...]
7323+
new_name = name.replace('experts.gate_up_proj_blocks', f'experts.{j}.up_proj.weight')
7324+
r[new_name] = gate_up[1::2, ...]
7325+
7326+
elif name.endswith('mlp.experts.gate_up_proj_bias'):
7327+
for j in range(config.num_local_experts):
7328+
gate_up = t[j]
7329+
new_name = name.replace('experts.gate_up_proj_bias', f'experts.{j}.gate_proj.bias')
7330+
r[new_name] = gate_up[0::2]
7331+
new_name = name.replace('experts.gate_up_proj_bias', f'experts.{j}.up_proj.bias')
7332+
r[new_name] = gate_up[1::2]
7333+
elif name.endswith('mlp.experts.down_proj_blocks'):
7334+
unpacked = convert_moe_packed_tensors(t, state_dict[name.replace('down_proj_blocks', 'down_proj_scales')])
7335+
for j in range(config.num_local_experts):
7336+
new_name = name.replace('experts.down_proj_blocks', f'experts.{j}.down_proj.weight')
7337+
r[new_name] = unpacked[j]
7338+
elif name.endswith('mlp.experts.down_proj_bias'):
7339+
for j in range(config.num_local_experts):
7340+
new_name = name.replace('experts.down_proj_bias', f'experts.{j}.down_proj.bias')
7341+
r[new_name] = t[j]
7342+
elif name.endswith('mlp.experts.gate_up_proj_scales') or name.endswith('mlp.experts.down_proj_scales'):
7343+
pass
7344+
else:
7345+
r[name] = t
7346+
7347+
return r
7348+
7349+
@staticmethod
7350+
def dump_config(f, config, ggml_type):
7351+
MAX_LAYERS = 128
7352+
assert not config.tie_word_embeddings
7353+
assert len(config.layer_types) <= MAX_LAYERS
7354+
assert config.num_hidden_layers <= MAX_LAYERS
7355+
assert config.rope_scaling['rope_type'] == 'yarn'
7356+
7357+
dump_llama_like_config(f, config, ggml_type)
7358+
7359+
layer_types = [0] * MAX_LAYERS
7360+
for i in range(len(config.layer_types)):
7361+
layer_types[i] = 1 if config.layer_types[i] == 'sliding_attention' else 0
7362+
7363+
config_values = [
7364+
config.num_key_value_heads,
7365+
config.head_dim,
7366+
config.experts_per_token,
7367+
config.num_experts_per_tok,
7368+
config.num_local_experts,
7369+
config.sliding_window,
7370+
] + layer_types
7371+
f.write(struct.pack("<" + "i" * len(config_values), *config_values))
7372+
7373+
config_values = [
7374+
config.router_aux_loss_coef,
7375+
config.swiglu_limit,
7376+
config.rope_theta,
7377+
config.rope_scaling['original_max_position_embeddings'],
7378+
config.rope_scaling['beta_fast'],
7379+
config.rope_scaling['beta_slow'],
7380+
config.rope_scaling['factor'],
7381+
]
7382+
f.write(struct.pack("<" + "f" * len(config_values), *config_values))
7383+
7384+
@staticmethod
7385+
def get_weight_names(config):
7386+
weight_names = ["model.embed_tokens.weight"]
7387+
for i in range(config.num_hidden_layers):
7388+
7389+
weight_names += [
7390+
f"model.layers.{i}.input_layernorm.weight",
7391+
]
7392+
7393+
for j in range(config.num_local_experts):
7394+
weight_names += [
7395+
f"model.layers.{i}.mlp.experts.{j}.down_proj.weight",
7396+
f"model.layers.{i}.mlp.experts.{j}.down_proj.bias",
7397+
f"model.layers.{i}.mlp.experts.{j}.gate_proj.weight",
7398+
f"model.layers.{i}.mlp.experts.{j}.gate_proj.bias",
7399+
f"model.layers.{i}.mlp.experts.{j}.up_proj.weight",
7400+
f"model.layers.{i}.mlp.experts.{j}.up_proj.bias",
7401+
]
7402+
7403+
weight_names += [
7404+
f"model.layers.{i}.mlp.router.weight",
7405+
f"model.layers.{i}.mlp.router.bias",
7406+
7407+
f"model.layers.{i}.post_attention_layernorm.weight",
7408+
f"model.layers.{i}.self_attn.k_proj.weight",
7409+
f"model.layers.{i}.self_attn.k_proj.bias",
7410+
f"model.layers.{i}.self_attn.q_proj.weight",
7411+
f"model.layers.{i}.self_attn.q_proj.bias",
7412+
f"model.layers.{i}.self_attn.v_proj.weight",
7413+
f"model.layers.{i}.self_attn.v_proj.bias",
7414+
f"model.layers.{i}.self_attn.o_proj.weight",
7415+
f"model.layers.{i}.self_attn.o_proj.bias",
7416+
f"model.layers.{i}.self_attn.sinks",
7417+
]
7418+
7419+
weight_names += [
7420+
"model.norm.weight",
7421+
"lm_head.weight"
7422+
]
7423+
7424+
return weight_names
7425+
72657426
def convert_grok_1_base(args, vocab, ggml_type):
72667427
def ffn_size(emb_size, widening_factor):
72677428
_ffn_size = int(widening_factor * emb_size) * 2 // 3
@@ -7857,6 +8018,8 @@ def main():
78578018
PanguEmbeddedConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
78588019
elif arch == 'JiutianForCausalLM':
78598020
JiuTianConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
8021+
elif arch == 'GptOssForCausalLM':
8022+
GptOssConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
78608023
elif arch == 'deepseek-r1-distill-qwen3':
78618024
QWen3Converter.MODEL_TYPE = ModelType.DeepSeek_R1_Distill_QWen3
78628025
QWen3Converter.convert(config, model_files, vocab, ggml_type, args.save_path)

docs/models.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,11 @@
7474

7575
Note: Only download `tokenizer.model` and DO NOT download `tokenizer.json` when converting.
7676

77+
* GPT (`GptOssForCausalLM`)
78+
* [x] OSS: [20B](https://huggingface.co/openai/gpt-oss-20b/tree/cbf31f62664d4b1360b3a78427f7b3c3ed8f0fa8), [120B](https://huggingface.co/openai/gpt-oss-120b/tree/bc75b44b8a2a116a0e4c6659bcd1b7969885f423)
79+
80+
Note: Q4_1/Q4_0 quantization won't work. Use Q8 instead.
81+
7782
* Granite (`GraniteForCausalLM`, `GraniteMoeForCausalLM`)
7883
* [x] v3.0: [Instruct-1B-A400M](https://huggingface.co/ibm-granite/granite-3.0-1b-a400m-instruct), [Instruct-3B-A800M](https://huggingface.co/ibm-granite/granite-3.0-3b-a800m-instruct), [Instruct-2B](https://huggingface.co/ibm-granite/granite-3.0-2b-instruct), [Instruct-8B](https://huggingface.co/ibm-granite/granite-3.0-8b-instruct)
7984
* [x] v3.1: [Instruct-1B-A400M](https://huggingface.co/ibm-granite/granite-3.1-1b-a400m-instruct), [Instruct-3B-A800M](https://huggingface.co/ibm-granite/granite-3.1-3b-a800m-instruct), [Instruct-2B](https://huggingface.co/ibm-granite/granite-3.1-2b-instruct), [Instruct-8B](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct)

0 commit comments

Comments
 (0)