-
Couldn't load subscription status.
- Fork 13.4k
Added support for the ArcticForCausalLM. #7020
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
71d8bd6
c95013d
c6f15a7
0cffda8
f3d1227
a892571
4ebb52c
9acc3ec
f4421f7
7a5df5f
85263f0
5b2be25
5553226
f93acb5
b53fd29
eb58c4b
a1a5508
3aa20e1
602c80d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -137,6 +137,7 @@ class MODEL_ARCH(IntEnum): | |
| COMMAND_R = auto() | ||
| DBRX = auto() | ||
| OLMO = auto() | ||
| ARCTIC = auto() | ||
|
|
||
|
|
||
| class MODEL_TENSOR(IntEnum): | ||
|
|
@@ -179,6 +180,7 @@ class MODEL_TENSOR(IntEnum): | |
| SSM_A = auto() | ||
| SSM_D = auto() | ||
| SSM_OUT = auto() | ||
| FFN_NORM_EXP = auto() | ||
|
||
|
|
||
|
|
||
| MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { | ||
|
|
@@ -214,6 +216,7 @@ class MODEL_TENSOR(IntEnum): | |
| MODEL_ARCH.COMMAND_R: "command-r", | ||
| MODEL_ARCH.DBRX: "dbrx", | ||
| MODEL_ARCH.OLMO: "olmo", | ||
| MODEL_ARCH.ARCTIC: "arctic", | ||
| } | ||
|
|
||
| TENSOR_NAMES: dict[MODEL_TENSOR, str] = { | ||
|
|
@@ -256,6 +259,7 @@ class MODEL_TENSOR(IntEnum): | |
| MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", | ||
| MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", | ||
| MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", | ||
| MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps", | ||
| } | ||
|
|
||
| MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { | ||
|
|
@@ -724,6 +728,27 @@ class MODEL_TENSOR(IntEnum): | |
| MODEL_TENSOR.FFN_DOWN, | ||
| MODEL_TENSOR.FFN_UP, | ||
| ], | ||
| MODEL_ARCH.ARCTIC: [ | ||
| MODEL_TENSOR.TOKEN_EMBD, | ||
| MODEL_TENSOR.OUTPUT_NORM, | ||
| MODEL_TENSOR.OUTPUT, | ||
| MODEL_TENSOR.ROPE_FREQS, | ||
| MODEL_TENSOR.ATTN_NORM, | ||
| MODEL_TENSOR.ATTN_Q, | ||
| MODEL_TENSOR.ATTN_K, | ||
| MODEL_TENSOR.ATTN_V, | ||
| MODEL_TENSOR.ATTN_OUT, | ||
| MODEL_TENSOR.ATTN_ROT_EMBD, | ||
| MODEL_TENSOR.FFN_GATE_INP, | ||
| MODEL_TENSOR.FFN_NORM, | ||
| MODEL_TENSOR.FFN_GATE, | ||
| MODEL_TENSOR.FFN_DOWN, | ||
| MODEL_TENSOR.FFN_UP, | ||
| MODEL_TENSOR.FFN_GATE_EXP, | ||
| MODEL_TENSOR.FFN_DOWN_EXP, | ||
| MODEL_TENSOR.FFN_UP_EXP, | ||
| MODEL_TENSOR.FFN_NORM_EXP, | ||
| ], | ||
| # TODO | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -370,6 +370,64 @@ class TensorNameMap: | |
| "model.layers.{bid}.out_proj", | ||
| "backbone.layers.{bid}.mixer.out_proj", | ||
| ), | ||
|
|
||
| } | ||
|
|
||
| # architecture-specific block mappings | ||
| arch_block_mappings_cfg: dict[MODEL_ARCH, dict[MODEL_TENSOR, tuple[str, ...]]] = { | ||
| MODEL_ARCH.ARCTIC: { | ||
| MODEL_TENSOR.TOKEN_EMBD: ( | ||
| "model.embed_tokens", | ||
| ), | ||
| MODEL_TENSOR.OUTPUT_NORM: ( | ||
| "model.norm", | ||
| ), | ||
| MODEL_TENSOR.OUTPUT: ( | ||
| "lm_head", | ||
| ), | ||
| MODEL_TENSOR.ATTN_NORM: ( | ||
| "model.layers.{bid}.input_layernorm", | ||
| ), | ||
| MODEL_TENSOR.ATTN_Q: ( | ||
| "model.layers.{bid}.self_attn.q_proj", | ||
| ), | ||
| MODEL_TENSOR.ATTN_K: ( | ||
| "model.layers.{bid}.self_attn.k_proj", | ||
| ), | ||
| MODEL_TENSOR.ATTN_V: ( | ||
| "model.layers.{bid}.self_attn.v_proj", | ||
| ), | ||
| MODEL_TENSOR.ATTN_OUT: ( | ||
| "model.layers.{bid}.self_attn.o_proj", | ||
| ), | ||
| MODEL_TENSOR.FFN_GATE_INP: ( | ||
| "model.layers.{bid}.block_sparse_moe.gate", | ||
| ), | ||
| MODEL_TENSOR.FFN_NORM: ( | ||
| "model.layers.{bid}.residual_layernorm", | ||
| ), | ||
| MODEL_TENSOR.FFN_GATE: ( | ||
| "model.layers.{bid}.residual_mlp.w1", | ||
| ), | ||
| MODEL_TENSOR.FFN_DOWN: ( | ||
| "model.layers.{bid}.residual_mlp.w2", | ||
| ), | ||
| MODEL_TENSOR.FFN_UP: ( | ||
| "model.layers.{bid}.residual_mlp.w3", | ||
| ), | ||
| MODEL_TENSOR.FFN_GATE_EXP: ( | ||
| "layers.{bid}.feed_forward.experts.w1", | ||
| ), | ||
| MODEL_TENSOR.FFN_DOWN_EXP: ( | ||
| "layers.{bid}.feed_forward.experts.w2", | ||
| ), | ||
| MODEL_TENSOR.FFN_UP_EXP: ( | ||
| "layers.{bid}.feed_forward.experts.w3", | ||
| ), | ||
| MODEL_TENSOR.FFN_NORM_EXP: ( | ||
| "model.layers.{bid}.post_attention_layernorm", | ||
| ), | ||
| }, | ||
| } | ||
|
|
||
| mapping: dict[str, tuple[MODEL_TENSOR, str]] | ||
|
|
@@ -383,12 +441,16 @@ def __init__(self, arch: MODEL_ARCH, n_blocks: int): | |
| self.mapping[tensor_name] = (tensor, tensor_name) | ||
| for key in keys: | ||
| self.mapping[key] = (tensor, tensor_name) | ||
| if arch in self.arch_block_mappings_cfg: | ||
| block_mappings = self.arch_block_mappings_cfg[arch] | ||
|
||
| else: | ||
| block_mappings = self.block_mappings_cfg | ||
| for bid in range(n_blocks): | ||
| for tensor, keys in self.block_mappings_cfg.items(): | ||
| for tensor, keys in block_mappings.items(): | ||
| if tensor not in MODEL_TENSORS[arch]: | ||
| continue | ||
| # TODO: make this configurable | ||
| n_experts = 60 | ||
| n_experts = 128 | ||
| for xid in range(n_experts): | ||
| tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid) | ||
| self.mapping[tensor_name] = (tensor, tensor_name) | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.