@@ -166,6 +166,7 @@ class SSM:
166166 TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
167167 GROUP_COUNT = "{arch}.ssm.group_count"
168168 DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms"
169+ HEAD_DIM = "{arch}.ssm.head_dim"
169170
170171 class WKV :
171172 HEAD_SIZE = "{arch}.wkv.head_size"
@@ -348,6 +349,7 @@ class MODEL_ARCH(IntEnum):
348349 BAILINGMOE = auto ()
349350 DOTS1 = auto ()
350351 ARCEE = auto ()
352+ FALCON_H1 = auto ()
351353
352354
353355class VISION_PROJECTOR_TYPE (IntEnum ):
@@ -408,6 +410,7 @@ class MODEL_TENSOR(IntEnum):
408410 SSM_D = auto ()
409411 SSM_NORM = auto ()
410412 SSM_OUT = auto ()
413+ SSM_MUP_VEC = auto ()
411414 TIME_MIX_W0 = auto ()
412415 TIME_MIX_W1 = auto ()
413416 TIME_MIX_W2 = auto ()
@@ -633,6 +636,7 @@ class MODEL_TENSOR(IntEnum):
633636 MODEL_ARCH .BAILINGMOE : "bailingmoe" ,
634637 MODEL_ARCH .DOTS1 : "dots1" ,
635638 MODEL_ARCH .ARCEE : "arcee" ,
639+ MODEL_ARCH .FALCON_H1 : "falcon-h1" ,
636640}
637641
638642VISION_PROJECTOR_TYPE_NAMES : dict [VISION_PROJECTOR_TYPE , str ] = {
@@ -670,7 +674,7 @@ class MODEL_TENSOR(IntEnum):
670674 MODEL_TENSOR .FFN_GATE_INP : "blk.{bid}.ffn_gate_inp" ,
671675 MODEL_TENSOR .FFN_GATE_INP_SHEXP : "blk.{bid}.ffn_gate_inp_shexp" ,
672676 MODEL_TENSOR .FFN_NORM : "blk.{bid}.ffn_norm" ,
673- MODEL_TENSOR .FFN_PRE_NORM : "blk.{bid}.ffn_norm " ,
677+ MODEL_TENSOR .FFN_PRE_NORM : "blk.{bid}.ffn_pre_norm " ,
674678 MODEL_TENSOR .FFN_POST_NORM : "blk.{bid}.post_ffw_norm" ,
675679 MODEL_TENSOR .FFN_GATE : "blk.{bid}.ffn_gate" ,
676680 MODEL_TENSOR .FFN_DOWN : "blk.{bid}.ffn_down" ,
@@ -693,6 +697,7 @@ class MODEL_TENSOR(IntEnum):
693697 MODEL_TENSOR .SSM_D : "blk.{bid}.ssm_d" ,
694698 MODEL_TENSOR .SSM_NORM : "blk.{bid}.ssm_norm" ,
695699 MODEL_TENSOR .SSM_OUT : "blk.{bid}.ssm_out" ,
700+ MODEL_TENSOR .SSM_MUP_VEC : "blk.{bid}.ssm_mup_vec" ,
696701 MODEL_TENSOR .TIME_MIX_W0 : "blk.{bid}.time_mix_w0" ,
697702 MODEL_TENSOR .TIME_MIX_W1 : "blk.{bid}.time_mix_w1" ,
698703 MODEL_TENSOR .TIME_MIX_W2 : "blk.{bid}.time_mix_w2" ,
@@ -2174,6 +2179,41 @@ class MODEL_TENSOR(IntEnum):
21742179 MODEL_ARCH .BAILINGMOE : [
21752180 MODEL_TENSOR .ROPE_FREQS ,
21762181 ],
2182+ MODEL_ARCH .FALCON_H1 : [
2183+ # Token embedding
2184+ MODEL_TENSOR .TOKEN_EMBD ,
2185+
2186+ # Input layernorm
2187+ MODEL_TENSOR .ATTN_NORM ,
2188+
2189+ # Attention components
2190+ MODEL_TENSOR .ATTN_Q , # Query projection
2191+ MODEL_TENSOR .ATTN_K , # Key projection
2192+ MODEL_TENSOR .ATTN_V , # Value projection
2193+ MODEL_TENSOR .ATTN_OUT , # Output projection
2194+
2195+ # SSM components (Mamba2 specific)
2196+ MODEL_TENSOR .SSM_MUP_VEC , # Mup vector
2197+ MODEL_TENSOR .SSM_IN , # Input projection for SSM
2198+ MODEL_TENSOR .SSM_CONV1D , # Convolution layer
2199+ MODEL_TENSOR .SSM_DT , # Delta time projection
2200+ MODEL_TENSOR .SSM_A , # A parameter (log form)
2201+ MODEL_TENSOR .SSM_D , # D parameter
2202+ MODEL_TENSOR .SSM_NORM , # Normalization in SSM
2203+ MODEL_TENSOR .SSM_OUT , # Output projection
2204+
2205+ # Pre-feedforward layernorm
2206+ MODEL_TENSOR .FFN_PRE_NORM ,
2207+
2208+ # Feed-forward network components
2209+ MODEL_TENSOR .FFN_GATE , # Gate projection (SwiGLU)
2210+ MODEL_TENSOR .FFN_DOWN , # Down projection
2211+ MODEL_TENSOR .FFN_UP , # Up projection
2212+
2213+ # Post-feedforward layernorm
2214+ MODEL_TENSOR .OUTPUT_NORM , # Final layer norm
2215+ MODEL_TENSOR .OUTPUT , # Output projection (lm_head)
2216+ ],
21772217}
21782218
21792219#
0 commit comments