Add mamba part of plamo2

mitmul · mitmul · commit e99335bc1504 · 2025-06-01T19:37:01.000+09:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -2312,6 +2312,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         # Handle Plamo2 specific tensor naming
         # The model has both attention and Mamba layers
         
+        # Debug: log all Mamba-related tensors
+        if "mixer" in name or "ssm" in name or "A_log" in name or ".D" in name:
+            logger.info(f"Processing Mamba tensor: {name}, shape: {data_torch.shape}")
+        
         # Handle the nested layer structure: layers.layers.X
         if name.startswith("model.layers.layers."):
             # Extract the layer number and rest of the name
@@ -2341,30 +2345,49 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             ]
 
         # Handle Mamba-specific A_log tensor transformation
-        if name.endswith(".A_log"):
+        if name.endswith(".A_log") or name.endswith(".mixer.A_log"):
             # Map the A_log tensor directly to ssm_a
             new_name = self.map_tensor_name(name)
             # Add .weight suffix if not present
             if not new_name.endswith(".weight"):
                 new_name += ".weight"
-            logger.debug(f"A_log --> A ==> {new_name}, original shape: {data_torch.shape}")
+            logger.info(f"A_log --> A ==> {new_name}, original shape: {data_torch.shape}")
             
             # Transform A_log to A: A = -exp(A_log)
             data_torch = -torch.exp(data_torch)
             
-            # PLaMo2 A_log is shape {d_state} but llama.cpp expects {d_state, d_inner}
-            # Expand the tensor to the correct shape
+            # Handle different tensor shapes for A_log
+            logger.info(f"A_log tensor after exp transform, shape: {data_torch.shape}")
+            
+            # Ensure we have a 2D tensor
+            while len(data_torch.shape) > 2:
+                data_torch = data_torch.squeeze(0)
+            
             if len(data_torch.shape) == 1:
-                d_state = data_torch.shape[0]  # 64
-                d_inner = 8192  # SSM inner size for PLaMo2
+                # PLaMo2 A_log is shape {d_state} but llama.cpp expects {d_state, d_inner}
+                d_state = data_torch.shape[0]  # 64 for PLaMo2
+                # Get d_inner from model hyperparameters
+                mamba_num_heads = self.hparams.get("mamba_num_heads", 64)
+                hidden_size_per_head = self.hparams.get("hidden_size_per_head", 128)
+                d_inner = mamba_num_heads * hidden_size_per_head  # 64 * 128 = 8192
                 
                 # Create tensor with correct shape {d_state, d_inner} = {64, 8192}
                 # Each row of the matrix should contain the same value from the original 1D tensor
                 new_tensor = data_torch.new_zeros((d_state, d_inner))
                 for i in range(d_state):
                     new_tensor[i, :] = data_torch[i]  # Broadcast the single value across the inner dimension
                 data_torch = new_tensor
-                logger.debug(f"Expanded A tensor from {d_state} to shape: {data_torch.shape}")
+                logger.info(f"Expanded A tensor from {d_state} to shape: {data_torch.shape}")
+            elif len(data_torch.shape) == 2:
+                # Check if dimensions need to be transposed
+                # Expected shape is [d_state, d_inner] where d_state = 64, d_inner = 8192
+                if data_torch.shape[0] == 8192 and data_torch.shape[1] == 64:
+                    data_torch = data_torch.transpose(0, 1)
+                    logger.info(f"Transposed A tensor to shape: {data_torch.shape}")
+            
+            # Final shape check and reshape if needed
+            if data_torch.shape != torch.Size([64, 8192]):
+                logger.warning(f"Unexpected A tensor shape after processing: {data_torch.shape}, expected [64, 8192]")
             
             return [(new_name, data_torch)]
         
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -7792,9 +7792,75 @@ struct llm_build_plamo2 : public llm_graph_context {
                             Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
                 }
             } else if (layer_type == "mamba") {
-                // Mamba layer processing - simplified implementation for now
-                // TODO: Implement full mamba layer logic
-                GGML_ASSERT(false && "Mamba layers not yet fully implemented for PLaMo2");
+                // Mamba layer processing
+                const int64_t d_conv  = hparams.ssm_d_conv;
+                const int64_t d_inner = hparams.ssm_d_inner;
+                const int64_t d_state = hparams.ssm_d_state;
+                const int64_t dt_rank = hparams.ssm_dt_rank;
+
+                // Apply linear transformation: n_embd -> 2*d_inner
+                ggml_tensor * xz = build_lora_mm(model.layers[il].ssm_in, mixer_norm);
+                cb(xz, "ssm_in", il);
+
+                // Split into x and z
+                ggml_tensor * x = ggml_view_2d(ctx0, xz, d_inner, n_tokens, xz->nb[1], 0);
+                ggml_tensor * z = ggml_view_2d(ctx0, xz, d_inner, n_tokens, xz->nb[1], d_inner*ggml_element_size(xz));
+
+                // For simplified PLaMo2 implementation without state caching,
+                // we use a basic convolution approach
+                // Reshape x for convolution: {d_inner, n_tokens} -> {n_tokens, d_inner}
+                x = ggml_cont(ctx0, ggml_transpose(ctx0, x));
+                
+                // Apply 1D convolution with proper padding
+                // Note: PLaMo2 conv1d weight shape is {d_inner, d_conv}
+                ggml_tensor * conv_w = ggml_reshape_2d(ctx0, model.layers[il].ssm_conv1d, d_conv, d_inner);
+                x = ggml_conv_1d(ctx0, conv_w, x, 1, d_conv - 1, 1);
+                
+                // Transpose back and apply SiLU
+                x = ggml_cont(ctx0, ggml_transpose(ctx0, x));
+                x = ggml_silu(ctx0, x);
+                cb(x, "ssm_conv", il);
+
+                // SSM sequence transformation
+                {
+                    // Project x to dt, B, C
+                    ggml_tensor * x_db = build_lora_mm(model.layers[il].ssm_bcdt, x);
+                    cb(x_db, "ssm_bcdt", il);
+
+                    // Split into dt, B, C
+                    ggml_tensor * dt = ggml_view_2d(ctx0, x_db, dt_rank, n_tokens, x_db->nb[1], 0);
+                    ggml_tensor * B  = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*dt_rank);
+                    ggml_tensor * C  = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*(dt_rank+d_state));
+
+                    // Project dt_rank to d_inner 
+                    dt = build_lora_mm(model.layers[il].ssm_dt, dt);
+                    cb(dt, "ssm_dt", il);
+
+                    // For simplified implementation without full SSM scan,
+                    // we'll create a basic selective scan approximation
+                    // Note: This is a simplified version and may not capture all SSM dynamics
+                    
+                    // Create dummy state tensors for ggml_ssm_scan
+                    ggml_tensor * dummy_s = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, d_state, d_inner, 1);
+                    
+                    // Use ggml_ssm_scan for proper SSM computation
+                    ggml_tensor * y_ssm = ggml_ssm_scan(ctx0, dummy_s, x, dt, model.layers[il].ssm_a, B, C);
+                    
+                    // Extract the output (first part of y_ssm)
+                    ggml_tensor * y = ggml_view_2d(ctx0, y_ssm, d_inner, n_tokens, y_ssm->nb[1], 0);
+                    
+                    // Add D parameter contribution
+                    y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
+                    x = y;
+                }
+
+                // Gated output
+                x = ggml_mul(ctx0, x, ggml_silu(ctx0, ggml_cont(ctx0, z)));
+                cb(x, "ssm_gate", il);
+
+                // Output projection
+                cur = build_lora_mm(model.layers[il].ssm_out, x);
+                cb(cur, "ssm_out", il);
             } else {
                 // Default to attention-like processing for unknown layer types
                 cur = build_lora_mm(model.layers[il].wqkv, mixer_norm);