remove unnecessary conts and merge reshapes

CISC · web-flow · commit 583a3a39b921 · 2025-08-17T21:55:20.000+02:00
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -10864,8 +10864,8 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
         ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens]
         all_coefs = ggml_scale_bias(ctx0, all_coefs, 1.0f, 1.0f); // + 1.0
         cb(all_coefs, "all_coefs", il);
-        all_coefs = ggml_cont(ctx0, ggml_transpose(ctx0, all_coefs)); // [n_tokens, n_altup]
-        all_coefs = ggml_reshape_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
+        all_coefs = ggml_transpose(ctx0, all_coefs); // [n_tokens, n_altup]
+        all_coefs = ggml_cont_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
 
         innovation = ggml_repeat_4d(ctx0, innovation, n_embd, n_tokens, n_altup, 1);
         ggml_tensor * corrected = ggml_mul(ctx0, innovation, all_coefs); // [n_embd, n_tokens, n_altup]
@@ -15769,7 +15769,7 @@ struct llm_build_wavtokenizer_dec : public llm_graph_context {
             };
         }
 
-        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+        cur = ggml_transpose(ctx0, cur);
 
         cur = build_norm(cur,
                 model.tok_norm,
@@ -15789,7 +15789,7 @@ struct llm_build_wavtokenizer_dec : public llm_graph_context {
             cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
             cur = ggml_add(ctx0, cur, layer.dw_b);
 
-            cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+            cur = ggml_transpose(ctx0, cur);
 
             cur = build_norm(cur,
                     layer.norm,
@@ -15812,7 +15812,7 @@ struct llm_build_wavtokenizer_dec : public llm_graph_context {
 
         cur = inpL;
 
-        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+        cur = ggml_transpose(ctx0, cur);
 
         cur = build_norm(cur,
                 model.output_norm,
@@ -16913,15 +16913,13 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
         cb(zx, "mamba_in_proj", il);
         // {8192, 5, 1, 1} -> {8192, 1, 5, 1}
         zx = ggml_permute(ctx0, zx, 0, 2, 1, 3);
-        zx = ggml_cont(ctx0, zx);
-        zx = ggml_reshape_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
+        zx = ggml_cont_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
         cb(zx, "mamba_in_proj_out", il);
 
         // split into z and x
         // => {head_dim * n_heads, n_seq_tokens, n_seqs}
         ggml_tensor * x = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], head_dim*ggml_element_size(zx));
-        x = ggml_cont(ctx0, x);
-        x = ggml_reshape_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
+        x = ggml_cont_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
         // x = ggml_permute(ctx0, x, 0, 2, 1, 3);
         cb(x, "mamba_x_split", il);
 
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
@@ -508,13 +508,13 @@ struct clip_graph {
             const int patches_per_image = n_patches_x;
             const int kernel_size = hparams.proj_scale_factor;
 
-            cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
-            cur = ggml_reshape_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
+            cur = ggml_transpose(ctx0, cur);
+            cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
 
             // doing a pool2d to reduce the number of output tokens
             cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
             cur = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[0], n_embd, batch_size);
-            cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+            cur = ggml_transpose(ctx0, cur);
 
             // apply norm before projection
             cur = ggml_rms_norm(ctx0, cur, eps);
@@ -537,13 +537,13 @@ struct clip_graph {
             GGML_ASSERT(scale_factor != 0);
             cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz);
             cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
-            cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
+            cur = ggml_cont_4d(ctx0, cur,
                 n_embd * scale_factor * scale_factor,
                 height / scale_factor,
                 width / scale_factor,
                 bsz);
             cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
-            cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur),
+            cur = ggml_cont_3d(ctx0, cur,
                 n_embd * scale_factor * scale_factor,
                 seq / (scale_factor * scale_factor),
                 bsz);
@@ -570,13 +570,13 @@ struct clip_graph {
 
             // unshuffle h
             cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height);
-            cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3));
+            cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
 
             // unshuffle w
-            cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor);
-            cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3));
+            cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor);
+            cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
 
-            cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
+            cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
 
             // projection
             cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
@@ -715,15 +715,15 @@ struct clip_graph {
             auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
             inp = ggml_add(ctx0, inp, inp_1);
 
-            inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3));  // [w, h, c, b] -> [c, w, h, b]
-            inp = ggml_reshape_4d(
+            inp = ggml_permute(ctx0, inp, 1, 2, 0, 3);  // [w, h, c, b] -> [c, w, h, b]
+            inp = ggml_cont_4d(
                 ctx0, inp,
                 n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
             inp = ggml_reshape_4d(
                 ctx0, inp,
                 n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
-            inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
-            inp = ggml_reshape_3d(
+            inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
+            inp = ggml_cont_3d(
                 ctx0, inp,
                 n_embd, n_patches_x * n_patches_y, batch_size);
         }
@@ -988,14 +988,14 @@ struct clip_graph {
             GGML_ASSERT(scale_factor > 0);
             cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
             cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
-            cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
+            cur = ggml_cont_4d(ctx0, cur,
                 n_embd * scale_factor * scale_factor,
                 height / scale_factor,
                 width / scale_factor,
                 bsz);
             cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
             // flatten to 2D
-            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur),
+            cur = ggml_cont_2d(ctx0, cur,
                 n_embd * scale_factor * scale_factor,
                 cur->ne[1] * cur->ne[2]);
         }
@@ -1081,14 +1081,14 @@ struct clip_graph {
                 n_patches_y,
                 bsz);
             cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
-            cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
+            cur = ggml_cont_4d(ctx0, cur,
                 n_embd * scale_factor * scale_factor,
                 n_patches_x / scale_factor,
                 n_patches_y / scale_factor,
                 bsz);
             cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
             // flatten to 2D
-            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur),
+            cur = ggml_cont_2d(ctx0, cur,
                 n_embd * scale_factor * scale_factor,
                 n_patches / scale_factor / scale_factor);
             cb(cur, "pixel_shuffle", -1);
@@ -1321,18 +1321,18 @@ struct clip_graph {
                 ggml_tensor * block_1 = nullptr;
                 {
                     // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
-                    mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
-                    mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
+                    mlp_3 = ggml_permute(ctx0, mlp_3, 1, 0, 2, 3);
+                    mlp_3 = ggml_cont_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
                     // stride = 1, padding = 1, bias is nullptr
                     block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
 
                     // layer norm
                     // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
-                    block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
+                    block_1 = ggml_permute(ctx0, block_1, 1, 2, 0, 3);
                     // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
                     block_1 = ggml_norm(ctx0, block_1, eps);
                     block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
-                    block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+                    block_1 = ggml_permute(ctx0, block_1, 2, 0, 1, 3);
 
                     // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
                     // hardswish
@@ -1376,11 +1376,11 @@ struct clip_graph {
 
                     // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
                     // layer norm
-                    block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
+                    block_1 = ggml_permute(ctx0, block_1, 1, 2, 0, 3);
                     // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
                     block_1 = ggml_norm(ctx0, block_1, eps);
                     block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
-                    block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+                    block_1 = ggml_permute(ctx0, block_1, 2, 0, 1, 3);
                     // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
                     // hardswish
                     ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
@@ -1427,9 +1427,9 @@ struct clip_graph {
                 mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
                 // mlp_2 ne = [2048, 576, 1, 1]
                 // // AVG Pool Layer 2*2, strides = 2
-                mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3));
+                mlp_2 = ggml_permute(ctx0, mlp_2, 1, 0, 2, 3);
                 // mlp_2 ne = [576, 2048, 1, 1]
-                mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
+                mlp_2 = ggml_cont_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
                 // mlp_2 ne [24, 24, 2048, 1]
                 mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
                 // weight ne = [3, 3, 2048, 1]
@@ -1449,8 +1449,8 @@ struct clip_graph {
         // glm projector
         else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE) {
             size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
-            embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3));
-            embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
+            embeddings = ggml_permute(ctx0,embeddings,1,0,2,3);
+            embeddings = ggml_cont_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
             embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
             embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
             embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
@@ -1750,7 +1750,9 @@ struct clip_graph {
             cur = ggml_cont(ctx0, cur);
             cur = ggml_pool_1d(ctx0, cur, GGML_OP_POOL_AVG, 2, 2, 0);
             cur = ggml_transpose(ctx0, cur);
-            cur = ggml_cont(ctx0, cur);
+            if (!model.post_ln_w) {
+                cur = ggml_cont(ctx0, cur);
+            }
             inpL = cur;
         }
 
@@ -2005,7 +2007,6 @@ struct clip_graph {
                 ggml_row_size(cur->type, n_dim),
                 ggml_row_size(cur->type, n_dim*n_head),
                 n_dim/2 * ggml_element_size(cur));
-            second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors
             second = ggml_rope_ext(
                 ctx0,
                 second,