From 379bf0891cf0464dbec3cea2026d647e763e05ac Mon Sep 17 00:00:00 2001
From: safranowith <bsh155762@gmail.com>
Date: Tue, 21 Oct 2025 11:22:33 +0300
Subject: [PATCH 1/7] rebase from master

---
 docs/ops.md                                   |   8 +-
 docs/ops/Vulkan.csv                           |   8 +
 ggml/src/ggml-vulkan/ggml-vulkan.cpp          |  91 +++++++++-
 ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp |  17 ++
 .../src/ggml-vulkan/vulkan-shaders/floor.comp |  17 ++
 .../src/ggml-vulkan/vulkan-shaders/round.comp |  17 ++
 .../src/ggml-vulkan/vulkan-shaders/trunc.comp |  17 ++
 .../vulkan-shaders/vulkan-shaders-gen.cpp     |   8 +
 tests/test-backend-ops.cpp                    | 171 ++++++++++++++++++
 9 files changed, 349 insertions(+), 5 deletions(-)
 create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp
 create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/floor.comp
 create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/round.comp
 create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp

diff --git a/docs/ops.md b/docs/ops.md
index 3738a4807..c9fd22863 100644
--- a/docs/ops.md
+++ b/docs/ops.md
@@ -22,7 +22,7 @@ Legend:
 |                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                             CEIL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                             CEIL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | 🟡 | ❌ |
 |                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
 |                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ |
 |                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
@@ -42,7 +42,7 @@ Legend:
 |                              ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
 |                              EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
 |                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
-|                            FLOOR | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                            FLOOR | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | 🟡 | ❌ |
 |                GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
 |                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
 |                        GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
@@ -84,7 +84,7 @@ Legend:
 |                             ROLL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
 |                             ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
 |                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                            ROUND | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                            ROUND | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | 🟡 | ❌ |
 |                        RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                        RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
@@ -111,6 +111,6 @@ Legend:
 |                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
 |               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
 |                         TOPK_MOE | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
-|                            TRUNC | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                            TRUNC | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | 🟡 | ❌ |
 |                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ |
 |                            XIELU | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
diff --git a/docs/ops/Vulkan.csv b/docs/ops/Vulkan.csv
index 298c2a6cc..f948b5a9e 100644
--- a/docs/ops/Vulkan.csv
+++ b/docs/ops/Vulkan.csv
@@ -5434,12 +5434,20 @@
 "Vulkan0","LOG","type=f16,ne=[10,5,4,3]","support","0","no","Vulkan"
 "Vulkan0","SIN","type=f16,ne=[10,2,2,2]","support","0","no","Vulkan"
 "Vulkan0","COS","type=f16,ne=[10,2,2,2]","support","0","no","Vulkan"
+"Vulkan0","FLOOR","type=f16,ne=[10,2,2,2]","support","0","no","Vulkan"
+"Vulkan0","CEIL","type=f16,ne=[10,2,2,2]","support","0","no","Vulkan"
+"Vulkan0","ROUND","type=f16,ne=[10,2,2,2]","support","0","no","Vulkan"
+"Vulkan0","TRUNC","type=f16,ne=[10,2,2,2]","support","0","no","Vulkan"
 "Vulkan0","CLAMP","type=f16,ne=[10,5,4,3],min=-0.500000,max=0.500000","support","0","no","Vulkan"
 "Vulkan0","SQR","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
 "Vulkan0","SQRT","type=f32,ne=[10,3,3,2]","support","0","no","Vulkan"
 "Vulkan0","LOG","type=f32,ne=[10,5,4,3]","support","0","no","Vulkan"
 "Vulkan0","SIN","type=f32,ne=[10,2,2,2]","support","1","yes","Vulkan"
 "Vulkan0","COS","type=f32,ne=[10,2,2,2]","support","1","yes","Vulkan"
+"Vulkan0","FLOOR","type=f32,ne=[10,2,2,2]","support","1","yes","Vulkan"
+"Vulkan0","CEIL","type=f32,ne=[10,2,2,2]","support","1","yes","Vulkan"
+"Vulkan0","ROUND","type=f32,ne=[10,2,2,2]","support","1","yes","Vulkan"
+"Vulkan0","TRUNC","type=f32,ne=[10,2,2,2]","support","1","yes","Vulkan"
 "Vulkan0","CLAMP","type=f32,ne=[10,5,4,3],min=-0.500000,max=0.500000","support","1","yes","Vulkan"
 "Vulkan0","DIAG_MASK_INF","type=f32,ne=[10,10,1,1],n_past=5","support","1","yes","Vulkan"
 "Vulkan0","DIAG_MASK_INF","type=f32,ne=[10,10,3,1],n_past=5","support","1","yes","Vulkan"
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 8d1a85c96..2c42bec47 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -622,6 +622,10 @@ struct vk_device_struct {
     vk_pipeline pipeline_rms_norm_mul_partials_f32;
     vk_pipeline pipeline_rms_norm_back_f32;
     vk_pipeline pipeline_l2_norm_f32;
+    vk_pipeline pipeline_floor_f32;
+    vk_pipeline pipeline_ceil_f32;
+    vk_pipeline pipeline_round_f32;
+    vk_pipeline pipeline_trunc_f32;
 
     // [src/dst 0=fp32,1=fp16]
     vk_pipeline pipeline_exp[2];
@@ -3674,6 +3678,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_repeat_f32, "repeat_f32", repeat_f32_len, repeat_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_repeat_back_f32, "repeat_back_f32", repeat_back_f32_len, repeat_back_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
 
+    ggml_vk_create_pipeline(device, device->pipeline_floor_f32, "floor_f32", floor_f32_len, floor_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_ceil_f32, "ceil_f32", ceil_f32_len, ceil_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_round_f32, "round_f32", round_f32_len, round_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_trunc_f32, "trunc_f32", trunc_f32_len, trunc_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+
 #define CREATE_UNARY(name)  \
     ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);  \
     ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
@@ -8240,6 +8249,26 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
             return ctx->device->pipeline_cos_f32;
         }
         return nullptr;
+    case GGML_OP_FLOOR:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_floor_f32;
+        }
+        return nullptr;
+    case GGML_OP_CEIL:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_ceil_f32;
+        }
+        return nullptr;
+    case GGML_OP_ROUND:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_round_f32;
+        }
+        return nullptr;
+    case GGML_OP_TRUNC:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_trunc_f32;
+        }
+        return nullptr;
     case GGML_OP_CLAMP:
         if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
             return ctx->device->pipeline_clamp_f32;
@@ -8631,6 +8660,10 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
     case GGML_OP_SUM:
     case GGML_OP_SUM_ROWS:
     case GGML_OP_MEAN:
+    case GGML_OP_FLOOR:
+    case GGML_OP_CEIL:
+    case GGML_OP_ROUND:
+    case GGML_OP_TRUNC:
         return true;
     default:
         return false;
@@ -9004,6 +9037,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
     case GGML_OP_UNARY:
     case GGML_OP_GLU:
     case GGML_OP_CONV_2D_DW:
+    case GGML_OP_FLOOR:
+    case GGML_OP_CEIL:
+    case GGML_OP_ROUND:
+    case GGML_OP_TRUNC:
         {
             uint32_t ne = ggml_nelements(dst);
             if (op == GGML_OP_CPY && ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
@@ -9806,6 +9843,22 @@ static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const
     ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_COS, vk_op_unary_push_constants_init(src0, dst), dryrun);
 }
 
+static void ggml_vk_floor(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_FLOOR, vk_op_unary_push_constants_init(src0, dst), dryrun);
+}
+
+static void ggml_vk_ceil(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CEIL, vk_op_unary_push_constants_init(src0, dst), dryrun);
+}
+
+static void ggml_vk_round(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ROUND, vk_op_unary_push_constants_init(src0, dst), dryrun);
+}
+
+static void ggml_vk_trunc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_TRUNC, vk_op_unary_push_constants_init(src0, dst), dryrun);
+}
+
 static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
     vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
     p.param1 = ggml_get_op_params_f32(dst, 0);
@@ -11577,6 +11630,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
     case GGML_OP_FLASH_ATTN_EXT:
     case GGML_OP_OPT_STEP_ADAMW:
     case GGML_OP_OPT_STEP_SGD:
+    case GGML_OP_FLOOR:
+    case GGML_OP_CEIL:
+    case GGML_OP_ROUND:
+    case GGML_OP_TRUNC:
         break;
     default:
         std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
@@ -11644,6 +11701,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
         case GGML_OP_CONV_2D_DW:
         case GGML_OP_LEAKY_RELU:
         case GGML_OP_OPT_STEP_SGD:
+        case GGML_OP_FLOOR:
+        case GGML_OP_CEIL:
+        case GGML_OP_ROUND:
+        case GGML_OP_TRUNC:
             {
                 // These operations all go through ggml_vk_op_f32, so short-circuit and
                 // do the only thing needed for the dryrun.
@@ -11823,6 +11884,22 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
     case GGML_OP_COS:
         ggml_vk_cos(ctx, compute_ctx, src0, node, dryrun);
 
+        break;
+    case GGML_OP_FLOOR:
+        ggml_vk_floor(ctx, compute_ctx, src0, node, dryrun);
+
+        break;
+    case GGML_OP_CEIL:
+        ggml_vk_ceil(ctx, compute_ctx, src0, node, dryrun);
+
+        break;
+    case GGML_OP_ROUND:
+        ggml_vk_round(ctx, compute_ctx, src0, node, dryrun);
+
+        break;
+    case GGML_OP_TRUNC:
+        ggml_vk_trunc(ctx, compute_ctx, src0, node, dryrun);
+
         break;
     case GGML_OP_CLAMP:
         ggml_vk_clamp(ctx, compute_ctx, src0, node, dryrun);
@@ -13856,6 +13933,10 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
         case GGML_OP_LEAKY_RELU:
         case GGML_OP_OPT_STEP_ADAMW:
         case GGML_OP_OPT_STEP_SGD:
+        case GGML_OP_FLOOR:
+        case GGML_OP_CEIL:
+        case GGML_OP_ROUND:
+        case GGML_OP_TRUNC:
             return op->src[0]->type == GGML_TYPE_F32;
         case GGML_OP_ARGSORT:
             return op->ne[0] <= max_argsort_cols;
@@ -14369,12 +14450,20 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
         tensor_clone = ggml_sin(ggml_ctx, src_clone[0]);
     } else if (tensor->op == GGML_OP_COS) {
         tensor_clone = ggml_cos(ggml_ctx, src_clone[0]);
+    } else if (tensor->op == GGML_OP_FLOOR) {
+        tensor_clone = ggml_floor(ggml_ctx, src_clone[0]);
+    } else if (tensor->op == GGML_OP_CEIL) {
+        tensor_clone = ggml_ceil(ggml_ctx, src_clone[0]);
+    } else if (tensor->op == GGML_OP_ROUND) {
+        tensor_clone = ggml_round(ggml_ctx, src_clone[0]);
+    } else if (tensor->op == GGML_OP_TRUNC) {
+        tensor_clone = ggml_trunc(ggml_ctx, src_clone[0]);
     } else if (tensor->op == GGML_OP_CLAMP) {
         const float * params = (const float *)tensor->op_params;
         tensor_clone = ggml_clamp(ggml_ctx, src_clone[0], params[0], params[1]);
     } else if (tensor->op == GGML_OP_PAD) {
         tensor_clone = ggml_pad_ext(ggml_ctx, src_clone[0], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3],
-                                                            tensor->op_params[4], tensor->op_params[5], tensor->op_params[6], tensor->op_params[7]);
+        tensor->op_params[4], tensor->op_params[5], tensor->op_params[6], tensor->op_params[7]);
     } else if (tensor->op == GGML_OP_REPEAT) {
         tensor_clone = ggml_repeat(ggml_ctx, src_clone[0], tensor);
     } else if (tensor->op == GGML_OP_REPEAT_BACK) {
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp b/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp
new file mode 100644
index 000000000..9652c6c16
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp
@@ -0,0 +1,17 @@
+#version 450
+
+#include "types.glsl"
+#include "generic_unary_head.glsl"
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    const uint idx = get_idx();
+
+    if (idx >= p.ne) {
+        return;
+    }
+
+    const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
+    data_d[get_doffset() + dst_idx(idx)] = D_TYPE(ceil(val));
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp b/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp
new file mode 100644
index 000000000..a5a3c7e61
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp
@@ -0,0 +1,17 @@
+#version 450
+
+#include "types.glsl"
+#include "generic_unary_head.glsl"
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    const uint idx = get_idx();
+
+    if (idx >= p.ne) {
+        return;
+    }
+
+    const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
+    data_d[get_doffset() + dst_idx(idx)] = D_TYPE(floor(val));
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/round.comp b/ggml/src/ggml-vulkan/vulkan-shaders/round.comp
new file mode 100644
index 000000000..40eb115ea
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/round.comp
@@ -0,0 +1,17 @@
+#version 450
+
+#include "types.glsl"
+#include "generic_unary_head.glsl"
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    const uint idx = get_idx();
+
+    if (idx >= p.ne) {
+        return;
+    }
+
+    const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
+    data_d[get_doffset() + dst_idx(idx)] = D_TYPE(round(val));
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp b/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp
new file mode 100644
index 000000000..492fbad52
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp
@@ -0,0 +1,17 @@
+#version 450
+
+#include "types.glsl"
+#include "generic_unary_head.glsl"
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    const uint idx = get_idx();
+
+    if (idx >= p.ne) {
+        return;
+    }
+
+    const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
+    data_d[get_doffset() + dst_idx(idx)] = D_TYPE(trunc(val));
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
index bd178875d..ae1a0e46b 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -781,6 +781,14 @@ void process_shaders() {
 
     string_to_spv("cos_f32", "cos.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
 
+    string_to_spv("floor_f32", "floor.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
+
+    string_to_spv("ceil_f32", "ceil.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
+
+    string_to_spv("round_f32", "round.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
+
+    string_to_spv("trunc_f32", "trunc.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
+    
     string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
 
     string_to_spv("pad_f32", "pad.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 967a53c63..5f0f15670 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -3811,6 +3811,177 @@ struct test_cos : public test_case {
     }
 };
 
+// GGML_OP_FLOOR
+struct test_floor : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_floor(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 2, 2, 2})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_floor(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
+        }
+    }
+
+    double max_maa_err() override {
+        return 1e-3;
+    }
+
+    float grad_eps() override {
+        return 0.2f;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+// GGML_OP_CEIL
+struct test_ceil : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_ceil(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 2, 2, 2})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_ceil(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
+        }
+    }
+
+    double max_maa_err() override {
+        return 1e-3;
+    }
+
+    float grad_eps() override {
+        return 0.2f;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_ROUND
+struct test_round : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_round(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 2, 2, 2})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_round(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
+        }
+    }
+
+    double max_maa_err() override {
+        return 1e-3;
+    }
+
+    float grad_eps() override {
+        return 0.2f;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_TRUNC
+struct test_trunc : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_trunc(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 2, 2, 2})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_trunc(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
+        }
+    }
+
+    double max_maa_err() override {
+        return 1e-3;
+    }
+
+    float grad_eps() override {
+        return 0.2f;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
 // GGML_OP_CLAMP
 struct test_clamp : public test_case {
     const ggml_type type;

From 82ec3e59759f3d60770201b3faf860fdba5ea353 Mon Sep 17 00:00:00 2001
From: safranowith <bsh155762@gmail.com>
Date: Mon, 20 Oct 2025 14:12:26 +0300
Subject: [PATCH 2/7] add tests

---
 tests/test-backend-ops.cpp | 128 ++-----------------------------------
 1 file changed, 4 insertions(+), 124 deletions(-)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 5f0f15670..7bfd47d4a 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -4017,130 +4017,6 @@ struct test_clamp : public test_case {
     }
 };
 
-// GGML_OP_FLOOR
-struct test_floor : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-
-    std::string vars() override {
-        return VARS_TO_STR2(type, ne);
-    }
-
-    test_floor(ggml_type type = GGML_TYPE_F32,
-               std::array<int64_t, 4> ne = {10, 2, 2, 2})
-        : type(type), ne(ne) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_set_param(a);
-        ggml_set_name(a, "a");
-
-        ggml_tensor * out = ggml_floor(ctx, a);
-        ggml_set_name(out, "out");
-
-        return out;
-    }
-
-    void initialize_tensors(ggml_context * ctx) override {
-        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-            init_tensor_uniform(t, -10.0f, 10.0f);
-        }
-    }
-};
-
-// GGML_OP_CEIL
-struct test_ceil : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-
-    std::string vars() override {
-        return VARS_TO_STR2(type, ne);
-    }
-
-    test_ceil(ggml_type type = GGML_TYPE_F32,
-              std::array<int64_t, 4> ne = {10, 2, 2, 2})
-        : type(type), ne(ne) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_set_param(a);
-        ggml_set_name(a, "a");
-
-        ggml_tensor * out = ggml_ceil(ctx, a);
-        ggml_set_name(out, "out");
-
-        return out;
-    }
-
-    void initialize_tensors(ggml_context * ctx) override {
-        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-            init_tensor_uniform(t, -10.0f, 10.0f);
-        }
-    }
-};
-
-// GGML_OP_ROUND
-struct test_round : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-
-    std::string vars() override {
-        return VARS_TO_STR2(type, ne);
-    }
-
-    test_round(ggml_type type = GGML_TYPE_F32,
-               std::array<int64_t, 4> ne = {10, 2, 2, 2})
-        : type(type), ne(ne) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_set_param(a);
-        ggml_set_name(a, "a");
-
-        ggml_tensor * out = ggml_round(ctx, a);
-        ggml_set_name(out, "out");
-
-        return out;
-    }
-
-    void initialize_tensors(ggml_context * ctx) override {
-        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-            init_tensor_uniform(t, -10.0f, 10.0f);
-        }
-    }
-};
-
-// GGML_OP_TRUNC
-struct test_trunc : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-
-    std::string vars() override {
-        return VARS_TO_STR2(type, ne);
-    }
-
-    test_trunc(ggml_type type = GGML_TYPE_F32,
-               std::array<int64_t, 4> ne = {10, 2, 2, 2})
-        : type(type), ne(ne) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_set_param(a);
-        ggml_set_name(a, "a");
-
-        ggml_tensor * out = ggml_trunc(ctx, a);
-        ggml_set_name(out, "out");
-
-        return out;
-    }
-
-    void initialize_tensors(ggml_context * ctx) override {
-        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-            init_tensor_uniform(t, -10.0f, 10.0f);
-        }
-    }
-};
-
 // GGML_OP_DIAG_MASK_INF
 struct test_diag_mask_inf : public test_case {
     const ggml_type type;
@@ -7198,6 +7074,10 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
         test_cases.emplace_back(new test_ceil      (type, {7, 1, 5, 3}));
         test_cases.emplace_back(new test_round     (type, {7, 1, 5, 3}));
         test_cases.emplace_back(new test_trunc     (type, {7, 1, 5, 3}));
+<<<<<<< HEAD
+=======
+
+>>>>>>> 9fef9eefb (add tests)
     }
 
     test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5));

From 4af5519a7dfafb1624c94b6e24fa16afda666318 Mon Sep 17 00:00:00 2001
From: safranowith <bsh155762@gmail.com>
Date: Tue, 21 Oct 2025 12:08:39 +0300
Subject: [PATCH 3/7] resloved conflict

---
 tests/test-backend-ops.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 7bfd47d4a..9eb3c73e6 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -7074,10 +7074,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
         test_cases.emplace_back(new test_ceil      (type, {7, 1, 5, 3}));
         test_cases.emplace_back(new test_round     (type, {7, 1, 5, 3}));
         test_cases.emplace_back(new test_trunc     (type, {7, 1, 5, 3}));
-<<<<<<< HEAD
-=======
-
->>>>>>> 9fef9eefb (add tests)
     }
 
     test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5));

From 37a0adc147c31f4238b3a3dee0db450d057f9983 Mon Sep 17 00:00:00 2001
From: safranowith <bsh155762@gmail.com>
Date: Tue, 21 Oct 2025 14:14:26 +0300
Subject: [PATCH 4/7] fixed editor congig checks

---
 .../vulkan-shaders/vulkan-shaders-gen.cpp     |  2 +-
 vendor/miniaudio/miniaudio.h                  | 22 +++++++++----------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
index ae1a0e46b..c98576dff 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -788,7 +788,7 @@ void process_shaders() {
     string_to_spv("round_f32", "round.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
 
     string_to_spv("trunc_f32", "trunc.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
-    
+
     string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
 
     string_to_spv("pad_f32", "pad.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
diff --git a/vendor/miniaudio/miniaudio.h b/vendor/miniaudio/miniaudio.h
index 2f5b9c4ea..08cd624f6 100644
--- a/vendor/miniaudio/miniaudio.h
+++ b/vendor/miniaudio/miniaudio.h
@@ -11582,7 +11582,7 @@ IMPLEMENTATION
 
     #include <sys/time.h>   /* select() (used for ma_sleep()). */
     #include <time.h>       /* For nanosleep() */
-    #include <unistd.h> 
+    #include <unistd.h>
 #endif
 
 /* For fstat(), etc. */
@@ -17608,7 +17608,7 @@ static ma_result ma_thread_create__posix(ma_thread* pThread, ma_thread_priority
             (void)stackSize;  /* Suppress unused parameter warning. */
         }
         #endif
-        
+
 
         if (scheduler != -1) {
             int priorityMin = sched_get_priority_min(scheduler);
@@ -23047,7 +23047,7 @@ static ma_result ma_context_get_MMDevice__wasapi(ma_context* pContext, ma_device
     CoInitializeResult = ma_CoInitializeEx(pContext, NULL, MA_COINIT_VALUE);
     {
         hr = ma_CoCreateInstance(pContext, &MA_CLSID_MMDeviceEnumerator, NULL, CLSCTX_ALL, &MA_IID_IMMDeviceEnumerator, (void**)&pDeviceEnumerator);
-    }    
+    }
     if (CoInitializeResult == S_OK || CoInitializeResult == S_FALSE) { ma_CoUninitialize(pContext); }
 
     if (FAILED(hr)) {   /* <-- This is checking the call above to ma_CoCreateInstance(). */
@@ -29673,7 +29673,7 @@ static ma_result ma_device_start__alsa(ma_device* pDevice)
     }
 
     if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        /*        
+        /*
         When data is written to the device we wait for the device to get ready to receive data with poll(). In my testing
         I have observed that poll() can sometimes block forever unless the device is started explicitly with snd_pcm_start()
         or some data is written with snd_pcm_writei().
@@ -35980,7 +35980,7 @@ static ma_result ma_device_init_internal__coreaudio(ma_context* pContext, ma_dev
             #endif
         }
 
-        
+
         status = ((ma_AudioUnitSetProperty_proc)pContext->coreaudio.AudioUnitSetProperty)(pData->audioUnit, kAudioUnitProperty_StreamFormat, formatScope, formatElement, &bestFormat, sizeof(bestFormat));
         if (status != noErr) {
             ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
@@ -39294,7 +39294,7 @@ static void ma_stream_error_callback__aaudio(ma_AAudioStream* pStream, void* pUs
 
     (void)error;
     ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[AAudio] ERROR CALLBACK: error=%d, AAudioStream_getState()=%d\n", error, ((MA_PFN_AAudioStream_getState)pDevice->pContext->aaudio.AAudioStream_getState)(pStream));
-    
+
     /*
     When we get an error, we'll assume that the stream is in an erroneous state and needs to be restarted. From the documentation,
     we cannot do this from the error callback. Therefore we are going to use an event thread for the AAudio backend to do this
@@ -39306,13 +39306,13 @@ static void ma_stream_error_callback__aaudio(ma_AAudioStream* pStream, void* pUs
     else {
         job = ma_job_init(MA_JOB_TYPE_DEVICE_AAUDIO_REROUTE);
         job.data.device.aaudio.reroute.pDevice = pDevice;
-    
+
         if (pStream == pDevice->aaudio.pStreamCapture) {
             job.data.device.aaudio.reroute.deviceType = ma_device_type_capture;
         } else {
             job.data.device.aaudio.reroute.deviceType = ma_device_type_playback;
         }
-    
+
         result = ma_device_job_thread_post(&pDevice->pContext->aaudio.jobThread, &job);
         if (result != MA_SUCCESS) {
             ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[AAudio] Device Disconnected. Failed to post job for rerouting.\n");
@@ -39909,7 +39909,7 @@ static ma_result ma_device_reinit__aaudio(ma_device* pDevice, ma_device_type dev
 
     /* We got disconnected! Retry a few times, until we find a connected device! */
     iAttempt = 0;
-    while (iAttempt++ < maxAttempts) {        
+    while (iAttempt++ < maxAttempts) {
         /* Device tearing down? No need to reroute! */
         if (ma_atomic_bool32_get(&pDevice->aaudio.isTearingDown)) {
             result = MA_SUCCESS; /* Caller should continue as normal. */
@@ -40007,7 +40007,7 @@ static ma_result ma_device_reinit__aaudio(ma_device* pDevice, ma_device_type dev
             break;
         }
     }
-    
+
     return result;
 }
 
@@ -61679,7 +61679,7 @@ static ma_result ma_default_vfs_info(ma_vfs* pVFS, ma_vfs_file file, ma_file_inf
         ma_result result;
         ma_int64 cursor;
         ma_int64 sizeInBytes;
-        
+
         result = ma_default_vfs_tell(pVFS, file, &cursor);
         if (result != MA_SUCCESS) {
             return result;

From bd59d6945470d0d1cd825b4c5636e929b1e654b5 Mon Sep 17 00:00:00 2001
From: safranowith <bsh155762@gmail.com>
Date: Sun, 2 Nov 2025 12:35:35 +0200
Subject: [PATCH 5/7] Fix: Use GGML_UNARY_OP_* instead of GGML_OP_* for
 FLOOR/CEIL/ROUND/TRUNC in Vulkan backend

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 120 ++++++++++++---------------
 tests/test-backend-ops.cpp           |  48 -----------
 2 files changed, 51 insertions(+), 117 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 2c42bec47..0760cf338 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -8249,26 +8249,6 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
             return ctx->device->pipeline_cos_f32;
         }
         return nullptr;
-    case GGML_OP_FLOOR:
-        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return ctx->device->pipeline_floor_f32;
-        }
-        return nullptr;
-    case GGML_OP_CEIL:
-        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return ctx->device->pipeline_ceil_f32;
-        }
-        return nullptr;
-    case GGML_OP_ROUND:
-        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return ctx->device->pipeline_round_f32;
-        }
-        return nullptr;
-    case GGML_OP_TRUNC:
-        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return ctx->device->pipeline_trunc_f32;
-        }
-        return nullptr;
     case GGML_OP_CLAMP:
         if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
             return ctx->device->pipeline_clamp_f32;
@@ -8366,6 +8346,26 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
                 return ctx->device->pipeline_hardsigmoid[dst->type == GGML_TYPE_F16];
             case GGML_UNARY_OP_HARDSWISH:
                 return ctx->device->pipeline_hardswish[dst->type == GGML_TYPE_F16];
+            case GGML_UNARY_OP_FLOOR:
+                if (dst->type == GGML_TYPE_F32) {
+                    return ctx->device->pipeline_floor_f32;
+                }
+                break;
+            case GGML_UNARY_OP_CEIL:
+                if (dst->type == GGML_TYPE_F32) {
+                    return ctx->device->pipeline_ceil_f32;
+                }
+                break;
+            case GGML_UNARY_OP_ROUND:
+                if (dst->type == GGML_TYPE_F32) {
+                    return ctx->device->pipeline_round_f32;
+                }
+                break;
+            case GGML_UNARY_OP_TRUNC:
+                if (dst->type == GGML_TYPE_F32) {
+                    return ctx->device->pipeline_trunc_f32;
+                }
+                break;
             default:
                 break;
         }
@@ -8660,10 +8660,6 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
     case GGML_OP_SUM:
     case GGML_OP_SUM_ROWS:
     case GGML_OP_MEAN:
-    case GGML_OP_FLOOR:
-    case GGML_OP_CEIL:
-    case GGML_OP_ROUND:
-    case GGML_OP_TRUNC:
         return true;
     default:
         return false;
@@ -9037,10 +9033,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
     case GGML_OP_UNARY:
     case GGML_OP_GLU:
     case GGML_OP_CONV_2D_DW:
-    case GGML_OP_FLOOR:
-    case GGML_OP_CEIL:
-    case GGML_OP_ROUND:
-    case GGML_OP_TRUNC:
         {
             uint32_t ne = ggml_nelements(dst);
             if (op == GGML_OP_CPY && ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
@@ -9844,19 +9836,19 @@ static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const
 }
 
 static void ggml_vk_floor(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_FLOOR, vk_op_unary_push_constants_init(src0, dst), dryrun);
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, vk_op_unary_push_constants_init(src0, dst), dryrun);
 }
 
 static void ggml_vk_ceil(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CEIL, vk_op_unary_push_constants_init(src0, dst), dryrun);
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, vk_op_unary_push_constants_init(src0, dst), dryrun);
 }
 
 static void ggml_vk_round(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ROUND, vk_op_unary_push_constants_init(src0, dst), dryrun);
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, vk_op_unary_push_constants_init(src0, dst), dryrun);
 }
 
 static void ggml_vk_trunc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_TRUNC, vk_op_unary_push_constants_init(src0, dst), dryrun);
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, vk_op_unary_push_constants_init(src0, dst), dryrun);
 }
 
 static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
@@ -11541,6 +11533,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
         case GGML_UNARY_OP_SIGMOID:
         case GGML_UNARY_OP_HARDSIGMOID:
         case GGML_UNARY_OP_HARDSWISH:
+        case GGML_UNARY_OP_FLOOR:
+        case GGML_UNARY_OP_CEIL:
+        case GGML_UNARY_OP_ROUND:
+        case GGML_UNARY_OP_TRUNC:
             break;
         default:
             return false;
@@ -11630,10 +11626,6 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
     case GGML_OP_FLASH_ATTN_EXT:
     case GGML_OP_OPT_STEP_ADAMW:
     case GGML_OP_OPT_STEP_SGD:
-    case GGML_OP_FLOOR:
-    case GGML_OP_CEIL:
-    case GGML_OP_ROUND:
-    case GGML_OP_TRUNC:
         break;
     default:
         std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
@@ -11701,10 +11693,6 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
         case GGML_OP_CONV_2D_DW:
         case GGML_OP_LEAKY_RELU:
         case GGML_OP_OPT_STEP_SGD:
-        case GGML_OP_FLOOR:
-        case GGML_OP_CEIL:
-        case GGML_OP_ROUND:
-        case GGML_OP_TRUNC:
             {
                 // These operations all go through ggml_vk_op_f32, so short-circuit and
                 // do the only thing needed for the dryrun.
@@ -11884,22 +11872,6 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
     case GGML_OP_COS:
         ggml_vk_cos(ctx, compute_ctx, src0, node, dryrun);
 
-        break;
-    case GGML_OP_FLOOR:
-        ggml_vk_floor(ctx, compute_ctx, src0, node, dryrun);
-
-        break;
-    case GGML_OP_CEIL:
-        ggml_vk_ceil(ctx, compute_ctx, src0, node, dryrun);
-
-        break;
-    case GGML_OP_ROUND:
-        ggml_vk_round(ctx, compute_ctx, src0, node, dryrun);
-
-        break;
-    case GGML_OP_TRUNC:
-        ggml_vk_trunc(ctx, compute_ctx, src0, node, dryrun);
-
         break;
     case GGML_OP_CLAMP:
         ggml_vk_clamp(ctx, compute_ctx, src0, node, dryrun);
@@ -11965,6 +11937,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
         case GGML_UNARY_OP_SIGMOID:
         case GGML_UNARY_OP_HARDSIGMOID:
         case GGML_UNARY_OP_HARDSWISH:
+        case GGML_UNARY_OP_FLOOR:
+        case GGML_UNARY_OP_CEIL:
+        case GGML_UNARY_OP_ROUND:
+        case GGML_UNARY_OP_TRUNC:
             ggml_vk_unary(ctx, compute_ctx, src0, node, dryrun);
             break;
         default:
@@ -13933,10 +13909,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
         case GGML_OP_LEAKY_RELU:
         case GGML_OP_OPT_STEP_ADAMW:
         case GGML_OP_OPT_STEP_SGD:
-        case GGML_OP_FLOOR:
-        case GGML_OP_CEIL:
-        case GGML_OP_ROUND:
-        case GGML_OP_TRUNC:
             return op->src[0]->type == GGML_TYPE_F32;
         case GGML_OP_ARGSORT:
             return op->ne[0] <= max_argsort_cols;
@@ -14450,20 +14422,30 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
         tensor_clone = ggml_sin(ggml_ctx, src_clone[0]);
     } else if (tensor->op == GGML_OP_COS) {
         tensor_clone = ggml_cos(ggml_ctx, src_clone[0]);
-    } else if (tensor->op == GGML_OP_FLOOR) {
-        tensor_clone = ggml_floor(ggml_ctx, src_clone[0]);
-    } else if (tensor->op == GGML_OP_CEIL) {
-        tensor_clone = ggml_ceil(ggml_ctx, src_clone[0]);
-    } else if (tensor->op == GGML_OP_ROUND) {
-        tensor_clone = ggml_round(ggml_ctx, src_clone[0]);
-    } else if (tensor->op == GGML_OP_TRUNC) {
-        tensor_clone = ggml_trunc(ggml_ctx, src_clone[0]);
+    } else if (tensor->op == GGML_OP_UNARY) {
+        switch (ggml_get_unary_op(tensor)) {
+            case GGML_UNARY_OP_FLOOR:
+                tensor_clone = ggml_floor(ggml_ctx, src_clone[0]);
+                break;
+            case GGML_UNARY_OP_CEIL:
+                tensor_clone = ggml_ceil(ggml_ctx, src_clone[0]);
+                break;
+            case GGML_UNARY_OP_ROUND:
+                tensor_clone = ggml_round(ggml_ctx, src_clone[0]);
+                break;
+            case GGML_UNARY_OP_TRUNC:
+                tensor_clone = ggml_trunc(ggml_ctx, src_clone[0]);
+                break;
+            default:
+                std::cerr << "Unsupported unary op: " << ggml_unary_op_name(ggml_get_unary_op(tensor)) << std::endl;
+                GGML_ABORT("fatal error");
+        }
     } else if (tensor->op == GGML_OP_CLAMP) {
         const float * params = (const float *)tensor->op_params;
         tensor_clone = ggml_clamp(ggml_ctx, src_clone[0], params[0], params[1]);
     } else if (tensor->op == GGML_OP_PAD) {
         tensor_clone = ggml_pad_ext(ggml_ctx, src_clone[0], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3],
-        tensor->op_params[4], tensor->op_params[5], tensor->op_params[6], tensor->op_params[7]);
+                                                            tensor->op_params[4], tensor->op_params[5], tensor->op_params[6], tensor->op_params[7]);
     } else if (tensor->op == GGML_OP_REPEAT) {
         tensor_clone = ggml_repeat(ggml_ctx, src_clone[0], tensor);
     } else if (tensor->op == GGML_OP_REPEAT_BACK) {
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 9eb3c73e6..9164d9e42 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -3840,18 +3840,6 @@ struct test_floor : public test_case {
             init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
         }
     }
-
-    double max_maa_err() override {
-        return 1e-3;
-    }
-
-    float grad_eps() override {
-        return 0.2f;
-    }
-
-    bool grad_precise() override {
-        return true;
-    }
 };
 // GGML_OP_CEIL
 struct test_ceil : public test_case {
@@ -3882,18 +3870,6 @@ struct test_ceil : public test_case {
             init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
         }
     }
-
-    double max_maa_err() override {
-        return 1e-3;
-    }
-
-    float grad_eps() override {
-        return 0.2f;
-    }
-
-    bool grad_precise() override {
-        return true;
-    }
 };
 
 // GGML_OP_ROUND
@@ -3925,18 +3901,6 @@ struct test_round : public test_case {
             init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
         }
     }
-
-    double max_maa_err() override {
-        return 1e-3;
-    }
-
-    float grad_eps() override {
-        return 0.2f;
-    }
-
-    bool grad_precise() override {
-        return true;
-    }
 };
 
 // GGML_OP_TRUNC
@@ -3968,18 +3932,6 @@ struct test_trunc : public test_case {
             init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
         }
     }
-
-    double max_maa_err() override {
-        return 1e-3;
-    }
-
-    float grad_eps() override {
-        return 0.2f;
-    }
-
-    bool grad_precise() override {
-        return true;
-    }
 };
 
 // GGML_OP_CLAMP

From bd2f2c52abfddaa8e2d51e90bc0f616cc3f0f7e4 Mon Sep 17 00:00:00 2001
From: safranowith <bsh155762@gmail.com>
Date: Sun, 2 Nov 2025 12:50:57 +0200
Subject: [PATCH 6/7] Revert unrelated miniaudio whitespace changes

---
 vendor/miniaudio/miniaudio.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/vendor/miniaudio/miniaudio.h b/vendor/miniaudio/miniaudio.h
index 08cd624f6..2f5b9c4ea 100644
--- a/vendor/miniaudio/miniaudio.h
+++ b/vendor/miniaudio/miniaudio.h
@@ -11582,7 +11582,7 @@ IMPLEMENTATION
 
     #include <sys/time.h>   /* select() (used for ma_sleep()). */
     #include <time.h>       /* For nanosleep() */
-    #include <unistd.h>
+    #include <unistd.h> 
 #endif
 
 /* For fstat(), etc. */
@@ -17608,7 +17608,7 @@ static ma_result ma_thread_create__posix(ma_thread* pThread, ma_thread_priority
             (void)stackSize;  /* Suppress unused parameter warning. */
         }
         #endif
-
+        
 
         if (scheduler != -1) {
             int priorityMin = sched_get_priority_min(scheduler);
@@ -23047,7 +23047,7 @@ static ma_result ma_context_get_MMDevice__wasapi(ma_context* pContext, ma_device
     CoInitializeResult = ma_CoInitializeEx(pContext, NULL, MA_COINIT_VALUE);
     {
         hr = ma_CoCreateInstance(pContext, &MA_CLSID_MMDeviceEnumerator, NULL, CLSCTX_ALL, &MA_IID_IMMDeviceEnumerator, (void**)&pDeviceEnumerator);
-    }
+    }    
     if (CoInitializeResult == S_OK || CoInitializeResult == S_FALSE) { ma_CoUninitialize(pContext); }
 
     if (FAILED(hr)) {   /* <-- This is checking the call above to ma_CoCreateInstance(). */
@@ -29673,7 +29673,7 @@ static ma_result ma_device_start__alsa(ma_device* pDevice)
     }
 
     if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
-        /*
+        /*        
         When data is written to the device we wait for the device to get ready to receive data with poll(). In my testing
         I have observed that poll() can sometimes block forever unless the device is started explicitly with snd_pcm_start()
         or some data is written with snd_pcm_writei().
@@ -35980,7 +35980,7 @@ static ma_result ma_device_init_internal__coreaudio(ma_context* pContext, ma_dev
             #endif
         }
 
-
+        
         status = ((ma_AudioUnitSetProperty_proc)pContext->coreaudio.AudioUnitSetProperty)(pData->audioUnit, kAudioUnitProperty_StreamFormat, formatScope, formatElement, &bestFormat, sizeof(bestFormat));
         if (status != noErr) {
             ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
@@ -39294,7 +39294,7 @@ static void ma_stream_error_callback__aaudio(ma_AAudioStream* pStream, void* pUs
 
     (void)error;
     ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[AAudio] ERROR CALLBACK: error=%d, AAudioStream_getState()=%d\n", error, ((MA_PFN_AAudioStream_getState)pDevice->pContext->aaudio.AAudioStream_getState)(pStream));
-
+    
     /*
     When we get an error, we'll assume that the stream is in an erroneous state and needs to be restarted. From the documentation,
     we cannot do this from the error callback. Therefore we are going to use an event thread for the AAudio backend to do this
@@ -39306,13 +39306,13 @@ static void ma_stream_error_callback__aaudio(ma_AAudioStream* pStream, void* pUs
     else {
         job = ma_job_init(MA_JOB_TYPE_DEVICE_AAUDIO_REROUTE);
         job.data.device.aaudio.reroute.pDevice = pDevice;
-
+    
         if (pStream == pDevice->aaudio.pStreamCapture) {
             job.data.device.aaudio.reroute.deviceType = ma_device_type_capture;
         } else {
             job.data.device.aaudio.reroute.deviceType = ma_device_type_playback;
         }
-
+    
         result = ma_device_job_thread_post(&pDevice->pContext->aaudio.jobThread, &job);
         if (result != MA_SUCCESS) {
             ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[AAudio] Device Disconnected. Failed to post job for rerouting.\n");
@@ -39909,7 +39909,7 @@ static ma_result ma_device_reinit__aaudio(ma_device* pDevice, ma_device_type dev
 
     /* We got disconnected! Retry a few times, until we find a connected device! */
     iAttempt = 0;
-    while (iAttempt++ < maxAttempts) {
+    while (iAttempt++ < maxAttempts) {        
         /* Device tearing down? No need to reroute! */
         if (ma_atomic_bool32_get(&pDevice->aaudio.isTearingDown)) {
             result = MA_SUCCESS; /* Caller should continue as normal. */
@@ -40007,7 +40007,7 @@ static ma_result ma_device_reinit__aaudio(ma_device* pDevice, ma_device_type dev
             break;
         }
     }
-
+    
     return result;
 }
 
@@ -61679,7 +61679,7 @@ static ma_result ma_default_vfs_info(ma_vfs* pVFS, ma_vfs_file file, ma_file_inf
         ma_result result;
         ma_int64 cursor;
         ma_int64 sizeInBytes;
-
+        
         result = ma_default_vfs_tell(pVFS, file, &cursor);
         if (result != MA_SUCCESS) {
             return result;

From cb039abc1cf34634e4f524a85b69e0a3989a6e57 Mon Sep 17 00:00:00 2001
From: safranowith <bsh155762@gmail.com>
Date: Sun, 9 Nov 2025 12:05:28 +0200
Subject: [PATCH 7/7] rebased the master and resloved the conflict

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp          | 175 ++++++++++++++----
 .../vulkan-shaders/vulkan-shaders-gen.cpp     |   2 +-
 tests/test-backend-ops.cpp                    |  46 +++++
 3 files changed, 186 insertions(+), 37 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 0760cf338..4d6101c87 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -8249,6 +8249,26 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
             return ctx->device->pipeline_cos_f32;
         }
         return nullptr;
+    case GGML_OP_FLOOR:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_floor_f32;
+        }
+        return nullptr;
+    case GGML_OP_CEIL:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_ceil_f32;
+        }
+        return nullptr;
+    case GGML_OP_ROUND:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_round_f32;
+        }
+        return nullptr;
+    case GGML_OP_TRUNC:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_trunc_f32;
+        }
+        return nullptr;
     case GGML_OP_CLAMP:
         if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
             return ctx->device->pipeline_clamp_f32;
@@ -8660,6 +8680,10 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
     case GGML_OP_SUM:
     case GGML_OP_SUM_ROWS:
     case GGML_OP_MEAN:
+    case GGML_OP_FLOOR:
+    case GGML_OP_CEIL:
+    case GGML_OP_ROUND:
+    case GGML_OP_TRUNC:
         return true;
     default:
         return false;
@@ -9033,6 +9057,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
     case GGML_OP_UNARY:
     case GGML_OP_GLU:
     case GGML_OP_CONV_2D_DW:
+    case GGML_OP_FLOOR:
+    case GGML_OP_CEIL:
+    case GGML_OP_ROUND:
+    case GGML_OP_TRUNC:
         {
             uint32_t ne = ggml_nelements(dst);
             if (op == GGML_OP_CPY && ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
@@ -9849,6 +9877,7 @@ static void ggml_vk_round(ggml_backend_vk_context * ctx, vk_context& subctx, con
 
 static void ggml_vk_trunc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
     ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, vk_op_unary_push_constants_init(src0, dst), dryrun);
+
 }
 
 static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
@@ -11626,6 +11655,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
     case GGML_OP_FLASH_ATTN_EXT:
     case GGML_OP_OPT_STEP_ADAMW:
     case GGML_OP_OPT_STEP_SGD:
+    case GGML_OP_FLOOR:
+    case GGML_OP_CEIL:
+    case GGML_OP_ROUND:
+    case GGML_OP_TRUNC:
         break;
     default:
         std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
@@ -11693,6 +11726,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
         case GGML_OP_CONV_2D_DW:
         case GGML_OP_LEAKY_RELU:
         case GGML_OP_OPT_STEP_SGD:
+        case GGML_OP_FLOOR:
+        case GGML_OP_CEIL:
+        case GGML_OP_ROUND:
+        case GGML_OP_TRUNC:
             {
                 // These operations all go through ggml_vk_op_f32, so short-circuit and
                 // do the only thing needed for the dryrun.
@@ -11707,7 +11744,6 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
             break;
         }
     }
-
     if (!dryrun) {
         // This logic detects dependencies between modes in the graph and calls ggml_vk_sync_buffers
         // to synchronize them. This handles most "normal" synchronization when computing the graph, and when
@@ -11872,6 +11908,22 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
     case GGML_OP_COS:
         ggml_vk_cos(ctx, compute_ctx, src0, node, dryrun);
 
+        break;
+    case GGML_OP_FLOOR:
+        ggml_vk_floor(ctx, compute_ctx, src0, node, dryrun);
+
+        break;
+    case GGML_OP_CEIL:
+        ggml_vk_ceil(ctx, compute_ctx, src0, node, dryrun);
+
+        break;
+    case GGML_OP_ROUND:
+        ggml_vk_round(ctx, compute_ctx, src0, node, dryrun);
+
+        break;
+    case GGML_OP_TRUNC:
+        ggml_vk_trunc(ctx, compute_ctx, src0, node, dryrun);
+
         break;
     case GGML_OP_CLAMP:
         ggml_vk_clamp(ctx, compute_ctx, src0, node, dryrun);
@@ -13909,6 +13961,10 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
         case GGML_OP_LEAKY_RELU:
         case GGML_OP_OPT_STEP_ADAMW:
         case GGML_OP_OPT_STEP_SGD:
+        case GGML_OP_FLOOR:
+        case GGML_OP_CEIL:
+        case GGML_OP_ROUND:
+        case GGML_OP_TRUNC:
             return op->src[0]->type == GGML_TYPE_F32;
         case GGML_OP_ARGSORT:
             return op->ne[0] <= max_argsort_cols;
@@ -14339,43 +14395,90 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
             default: continue;
             }
         }
-        if (srci == nullptr) {
-            continue;
+        if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
+            ggml_vk_print_tensor(srci, srci_name[i]);
         }
-        ggml_tensor * srci_clone = ggml_dup_tensor(ggml_ctx, srci);
-        size_t srci_size = ggml_nbytes(srci);
-
-        src_clone[i] = srci_clone;
-        src_size[i] = ggml_nbytes(srci);
-        src_buffer[i] = malloc(srci_size);
-
-        srci_clone->data = src_buffer[i];
-        if (ggml_backend_buffer_is_host(srci->buffer)) {
-            memcpy(srci_clone->data, srci->data, srci_size);
-            memcpy(srci_clone->nb, srci->nb, sizeof(size_t) * GGML_MAX_DIMS);
-        } else if (ggml_backend_buffer_is_vk(srci->buffer)) {
-            ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)srci->buffer->context;
-            vk_buffer& buffer_gpu = buf_ctx->dev_buffer;
-            uint64_t offset = vk_tensor_offset(srci) + srci->view_offs;
-            if (!ggml_is_contiguous(srci) && ggml_vk_dim01_contiguous(srci)) {
-                for (int i3 = 0; i3 < srci->ne[3]; i3++) {
-                    for (int i2 = 0; i2 < srci->ne[2]; i2++) {
-                        const int idx = i3*srci->ne[2] + i2;
-                        ggml_vk_buffer_read(buffer_gpu, offset + idx * srci->nb[2], ((char *)srci_clone->data + idx * srci_clone->nb[2]), srci->ne[1] * srci->nb[1]);
-                    }
-                }
+    }
 
-                srci_clone->nb[0] = srci->nb[0];
-                srci_clone->nb[1] = srci->nb[1];
-                for (int i = 2; i < GGML_MAX_DIMS; i++) {
-                    srci_clone->nb[i] = srci_clone->nb[i - 1]*srci_clone->ne[i - 1];
-                }
-            } else {
-                if (offset + srci_size >= buffer_gpu->size) {
-                    srci_size = buffer_gpu->size - offset;
-                }
-                ggml_vk_buffer_read(buffer_gpu, offset, srci_clone->data, srci_size);
-                memcpy(srci_clone->nb, srci->nb, sizeof(size_t) * GGML_MAX_DIMS);
+    if (tensor->op == GGML_OP_FLASH_ATTN_EXT) {
+        const float * params = (const float *)tensor->op_params;
+        tensor_clone = ggml_flash_attn_ext(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3], params[0], params[1], params[2]);
+        if (src_clone[4]) {
+            ggml_flash_attn_ext_add_sinks(tensor_clone, src_clone[4]);
+        }
+    } else if (tensor->op == GGML_OP_MUL_MAT) {
+        tensor_clone = ggml_mul_mat(ggml_ctx, src_clone[0], src_clone[1]);
+    } else if (tensor->op == GGML_OP_MUL_MAT_ID) {
+        tensor_clone = ggml_mul_mat_id(ggml_ctx, src_clone[0], src_clone[1], src_clone[2]);
+    } else if (tensor->op == GGML_OP_SUB) {
+        tensor_clone = ggml_sub(ggml_ctx, src_clone[0], src_clone[1]);
+    } else if (tensor->op == GGML_OP_MUL) {
+        if (fused_rms_norm_mul) {
+            tensor_clone = ggml_rms_norm(ggml_ctx, src_clone[0], *(float *)tensor->src[rms_norm_idx]->op_params);
+            tensor_clone = ggml_mul(ggml_ctx, tensor_clone, src_clone[1 - rms_norm_idx]);
+        } else {
+            tensor_clone = ggml_mul(ggml_ctx, src_clone[0], src_clone[1]);
+        }
+    } else if (tensor->op == GGML_OP_DIV) {
+        tensor_clone = ggml_div(ggml_ctx, src_clone[0], src_clone[1]);
+    } else if (tensor->op == GGML_OP_CONCAT) {
+        tensor_clone = ggml_concat(ggml_ctx, src_clone[0], src_clone[1], *(int *)tensor->op_params);
+    } else if (tensor->op == GGML_OP_UPSCALE) {
+        tensor_clone = ggml_interpolate(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], (ggml_scale_mode) tensor->op_params[0]);
+    } else if (tensor->op == GGML_OP_SCALE) {
+        const float * params = (const float *)tensor->op_params;
+        tensor_clone = ggml_scale_bias(ggml_ctx, src_clone[0], params[0], params[1]);
+    } else if (tensor->op == GGML_OP_SQR) {
+        tensor_clone = ggml_sqr(ggml_ctx, src_clone[0]);
+    } else if (tensor->op == GGML_OP_SQRT) {
+        tensor_clone = ggml_sqrt(ggml_ctx, src_clone[0]);
+    } else if (tensor->op == GGML_OP_SIN) {
+        tensor_clone = ggml_sin(ggml_ctx, src_clone[0]);
+    } else if (tensor->op == GGML_OP_COS) {
+        tensor_clone = ggml_cos(ggml_ctx, src_clone[0]);
+    } else if (tensor->op == GGML_OP_FLOOR) {
+        tensor_clone = ggml_floor(ggml_ctx, src_clone[0]);
+    } else if (tensor->op == GGML_OP_CEIL) {
+        tensor_clone = ggml_ceil(ggml_ctx, src_clone[0]);
+    } else if (tensor->op == GGML_OP_ROUND) {
+        tensor_clone = ggml_round(ggml_ctx, src_clone[0]);
+    } else if (tensor->op == GGML_OP_TRUNC) {
+        tensor_clone = ggml_trunc(ggml_ctx, src_clone[0]);
+    } else if (tensor->op == GGML_OP_CLAMP) {
+        const float * params = (const float *)tensor->op_params;
+        tensor_clone = ggml_clamp(ggml_ctx, src_clone[0], params[0], params[1]);
+    } else if (tensor->op == GGML_OP_PAD) {
+        tensor_clone = ggml_pad_ext(ggml_ctx, src_clone[0], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3],
+        tensor->op_params[4], tensor->op_params[5], tensor->op_params[6], tensor->op_params[7]);
+    } else if (tensor->op == GGML_OP_REPEAT) {
+        tensor_clone = ggml_repeat(ggml_ctx, src_clone[0], tensor);
+    } else if (tensor->op == GGML_OP_REPEAT_BACK) {
+        tensor_clone = ggml_repeat_back(ggml_ctx, src_clone[0], tensor);
+    } else if (tensor->op == GGML_OP_ADD) {
+        tensor_clone = ggml_add(ggml_ctx, src_clone[0], src_clone[1]);
+    } else if (tensor->op == GGML_OP_ACC) {
+        tensor_clone = ggml_acc(ggml_ctx, src_clone[0], src_clone[1], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3]);
+    } else if (tensor->op == GGML_OP_NORM) {
+        tensor_clone = ggml_norm(ggml_ctx, src_clone[0], *(float *)tensor->op_params);
+    } else if (tensor->op == GGML_OP_GROUP_NORM) {
+        const float * float_params = (const float *)tensor->op_params;
+        tensor_clone = ggml_group_norm(ggml_ctx, src_clone[0], tensor->op_params[0], float_params[1]);
+    } else if (tensor->op == GGML_OP_RMS_NORM) {
+        tensor_clone = ggml_rms_norm(ggml_ctx, src_clone[0], *(float *)tensor->op_params);
+    } else if (tensor->op == GGML_OP_RMS_NORM_BACK) {
+        const float eps = ((float *) tensor->op_params)[0];
+        tensor_clone = ggml_rms_norm_back(ggml_ctx, src_clone[0], src_clone[1], eps);
+    } else if (tensor->op == GGML_OP_SILU_BACK) {
+        tensor_clone = ggml_silu_back(ggml_ctx, src_clone[0], src_clone[1]);
+    } else if (tensor->op == GGML_OP_L2_NORM) {
+        const float eps = ((float *) tensor->op_params)[0];
+        tensor_clone = ggml_l2_norm(ggml_ctx, src_clone[0], eps);
+    } else if (tensor->op == GGML_OP_SOFT_MAX) {
+        if (src1 != nullptr) {
+            const float * params = (const float *)tensor->op_params;
+            tensor_clone = ggml_flash_attn_ext(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3], params[0], params[1], params[2]);
+            if (src_clone[4]) {
+                ggml_flash_attn_ext_add_sinks(tensor_clone, src_clone[4]);
             }
         } else {
             GGML_ABORT("fatal error");
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
index c98576dff..61b1fc72f 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -788,7 +788,7 @@ void process_shaders() {
     string_to_spv("round_f32", "round.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
 
     string_to_spv("trunc_f32", "trunc.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
-
+     
     string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
 
     string_to_spv("pad_f32", "pad.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 9164d9e42..435a8ac31 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -3840,6 +3840,17 @@ struct test_floor : public test_case {
             init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
         }
     }
+    double max_maa_err() override {
+        return 1e-3;
+    }
+
+    float grad_eps() override {
+        return 0.2f;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
 };
 // GGML_OP_CEIL
 struct test_ceil : public test_case {
@@ -3870,6 +3881,18 @@ struct test_ceil : public test_case {
             init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
         }
     }
+
+    double max_maa_err() override {
+        return 1e-3;
+    }
+
+    float grad_eps() override {
+        return 0.2f;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
 };
 
 // GGML_OP_ROUND
@@ -3901,6 +3924,18 @@ struct test_round : public test_case {
             init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
         }
     }
+
+    double max_maa_err() override {
+        return 1e-3;
+    }
+
+    float grad_eps() override {
+        return 0.2f;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
 };
 
 // GGML_OP_TRUNC
@@ -3932,6 +3967,17 @@ struct test_trunc : public test_case {
             init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
         }
     }
+    double max_maa_err() override {
+        return 1e-3;
+    }
+
+    float grad_eps() override {
+        return 0.2f;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
 };
 
 // GGML_OP_CLAMP