@@ -457,6 +457,7 @@ struct vk_device_struct {
457457    vk_pipeline pipeline_rwkv_wkv6_f32;
458458    vk_pipeline pipeline_rwkv_wkv7_f32;
459459    vk_pipeline pipeline_opt_step_adamw_f32;
460+     vk_pipeline pipeline_conv2d_f32;
460461    vk_pipeline pipeline_conv2d_dw_whcn_f32;
461462    vk_pipeline pipeline_conv2d_dw_cwhn_f32;
462463
@@ -816,6 +817,38 @@ struct vk_op_rwkv_wkv7_push_constants {
816817    uint32_t H;
817818};
818819
820+ struct vk_op_conv2d_push_constants {
821+     uint32_t Cout;
822+     uint32_t Cin;
823+     uint32_t N;
824+     
825+     uint32_t KW;
826+     uint32_t KH;
827+     uint32_t W;
828+     uint32_t H;
829+     uint32_t OW;
830+     uint32_t OH;
831+ 
832+     uint32_t s0;
833+     uint32_t s1;
834+     uint32_t p0;
835+     uint32_t p1;
836+     uint32_t d0;
837+     uint32_t d1;
838+ 
839+     uint32_t nb01;
840+     uint32_t nb02;
841+     uint32_t nb03;
842+ 
843+     uint32_t nb11;
844+     uint32_t nb12;
845+     uint32_t nb13;
846+ 
847+     uint32_t nb1;
848+     uint32_t nb2;
849+     uint32_t nb3;
850+ };
851+ 
819852struct vk_op_conv2d_dw_push_constants {
820853    uint32_t ne;
821854    uint32_t batches;
@@ -916,16 +949,33 @@ class vk_memory_logger {
916949class vk_perf_logger {
917950public:
918951    void print_timings() {
952+         if(timings.empty()){
953+             return;
954+         }        
919955        std::cerr << "----------------\nVulkan Timings:" << std::endl;
920956        for (const auto& t : timings) {
921957            uint64_t total = 0;
922958            for (const auto& time : t.second) {
923959                total += time;
924960            }
925-             std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us" << std::endl;
961+             std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us";
962+ 
963+             // If we have as many flops entries as timing entries for the op, then compute and log the flops/S.
964+             auto it = flops.find(t.first);
965+             if(it != flops.end() && (it->second).size() == t.second.size()){
966+                 uint64_t total_nflops = 0;
967+                 for(const auto& elem : it->second){
968+                     total_nflops += elem;
969+                 }
970+                 std::cout << " (" << (double(total_nflops)/(1000.0*1000.0*1000.0)) / (double(total)/(1000.0*1000.0*1000.0)) << " GFLOPS/s)";
971+             }
972+ 
973+ 
974+             std::cerr << std::endl;
926975        }
927976
928977        timings.clear();
978+         flops.clear();        
929979    }
930980
931981    void log_timing(const ggml_tensor * node, uint64_t time) {
@@ -944,12 +994,33 @@ class vk_perf_logger {
944994                name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k);
945995            }
946996            timings[name].push_back(time);
997+             flops[name].push_back( m*n*(k+(k-1)) );
947998            return;
948999        }
1000+         if(node->op == GGML_OP_CONV_2D){
1001+             std::string name = ggml_op_name(node->op);
1002+             ggml_tensor * knl = node->src[0];
1003+             uint64_t OW = node->ne[0];
1004+             uint64_t OH = node->ne[1];
1005+             uint64_t N = node->ne[3];
1006+             uint64_t Cout = node->ne[2];
1007+             uint64_t KW = knl->ne[0];
1008+             uint64_t KH = knl->ne[1];
1009+             uint64_t Cin = knl->ne[2];
1010+             // KxCRS @ CRSxNPQ = KxNPQ -> M=K, K=CRS, N=NPQ
1011+             uint64_t size_M = Cout;
1012+             uint64_t size_K = Cin*KW*KH;
1013+             uint64_t size_N = N*OW*OH;
1014+             uint64_t n_flops = size_M*size_N*(size_K+(size_K-1));
1015+             flops[name].push_back(n_flops);
1016+             timings[name].push_back(time);
1017+             return;
1018+         }        
9491019        timings[ggml_op_name(node->op)].push_back(time);
9501020    }
9511021private:
9521022    std::map<std::string, std::vector<uint64_t>> timings;
1023+     std::map<std::string, std::vector<uint64_t>> flops;
9531024};
9541025
9551026struct ggml_backend_vk_context {
@@ -2806,6 +2877,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
28062877
28072878    ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
28082879
2880+     ggml_vk_create_pipeline(device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3, sizeof(vk_op_conv2d_push_constants), {128 /* equal to BS_K in the shader */, 128 /* equal to BS_NPQ in the shader */, 1}, {}, 1);
2881+ 
28092882    ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
28102883    ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f32, "conv2d_dw_cwhn_f32", conv2d_dw_cwhn_f32_len, conv2d_dw_cwhn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
28112884
@@ -6578,6 +6651,16 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
65786651            return ctx->device->pipeline_leaky_relu_f32;
65796652        }
65806653        return nullptr;
6654+     case GGML_OP_CONV_2D:
6655+         if (src0->type == GGML_TYPE_F32 && 
6656+                 src1->type == GGML_TYPE_F32 && 
6657+                 dst->type == GGML_TYPE_F32 && 
6658+                 ggml_is_contiguous(src0) && 
6659+                 ggml_is_contiguous(src1) && 
6660+                 ggml_is_contiguous(dst)) {
6661+             return ctx->device->pipeline_conv2d_f32;
6662+         }
6663+         return nullptr;
65816664    case GGML_OP_CONV_2D_DW:
65826665        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
65836666            if (ggml_is_contiguous(src1)) {
@@ -6899,6 +6982,30 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
68996982            const uint32_t OW = dst->ne[0];
69006983            elements = { N * OC * OH * OW, 1, 1};
69016984        } break;
6985+     case GGML_OP_CONV_2D:
6986+         {
6987+             // src0 - kernel:   [KW, KH, Cin, Cout]
6988+             // src1 - input:    [W, H, Cin, N]
6989+             // dst - result:    [OW, OH, Cout, N]
6990+             
6991+             // Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
6992+             auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
6993+                 return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
6994+             };
6995+             // parallelize in {OW/BS_K, OH/BS_NPQ, 1}
6996+             int64_t W = src1->ne[0];
6997+             int64_t H = src1->ne[1];
6998+             int64_t KW = src0->ne[0];
6999+             int64_t KH = src0->ne[1];
7000+             int64_t Cout = src0->ne[3];
7001+             int64_t N = src1->ne[3];
7002+             int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
7003+             int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
7004+             int64_t NPQ = N*OW*OH;
7005+             
7006+             // Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
7007+             elements = {static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1}; 
7008+         } break;
69027009    case GGML_OP_ADD:
69037010    case GGML_OP_SUB:
69047011    case GGML_OP_DIV:
@@ -7753,6 +7860,55 @@ static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c
77537860    }, dryrun);
77547861}
77557862
7863+ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
7864+     GGML_ASSERT(src0->type == GGML_TYPE_F32);
7865+     GGML_ASSERT(src1->type == GGML_TYPE_F32);
7866+     GGML_ASSERT( dst->type == GGML_TYPE_F32);
7867+ 
7868+     GGML_TENSOR_BINARY_OP_LOCALS
7869+ 
7870+     GGML_ASSERT(nb00 == sizeof(float));
7871+     GGML_ASSERT(nb10 == sizeof(float));
7872+     GGML_ASSERT(nb0 == sizeof(float));
7873+ 
7874+     vk_op_conv2d_push_constants p{};
7875+     p.Cout = static_cast<uint32_t>(ne03);
7876+     p.Cin = static_cast<uint32_t>(ne02);
7877+     p.N = static_cast<uint32_t>(ne13);
7878+     
7879+     p.KW = static_cast<uint32_t>(ne00);
7880+     p.KH = static_cast<uint32_t>(ne01);
7881+     p.W = static_cast<uint32_t>(ne10);
7882+     p.H = static_cast<uint32_t>(ne11);
7883+     p.OW = static_cast<uint32_t>(ne0);
7884+     p.OH = static_cast<uint32_t>(ne1);
7885+     
7886+     p.s0 = static_cast<uint32_t>(dst->op_params[0]);
7887+     p.s1 = static_cast<uint32_t>(dst->op_params[1]);
7888+     p.p0 = static_cast<uint32_t>(dst->op_params[2]);
7889+     p.p1 = static_cast<uint32_t>(dst->op_params[3]);
7890+     p.d0 = static_cast<uint32_t>(dst->op_params[4]);
7891+     p.d1 = static_cast<uint32_t>(dst->op_params[5]);
7892+ 
7893+     p.nb01 = static_cast<uint32_t>(nb01/nb00);
7894+     p.nb02 = static_cast<uint32_t>(nb02/nb00);
7895+     p.nb03 = static_cast<uint32_t>(nb03/nb00);
7896+ 
7897+     p.nb11 = static_cast<uint32_t>(nb11/nb10);
7898+     p.nb12 = static_cast<uint32_t>(nb12/nb10);
7899+     p.nb13 = static_cast<uint32_t>(nb13/nb10);
7900+ 
7901+     p.nb1 = static_cast<uint32_t>(nb1 / nb0);
7902+     p.nb2 = static_cast<uint32_t>(nb2 / nb0);
7903+     p.nb3 = static_cast<uint32_t>(nb3 / nb0);
7904+ 
7905+     GGML_ASSERT(ne03 == ne2);
7906+     GGML_ASSERT(ne02 == ne12);
7907+ 
7908+     ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONV_2D, std::move(p), dryrun);
7909+     
7910+ }
7911+ 
77567912static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
77577913    vk_op_conv2d_dw_push_constants p{};
77587914    p.ne = ggml_nelements(dst);
@@ -8799,6 +8955,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
87998955    case GGML_OP_TIMESTEP_EMBEDDING:
88008956    case GGML_OP_CONV_TRANSPOSE_1D:
88018957    case GGML_OP_POOL_2D:
8958+     case GGML_OP_CONV_2D:
88028959    case GGML_OP_CONV_2D_DW:
88038960    case GGML_OP_RWKV_WKV6:
88048961    case GGML_OP_RWKV_WKV7:
@@ -8864,6 +9021,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
88649021        case GGML_OP_TIMESTEP_EMBEDDING:
88659022        case GGML_OP_CONV_TRANSPOSE_1D:
88669023        case GGML_OP_POOL_2D:
9024+         case GGML_OP_CONV_2D:
88679025        case GGML_OP_CONV_2D_DW:
88689026        case GGML_OP_LEAKY_RELU:
88699027            {
@@ -9042,6 +9200,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
90429200    case GGML_OP_POOL_2D:
90439201        ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun);
90449202
9203+         break;
9204+     case GGML_OP_CONV_2D:
9205+         ggml_vk_conv_2d(ctx, compute_ctx, src0, src1, node, dryrun);
9206+ 
90459207        break;
90469208    case GGML_OP_CONV_2D_DW:
90479209        ggml_vk_conv_2d_dw(ctx, compute_ctx, src0, src1, node, dryrun);
@@ -9168,6 +9330,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
91689330    case GGML_OP_TIMESTEP_EMBEDDING:
91699331    case GGML_OP_CONV_TRANSPOSE_1D:
91709332    case GGML_OP_POOL_2D:
9333+     case GGML_OP_CONV_2D:
91719334    case GGML_OP_CONV_2D_DW:
91729335    case GGML_OP_RWKV_WKV6:
91739336    case GGML_OP_RWKV_WKV7:
@@ -10242,6 +10405,14 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
1024210405            return true;
1024310406        case GGML_OP_CONV_TRANSPOSE_1D:
1024410407            return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
10408+         case GGML_OP_CONV_2D:
10409+             // Channel-contiguous format is not supported yet.
10410+             return (op->src[0]->type == GGML_TYPE_F32 && 
10411+                 op->src[1]->type == GGML_TYPE_F32 && 
10412+                 op->type == GGML_TYPE_F32 && 
10413+                 ggml_is_contiguous(op->src[0]) && 
10414+                 ggml_is_contiguous(op->src[1]) && 
10415+                 ggml_is_contiguous(op));
1024510416        default:
1024610417            return false;
1024710418    }
@@ -10765,6 +10936,14 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
1076510936        const int32_t p1 = tensor->op_params[6];
1076610937
1076710938        tensor_clone = ggml_pool_2d(ggml_ctx, src_clone[0], op, k0, k1, s0, s1, p0, p1);
10939+     } else if (tensor->op == GGML_OP_CONV_2D) {
10940+         const int32_t s0 = tensor->op_params[0];
10941+         const int32_t s1 = tensor->op_params[1];
10942+         const int32_t p0 = tensor->op_params[2];
10943+         const int32_t p1 = tensor->op_params[3];
10944+         const int32_t d0 = tensor->op_params[4];
10945+         const int32_t d1 = tensor->op_params[5];
10946+         tensor_clone = ggml_conv_2d(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1);
1076810947    } else if (tensor->op == GGML_OP_LEAKY_RELU) {
1076910948        const float * op_params = (const float *)tensor->op_params;
1077010949        tensor_clone = ggml_leaky_relu(ggml_ctx, src_clone[0], op_params[0], false);
0 commit comments