@@ -1007,37 +1007,41 @@ class vk_memory_logger {
10071007#endif // GGML_VULKAN_MEMORY_DEBUG
10081008
10091009class vk_perf_logger {
1010- public:
1010+ public:
10111011 void print_timings() {
1012- if(timings.empty()){
1012+ if (timings.empty()) {
10131013 return;
10141014 }
10151015 uint64_t total_all_op_times = 0;
10161016 std::cerr << "----------------\nVulkan Timings:" << std::endl;
1017- for (const auto& t : timings) {
1017+ for (const auto & t : timings) {
10181018 uint64_t total_op_times = 0;
1019- for (const auto& time : t.second) {
1019+ for (const auto & time : t.second) {
10201020 total_op_times += time;
10211021 }
1022- std::cerr << t.first << ": " << t.second.size() << " x " << (total_op_times / t.second.size() / 1000.0) << " us";
1022+ std::cerr << t.first << ": " << t.second.size() << " x " << (total_op_times / t.second.size() / 1000.0)
1023+ << " us";
10231024
10241025 // If we have as many flops entries as timing entries for the op, then compute and log the flops/S.
10251026 auto it = flops.find(t.first);
1026- if(it != flops.end() && (it->second).size() == t.second.size()){
1027+ if (it != flops.end() && (it->second).size() == t.second.size()) {
10271028 uint64_t total_op_flops = 0;
1028- for(const auto& elem : it->second){
1029+ for (const auto & elem : it->second) {
10291030 total_op_flops += elem;
10301031 }
1031- std::cerr << " (" << (double(total_op_flops)/(1000.0*1000.0*1000.0)) / (double(total_op_times)/(1000.0*1000.0*1000.0)) << " GFLOPS/s)";
1032+ std::cerr << " ("
1033+ << (double(total_op_flops) / (1000.0 * 1000.0 * 1000.0)) /
1034+ (double(total_op_times) / (1000.0 * 1000.0 * 1000.0))
1035+ << " GFLOPS/s)";
10321036 }
10331037
10341038 total_all_op_times += total_op_times;
10351039
10361040 std::cerr << std::endl;
10371041 }
10381042
1039- if(timings.size() > 0){
1040- std::cerr << "Total time: " << total_all_op_times/ 1000.0 << " us." << std::endl;
1043+ if (timings.size() > 0) {
1044+ std::cerr << "Total time: " << total_all_op_times / 1000.0 << " us." << std::endl;
10411045 }
10421046
10431047 timings.clear();
@@ -1050,42 +1054,43 @@ class vk_perf_logger {
10501054 return;
10511055 }
10521056 if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
1053- const uint64_t m = node->src[0]->ne[1];
1054- const uint64_t n = node->src[1]->ne[1];
1055- const uint64_t k = node->src[1]->ne[0];
1056- std::string name = ggml_op_name(node->op);
1057+ const uint64_t m = node->src[0]->ne[1];
1058+ const uint64_t n = node->src[1]->ne[1];
1059+ const uint64_t k = node->src[1]->ne[0];
1060+ std::string name = ggml_op_name(node->op);
10571061 if (n == 1) {
10581062 name += "_VEC m=" + std::to_string(m) + " k=" + std::to_string(k);
10591063 } else {
10601064 name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k);
10611065 }
10621066 timings[name].push_back(time);
1063- flops[name].push_back( m*n*(k+(k- 1)) );
1067+ flops[name].push_back(m * n * (k + (k - 1)));
10641068 return;
10651069 }
1066- if(node->op == GGML_OP_CONV_2D){
1067- std::string name = ggml_op_name(node->op);
1068- ggml_tensor * knl = node->src[0];
1069- uint64_t OW = node->ne[0];
1070- uint64_t OH = node->ne[1];
1071- uint64_t N = node->ne[3];
1072- uint64_t Cout = node->ne[2];
1073- uint64_t KW = knl->ne[0];
1074- uint64_t KH = knl->ne[1];
1075- uint64_t Cin = knl->ne[2];
1070+ if (node->op == GGML_OP_CONV_2D) {
1071+ std::string name = ggml_op_name(node->op);
1072+ ggml_tensor * knl = node->src[0];
1073+ uint64_t OW = node->ne[0];
1074+ uint64_t OH = node->ne[1];
1075+ uint64_t N = node->ne[3];
1076+ uint64_t Cout = node->ne[2];
1077+ uint64_t KW = knl->ne[0];
1078+ uint64_t KH = knl->ne[1];
1079+ uint64_t Cin = knl->ne[2];
10761080 // KxCRS @ CRSxNPQ = KxNPQ -> M=K, K=CRS, N=NPQ
1077- uint64_t size_M = Cout;
1078- uint64_t size_K = Cin*KW*KH;
1079- uint64_t size_N = N*OW*OH;
1080- uint64_t n_flops = size_M*size_N*(size_K+(size_K-1));
1081- name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) + ", N=N*OW*OH=" + std::to_string(size_N);
1081+ uint64_t size_M = Cout;
1082+ uint64_t size_K = Cin * KW * KH;
1083+ uint64_t size_N = N * OW * OH;
1084+ uint64_t n_flops = size_M * size_N * (size_K + (size_K - 1));
1085+ name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) +
1086+ ", N=N*OW*OH=" + std::to_string(size_N);
10821087 flops[name].push_back(n_flops);
10831088 timings[name].push_back(time);
10841089 return;
10851090 }
10861091 timings[ggml_op_name(node->op)].push_back(time);
10871092 }
1088- private:
1093+ private:
10891094 std::map<std::string, std::vector<uint64_t>> timings;
10901095 std::map<std::string, std::vector<uint64_t>> flops;
10911096};
@@ -3035,28 +3040,39 @@ static void ggml_vk_load_shaders(vk_device& device) {
30353040 ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
30363041
30373042 // conv2d
3038- uint32_t conv2d_WG_SIZE = 256;
3039- uint32_t conv2d_BS_K = 128;
3040- uint32_t conv2d_BS_CRS = 16;
3041- uint32_t use_collectives = 0; // Enables subgroup ops for preventing the re-calculation of indices.
3042- if(device->subgroup_shuffle && device->vendor_id != VK_VENDOR_ID_INTEL){ // Do not enable collectives on Intel, see PR 14316
3043+ uint32_t conv2d_WG_SIZE = 256;
3044+ uint32_t conv2d_BS_K = 128;
3045+ uint32_t conv2d_BS_CRS = 16;
3046+ uint32_t use_collectives = 0; // Enables subgroup ops for preventing the re-calculation of indices.
3047+ if (device->subgroup_shuffle &&
3048+ device->vendor_id != VK_VENDOR_ID_INTEL) { // Do not enable collectives on Intel, see PR 14316
30433049 use_collectives = 1;
3044- conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS); // CRS block size should be capped at sugroup size for correctness when shuffle is used.
3050+ conv2d_BS_CRS = std::min(
3051+ device->subgroup_size,
3052+ conv2d_BS_CRS); // CRS block size should be capped at sugroup size for correctness when shuffle is used.
30453053 }
30463054 uint32_t conv2d_BS_NPQ = 128;
3047- uint32_t conv2d_TS_K = 8;
3048- uint32_t conv2d_shmem_req = (conv2d_BS_K*(conv2d_BS_CRS+1) + conv2d_BS_CRS*(conv2d_BS_NPQ+1))*sizeof(float);
3049- if(device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req){
3055+ uint32_t conv2d_TS_K = 8;
3056+ uint32_t conv2d_shmem_req =
3057+ (conv2d_BS_K * (conv2d_BS_CRS + 1) + conv2d_BS_CRS * (conv2d_BS_NPQ + 1)) * sizeof(float);
3058+ if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) {
30503059 conv2d_BS_CRS = 8;
3051- if(use_collectives){
3060+ if (use_collectives) {
30523061 conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS);
30533062 }
30543063 }
30553064
3056- if(use_collectives){
3057- ggml_vk_create_pipeline(device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3, sizeof(vk_op_conv2d_push_constants), {conv2d_BS_K, conv2d_BS_NPQ, 1}, {conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives}, 1, true, true);
3058- }else{
3059- ggml_vk_create_pipeline(device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3, sizeof(vk_op_conv2d_push_constants), {conv2d_BS_K, conv2d_BS_NPQ, 1}, {conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives}, 1, true, false);
3065+ if (use_collectives) {
3066+ ggml_vk_create_pipeline(
3067+ device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
3068+ sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
3069+ { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, true);
3070+ } else {
3071+ ggml_vk_create_pipeline(
3072+ device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
3073+ sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
3074+ { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true,
3075+ false);
30603076 }
30613077
30623078 ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
@@ -6908,12 +6924,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
69086924 }
69096925 return nullptr;
69106926 case GGML_OP_CONV_2D:
6911- if (src0->type == GGML_TYPE_F32 &&
6912- src1->type == GGML_TYPE_F32 &&
6913- dst->type == GGML_TYPE_F32 &&
6914- ggml_is_contiguous(src0) &&
6915- ggml_is_contiguous(src1) &&
6916- ggml_is_contiguous(dst)) {
6927+ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
6928+ ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
69176929 return ctx->device->pipeline_conv2d_f32;
69186930 }
69196931 return nullptr;
@@ -7250,19 +7262,20 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
72507262 return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
72517263 };
72527264 // parallelize in {OW/BS_K, OH/BS_NPQ, 1}
7253- int64_t W = src1->ne[0];
7254- int64_t H = src1->ne[1];
7255- int64_t KW = src0->ne[0];
7256- int64_t KH = src0->ne[1];
7265+ int64_t W = src1->ne[0];
7266+ int64_t H = src1->ne[1];
7267+ int64_t KW = src0->ne[0];
7268+ int64_t KH = src0->ne[1];
72577269 int64_t Cout = src0->ne[3];
7258- int64_t N = src1->ne[3];
7259- int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
7260- int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
7261- int64_t NPQ = N*OW* OH;
7270+ int64_t N = src1->ne[3];
7271+ int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
7272+ int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
7273+ int64_t NPQ = N * OW * OH;
72627274
72637275 // Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
7264- elements = {static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1};
7265- } break;
7276+ elements = { static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1 };
7277+ }
7278+ break;
72667279 case GGML_OP_ADD:
72677280 case GGML_OP_SUB:
72687281 case GGML_OP_DIV:
@@ -8129,10 +8142,11 @@ static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c
81298142 }, dryrun);
81308143}
81318144
8132- static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
8145+ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0,
8146+ const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
81338147 GGML_ASSERT(src0->type == GGML_TYPE_F32);
81348148 GGML_ASSERT(src1->type == GGML_TYPE_F32);
8135- GGML_ASSERT( dst->type == GGML_TYPE_F32);
8149+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
81368150
81378151 GGML_TENSOR_BINARY_OP_LOCALS
81388152
@@ -8142,13 +8156,13 @@ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c
81428156
81438157 vk_op_conv2d_push_constants p{};
81448158 p.Cout = static_cast<uint32_t>(ne03);
8145- p.Cin = static_cast<uint32_t>(ne02);
8146- p.N = static_cast<uint32_t>(ne13);
8159+ p.Cin = static_cast<uint32_t>(ne02);
8160+ p.N = static_cast<uint32_t>(ne13);
81478161
81488162 p.KW = static_cast<uint32_t>(ne00);
81498163 p.KH = static_cast<uint32_t>(ne01);
8150- p.W = static_cast<uint32_t>(ne10);
8151- p.H = static_cast<uint32_t>(ne11);
8164+ p.W = static_cast<uint32_t>(ne10);
8165+ p.H = static_cast<uint32_t>(ne11);
81528166 p.OW = static_cast<uint32_t>(ne0);
81538167 p.OH = static_cast<uint32_t>(ne1);
81548168
@@ -8159,13 +8173,13 @@ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c
81598173 p.d0 = static_cast<uint32_t>(dst->op_params[4]);
81608174 p.d1 = static_cast<uint32_t>(dst->op_params[5]);
81618175
8162- p.nb01 = static_cast<uint32_t>(nb01/ nb00);
8163- p.nb02 = static_cast<uint32_t>(nb02/ nb00);
8164- p.nb03 = static_cast<uint32_t>(nb03/ nb00);
8176+ p.nb01 = static_cast<uint32_t>(nb01 / nb00);
8177+ p.nb02 = static_cast<uint32_t>(nb02 / nb00);
8178+ p.nb03 = static_cast<uint32_t>(nb03 / nb00);
81658179
8166- p.nb11 = static_cast<uint32_t>(nb11/ nb10);
8167- p.nb12 = static_cast<uint32_t>(nb12/ nb10);
8168- p.nb13 = static_cast<uint32_t>(nb13/ nb10);
8180+ p.nb11 = static_cast<uint32_t>(nb11 / nb10);
8181+ p.nb12 = static_cast<uint32_t>(nb12 / nb10);
8182+ p.nb13 = static_cast<uint32_t>(nb13 / nb10);
81698183
81708184 p.nb1 = static_cast<uint32_t>(nb1 / nb0);
81718185 p.nb2 = static_cast<uint32_t>(nb2 / nb0);
@@ -8175,7 +8189,6 @@ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c
81758189 GGML_ASSERT(ne02 == ne12);
81768190
81778191 ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONV_2D, std::move(p), dryrun);
8178-
81798192}
81808193
81818194static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -10231,11 +10244,12 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1023110244 ggml_vk_build_graph(ctx, cgraph, i, nullptr, 0, true, false, false, false);
1023210245 if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) {
1023310246 total_mat_mul_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
10234- }else if(cgraph->nodes[i]->op == GGML_OP_CONV_2D){
10247+ } else if (cgraph->nodes[i]->op == GGML_OP_CONV_2D) {
1023510248 // Return CRSxNPQxsizeof(*) to account as many bytes as mul_mat has in im2col->mul_mat mode.
10236- auto CRS_size = cgraph->nodes[i]->src[0]->ne[0]*cgraph->nodes[i]->src[0]->ne[1]*cgraph->nodes[i]->src[0]->ne[2];
10237- auto NPQ_size = cgraph->nodes[i]->ne[0]*cgraph->nodes[i]->ne[1]*cgraph->nodes[i]->ne[3];
10238- total_mat_mul_bytes += NPQ_size*CRS_size*ggml_type_size(cgraph->nodes[i]->type);
10249+ auto CRS_size =
10250+ cgraph->nodes[i]->src[0]->ne[0] * cgraph->nodes[i]->src[0]->ne[1] * cgraph->nodes[i]->src[0]->ne[2];
10251+ auto NPQ_size = cgraph->nodes[i]->ne[0] * cgraph->nodes[i]->ne[1] * cgraph->nodes[i]->ne[3];
10252+ total_mat_mul_bytes += NPQ_size * CRS_size * ggml_type_size(cgraph->nodes[i]->type);
1023910253 }
1024010254 i += ctx->num_additional_fused_ops;
1024110255 ctx->num_additional_fused_ops = 0;
0 commit comments