Skip to content

Commit b4acacc

Browse files
committed
Code formatted with .clang-format.
1 parent 8ddaf2d commit b4acacc

File tree

3 files changed

+327
-291
lines changed

3 files changed

+327
-291
lines changed

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 92 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -1007,37 +1007,41 @@ class vk_memory_logger {
10071007
#endif // GGML_VULKAN_MEMORY_DEBUG
10081008

10091009
class vk_perf_logger {
1010-
public:
1010+
public:
10111011
void print_timings() {
1012-
if(timings.empty()){
1012+
if (timings.empty()) {
10131013
return;
10141014
}
10151015
uint64_t total_all_op_times = 0;
10161016
std::cerr << "----------------\nVulkan Timings:" << std::endl;
1017-
for (const auto& t : timings) {
1017+
for (const auto & t : timings) {
10181018
uint64_t total_op_times = 0;
1019-
for (const auto& time : t.second) {
1019+
for (const auto & time : t.second) {
10201020
total_op_times += time;
10211021
}
1022-
std::cerr << t.first << ": " << t.second.size() << " x " << (total_op_times / t.second.size() / 1000.0) << " us";
1022+
std::cerr << t.first << ": " << t.second.size() << " x " << (total_op_times / t.second.size() / 1000.0)
1023+
<< " us";
10231024

10241025
// If we have as many flops entries as timing entries for the op, then compute and log the flops/S.
10251026
auto it = flops.find(t.first);
1026-
if(it != flops.end() && (it->second).size() == t.second.size()){
1027+
if (it != flops.end() && (it->second).size() == t.second.size()) {
10271028
uint64_t total_op_flops = 0;
1028-
for(const auto& elem : it->second){
1029+
for (const auto & elem : it->second) {
10291030
total_op_flops += elem;
10301031
}
1031-
std::cerr << " (" << (double(total_op_flops)/(1000.0*1000.0*1000.0)) / (double(total_op_times)/(1000.0*1000.0*1000.0)) << " GFLOPS/s)";
1032+
std::cerr << " ("
1033+
<< (double(total_op_flops) / (1000.0 * 1000.0 * 1000.0)) /
1034+
(double(total_op_times) / (1000.0 * 1000.0 * 1000.0))
1035+
<< " GFLOPS/s)";
10321036
}
10331037

10341038
total_all_op_times += total_op_times;
10351039

10361040
std::cerr << std::endl;
10371041
}
10381042

1039-
if(timings.size() > 0){
1040-
std::cerr << "Total time: " << total_all_op_times/1000.0 << " us." << std::endl;
1043+
if (timings.size() > 0) {
1044+
std::cerr << "Total time: " << total_all_op_times / 1000.0 << " us." << std::endl;
10411045
}
10421046

10431047
timings.clear();
@@ -1050,42 +1054,43 @@ class vk_perf_logger {
10501054
return;
10511055
}
10521056
if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
1053-
const uint64_t m = node->src[0]->ne[1];
1054-
const uint64_t n = node->src[1]->ne[1];
1055-
const uint64_t k = node->src[1]->ne[0];
1056-
std::string name = ggml_op_name(node->op);
1057+
const uint64_t m = node->src[0]->ne[1];
1058+
const uint64_t n = node->src[1]->ne[1];
1059+
const uint64_t k = node->src[1]->ne[0];
1060+
std::string name = ggml_op_name(node->op);
10571061
if (n == 1) {
10581062
name += "_VEC m=" + std::to_string(m) + " k=" + std::to_string(k);
10591063
} else {
10601064
name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k);
10611065
}
10621066
timings[name].push_back(time);
1063-
flops[name].push_back( m*n*(k+(k-1)) );
1067+
flops[name].push_back(m * n * (k + (k - 1)));
10641068
return;
10651069
}
1066-
if(node->op == GGML_OP_CONV_2D){
1067-
std::string name = ggml_op_name(node->op);
1068-
ggml_tensor * knl = node->src[0];
1069-
uint64_t OW = node->ne[0];
1070-
uint64_t OH = node->ne[1];
1071-
uint64_t N = node->ne[3];
1072-
uint64_t Cout = node->ne[2];
1073-
uint64_t KW = knl->ne[0];
1074-
uint64_t KH = knl->ne[1];
1075-
uint64_t Cin = knl->ne[2];
1070+
if (node->op == GGML_OP_CONV_2D) {
1071+
std::string name = ggml_op_name(node->op);
1072+
ggml_tensor * knl = node->src[0];
1073+
uint64_t OW = node->ne[0];
1074+
uint64_t OH = node->ne[1];
1075+
uint64_t N = node->ne[3];
1076+
uint64_t Cout = node->ne[2];
1077+
uint64_t KW = knl->ne[0];
1078+
uint64_t KH = knl->ne[1];
1079+
uint64_t Cin = knl->ne[2];
10761080
// KxCRS @ CRSxNPQ = KxNPQ -> M=K, K=CRS, N=NPQ
1077-
uint64_t size_M = Cout;
1078-
uint64_t size_K = Cin*KW*KH;
1079-
uint64_t size_N = N*OW*OH;
1080-
uint64_t n_flops = size_M*size_N*(size_K+(size_K-1));
1081-
name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) + ", N=N*OW*OH=" + std::to_string(size_N);
1081+
uint64_t size_M = Cout;
1082+
uint64_t size_K = Cin * KW * KH;
1083+
uint64_t size_N = N * OW * OH;
1084+
uint64_t n_flops = size_M * size_N * (size_K + (size_K - 1));
1085+
name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) +
1086+
", N=N*OW*OH=" + std::to_string(size_N);
10821087
flops[name].push_back(n_flops);
10831088
timings[name].push_back(time);
10841089
return;
10851090
}
10861091
timings[ggml_op_name(node->op)].push_back(time);
10871092
}
1088-
private:
1093+
private:
10891094
std::map<std::string, std::vector<uint64_t>> timings;
10901095
std::map<std::string, std::vector<uint64_t>> flops;
10911096
};
@@ -3035,28 +3040,39 @@ static void ggml_vk_load_shaders(vk_device& device) {
30353040
ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
30363041

30373042
// conv2d
3038-
uint32_t conv2d_WG_SIZE = 256;
3039-
uint32_t conv2d_BS_K = 128;
3040-
uint32_t conv2d_BS_CRS = 16;
3041-
uint32_t use_collectives = 0; // Enables subgroup ops for preventing the re-calculation of indices.
3042-
if(device->subgroup_shuffle && device->vendor_id != VK_VENDOR_ID_INTEL){ // Do not enable collectives on Intel, see PR 14316
3043+
uint32_t conv2d_WG_SIZE = 256;
3044+
uint32_t conv2d_BS_K = 128;
3045+
uint32_t conv2d_BS_CRS = 16;
3046+
uint32_t use_collectives = 0; // Enables subgroup ops for preventing the re-calculation of indices.
3047+
if (device->subgroup_shuffle &&
3048+
device->vendor_id != VK_VENDOR_ID_INTEL) { // Do not enable collectives on Intel, see PR 14316
30433049
use_collectives = 1;
3044-
conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS); // CRS block size should be capped at sugroup size for correctness when shuffle is used.
3050+
conv2d_BS_CRS = std::min(
3051+
device->subgroup_size,
3052+
conv2d_BS_CRS); // CRS block size should be capped at sugroup size for correctness when shuffle is used.
30453053
}
30463054
uint32_t conv2d_BS_NPQ = 128;
3047-
uint32_t conv2d_TS_K = 8;
3048-
uint32_t conv2d_shmem_req = (conv2d_BS_K*(conv2d_BS_CRS+1) + conv2d_BS_CRS*(conv2d_BS_NPQ+1))*sizeof(float);
3049-
if(device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req){
3055+
uint32_t conv2d_TS_K = 8;
3056+
uint32_t conv2d_shmem_req =
3057+
(conv2d_BS_K * (conv2d_BS_CRS + 1) + conv2d_BS_CRS * (conv2d_BS_NPQ + 1)) * sizeof(float);
3058+
if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) {
30503059
conv2d_BS_CRS = 8;
3051-
if(use_collectives){
3060+
if (use_collectives) {
30523061
conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS);
30533062
}
30543063
}
30553064

3056-
if(use_collectives){
3057-
ggml_vk_create_pipeline(device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3, sizeof(vk_op_conv2d_push_constants), {conv2d_BS_K, conv2d_BS_NPQ, 1}, {conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives}, 1, true, true);
3058-
}else{
3059-
ggml_vk_create_pipeline(device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3, sizeof(vk_op_conv2d_push_constants), {conv2d_BS_K, conv2d_BS_NPQ, 1}, {conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives}, 1, true, false);
3065+
if (use_collectives) {
3066+
ggml_vk_create_pipeline(
3067+
device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
3068+
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
3069+
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, true);
3070+
} else {
3071+
ggml_vk_create_pipeline(
3072+
device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
3073+
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
3074+
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true,
3075+
false);
30603076
}
30613077

30623078
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
@@ -6908,12 +6924,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
69086924
}
69096925
return nullptr;
69106926
case GGML_OP_CONV_2D:
6911-
if (src0->type == GGML_TYPE_F32 &&
6912-
src1->type == GGML_TYPE_F32 &&
6913-
dst->type == GGML_TYPE_F32 &&
6914-
ggml_is_contiguous(src0) &&
6915-
ggml_is_contiguous(src1) &&
6916-
ggml_is_contiguous(dst)) {
6927+
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
6928+
ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
69176929
return ctx->device->pipeline_conv2d_f32;
69186930
}
69196931
return nullptr;
@@ -7250,19 +7262,20 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
72507262
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
72517263
};
72527264
// parallelize in {OW/BS_K, OH/BS_NPQ, 1}
7253-
int64_t W = src1->ne[0];
7254-
int64_t H = src1->ne[1];
7255-
int64_t KW = src0->ne[0];
7256-
int64_t KH = src0->ne[1];
7265+
int64_t W = src1->ne[0];
7266+
int64_t H = src1->ne[1];
7267+
int64_t KW = src0->ne[0];
7268+
int64_t KH = src0->ne[1];
72577269
int64_t Cout = src0->ne[3];
7258-
int64_t N = src1->ne[3];
7259-
int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
7260-
int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
7261-
int64_t NPQ = N*OW*OH;
7270+
int64_t N = src1->ne[3];
7271+
int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
7272+
int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
7273+
int64_t NPQ = N * OW * OH;
72627274

72637275
// Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
7264-
elements = {static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1};
7265-
} break;
7276+
elements = { static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1 };
7277+
}
7278+
break;
72667279
case GGML_OP_ADD:
72677280
case GGML_OP_SUB:
72687281
case GGML_OP_DIV:
@@ -8129,10 +8142,11 @@ static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c
81298142
}, dryrun);
81308143
}
81318144

8132-
static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
8145+
static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0,
8146+
const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
81338147
GGML_ASSERT(src0->type == GGML_TYPE_F32);
81348148
GGML_ASSERT(src1->type == GGML_TYPE_F32);
8135-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
8149+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
81368150

81378151
GGML_TENSOR_BINARY_OP_LOCALS
81388152

@@ -8142,13 +8156,13 @@ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c
81428156

81438157
vk_op_conv2d_push_constants p{};
81448158
p.Cout = static_cast<uint32_t>(ne03);
8145-
p.Cin = static_cast<uint32_t>(ne02);
8146-
p.N = static_cast<uint32_t>(ne13);
8159+
p.Cin = static_cast<uint32_t>(ne02);
8160+
p.N = static_cast<uint32_t>(ne13);
81478161

81488162
p.KW = static_cast<uint32_t>(ne00);
81498163
p.KH = static_cast<uint32_t>(ne01);
8150-
p.W = static_cast<uint32_t>(ne10);
8151-
p.H = static_cast<uint32_t>(ne11);
8164+
p.W = static_cast<uint32_t>(ne10);
8165+
p.H = static_cast<uint32_t>(ne11);
81528166
p.OW = static_cast<uint32_t>(ne0);
81538167
p.OH = static_cast<uint32_t>(ne1);
81548168

@@ -8159,13 +8173,13 @@ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c
81598173
p.d0 = static_cast<uint32_t>(dst->op_params[4]);
81608174
p.d1 = static_cast<uint32_t>(dst->op_params[5]);
81618175

8162-
p.nb01 = static_cast<uint32_t>(nb01/nb00);
8163-
p.nb02 = static_cast<uint32_t>(nb02/nb00);
8164-
p.nb03 = static_cast<uint32_t>(nb03/nb00);
8176+
p.nb01 = static_cast<uint32_t>(nb01 / nb00);
8177+
p.nb02 = static_cast<uint32_t>(nb02 / nb00);
8178+
p.nb03 = static_cast<uint32_t>(nb03 / nb00);
81658179

8166-
p.nb11 = static_cast<uint32_t>(nb11/nb10);
8167-
p.nb12 = static_cast<uint32_t>(nb12/nb10);
8168-
p.nb13 = static_cast<uint32_t>(nb13/nb10);
8180+
p.nb11 = static_cast<uint32_t>(nb11 / nb10);
8181+
p.nb12 = static_cast<uint32_t>(nb12 / nb10);
8182+
p.nb13 = static_cast<uint32_t>(nb13 / nb10);
81698183

81708184
p.nb1 = static_cast<uint32_t>(nb1 / nb0);
81718185
p.nb2 = static_cast<uint32_t>(nb2 / nb0);
@@ -8175,7 +8189,6 @@ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c
81758189
GGML_ASSERT(ne02 == ne12);
81768190

81778191
ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONV_2D, std::move(p), dryrun);
8178-
81798192
}
81808193

81818194
static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -10231,11 +10244,12 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1023110244
ggml_vk_build_graph(ctx, cgraph, i, nullptr, 0, true, false, false, false);
1023210245
if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) {
1023310246
total_mat_mul_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
10234-
}else if(cgraph->nodes[i]->op == GGML_OP_CONV_2D){
10247+
} else if (cgraph->nodes[i]->op == GGML_OP_CONV_2D) {
1023510248
// Return CRSxNPQxsizeof(*) to account as many bytes as mul_mat has in im2col->mul_mat mode.
10236-
auto CRS_size = cgraph->nodes[i]->src[0]->ne[0]*cgraph->nodes[i]->src[0]->ne[1]*cgraph->nodes[i]->src[0]->ne[2];
10237-
auto NPQ_size = cgraph->nodes[i]->ne[0]*cgraph->nodes[i]->ne[1]*cgraph->nodes[i]->ne[3];
10238-
total_mat_mul_bytes += NPQ_size*CRS_size*ggml_type_size(cgraph->nodes[i]->type);
10249+
auto CRS_size =
10250+
cgraph->nodes[i]->src[0]->ne[0] * cgraph->nodes[i]->src[0]->ne[1] * cgraph->nodes[i]->src[0]->ne[2];
10251+
auto NPQ_size = cgraph->nodes[i]->ne[0] * cgraph->nodes[i]->ne[1] * cgraph->nodes[i]->ne[3];
10252+
total_mat_mul_bytes += NPQ_size * CRS_size * ggml_type_size(cgraph->nodes[i]->type);
1023910253
}
1024010254
i += ctx->num_additional_fused_ops;
1024110255
ctx->num_additional_fused_ops = 0;

0 commit comments

Comments
 (0)