@@ -196,6 +196,7 @@ struct vk_device_struct {
196196 vk_pipeline pipeline_pad_f32;
197197 vk_pipeline pipeline_repeat_f32;
198198 vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16;
199+ vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16;
199200 vk_pipeline pipeline_norm_f32;
200201 vk_pipeline pipeline_group_norm_f32;
201202 vk_pipeline pipeline_rms_norm_f32;
@@ -722,6 +723,12 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
722723 std::lock_guard<std::mutex> guard (compile_count_mutex);
723724 assert (compile_count > 0 );
724725 compile_count--;
726+
727+ // "Progress bar" for shader compiles
728+ static uint32_t total_compile_count = 0 ;
729+ if ((total_compile_count++ % 10 ) == 0 ) {
730+ std::cerr << " ." ;
731+ }
725732 }
726733 compile_count_cond.notify_all ();
727734}
@@ -1200,6 +1207,8 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events
12001207static void ggml_vk_load_shaders (vk_device& device) {
12011208 VK_LOG_DEBUG (" ggml_vk_load_shaders(" << device->name << " )" );
12021209
1210+ std::cerr << " ggml_vulkan: Compiling shaders" ;
1211+
12031212 // mulmat
12041213 std::initializer_list<uint32_t > warptile_l = { 128 , 128 , 128 , 16 , device->subgroup_size * 2 , 64 , 2 , 4 , 4 , device->subgroup_size };
12051214 std::initializer_list<uint32_t > warptile_m = { 128 , 64 , 64 , 16 , device->subgroup_size , 32 , 2 , 4 , 2 , device->subgroup_size };
@@ -1759,6 +1768,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
17591768 ggml_vk_create_pipeline (device, device->pipeline_cpy_f32_f16 , " cpy_f32_f16" , cpy_f32_f16_len, cpy_f32_f16_data, " main" , 2 , sizeof (vk_op_unary_push_constants), {512 , 1 , 1 }, {}, 1 );
17601769 ggml_vk_create_pipeline (device, device->pipeline_cpy_f16_f16 , " cpy_f16_f16" , cpy_f16_f16_len, cpy_f16_f16_data, " main" , 2 , sizeof (vk_op_unary_push_constants), {512 , 1 , 1 }, {}, 1 );
17611770
1771+ ggml_vk_create_pipeline (device, device->pipeline_contig_cpy_f32_f32 , " contig_cpy_f32_f32" , contig_cpy_f32_f32_len, contig_cpy_f32_f32_data, " main" , 2 , sizeof (vk_op_unary_push_constants), {512 , 1 , 1 }, {}, 1 );
1772+ ggml_vk_create_pipeline (device, device->pipeline_contig_cpy_f32_f16 , " contig_cpy_f32_f16" , contig_cpy_f32_f16_len, contig_cpy_f32_f16_data, " main" , 2 , sizeof (vk_op_unary_push_constants), {512 , 1 , 1 }, {}, 1 );
1773+ ggml_vk_create_pipeline (device, device->pipeline_contig_cpy_f16_f16 , " contig_cpy_f16_f16" , contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, " main" , 2 , sizeof (vk_op_unary_push_constants), {512 , 1 , 1 }, {}, 1 );
1774+
17621775 ggml_vk_create_pipeline (device, device->pipeline_add_f32 , " add_f32" , add_f32_len, add_f32_data, " main" , 3 , sizeof (vk_op_binary_push_constants), {512 , 1 , 1 }, {}, 1 );
17631776 ggml_vk_create_pipeline (device, device->pipeline_add_f16_f32_f16 , " add_f16_f32_f16" , add_f16_f32_f16_len, add_f16_f32_f16_data, " main" , 3 , sizeof (vk_op_binary_push_constants), {512 , 1 , 1 }, {}, 1 );
17641777
@@ -1817,6 +1830,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
18171830 for (auto &c : compiles) {
18181831 c.wait ();
18191832 }
1833+ std::cerr << " Done!" << std::endl;
18201834}
18211835
18221836static vk_device ggml_vk_get_device (size_t idx) {
@@ -3061,18 +3075,34 @@ static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {
30613075 tensor->nb [3 ] == tensor->nb [2 ]*tensor->ne [2 ];
30623076}
30633077
3064- static vk_pipeline ggml_vk_get_cpy_pipeline (ggml_backend_vk_context * ctx, ggml_type from, ggml_type to) {
3065- if (from == GGML_TYPE_F32 && to == GGML_TYPE_F32) {
3066- return ctx->device ->pipeline_cpy_f32_f32 ;
3078+ static vk_pipeline ggml_vk_get_cpy_pipeline (ggml_backend_vk_context * ctx, const ggml_tensor * src, const ggml_tensor * dst, ggml_type to) {
3079+
3080+ // Choose "contiguous copy" shader if src/dst are contiguous
3081+ bool contig = ggml_is_contiguous (src) && (!dst || ggml_is_contiguous (dst));
3082+
3083+ if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F32) {
3084+ if (contig) {
3085+ return ctx->device ->pipeline_contig_cpy_f32_f32 ;
3086+ } else {
3087+ return ctx->device ->pipeline_cpy_f32_f32 ;
3088+ }
30673089 }
3068- if (from == GGML_TYPE_F32 && to == GGML_TYPE_F16) {
3069- return ctx->device ->pipeline_cpy_f32_f16 ;
3090+ if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F16) {
3091+ if (contig) {
3092+ return ctx->device ->pipeline_contig_cpy_f32_f16 ;
3093+ } else {
3094+ return ctx->device ->pipeline_cpy_f32_f16 ;
3095+ }
30703096 }
3071- if (from == GGML_TYPE_F16 && to == GGML_TYPE_F16) {
3072- return ctx->device ->pipeline_cpy_f16_f16 ;
3097+ if (src->type == GGML_TYPE_F16 && to == GGML_TYPE_F16) {
3098+ if (contig) {
3099+ return ctx->device ->pipeline_contig_cpy_f16_f16 ;
3100+ } else {
3101+ return ctx->device ->pipeline_cpy_f16_f16 ;
3102+ }
30733103 }
30743104
3075- std::cerr << " Missing CPY op for types: " << ggml_type_name (from ) << " " << ggml_type_name (to) << std::endl;
3105+ std::cerr << " Missing CPY op for types: " << ggml_type_name (src-> type ) << " " << ggml_type_name (to) << std::endl;
30763106 GGML_ABORT (" fatal error" );
30773107}
30783108
@@ -3082,6 +3112,15 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
30823112 const int tensor_type_size = ggml_type_size (tensor->type );
30833113
30843114 const uint32_t ne = ggml_nelements (tensor);
3115+ std::array<uint32_t , 3 > elements;
3116+
3117+ if (ne > 262144 ) {
3118+ elements = { 512 , 512 , CEIL_DIV (ne, 262144 ) };
3119+ } else if (ne > 512 ) {
3120+ elements = { 512 , CEIL_DIV (ne, 512 ), 1 };
3121+ } else {
3122+ elements = { ne, 1 , 1 };
3123+ }
30853124
30863125 const vk_op_unary_push_constants pc = {
30873126 (uint32_t )ne,
@@ -3091,7 +3130,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
30913130 0 .0f , 0 .0f ,
30923131 };
30933132 ggml_vk_sync_buffers (subctx);
3094- ggml_vk_dispatch_pipeline (ctx, subctx, pipeline, { in, out }, sizeof (vk_op_unary_push_constants), &pc, { ne, 1 , 1 } );
3133+ ggml_vk_dispatch_pipeline (ctx, subctx, pipeline, { in, out }, sizeof (vk_op_unary_push_constants), &pc, elements );
30953134}
30963135
30973136static void ggml_vk_mul_mat_q_f16 (ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false ) {
@@ -3176,12 +3215,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
31763215 vk_pipeline to_fp16_vk_1 = nullptr ;
31773216
31783217 if (x_non_contig) {
3179- to_fp16_vk_0 = ggml_vk_get_cpy_pipeline (ctx, src0-> type , GGML_TYPE_F16);
3218+ to_fp16_vk_0 = ggml_vk_get_cpy_pipeline (ctx, src0, nullptr , GGML_TYPE_F16);
31803219 } else {
31813220 to_fp16_vk_0 = ggml_vk_get_to_fp16 (ctx, src0->type );
31823221 }
31833222 if (y_non_contig) {
3184- to_fp16_vk_1 = ggml_vk_get_cpy_pipeline (ctx, src1-> type , GGML_TYPE_F16);
3223+ to_fp16_vk_1 = ggml_vk_get_cpy_pipeline (ctx, src1, nullptr , GGML_TYPE_F16);
31853224 } else {
31863225 to_fp16_vk_1 = ggml_vk_get_to_fp16 (ctx, src1->type );
31873226 }
@@ -3361,10 +3400,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
33613400 vk_pipeline to_fp16_vk_0 = nullptr ;
33623401 vk_pipeline to_fp16_vk_1 = nullptr ;
33633402 if (x_non_contig) {
3364- to_fp16_vk_0 = ggml_vk_get_cpy_pipeline (ctx, src0-> type , src0->type );
3403+ to_fp16_vk_0 = ggml_vk_get_cpy_pipeline (ctx, src0, nullptr , src0->type );
33653404 }
33663405 if (y_non_contig) {
3367- to_fp16_vk_1 = ggml_vk_get_cpy_pipeline (ctx, src1-> type , src1->type );
3406+ to_fp16_vk_1 = ggml_vk_get_cpy_pipeline (ctx, src1, nullptr , src1->type );
33683407 } else {
33693408 to_fp16_vk_1 = ggml_vk_get_to_fp16 (ctx, src1->type );
33703409 }
@@ -3745,12 +3784,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
37453784 vk_pipeline to_fp16_vk_1 = nullptr ;
37463785
37473786 if (x_non_contig) {
3748- to_fp16_vk_0 = ggml_vk_get_cpy_pipeline (ctx, src0-> type , GGML_TYPE_F16);
3787+ to_fp16_vk_0 = ggml_vk_get_cpy_pipeline (ctx, src0, nullptr , GGML_TYPE_F16);
37493788 } else {
37503789 to_fp16_vk_0 = ggml_vk_get_to_fp16 (ctx, src0->type );
37513790 }
37523791 if (y_non_contig) {
3753- to_fp16_vk_1 = ggml_vk_get_cpy_pipeline (ctx, src1-> type , GGML_TYPE_F16);
3792+ to_fp16_vk_1 = ggml_vk_get_cpy_pipeline (ctx, src1, nullptr , GGML_TYPE_F16);
37543793 } else {
37553794 to_fp16_vk_1 = ggml_vk_get_to_fp16 (ctx, src1->type );
37563795 }
@@ -3938,10 +3977,10 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
39383977 vk_pipeline to_fp16_vk_0 = nullptr ;
39393978 vk_pipeline to_fp16_vk_1 = nullptr ;
39403979 if (x_non_contig) {
3941- to_fp16_vk_0 = ggml_vk_get_cpy_pipeline (ctx, src0-> type , src0->type );
3980+ to_fp16_vk_0 = ggml_vk_get_cpy_pipeline (ctx, src0, nullptr , src0->type );
39423981 }
39433982 if (y_non_contig) {
3944- to_fp16_vk_1 = ggml_vk_get_cpy_pipeline (ctx, src1-> type , src1->type );
3983+ to_fp16_vk_1 = ggml_vk_get_cpy_pipeline (ctx, src1, nullptr , src1->type );
39453984 } else {
39463985 to_fp16_vk_1 = ggml_vk_get_to_fp16 (ctx, src1->type );
39473986 }
@@ -4148,7 +4187,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
41484187 case GGML_OP_CPY:
41494188 case GGML_OP_CONT:
41504189 case GGML_OP_DUP:
4151- return ggml_vk_get_cpy_pipeline (ctx, src0-> type , dst->type );
4190+ return ggml_vk_get_cpy_pipeline (ctx, src0, dst , dst->type );
41524191 case GGML_OP_NORM:
41534192 if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
41544193 return ctx->device ->pipeline_norm_f32 ;
@@ -4281,7 +4320,6 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
42814320 case GGML_OP_DIV:
42824321 case GGML_OP_CONCAT:
42834322 case GGML_OP_UPSCALE:
4284- case GGML_OP_SCALE:
42854323 case GGML_OP_SQR:
42864324 case GGML_OP_SIN:
42874325 case GGML_OP_COS:
0 commit comments