@@ -196,6 +196,7 @@ struct vk_device_struct {
196196    vk_pipeline pipeline_pad_f32;
197197    vk_pipeline pipeline_repeat_f32;
198198    vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16;
199+     vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16;
199200    vk_pipeline pipeline_norm_f32;
200201    vk_pipeline pipeline_group_norm_f32;
201202    vk_pipeline pipeline_rms_norm_f32;
@@ -722,6 +723,12 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
722723        std::lock_guard<std::mutex> guard (compile_count_mutex);
723724        assert (compile_count > 0 );
724725        compile_count--;
726+ 
727+         //  "Progress bar" for shader compiles
728+         static  uint32_t  total_compile_count = 0 ;
729+         if  ((total_compile_count++ % 10 ) == 0 ) {
730+             std::cerr << " ."  ;
731+         }
725732    }
726733    compile_count_cond.notify_all ();
727734}
@@ -1200,6 +1207,8 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events
12001207static  void  ggml_vk_load_shaders (vk_device& device) {
12011208    VK_LOG_DEBUG (" ggml_vk_load_shaders("   << device->name  << " )"  );
12021209
1210+     std::cerr << " ggml_vulkan: Compiling shaders"  ;
1211+ 
12031212    //  mulmat
12041213    std::initializer_list<uint32_t > warptile_l = { 128 , 128 , 128 , 16 , device->subgroup_size  * 2 , 64 , 2 , 4 , 4 , device->subgroup_size  };
12051214    std::initializer_list<uint32_t > warptile_m = { 128 ,  64 ,  64 , 16 , device->subgroup_size , 32 , 2 , 4 , 2 , device->subgroup_size  };
@@ -1759,6 +1768,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
17591768    ggml_vk_create_pipeline (device, device->pipeline_cpy_f32_f16 , " cpy_f32_f16"  , cpy_f32_f16_len, cpy_f32_f16_data, " main"  , 2 , sizeof (vk_op_unary_push_constants), {512 , 1 , 1 }, {}, 1 );
17601769    ggml_vk_create_pipeline (device, device->pipeline_cpy_f16_f16 , " cpy_f16_f16"  , cpy_f16_f16_len, cpy_f16_f16_data, " main"  , 2 , sizeof (vk_op_unary_push_constants), {512 , 1 , 1 }, {}, 1 );
17611770
1771+     ggml_vk_create_pipeline (device, device->pipeline_contig_cpy_f32_f32 , " contig_cpy_f32_f32"  , contig_cpy_f32_f32_len, contig_cpy_f32_f32_data, " main"  , 2 , sizeof (vk_op_unary_push_constants), {512 , 1 , 1 }, {}, 1 );
1772+     ggml_vk_create_pipeline (device, device->pipeline_contig_cpy_f32_f16 , " contig_cpy_f32_f16"  , contig_cpy_f32_f16_len, contig_cpy_f32_f16_data, " main"  , 2 , sizeof (vk_op_unary_push_constants), {512 , 1 , 1 }, {}, 1 );
1773+     ggml_vk_create_pipeline (device, device->pipeline_contig_cpy_f16_f16 , " contig_cpy_f16_f16"  , contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, " main"  , 2 , sizeof (vk_op_unary_push_constants), {512 , 1 , 1 }, {}, 1 );
1774+ 
17621775    ggml_vk_create_pipeline (device, device->pipeline_add_f32 , " add_f32"  , add_f32_len, add_f32_data, " main"  , 3 , sizeof (vk_op_binary_push_constants), {512 , 1 , 1 }, {}, 1 );
17631776    ggml_vk_create_pipeline (device, device->pipeline_add_f16_f32_f16 , " add_f16_f32_f16"  , add_f16_f32_f16_len, add_f16_f32_f16_data, " main"  , 3 , sizeof (vk_op_binary_push_constants), {512 , 1 , 1 }, {}, 1 );
17641777
@@ -1817,6 +1830,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
18171830    for  (auto  &c : compiles) {
18181831        c.wait ();
18191832    }
1833+     std::cerr << " Done!"   << std::endl;
18201834}
18211835
18221836static  vk_device ggml_vk_get_device (size_t  idx) {
@@ -3061,18 +3075,34 @@ static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {
30613075        tensor->nb [3 ] == tensor->nb [2 ]*tensor->ne [2 ];
30623076}
30633077
3064- static  vk_pipeline ggml_vk_get_cpy_pipeline (ggml_backend_vk_context * ctx, ggml_type from, ggml_type to) {
3065-     if  (from == GGML_TYPE_F32 && to == GGML_TYPE_F32) {
3066-         return  ctx->device ->pipeline_cpy_f32_f32 ;
3078+ static  vk_pipeline ggml_vk_get_cpy_pipeline (ggml_backend_vk_context * ctx, const  ggml_tensor * src, const  ggml_tensor * dst, ggml_type to) {
3079+ 
3080+     //  Choose "contiguous copy" shader if src/dst are contiguous
3081+     bool  contig = ggml_is_contiguous (src) && (!dst || ggml_is_contiguous (dst));
3082+ 
3083+     if  (src->type  == GGML_TYPE_F32 && to == GGML_TYPE_F32) {
3084+         if  (contig) {
3085+             return  ctx->device ->pipeline_contig_cpy_f32_f32 ;
3086+         } else  {
3087+             return  ctx->device ->pipeline_cpy_f32_f32 ;
3088+         }
30673089    }
3068-     if  (from == GGML_TYPE_F32 && to == GGML_TYPE_F16) {
3069-         return  ctx->device ->pipeline_cpy_f32_f16 ;
3090+     if  (src->type  == GGML_TYPE_F32 && to == GGML_TYPE_F16) {
3091+         if  (contig) {
3092+             return  ctx->device ->pipeline_contig_cpy_f32_f16 ;
3093+         } else  {
3094+             return  ctx->device ->pipeline_cpy_f32_f16 ;
3095+         }
30703096    }
3071-     if  (from == GGML_TYPE_F16 && to == GGML_TYPE_F16) {
3072-         return  ctx->device ->pipeline_cpy_f16_f16 ;
3097+     if  (src->type  == GGML_TYPE_F16 && to == GGML_TYPE_F16) {
3098+         if  (contig) {
3099+             return  ctx->device ->pipeline_contig_cpy_f16_f16 ;
3100+         } else  {
3101+             return  ctx->device ->pipeline_cpy_f16_f16 ;
3102+         }
30733103    }
30743104
3075-     std::cerr << " Missing CPY op for types: "   << ggml_type_name (from ) << "  "   << ggml_type_name (to) << std::endl;
3105+     std::cerr << " Missing CPY op for types: "   << ggml_type_name (src-> type ) << "  "   << ggml_type_name (to) << std::endl;
30763106    GGML_ABORT (" fatal error"  );
30773107}
30783108
@@ -3082,6 +3112,15 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
30823112    const  int  tensor_type_size = ggml_type_size (tensor->type );
30833113
30843114    const  uint32_t  ne = ggml_nelements (tensor);
3115+     std::array<uint32_t , 3 > elements;
3116+ 
3117+     if  (ne > 262144 ) {
3118+         elements = { 512 , 512 , CEIL_DIV (ne, 262144 ) };
3119+     } else  if  (ne > 512 ) {
3120+         elements = { 512 , CEIL_DIV (ne, 512 ), 1  };
3121+     } else  {
3122+         elements = { ne, 1 , 1  };
3123+     }
30853124
30863125    const  vk_op_unary_push_constants pc = {
30873126        (uint32_t )ne,
@@ -3091,7 +3130,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
30913130        0 .0f , 0 .0f ,
30923131    };
30933132    ggml_vk_sync_buffers (subctx);
3094-     ggml_vk_dispatch_pipeline (ctx, subctx, pipeline, { in, out }, sizeof (vk_op_unary_push_constants), &pc, { ne,  1 ,  1  } );
3133+     ggml_vk_dispatch_pipeline (ctx, subctx, pipeline, { in, out }, sizeof (vk_op_unary_push_constants), &pc, elements );
30953134}
30963135
30973136static  void  ggml_vk_mul_mat_q_f16 (ggml_backend_vk_context * ctx, vk_context& subctx, const  ggml_tensor * src0, const  ggml_tensor * src1, ggml_tensor * dst, bool  dryrun = false ) {
@@ -3176,12 +3215,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
31763215    vk_pipeline to_fp16_vk_1 = nullptr ;
31773216
31783217    if  (x_non_contig) {
3179-         to_fp16_vk_0 = ggml_vk_get_cpy_pipeline (ctx, src0-> type , GGML_TYPE_F16);
3218+         to_fp16_vk_0 = ggml_vk_get_cpy_pipeline (ctx, src0,  nullptr , GGML_TYPE_F16);
31803219    } else  {
31813220        to_fp16_vk_0 = ggml_vk_get_to_fp16 (ctx, src0->type );
31823221    }
31833222    if  (y_non_contig) {
3184-         to_fp16_vk_1 = ggml_vk_get_cpy_pipeline (ctx, src1-> type , GGML_TYPE_F16);
3223+         to_fp16_vk_1 = ggml_vk_get_cpy_pipeline (ctx, src1,  nullptr , GGML_TYPE_F16);
31853224    } else  {
31863225        to_fp16_vk_1 = ggml_vk_get_to_fp16 (ctx, src1->type );
31873226    }
@@ -3361,10 +3400,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
33613400    vk_pipeline to_fp16_vk_0 = nullptr ;
33623401    vk_pipeline to_fp16_vk_1 = nullptr ;
33633402    if  (x_non_contig) {
3364-         to_fp16_vk_0 = ggml_vk_get_cpy_pipeline (ctx, src0-> type , src0->type );
3403+         to_fp16_vk_0 = ggml_vk_get_cpy_pipeline (ctx, src0,  nullptr , src0->type );
33653404    }
33663405    if  (y_non_contig) {
3367-         to_fp16_vk_1 = ggml_vk_get_cpy_pipeline (ctx, src1-> type , src1->type );
3406+         to_fp16_vk_1 = ggml_vk_get_cpy_pipeline (ctx, src1,  nullptr , src1->type );
33683407    } else  {
33693408        to_fp16_vk_1 = ggml_vk_get_to_fp16 (ctx, src1->type );
33703409    }
@@ -3745,12 +3784,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
37453784    vk_pipeline to_fp16_vk_1 = nullptr ;
37463785
37473786    if  (x_non_contig) {
3748-         to_fp16_vk_0 = ggml_vk_get_cpy_pipeline (ctx, src0-> type , GGML_TYPE_F16);
3787+         to_fp16_vk_0 = ggml_vk_get_cpy_pipeline (ctx, src0,  nullptr , GGML_TYPE_F16);
37493788    } else  {
37503789        to_fp16_vk_0 = ggml_vk_get_to_fp16 (ctx, src0->type );
37513790    }
37523791    if  (y_non_contig) {
3753-         to_fp16_vk_1 = ggml_vk_get_cpy_pipeline (ctx, src1-> type , GGML_TYPE_F16);
3792+         to_fp16_vk_1 = ggml_vk_get_cpy_pipeline (ctx, src1,  nullptr , GGML_TYPE_F16);
37543793    } else  {
37553794        to_fp16_vk_1 = ggml_vk_get_to_fp16 (ctx, src1->type );
37563795    }
@@ -3938,10 +3977,10 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
39383977    vk_pipeline to_fp16_vk_0 = nullptr ;
39393978    vk_pipeline to_fp16_vk_1 = nullptr ;
39403979    if  (x_non_contig) {
3941-         to_fp16_vk_0 = ggml_vk_get_cpy_pipeline (ctx, src0-> type , src0->type );
3980+         to_fp16_vk_0 = ggml_vk_get_cpy_pipeline (ctx, src0,  nullptr , src0->type );
39423981    }
39433982    if  (y_non_contig) {
3944-         to_fp16_vk_1 = ggml_vk_get_cpy_pipeline (ctx, src1-> type , src1->type );
3983+         to_fp16_vk_1 = ggml_vk_get_cpy_pipeline (ctx, src1,  nullptr , src1->type );
39453984    } else  {
39463985        to_fp16_vk_1 = ggml_vk_get_to_fp16 (ctx, src1->type );
39473986    }
@@ -4148,7 +4187,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
41484187    case  GGML_OP_CPY:
41494188    case  GGML_OP_CONT:
41504189    case  GGML_OP_DUP:
4151-         return  ggml_vk_get_cpy_pipeline (ctx, src0-> type , dst->type );
4190+         return  ggml_vk_get_cpy_pipeline (ctx, src0, dst , dst->type );
41524191    case  GGML_OP_NORM:
41534192        if  (src0->type  == GGML_TYPE_F32 && dst->type  == GGML_TYPE_F32) {
41544193            return  ctx->device ->pipeline_norm_f32 ;
@@ -4281,7 +4320,6 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
42814320    case  GGML_OP_DIV:
42824321    case  GGML_OP_CONCAT:
42834322    case  GGML_OP_UPSCALE:
4284-     case  GGML_OP_SCALE:
42854323    case  GGML_OP_SQR:
42864324    case  GGML_OP_SIN:
42874325    case  GGML_OP_COS:
0 commit comments