@@ -411,7 +411,7 @@ struct vk_op_unary_push_constants {
411411    uint32_t  ne;
412412    uint32_t  ne00; uint32_t  ne01; uint32_t  ne02; uint32_t  ne03; uint32_t  nb00; uint32_t  nb01; uint32_t  nb02; uint32_t  nb03;
413413    uint32_t  ne10; uint32_t  ne11; uint32_t  ne12; uint32_t  ne13; uint32_t  nb10; uint32_t  nb11; uint32_t  nb12; uint32_t  nb13;
414-     uint32_t  d_offset ;
414+     uint32_t  misalign_offsets ;
415415    float  param1; float  param2;
416416    uint32_t  ne0_012mp; uint32_t  ne0_012L;
417417    uint32_t  ne0_01mp;  uint32_t  ne0_01L;
@@ -459,7 +459,7 @@ struct vk_op_binary_push_constants {
459459    uint32_t  ne00; uint32_t  ne01; uint32_t  ne02; uint32_t  ne03; uint32_t  nb00; uint32_t  nb01; uint32_t  nb02; uint32_t  nb03;
460460    uint32_t  ne10; uint32_t  ne11; uint32_t  ne12; uint32_t  ne13; uint32_t  nb10; uint32_t  nb11; uint32_t  nb12; uint32_t  nb13;
461461    uint32_t  ne20; uint32_t  ne21; uint32_t  ne22; uint32_t  ne23; uint32_t  nb20; uint32_t  nb21; uint32_t  nb22; uint32_t  nb23;
462-     uint32_t  d_offset ;
462+     uint32_t  misalign_offsets ;
463463    float  param1; float  param2; int32_t  param3;
464464};
465465
@@ -546,7 +546,7 @@ struct vk_staging_memcpy {
546546};
547547
548548struct  vk_op_upscale_push_constants  {
549-     uint32_t  ne; uint32_t  d_offset;
549+     uint32_t  ne; uint32_t  a_offset;  uint32_t   d_offset;
550550    uint32_t  nb00; uint32_t  nb01; uint32_t  nb02; uint32_t  nb03;
551551    uint32_t  ne10; uint32_t  ne11; uint32_t  ne12; uint32_t  ne13;
552552    float  sf0; float  sf1; float  sf2; float  sf3;
@@ -5076,6 +5076,57 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
50765076    }
50775077}
50785078
5079+ static  uint32_t  get_misalign_bytes (ggml_backend_vk_context * ctx, const  ggml_tensor * t)
5080+ {
5081+     return  ((vk_tensor_offset (t) + t->view_offs ) & (ctx->device ->properties .limits .minStorageBufferOffsetAlignment  - 1 ));;
5082+ }
5083+ 
5084+ template  <typename  T> void  init_pushconst_tensor_offsets (ggml_backend_vk_context * ctx, T &p, const  ggml_tensor * src0, const  ggml_tensor * src1, const  ggml_tensor * src2, ggml_tensor * dst) {
5085+     GGML_UNUSED (p);
5086+     GGML_UNUSED (src0);
5087+     GGML_UNUSED (src1);
5088+     GGML_UNUSED (src2);
5089+     GGML_UNUSED (dst);
5090+     static_assert (!std::is_const<T>::value, " unexpected type"  );
5091+     GGML_ASSERT (!src0 || get_misalign_bytes (ctx, src0) == 0 );
5092+     GGML_ASSERT (!src1 || get_misalign_bytes (ctx, src1) == 0 );
5093+     GGML_ASSERT (!src2 || get_misalign_bytes (ctx, src2) == 0 );
5094+     GGML_ASSERT (!dst  || get_misalign_bytes (ctx, dst) == 0 );
5095+ }
5096+ 
5097+ template  <> void  init_pushconst_tensor_offsets (ggml_backend_vk_context * ctx, vk_op_unary_push_constants &p, const  ggml_tensor * src0, const  ggml_tensor * src1, const  ggml_tensor * src2, ggml_tensor * dst) {
5098+     const  uint32_t  a_offset = get_misalign_bytes (ctx, src0) / ggml_type_size (src0->type );
5099+     const  uint32_t  d_offset = get_misalign_bytes (ctx, dst) / ggml_type_size (dst->type );
5100+ 
5101+     p.misalign_offsets  = (a_offset << 16 ) | d_offset;
5102+ 
5103+     GGML_UNUSED (src1);
5104+     GGML_UNUSED (src2);
5105+ }
5106+ 
5107+ template  <> void  init_pushconst_tensor_offsets (ggml_backend_vk_context * ctx, vk_op_binary_push_constants &p, const  ggml_tensor * src0, const  ggml_tensor * src1, const  ggml_tensor * src2, ggml_tensor * dst) {
5108+     const  uint32_t  a_offset = get_misalign_bytes (ctx, src0) / ggml_type_size (src0->type );
5109+     const  uint32_t  b_offset = get_misalign_bytes (ctx, src1) / ggml_type_size (src1->type );
5110+     const  uint32_t  d_offset = get_misalign_bytes (ctx, dst) / ggml_type_size (dst->type );
5111+ 
5112+     GGML_ASSERT (dst->op  != GGML_OP_GET_ROWS || (a_offset == 0  && b_offset == 0  && d_offset == 0 ));
5113+ 
5114+     p.misalign_offsets  = (a_offset << 16 ) | (b_offset << 8 ) | d_offset;
5115+ 
5116+     GGML_UNUSED (src2);
5117+ }
5118+ 
5119+ template  <> void  init_pushconst_tensor_offsets (ggml_backend_vk_context * ctx, vk_op_upscale_push_constants &p, const  ggml_tensor * src0, const  ggml_tensor * src1, const  ggml_tensor * src2, ggml_tensor * dst) {
5120+     const  uint32_t  a_offset = get_misalign_bytes (ctx, src0) / ggml_type_size (src0->type );
5121+     const  uint32_t  d_offset = get_misalign_bytes (ctx, dst) / ggml_type_size (dst->type );
5122+ 
5123+     p.a_offset  = a_offset;
5124+     p.d_offset  = d_offset;
5125+ 
5126+     GGML_UNUSED (src1);
5127+     GGML_UNUSED (src2);
5128+ }
5129+ 
50795130template <typename  PC>
50805131static  void  ggml_vk_op_f32 (ggml_backend_vk_context * ctx, vk_context& subctx, const  ggml_tensor * src0, const  ggml_tensor * src1, const  ggml_tensor * src2, ggml_tensor * dst, ggml_op op, PC&& pc, bool  dryrun = false ) {
50815132    VK_LOG_DEBUG (" ggml_vk_op_f32(("   << src0 << " , name="   << src0->name  << " , type="   << src0->type  << " , ne0="   << src0->ne [0 ] << " , ne1="   << src0->ne [1 ] << " , ne2="   << src0->ne [2 ] << " , ne3="   << src0->ne [3 ] << " , nb0="   << src0->nb [0 ] << " , nb1="   << src0->nb [1 ] << " , nb2="   << src0->nb [2 ] << " , nb3="   << src0->nb [3 ];
@@ -5179,8 +5230,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
51795230    }
51805231
51815232    GGML_ASSERT (d_D != nullptr );
5182-     uint64_t  d_buf_offset = ((vk_tensor_offset (dst) + dst->view_offs ) / ctx->device ->properties .limits .minStorageBufferOffsetAlignment ) * ctx->device ->properties .limits .minStorageBufferOffsetAlignment ;
5183-     GGML_ASSERT (d_buf_offset == vk_tensor_offset (dst) || op == GGML_OP_CPY);  //  NOLINT
5233+     uint64_t  d_buf_offset = vk_tensor_offset (dst) + dst->view_offs ;
51845234    if (!src0_uma) {
51855235        d_X = src0_buf_ctx->dev_buffer ;
51865236        x_buf_offset = vk_tensor_offset (src0) + src0->view_offs ;
@@ -5196,6 +5246,12 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
51965246        z_buf_offset = vk_tensor_offset (src2) + src2->view_offs ;
51975247        GGML_ASSERT (d_Z != nullptr );
51985248    }
5249+     //  Compute misalignment offset for descriptors and store it in in push constants, then align the descriptor offsets.
5250+     init_pushconst_tensor_offsets (ctx, pc, src0, src1, src2, dst);
5251+     x_buf_offset &= ~(ctx->device ->properties .limits .minStorageBufferOffsetAlignment  - 1 );
5252+     y_buf_offset &= ~(ctx->device ->properties .limits .minStorageBufferOffsetAlignment  - 1 );
5253+     z_buf_offset &= ~(ctx->device ->properties .limits .minStorageBufferOffsetAlignment  - 1 );
5254+     d_buf_offset &= ~(ctx->device ->properties .limits .minStorageBufferOffsetAlignment  - 1 );
51995255
52005256    if  (op_supports_incontiguous) {
52015257        x_sz = ggml_nbytes (src0);
@@ -5383,7 +5439,6 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const
53835439    const  uint32_t  src0_type_size = ggml_type_size (src0->type );
53845440    const  uint32_t  src1_type_size = ggml_type_size (src1->type );
53855441    const  uint32_t  dst_type_size = ggml_type_size (dst->type );
5386-     const  uint32_t  d_offset = ((vk_tensor_offset (dst) + dst->view_offs ) % ctx->device ->properties .limits .minStorageBufferOffsetAlignment ) / dst_type_size;
53875442
53885443    int  nb1 = dst->op_params [0 ] / 4 ; //  4 bytes of float32
53895444    int  nb2 = dst->op_params [1 ] / 4 ; //  4 bytes of float32
@@ -5395,7 +5450,7 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const
53955450        (uint32_t )src0->ne [0 ], (uint32_t )src0->ne [1 ], (uint32_t )src0->ne [2 ],(uint32_t )src0->ne [3 ], (uint32_t )src0->nb [0 ] / src0_type_size, (uint32_t )nb1, (uint32_t )nb2, (uint32_t )src0->nb [3 ] / src0_type_size,
53965451        (uint32_t )src1->ne [0 ], (uint32_t )src1->ne [1 ], (uint32_t )src1->ne [2 ],(uint32_t )src1->ne [3 ], (uint32_t )src1->nb [0 ] / src1_type_size, (uint32_t )src1->nb [1 ] / src1_type_size, (uint32_t )src1->nb [2 ] / src1_type_size, (uint32_t )src1->nb [3 ] / src1_type_size,
53975452        (uint32_t ) dst->ne [0 ], (uint32_t ) dst->ne [1 ], (uint32_t ) dst->ne [2 ],(uint32_t ) dst->ne [3 ], (uint32_t ) dst->nb [0 ] /  dst_type_size, (uint32_t )nb1, (uint32_t )nb2, (uint32_t ) dst->nb [3 ] /  dst_type_size,
5398-         d_offset ,
5453+         0 ,
53995454        0 .0f , 0 .0f , offset,
54005455    }, dryrun);
54015456}
@@ -5599,7 +5654,7 @@ static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, c
55995654    const  float  sf3 = (float )dst->ne [3 ] / src0->ne [3 ];
56005655
56015656    ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr , nullptr , dst, GGML_OP_UPSCALE, {
5602-         (uint32_t )ggml_nelements (dst), 0 ,
5657+         (uint32_t )ggml_nelements (dst), 0 ,  0 , 
56035658        (uint32_t )src0->nb [0 ] / src0_type_size, (uint32_t )src0->nb [1 ] / src0_type_size, (uint32_t )src0->nb [2 ] / src0_type_size, (uint32_t )src0->nb [3 ] / src0_type_size,
56045659        (uint32_t )dst->ne [0 ], (uint32_t )dst->ne [1 ], (uint32_t )dst->ne [2 ],(uint32_t )dst->ne [3 ],
56055660        sf0, sf1, sf2, sf3,
@@ -5709,13 +5764,12 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, co
57095764static  void  ggml_vk_cpy (ggml_backend_vk_context * ctx, vk_context& subctx, const  ggml_tensor * src0, ggml_tensor * dst, bool  dryrun = false ) {
57105765    const  uint32_t  src0_type_size = ggml_type_size (src0->type );
57115766    const  uint32_t  dst_type_size = ggml_type_size (dst->type );
5712-     const  uint32_t  d_offset = ((vk_tensor_offset (dst) + dst->view_offs ) % ctx->device ->properties .limits .minStorageBufferOffsetAlignment ) / dst_type_size;
57135767
57145768    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr , nullptr , dst, GGML_OP_CPY, {
57155769        (uint32_t )ggml_nelements (src0),
57165770        (uint32_t )src0->ne [0 ], (uint32_t )src0->ne [1 ], (uint32_t )src0->ne [2 ], (uint32_t )src0->ne [3 ], (uint32_t )src0->nb [0 ] / src0_type_size, (uint32_t )src0->nb [1 ] / src0_type_size, (uint32_t )src0->nb [2 ] / src0_type_size, (uint32_t )src0->nb [3 ] / src0_type_size,
57175771        (uint32_t ) dst->ne [0 ], (uint32_t ) dst->ne [1 ], (uint32_t ) dst->ne [2 ], (uint32_t ) dst->ne [3 ], (uint32_t ) dst->nb [0 ] /  dst_type_size, (uint32_t ) dst->nb [1 ] /  dst_type_size, (uint32_t ) dst->nb [2 ] /  dst_type_size, (uint32_t ) dst->nb [3 ] /  dst_type_size,
5718-         d_offset ,
5772+         0 ,
57195773        0 .0f , 0 .0f ,
57205774        0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
57215775    }, dryrun);
0 commit comments