@@ -411,7 +411,7 @@ struct vk_op_unary_push_constants {
411411 uint32_t ne;
412412 uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
413413 uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
414- uint32_t d_offset ;
414+ uint32_t misalign_offsets ;
415415 float param1; float param2;
416416 uint32_t ne0_012mp; uint32_t ne0_012L;
417417 uint32_t ne0_01mp; uint32_t ne0_01L;
@@ -459,7 +459,7 @@ struct vk_op_binary_push_constants {
459459 uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
460460 uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
461461 uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23;
462- uint32_t d_offset ;
462+ uint32_t misalign_offsets ;
463463 float param1; float param2; int32_t param3;
464464};
465465
@@ -546,7 +546,7 @@ struct vk_staging_memcpy {
546546};
547547
548548struct vk_op_upscale_push_constants {
549- uint32_t ne; uint32_t d_offset;
549+ uint32_t ne; uint32_t a_offset; uint32_t d_offset;
550550 uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
551551 uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
552552 float sf0; float sf1; float sf2; float sf3;
@@ -5076,6 +5076,57 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
50765076 }
50775077}
50785078
5079+ static uint32_t get_misalign_bytes (ggml_backend_vk_context * ctx, const ggml_tensor * t)
5080+ {
5081+ return ((vk_tensor_offset (t) + t->view_offs ) & (ctx->device ->properties .limits .minStorageBufferOffsetAlignment - 1 ));;
5082+ }
5083+
5084+ template <typename T> void init_pushconst_tensor_offsets (ggml_backend_vk_context * ctx, T &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
5085+ GGML_UNUSED (p);
5086+ GGML_UNUSED (src0);
5087+ GGML_UNUSED (src1);
5088+ GGML_UNUSED (src2);
5089+ GGML_UNUSED (dst);
5090+ static_assert (!std::is_const<T>::value, " unexpected type" );
5091+ GGML_ASSERT (!src0 || get_misalign_bytes (ctx, src0) == 0 );
5092+ GGML_ASSERT (!src1 || get_misalign_bytes (ctx, src1) == 0 );
5093+ GGML_ASSERT (!src2 || get_misalign_bytes (ctx, src2) == 0 );
5094+ GGML_ASSERT (!dst || get_misalign_bytes (ctx, dst) == 0 );
5095+ }
5096+
5097+ template <> void init_pushconst_tensor_offsets (ggml_backend_vk_context * ctx, vk_op_unary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
5098+ const uint32_t a_offset = get_misalign_bytes (ctx, src0) / ggml_type_size (src0->type );
5099+ const uint32_t d_offset = get_misalign_bytes (ctx, dst) / ggml_type_size (dst->type );
5100+
5101+ p.misalign_offsets = (a_offset << 16 ) | d_offset;
5102+
5103+ GGML_UNUSED (src1);
5104+ GGML_UNUSED (src2);
5105+ }
5106+
5107+ template <> void init_pushconst_tensor_offsets (ggml_backend_vk_context * ctx, vk_op_binary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
5108+ const uint32_t a_offset = get_misalign_bytes (ctx, src0) / ggml_type_size (src0->type );
5109+ const uint32_t b_offset = get_misalign_bytes (ctx, src1) / ggml_type_size (src1->type );
5110+ const uint32_t d_offset = get_misalign_bytes (ctx, dst) / ggml_type_size (dst->type );
5111+
5112+ GGML_ASSERT (dst->op != GGML_OP_GET_ROWS || (a_offset == 0 && b_offset == 0 && d_offset == 0 ));
5113+
5114+ p.misalign_offsets = (a_offset << 16 ) | (b_offset << 8 ) | d_offset;
5115+
5116+ GGML_UNUSED (src2);
5117+ }
5118+
5119+ template <> void init_pushconst_tensor_offsets (ggml_backend_vk_context * ctx, vk_op_upscale_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
5120+ const uint32_t a_offset = get_misalign_bytes (ctx, src0) / ggml_type_size (src0->type );
5121+ const uint32_t d_offset = get_misalign_bytes (ctx, dst) / ggml_type_size (dst->type );
5122+
5123+ p.a_offset = a_offset;
5124+ p.d_offset = d_offset;
5125+
5126+ GGML_UNUSED (src1);
5127+ GGML_UNUSED (src2);
5128+ }
5129+
50795130template <typename PC>
50805131static void ggml_vk_op_f32 (ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, PC&& pc, bool dryrun = false ) {
50815132 VK_LOG_DEBUG (" ggml_vk_op_f32((" << src0 << " , name=" << src0->name << " , type=" << src0->type << " , ne0=" << src0->ne [0 ] << " , ne1=" << src0->ne [1 ] << " , ne2=" << src0->ne [2 ] << " , ne3=" << src0->ne [3 ] << " , nb0=" << src0->nb [0 ] << " , nb1=" << src0->nb [1 ] << " , nb2=" << src0->nb [2 ] << " , nb3=" << src0->nb [3 ];
@@ -5179,8 +5230,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
51795230 }
51805231
51815232 GGML_ASSERT (d_D != nullptr );
5182- uint64_t d_buf_offset = ((vk_tensor_offset (dst) + dst->view_offs ) / ctx->device ->properties .limits .minStorageBufferOffsetAlignment ) * ctx->device ->properties .limits .minStorageBufferOffsetAlignment ;
5183- GGML_ASSERT (d_buf_offset == vk_tensor_offset (dst) || op == GGML_OP_CPY); // NOLINT
5233+ uint64_t d_buf_offset = vk_tensor_offset (dst) + dst->view_offs ;
51845234 if (!src0_uma) {
51855235 d_X = src0_buf_ctx->dev_buffer ;
51865236 x_buf_offset = vk_tensor_offset (src0) + src0->view_offs ;
@@ -5196,6 +5246,12 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
51965246 z_buf_offset = vk_tensor_offset (src2) + src2->view_offs ;
51975247 GGML_ASSERT (d_Z != nullptr );
51985248 }
5249+ // Compute misalignment offset for descriptors and store it in in push constants, then align the descriptor offsets.
5250+ init_pushconst_tensor_offsets (ctx, pc, src0, src1, src2, dst);
5251+ x_buf_offset &= ~(ctx->device ->properties .limits .minStorageBufferOffsetAlignment - 1 );
5252+ y_buf_offset &= ~(ctx->device ->properties .limits .minStorageBufferOffsetAlignment - 1 );
5253+ z_buf_offset &= ~(ctx->device ->properties .limits .minStorageBufferOffsetAlignment - 1 );
5254+ d_buf_offset &= ~(ctx->device ->properties .limits .minStorageBufferOffsetAlignment - 1 );
51995255
52005256 if (op_supports_incontiguous) {
52015257 x_sz = ggml_nbytes (src0);
@@ -5383,7 +5439,6 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const
53835439 const uint32_t src0_type_size = ggml_type_size (src0->type );
53845440 const uint32_t src1_type_size = ggml_type_size (src1->type );
53855441 const uint32_t dst_type_size = ggml_type_size (dst->type );
5386- const uint32_t d_offset = ((vk_tensor_offset (dst) + dst->view_offs ) % ctx->device ->properties .limits .minStorageBufferOffsetAlignment ) / dst_type_size;
53875442
53885443 int nb1 = dst->op_params [0 ] / 4 ; // 4 bytes of float32
53895444 int nb2 = dst->op_params [1 ] / 4 ; // 4 bytes of float32
@@ -5395,7 +5450,7 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const
53955450 (uint32_t )src0->ne [0 ], (uint32_t )src0->ne [1 ], (uint32_t )src0->ne [2 ],(uint32_t )src0->ne [3 ], (uint32_t )src0->nb [0 ] / src0_type_size, (uint32_t )nb1, (uint32_t )nb2, (uint32_t )src0->nb [3 ] / src0_type_size,
53965451 (uint32_t )src1->ne [0 ], (uint32_t )src1->ne [1 ], (uint32_t )src1->ne [2 ],(uint32_t )src1->ne [3 ], (uint32_t )src1->nb [0 ] / src1_type_size, (uint32_t )src1->nb [1 ] / src1_type_size, (uint32_t )src1->nb [2 ] / src1_type_size, (uint32_t )src1->nb [3 ] / src1_type_size,
53975452 (uint32_t ) dst->ne [0 ], (uint32_t ) dst->ne [1 ], (uint32_t ) dst->ne [2 ],(uint32_t ) dst->ne [3 ], (uint32_t ) dst->nb [0 ] / dst_type_size, (uint32_t )nb1, (uint32_t )nb2, (uint32_t ) dst->nb [3 ] / dst_type_size,
5398- d_offset ,
5453+ 0 ,
53995454 0 .0f , 0 .0f , offset,
54005455 }, dryrun);
54015456}
@@ -5599,7 +5654,7 @@ static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, c
55995654 const float sf3 = (float )dst->ne [3 ] / src0->ne [3 ];
56005655
56015656 ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr , nullptr , dst, GGML_OP_UPSCALE, {
5602- (uint32_t )ggml_nelements (dst), 0 ,
5657+ (uint32_t )ggml_nelements (dst), 0 , 0 ,
56035658 (uint32_t )src0->nb [0 ] / src0_type_size, (uint32_t )src0->nb [1 ] / src0_type_size, (uint32_t )src0->nb [2 ] / src0_type_size, (uint32_t )src0->nb [3 ] / src0_type_size,
56045659 (uint32_t )dst->ne [0 ], (uint32_t )dst->ne [1 ], (uint32_t )dst->ne [2 ],(uint32_t )dst->ne [3 ],
56055660 sf0, sf1, sf2, sf3,
@@ -5709,13 +5764,12 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, co
57095764static void ggml_vk_cpy (ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false ) {
57105765 const uint32_t src0_type_size = ggml_type_size (src0->type );
57115766 const uint32_t dst_type_size = ggml_type_size (dst->type );
5712- const uint32_t d_offset = ((vk_tensor_offset (dst) + dst->view_offs ) % ctx->device ->properties .limits .minStorageBufferOffsetAlignment ) / dst_type_size;
57135767
57145768 ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr , nullptr , dst, GGML_OP_CPY, {
57155769 (uint32_t )ggml_nelements (src0),
57165770 (uint32_t )src0->ne [0 ], (uint32_t )src0->ne [1 ], (uint32_t )src0->ne [2 ], (uint32_t )src0->ne [3 ], (uint32_t )src0->nb [0 ] / src0_type_size, (uint32_t )src0->nb [1 ] / src0_type_size, (uint32_t )src0->nb [2 ] / src0_type_size, (uint32_t )src0->nb [3 ] / src0_type_size,
57175771 (uint32_t ) dst->ne [0 ], (uint32_t ) dst->ne [1 ], (uint32_t ) dst->ne [2 ], (uint32_t ) dst->ne [3 ], (uint32_t ) dst->nb [0 ] / dst_type_size, (uint32_t ) dst->nb [1 ] / dst_type_size, (uint32_t ) dst->nb [2 ] / dst_type_size, (uint32_t ) dst->nb [3 ] / dst_type_size,
5718- d_offset ,
5772+ 0 ,
57195773 0 .0f , 0 .0f ,
57205774 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
57215775 }, dryrun);
0 commit comments