@@ -765,11 +765,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
765765#endif
766766 backend_ctx->program_cvt =
767767 build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
768- // q4_0
768+
769769 CL_CHECK ((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_q4_0_noshuffle" , &err), err));
770770 CL_CHECK ((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_q4_0" , &err), err));
771771 CL_CHECK ((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_restore_block_q4_0" , &err), err));
772- // mxfp4
773772 CL_CHECK ((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_mxfp4" , &err), err));
774773 CL_CHECK ((backend_ctx->kernel_restore_block_mxfp4 = clCreateKernel (backend_ctx->program_cvt , " kernel_restore_block_mxfp4" , &err), err));
775774 GGML_LOG_CONT (" ." );
@@ -2430,7 +2429,6 @@ struct ggml_tensor_extra_cl_q4_0 {
24302429 }
24312430};
24322431
2433- // Additional tensor extra structs for mxfp4 tensors.
24342432struct ggml_tensor_extra_cl_mxfp4 {
24352433 // Quantized values.
24362434 cl_mem q = nullptr ;
@@ -3403,7 +3401,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
34033401
34043402 return ;
34053403
3406- } else if (tensor->type == GGML_TYPE_MXFP4) {
3404+ }
3405+ if (tensor->type == GGML_TYPE_MXFP4) {
34073406 ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra ;
34083407 GGML_ASSERT (extra_orig && " Tesnors in OpenCL backend should have been allocated and initialized" );
34093408
@@ -3423,27 +3422,12 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
34233422 queue, data_device, CL_TRUE, 0 ,
34243423 ggml_nbytes (tensor), data, 0 , NULL , NULL ));
34253424
3426- // We consider the specified offset arg as always, although For weights
3427- // the offset arg should be 0 (we do not assert this).
3428- // GGML_ASSERT(offset == 0);
3429-
3430- // We create subbuffers from the original tensor buffer for scales and
3431- // quants - i.e., scales and quants are aliases into the buffer obejct
3432- // that backs the original tensor. This is a cleaner way to adapt to the
3433- // new memory management.
3434- // In the old code, we allocate new buffers for scales and quants
3435- // respectively, which could still be done but would result in double
3436- // allocation; properly deallocating the preallocated buffer that backs
3437- // the tensors is tricky and would leak the backend specific information
3438- // into the general backend code.
3439- // Does this create misaligned subbuffers (alignment is 1024) in certain
3440- // cases ?
3441- cl_buffer_region region;
3442-
34433425 // The original tensor memory is divided into scales and quants, i.e.,
34443426 // we first store scales, then quants.
3427+ cl_buffer_region region;
3428+
34453429 // Create subbuffer for scales.
3446- region.origin = extra_orig-> offset + tensor-> view_offs + offset; // align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
3430+ region.origin = align_to (extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment );
34473431 region.size = size_e;
34483432 extra->e = clCreateSubBuffer (
34493433 extra_orig->data_device , CL_MEM_READ_WRITE,
@@ -3452,7 +3436,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
34523436 auto previous_origin = region.origin ;
34533437
34543438 // Create subbuffer for quants.
3455- region.origin = previous_origin + size_e; // align_to(previous_origin + size_e, backend_ctx->alignment);
3439+ region.origin = align_to (previous_origin + size_e, backend_ctx->alignment );
34563440 region.size = size_q;
34573441 extra->q = clCreateSubBuffer (
34583442 extra_orig->data_device , CL_MEM_READ_WRITE,
@@ -3475,7 +3459,12 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
34753459
34763460 // Create image for Q
34773461 cl_image_format img_format_q = {CL_RG, CL_UNSIGNED_INT32};
3478- cl_image_desc img_desc_q = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast <size_t >(tensor->ne [0 ] * tensor->ne [1 ] * tensor->ne [2 ] / 32 * 2 ), 0 ,0 ,0 ,0 ,0 ,0 ,0 , extra->q };
3462+ cl_image_desc img_desc_q = {
3463+ CL_MEM_OBJECT_IMAGE1D_BUFFER,
3464+ static_cast <size_t >(tensor->ne [0 ] * tensor->ne [1 ] * tensor->ne [2 ] / 32 * 2 ),
3465+ 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
3466+ { extra->q }
3467+ };
34793468 extra->q_img = clCreateImage (context, CL_MEM_READ_ONLY, &img_format_q, &img_desc_q, NULL , &err);
34803469
34813470 tensor->extra = extra;
@@ -6275,8 +6264,10 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
62756264 cl_ulong offset1 = extra1->offset + src1->view_offs ;
62766265 cl_ulong offsetd = extrad->offset + dst->view_offs ;
62776266
6267+ #ifdef GGML_OPENCL_SOA_Q
62786268 ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra ;
62796269 ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra ;
6270+ #endif
62806271
62816272 const int ne00 = src0 ? src0->ne [0 ] : 0 ;
62826273 const int ne01 = src0 ? src0->ne [1 ] : 0 ;
@@ -7112,8 +7103,10 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
71127103 cl_ulong offset2 = extra2->offset + src2->view_offs ;
71137104 cl_ulong offsetd = extrad->offset + dst->view_offs ;
71147105
7106+ #ifdef GGML_OPENCL_SOA_Q
71157107 ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra ;
71167108 ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra ;
7109+ #endif
71177110
71187111 const int ne00 = src0->ne [0 ];
71197112 const int ne01 = src0->ne [1 ];
@@ -7171,7 +7164,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
71717164 } else {
71727165 GGML_ASSERT (false && " TODO: Unknown GPU" );
71737166 }
7174-
7167+
71757168 CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0_q4_0->q ));
71767169 CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &extra0_q4_0->d ));
71777170 CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1->data_device ));
0 commit comments