@@ -67,12 +67,20 @@ inline void ggml_zdnn_init_tensor(ggml_backend_zdnn_buffer * buffer, const ggml_
6767
6868 default :
6969 {
70+ // For 4D tensors, GGML uses NCHW layout. However, because zDNN
71+ // automatically transforms everything to NHWC, we will use it
72+ // directly to avoid the performance penalty changing the
73+ // layout and reshaping the tensor.
7074 zdnn_init_pre_transformed_desc (
7175 ZDNN_NHWC,
7276 ggml_zdnn_type_mapping (tensor->type ),
7377 &buffer->pre_tfm_desc ,
7478 tensor->ne [3 ], tensor->ne [2 ], tensor->ne [1 ], tensor->ne [0 ]
7579 );
80+
81+ // TODO: Consider adding a ggml check.
82+ // TODO: If tensor = 4D, use ZDNN_NCHW by default.
83+ // TODO: If tensor = 2D, use ZDNN_NHWC by default.
7684 } break ;
7785 }
7886
@@ -108,11 +116,8 @@ static void ggml_zdnn_mul_mat_op(ggml_backend_zdnn_context * ctx, const ggml_ten
108116 ggml_backend_zdnn_buffer * inputs_extra = (ggml_backend_zdnn_buffer *)inputs->extra ;
109117 ggml_backend_zdnn_buffer * output_extra = (ggml_backend_zdnn_buffer *)output->extra ;
110118
111- zdnn_tensor_desc ptd_weights, td_weights;
112- zdnn_tensor_desc ptd_inputs, td_inputs;
113- zdnn_tensor_desc ptd_bias, td_bias;
114- zdnn_tensor_desc ptd_output, td_output;
115- zdnn_ztensor zt_weights, zt_inputs, zt_bias, zt_output;
119+ zdnn_tensor_desc ptd_bias, td_bias;
120+ zdnn_ztensor zt_bias;
116121
117122 const int64_t weights_rows = ne01;
118123 const int64_t weights_cols = ne00;
@@ -129,8 +134,7 @@ static void ggml_zdnn_mul_mat_op(ggml_backend_zdnn_context * ctx, const ggml_ten
129134 const int64_t bias_dim [GGML_MAX_DIMS] = { 1 , 1 , 1 , output_cols };
130135 const int64_t output_dim[GGML_MAX_DIMS] = { 1 , 1 , output_cols, output_rows };
131136
132- ggml_zdnn_create_tensor (ptd_bias, td_bias, zt_bias, output, bias_dim, ZDNN_1D);
133- // ggml_zdnn_create_tensor(ptd_output, td_output, zt_output, output, output_dim, ZDNN_2D);
137+ ggml_zdnn_create_tensor (ptd_bias, td_bias, zt_bias, output, bias_dim, ZDNN_1D);
134138
135139 void * bias_data = (void *)calloc (ne0, ggml_element_size (output));
136140 if (weights_extra->ztensor .is_transformed == false ) {
@@ -140,8 +144,7 @@ static void ggml_zdnn_mul_mat_op(ggml_backend_zdnn_context * ctx, const ggml_ten
140144 if (inputs_extra->ztensor .is_transformed == false ) {
141145 ggml_zdnn_load_tensor (inputs_extra->ztensor , inputs->data );
142146 }
143- ggml_zdnn_load_tensor (zt_bias, bias_data);
144- // ggml_zdnn_load_tensor(output_extra->ztensor, output->data);
147+ ggml_zdnn_load_tensor (zt_bias, bias_data);
145148
146149 // GGML_LOG_INFO("%s: tensor '%s' tensor dimensions: [%ld, %ld, %ld, %ld] pre_tfm_desc dimensions: [%ld, %ld, %ld, %ld]\n",
147150 // __func__, weights_extra->name,
@@ -159,21 +162,17 @@ static void ggml_zdnn_mul_mat_op(ggml_backend_zdnn_context * ctx, const ggml_ten
159162 // inputs_extra->pre_tfm_desc.dim3,
160163 // inputs_extra->pre_tfm_desc.dim4);
161164
162- // GGML_ASSERT(weights_extra->pre_tfm_desc.layout == ZDNN_2D && "weights_extra->pre_tfm_desc.layout must be ZDNN_2D");
163- // GGML_ASSERT(inputs_extra->pre_tfm_desc.layout == ZDNN_2D && "inputs_extra->pre_tfm_desc.layout must be ZDNN_2D");
164165 GGML_ASSERT (weights_extra->pre_tfm_desc .dim1 == weights->ne [0 ] && " weights_extra->pre_tfm_desc.dim1 must match weights->ne[0]" );
165166 GGML_ASSERT (weights_extra->pre_tfm_desc .dim2 == weights->ne [1 ] && " weights_extra->pre_tfm_desc.dim2 must match weights->ne[1]" );
166- GGML_ASSERT (inputs_extra->pre_tfm_desc .dim1 == inputs->ne [0 ] && " inputs_extra->pre_tfm_desc.dim1 must match inputs->ne[0]" );
167- GGML_ASSERT (inputs_extra->pre_tfm_desc .dim2 == inputs->ne [1 ] && " inputs_extra->pre_tfm_desc.dim2 must match inputs->ne[1]" );
168-
169- std::raise (SIGINT);
167+ GGML_ASSERT (inputs_extra->pre_tfm_desc .dim1 == inputs->ne [0 ] && " inputs_extra->pre_tfm_desc.dim1 must match inputs->ne[0]" );
168+ GGML_ASSERT (inputs_extra->pre_tfm_desc .dim2 == inputs->ne [1 ] && " inputs_extra->pre_tfm_desc.dim2 must match inputs->ne[1]" );
170169
171170 ZDNN_CHECK (zdnn_matmul_transpose_op (&inputs_extra->ztensor , &weights_extra->ztensor , &zt_bias,
172171 false , true , MATMUL_OP_ADDITION, &output_extra->ztensor ));
172+ // TODO: Remove in the future as we are currently DLF16 -> FP32 then in the next op, FP32 -> DLF16 again. Inefficient.
173173 ZDNN_CHECK (zdnn_transform_origtensor (&output_extra->ztensor , output->data ));
174174
175175 ZDNN_CHECK (zdnn_free_ztensor_buffer (&zt_bias));
176-
177176 free (bias_data);
178177}
179178
0 commit comments