@@ -52,8 +52,8 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, bool use_gpu
5252 }
5353
5454 // Convert adata to fp16 format
55- // std::vector<ggml_fp16_t> hadata(KW * KH * IC * OC);
56- // ggml_fp32_to_fp16_row(adata.data(), hadata.data(), KW * KH * IC * OC);
55+ std::vector<ggml_fp16_t > hadata (KW * KH * IC * OC);
56+ ggml_fp32_to_fp16_row (adata.data (), hadata.data (), KW * KH * IC * OC);
5757
5858 // Initialize bdata
5959 std::vector<float > bdata (IW * IH * IC * N);
@@ -63,7 +63,8 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, bool use_gpu
6363
6464 size_t buffer_size = 0 ;
6565 {
66- buffer_size += KW * KH * IC * OC * ggml_type_size (GGML_TYPE_F32); // tensor a
66+ // buffer_size += KW * KH * IC * OC * ggml_type_size(GGML_TYPE_F32); // tensor a
67+ buffer_size += KW * KH * IC * OC * ggml_type_size (GGML_TYPE_F16); // tensor a
6768 buffer_size += IW * IH * IC * N * ggml_type_size (GGML_TYPE_F32); // tensor b
6869 buffer_size += 1024 ; // overhead
6970 }
@@ -111,7 +112,7 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, bool use_gpu
111112 model.ctx = ggml_init (params);
112113
113114 // create tensors
114- model.a = ggml_new_tensor_4d (model.ctx , GGML_TYPE_F32 , KW, KH, IC, OC);
115+ model.a = ggml_new_tensor_4d (model.ctx , GGML_TYPE_F16 , KW, KH, IC, OC);
115116 model.b = ggml_new_tensor_4d (model.ctx , GGML_TYPE_F32, IW, IH, IC, N);
116117
117118 // create a allocator
@@ -122,9 +123,9 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, bool use_gpu
122123
123124 // load data to buffer
124125 if (ggml_backend_is_cpu (model.backend )) {
125- memcpy (model.a ->data , adata .data (), ggml_nbytes (model.a ));
126+ memcpy (model.a ->data , hadata .data (), ggml_nbytes (model.a ));
126127 } else {
127- ggml_backend_tensor_set (model.a , adata .data (), 0 , ggml_nbytes (model.a ));
128+ ggml_backend_tensor_set (model.a , hadata .data (), 0 , ggml_nbytes (model.a ));
128129 }
129130
130131 // alloc memory
@@ -208,6 +209,48 @@ struct ggml_cgraph * build_graph_1(const test_model& model) {
208209
209210
210211
212+ // recalculate for avoid fragmentation
213+ // struct ggml_tensor* conv2d_res = ggml_conv_2d(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
214+ // ggml_set_name(conv2d_res, "conv2d_res");
215+ // ggml_build_forward_expand(gf, conv2d_res);
216+ // int64_t *ne = conv2d_res->ne;
217+ // printf("conv2d: (%zu, %zu, %zu, %zu) \n", ne[0], ne[1], ne[2], ne[3]);
218+
219+
220+ // struct ggml_tensor* wino_res = ggml_conv_2d_implicitgemm(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
221+ struct ggml_tensor * wino_res = ggml_conv_2d_direct (ctx0, model.a , model.b , s0, s1, p0, p1, d0, d1);
222+ ggml_set_name (wino_res, " wino_res" );
223+ ggml_build_forward_expand (gf, wino_res);
224+ // ne = wino_res->ne;
225+ // printf("wino: (%zu, %zu, %zu, %zu) \n", ne[0], ne[1], ne[2], ne[3]);
226+ ggml_free (ctx0);
227+ return gf;
228+ }
229+
230+ struct ggml_cgraph * build_graph_2 (const test_model& model) {
231+ static size_t buf_size = ggml_tensor_overhead ()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead ();
232+ static std::vector<uint8_t > buf (buf_size);
233+
234+ struct ggml_init_params params0 = {
235+ /* .mem_size =*/ buf_size,
236+ /* .mem_buffer =*/ buf.data (),
237+ /* .no_alloc =*/ true , // the tensors will be allocated later by ggml_gallocr_alloc_graph()
238+ };
239+
240+ // create a temporally context to build the graph
241+ struct ggml_context * ctx0 = ggml_init (params0);
242+
243+ struct ggml_cgraph * gf = ggml_new_graph (ctx0);
244+
245+ int s0 = 1 ;
246+ int s1 = 1 ;
247+ int p0 = 1 ;
248+ int p1 = 1 ;
249+ int d0 = 1 ;
250+ int d1 = 1 ;
251+
252+
253+
211254 // recalculate for avoid fragmentation
212255 // struct ggml_tensor* conv2d_res = ggml_conv_2d(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
213256 // ggml_set_name(conv2d_res, "conv2d_res");
@@ -217,6 +260,7 @@ struct ggml_cgraph * build_graph_1(const test_model& model) {
217260
218261
219262 struct ggml_tensor * wino_res = ggml_conv_2d_implicitgemm (ctx0, model.a , model.b , s0, s1, p0, p1, d0, d1);
263+ // struct ggml_tensor* wino_res = ggml_conv_2d_direct(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
220264 ggml_set_name (wino_res, " wino_res" );
221265 ggml_build_forward_expand (gf, wino_res);
222266 // ne = wino_res->ne;
@@ -353,16 +397,39 @@ int main(void)
353397 double run_time1;
354398 std::vector<float > wino_data = compute_graph (model, allocr, build_graph_1, iterations, &run_time1);
355399
400+
401+ ggml_gallocr_free (allocr);
402+
403+ allocr = NULL ;
404+
405+ allocr = ggml_gallocr_new (ggml_backend_get_default_buffer_type (model.backend ));
406+
407+ // create the worst case graph for memory usage estimation
408+ gf = build_graph_2 (model);
409+
410+ // compute the required memory
411+ ggml_gallocr_reserve (allocr, gf);
412+ size_t mem_size2 = ggml_gallocr_get_buffer_size (allocr, 0 );
413+ // fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0f/1024.0f);
414+
415+
416+ struct ggml_cgraph * gf_res_2 = NULL ;
417+
418+ double run_time2;
419+ wino_data = compute_graph (model, allocr, build_graph_2, iterations, &run_time2);
420+
421+
356422 if (k==0 ) {
357423 k = 1 ;
358- fprintf (stderr, " | (IC, OC, IW, IH) | im2col+GEMM TIME | im2col+GEMM VRAM | implicit GEMM TIME | implicit GEMM VRAM \n " );
359- fprintf (stderr, " | --- | --- | --- | --- | --- \n " );
424+ fprintf (stderr, " | (IC, OC, IW, IH) | im2col+GEMM TIME | im2col+GEMM VRAM | direct TIME | direct VRAM | implicit GEMM TIME | implicit GEMM VRAM \n " );
425+ fprintf (stderr, " | --- | --- | --- | --- | --- | --- | --- \n " );
360426 }
361427
362- fprintf (stderr, " | (%d, %d, %d, %d) | %.2f ms | %.2f MB | %.2f ms | %.2f MB\n " ,
428+ fprintf (stderr, " | (%d, %d, %d, %d) | %.2f ms | %.2f MB | %.2f ms | %.2f MB | %.2f ms | %.2f MB \n " ,
363429 std::get<0 >(c), std::get<1 >(c), std::get<2 >(c), std::get<3 >(c),
364430 run_time0, mem_size0/1024 .0f /1024 .0f ,
365- run_time1, mem_size1/1024 .0f /1024 .0f );
431+ run_time1, mem_size1/1024 .0f /1024 .0f ,
432+ run_time2, mem_size2/1024 .0f /1024 .0f );
366433
367434
368435 // for(int i = 0; i < ggml_nelements(wino_res); i++) {
0 commit comments