33#include < iomanip>
44#include < chrono>
55
6- #define NR_DPUS 8
6+ #include < vector>
7+
8+ #define NR_DPUS 2048
79#define NR_LAYER 2
810#define DPU_BINARY " ./dpu/gemv_dpu"
911
@@ -21,6 +23,8 @@ int gemv_dpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct
2123 uint32_t pim_offset = 0 ;
2224 struct dpu_set_t dpu;
2325
26+ std::chrono::high_resolution_clock::time_point ex_tp1 = std::chrono::high_resolution_clock::now ();
27+
2428 // ggml_table_f32_f16 tbl is transferred to pim
2529 DPU_ASSERT (dpu_broadcast_to (context->dpu_set , DPU_MRAM_HEAP_POINTER_NAME, pim_offset, (void *)(ggml_table_f32_f16), sizeof (ggml_table_f32_f16), DPU_XFER_DEFAULT));
2630 pim_offset += sizeof (ggml_table_f32_f16);
@@ -39,7 +43,7 @@ int gemv_dpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct
3943 context->pim_metadata .layer_len = w->nb [1 ] * (context->pim_metadata .rows_per_dpu );
4044 context->pim_metadata .input_offset = sizeof (ggml_table_f32_f16) + sizeof (struct pim_meta ) + context->pim_metadata .layer_len * NR_LAYER;
4145
42- std::cout << " layer_num = " << NR_LAYER << " , weight_type = " << (uint16_t )(w->type ) << " , rows_per_dpu = " << w->ne [1 ] / NR_DPUS << " , rest_rows = " << w->ne [1 ] % NR_DPUS << " , layer_len = " << context->pim_metadata .layer_len << " , input_offset = " << context->pim_metadata .input_offset << std::endl;
46+ // std::cout << "layer_num = " << NR_LAYER << ", weight_type = " << (uint16_t)(w->type) << ", rows_per_dpu = " << w->ne[1] / NR_DPUS << ", rest_rows = " << w->ne[1] % NR_DPUS << ", layer_len = " << context->pim_metadata.layer_len << ", input_offset = " << context->pim_metadata.input_offset << std::endl;
4347
4448 // Todo: NR_DPUS contexts are dispatched to different dpus(rest row is different on different dpu)
4549 DPU_ASSERT (dpu_broadcast_to (context->dpu_set , DPU_MRAM_HEAP_POINTER_NAME, pim_offset, &(context->pim_metadata ), sizeof (struct pim_meta ), DPU_XFER_DEFAULT));
@@ -61,6 +65,15 @@ int gemv_dpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct
6165 DPU_ASSERT (dpu_push_xfer (context->dpu_set , DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, pim_offset + layer_len * layeridx, layer_len, DPU_XFER_DEFAULT));
6266 }
6367
68+ std::chrono::high_resolution_clock::time_point ex_tp2 = std::chrono::high_resolution_clock::now ();
69+
70+ std::chrono::duration<size_t , std::nano> dur = ex_tp2 - ex_tp1;
71+
72+ std::cout << " dpu: w传输用时:" << std::chrono::duration_cast<std::chrono::milliseconds>(dur).count () << " ms" << std::endl;
73+ std::cout << " dpu: w传输用时:" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count () << " us" << std::endl;
74+
75+ ex_tp1 = std::chrono::high_resolution_clock::now ();
76+
6477 // Transfer input into DPUs
6578 pim_matrix_des input_descript;
6679 input_descript.type = (int32_t )in_q->type ;
@@ -77,14 +90,22 @@ int gemv_dpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct
7790 DPU_ASSERT (dpu_broadcast_to (context->dpu_set , DPU_MRAM_HEAP_POINTER_NAME, input_offset, in_q->data , bclen, DPU_XFER_DEFAULT));
7891 input_offset += bclen;
7992
80- std::chrono::high_resolution_clock::time_point ex_tp1 = std::chrono::high_resolution_clock::now ();
93+ ex_tp2 = std::chrono::high_resolution_clock::now ();
94+
95+ dur = ex_tp2 - ex_tp1;
96+
97+ std::cout << " dpu: in_q传输用时:" << std::chrono::duration_cast<std::chrono::milliseconds>(dur).count () << " ms" << std::endl;
98+ std::cout << " dpu: in_q传输用时:" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count () << " us" << std::endl;
99+
100+ ex_tp1 = std::chrono::high_resolution_clock::now ();
81101 // Launch DPU kernel
82102 DPU_ASSERT (dpu_launch (context->dpu_set , DPU_SYNCHRONOUS));
83- std::chrono::high_resolution_clock::time_point ex_tp2 = std::chrono::high_resolution_clock::now ();
103+ ex_tp2 = std::chrono::high_resolution_clock::now ();
84104
85- std::chrono::duration< size_t , std::nano> dur = ex_tp2 - ex_tp1;
105+ dur = ex_tp2 - ex_tp1;
86106
87- std::cout << " 执行用时:" << std::chrono::duration_cast<std::chrono::milliseconds>(dur).count () << " ms" << std::endl;
107+ std::cout << " dpu: 执行用时:" << std::chrono::duration_cast<std::chrono::milliseconds>(dur).count () << " ms" << std::endl;
108+ std::cout << " dpu: 执行用时:" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count () << " us" << std::endl;
88109
89110 // Check results
90111 float *mul_mat_res = (float *)res->data ;
@@ -96,6 +117,49 @@ int gemv_dpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct
96117 return 0 ;
97118}
98119
120+
121+ void gemv_cpu_kernel (struct pim_context *context, struct ggml_tensor * w, struct ggml_tensor * in_q, struct ggml_tensor * res_comp) {
122+
123+ // 初始化上下文
124+ ggml_init_params params = {.mem_size = 256 *1024 *1024 };
125+ ggml_context* ctx = ggml_init (params);
126+
127+ // 创建tensor
128+ ggml_tensor* A = ggml_new_tensor_2d (ctx, GGML_TYPE_Q4_0, 4096 , 4096 );
129+ ggml_tensor* B = ggml_new_tensor_2d (ctx, GGML_TYPE_Q8_0, 4096 , 1 );
130+
131+ assert (A->ne [0 ] == w->ne [0 ] && A->ne [1 ] == w->ne [1 ] && A->ne [2 ] == w->ne [2 ] && A->ne [3 ] == w->ne [3 ]);
132+ assert (B->ne [0 ] == in_q->ne [0 ] && B->ne [1 ] == in_q->ne [1 ] && B->ne [2 ] == in_q->ne [2 ] && B->ne [3 ] == in_q->ne [3 ]);
133+
134+ memcpy (A->data , w->data , ggml_nbytes (w));
135+ memcpy (B->data , in_q->data , ggml_nbytes (in_q));
136+
137+ // 构建计算图
138+ ggml_tensor* C = ggml_mul_mat (ctx, A, B);
139+ ggml_cgraph* gf = ggml_new_graph (ctx);
140+ ggml_build_forward_expand (gf, C);
141+
142+ std::chrono::high_resolution_clock::time_point ex_tp1 = std::chrono::high_resolution_clock::now ();
143+ // 执行计算
144+ ggml_graph_compute_with_ctx (ctx, gf, 64 ); // 使用4线程
145+ std::chrono::high_resolution_clock::time_point ex_tp2 = std::chrono::high_resolution_clock::now ();
146+
147+ std::chrono::duration<size_t , std::nano> dur = ex_tp2 - ex_tp1;
148+
149+ std::cout << " 执行用时:" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count () << " us" << std::endl;
150+ std::cout << " 执行用时:" << std::chrono::duration_cast<std::chrono::milliseconds>(dur).count () << " ms" << std::endl;
151+
152+
153+ // 保存结果
154+ print_tensor (C, stdout);
155+
156+ std::cout << " error between cpu and dpu before gemv:" << std::endl;
157+ compare_tensor (C, res_comp);
158+
159+ // 释放资源
160+ ggml_free (ctx);
161+ }
162+
99163int main (int argc, char ** argv) {
100164 // init fp table for fp16 dump
101165 fp_table_init ();
@@ -106,15 +170,15 @@ int main(int argc, char** argv) {
106170 DPU_ASSERT (dpu_alloc (NR_DPUS, NULL , &pqcontext->dpu_set ));
107171 DPU_ASSERT (dpu_load (pqcontext->dpu_set , DPU_BINARY, NULL ));
108172
109- const char * filenamea = " tensor-files/a.tensor" ;
110- const char * filenameb = " tensor-files/b.tensor" ;
111- const char * filenamebq = " tensor-files/b_quant.tensor" ;
112- const char * filenamec = " tensor-files/c.tensor" ;
173+ const char * filenamea = " tensor-files/a.tensor" ;
174+ const char * filenameb = " tensor-files/b.tensor" ;
175+ const char * filenamebq = " tensor-files/b_quant.tensor" ;
176+ const char * filenamec = " tensor-files/c.tensor" ;
113177 const char * filenamec_p = " tensor-files/c_pim.tensor" ;
114- struct ggml_tensor * ts_a = tensor_import (filenamea);
115- struct ggml_tensor * ts_b = tensor_import (filenameb);
116- struct ggml_tensor * ts_bq = tensor_import (filenamebq);
117- struct ggml_tensor * ts_c = tensor_import (filenamec);
178+ struct ggml_tensor * ts_a = tensor_import (filenamea);
179+ struct ggml_tensor * ts_b = tensor_import (filenameb);
180+ struct ggml_tensor * ts_bq = tensor_import (filenamebq);
181+ struct ggml_tensor * ts_c = tensor_import (filenamec);
118182 struct ggml_tensor * ts_c_pim = tensor_import (filenamec_p);
119183 // std::cout<<"ts_a:"<<std::endl;
120184 // dump_tensor(ts_a, stdout);
@@ -126,20 +190,29 @@ int main(int argc, char** argv) {
126190 // dump_tensor(ts_c, stdout);
127191 // std::cout<<"ts_c_pim:"<<std::endl;
128192 // dump_tensor(ts_c_pim, stdout);
193+ // #define IS_CONTIGUOUS(t, tn) {\
194+ // if(ggml_is_contiguous(t)) printf("%s is contiguous\n", tn);\
195+ // }\
129196
130- std::cout << " ts_a: " << std::endl;
131- print_tensor (ts_a, stdout);
132- std::cout << " ts_b: " << std::endl;
133- print_tensor (ts_b, stdout);
197+ // IS_CONTIGUOUS(ts_a, "ts_a");
198+ // IS_CONTIGUOUS(ts_b, "ts_b");
199+ // IS_CONTIGUOUS(ts_bq, "ts_bq");
200+ // IS_CONTIGUOUS(ts_c, "ts_c");
201+ // IS_CONTIGUOUS(ts_c_pim, "ts_ac_pim");
202+ // #undef IS_CONTIGUOUS
134203
135- gemv_dpu_kernel (pqcontext, ts_a, ts_bq, ts_c_pim);
136- // std::cout<<"ts_c_pim calculated by DPUs:"<<std::endl;
137- // dump_tensor(ts_c_pim, stdout);
138204
139- float first_res = mul_add_q4_0_q8_0 (ts_a, ts_bq);
140- std::cout<<" first element: " <<std::fixed << std::setprecision (6 )<<first_res<<std::endl;
205+ // dpu code
206+ // gemv_dpu_kernel(pqcontext, ts_a, ts_bq, ts_c_pim);
207+ // std::cout << "error between c and c_pim:" << std::endl;
208+ // compare_tensor(ts_c, ts_c_pim);
209+
210+ // cpu code
211+ gemv_cpu_kernel (pqcontext, ts_a, ts_bq, ts_c_pim);
212+
213+ // float first_res = mul_add_q4_0_q8_0(ts_a, ts_bq);
214+ // std::cout<<"first element: "<<std::fixed << std::setprecision(6)<<first_res<<std::endl;
141215
142- std::cout << " error between c and c_pim:" << std::endl;
143- compare_tensor (ts_c, ts_c_pim);
216+
144217 return 0 ;
145218}
0 commit comments