11#include " trace_driver.h"
22#include < iostream>
33#include < iomanip>
4+ #include < chrono>
45
56#define NR_DPUS 8
67#define NR_LAYER 2
@@ -24,6 +25,8 @@ int gemv_dpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct
2425 DPU_ASSERT (dpu_broadcast_to (context->dpu_set , DPU_MRAM_HEAP_POINTER_NAME, pim_offset, (void *)(ggml_table_f32_f16), sizeof (ggml_table_f32_f16), DPU_XFER_DEFAULT));
2526 pim_offset += sizeof (ggml_table_f32_f16);
2627
28+ std::cout << " ggml_table_f32_f16 len = " << sizeof (ggml_table_f32_f16) << std::endl;
29+
2730 // Transfer pim_metadata into DPUs
2831 context->pim_metadata .layer_num = NR_LAYER;
2932 context->pim_metadata .weight_type = (uint16_t )(w->type );
@@ -36,6 +39,8 @@ int gemv_dpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct
3639 context->pim_metadata .layer_len = w->nb [1 ] * (context->pim_metadata .rows_per_dpu );
3740 context->pim_metadata .input_offset = sizeof (ggml_table_f32_f16) + sizeof (struct pim_meta ) + context->pim_metadata .layer_len * NR_LAYER;
3841
42+ std::cout << " layer_num = " << NR_LAYER << " , weight_type = " << (uint16_t )(w->type ) << " , rows_per_dpu = " << w->ne [1 ] / NR_DPUS << " , rest_rows = " << w->ne [1 ] % NR_DPUS << " , layer_len = " << context->pim_metadata .layer_len << " , input_offset = " << context->pim_metadata .input_offset << std::endl;
43+
3944 // Todo: NR_DPUS contexts are dispatched to different dpus(rest row is different on different dpu)
4045 DPU_ASSERT (dpu_broadcast_to (context->dpu_set , DPU_MRAM_HEAP_POINTER_NAME, pim_offset, &(context->pim_metadata ), sizeof (struct pim_meta ), DPU_XFER_DEFAULT));
4146 pim_offset += sizeof (struct pim_meta );
@@ -50,10 +55,10 @@ int gemv_dpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct
5055 uint32_t prev_rows_dpu = i * context->pim_metadata .rows_per_dpu ;
5156
5257 // every dpu's data
53- DPU_ASSERT (dpu_prepare_xfer (dpu, ((unsigned char *)w->data ) + prev_rows_dpu* size_per_row));
58+ DPU_ASSERT (dpu_prepare_xfer (dpu, ((unsigned char *)w->data ) + prev_rows_dpu * size_per_row));
5459 }
5560
56- DPU_ASSERT (dpu_push_xfer (context->dpu_set , DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, pim_offset + layer_len* layeridx, layer_len, DPU_XFER_DEFAULT));
61+ DPU_ASSERT (dpu_push_xfer (context->dpu_set , DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, pim_offset + layer_len * layeridx, layer_len, DPU_XFER_DEFAULT));
5762 }
5863
5964 // Transfer input into DPUs
@@ -72,8 +77,14 @@ int gemv_dpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct
7277 DPU_ASSERT (dpu_broadcast_to (context->dpu_set , DPU_MRAM_HEAP_POINTER_NAME, input_offset, in_q->data , bclen, DPU_XFER_DEFAULT));
7378 input_offset += bclen;
7479
80+ std::chrono::high_resolution_clock::time_point ex_tp1 = std::chrono::high_resolution_clock::now ();
7581 // Launch DPU kernel
7682 DPU_ASSERT (dpu_launch (context->dpu_set , DPU_SYNCHRONOUS));
83+ std::chrono::high_resolution_clock::time_point ex_tp2 = std::chrono::high_resolution_clock::now ();
84+
85+ std::chrono::duration<size_t , std::nano> dur = ex_tp2 - ex_tp1;
86+
87+ std::cout << " 执行用时:" << std::chrono::duration_cast<std::chrono::milliseconds>(dur).count () << " ms" << std::endl;
7788
7889 // Check results
7990 float *mul_mat_res = (float *)res->data ;
@@ -105,23 +116,30 @@ int main(int argc, char** argv) {
105116 struct ggml_tensor * ts_bq = tensor_import (filenamebq);
106117 struct ggml_tensor * ts_c = tensor_import (filenamec);
107118 struct ggml_tensor * ts_c_pim = tensor_import (filenamec_p);
108- std::cout<<" ts_a:" <<std::endl;
109- dump_tensor (ts_a, stdout);
110- std::cout<<" ts_b:" <<std::endl;
111- dump_tensor (ts_b, stdout);
112- std::cout<<" ts_bq:" <<std::endl;
113- dump_tensor (ts_bq, stdout);
114- std::cout<<" ts_c:" <<std::endl;
115- dump_tensor (ts_c, stdout);
116- std::cout<<" ts_c_pim:" <<std::endl;
117- dump_tensor (ts_c_pim, stdout);
118-
119+ // std::cout<<"ts_a:"<<std::endl;
120+ // dump_tensor(ts_a, stdout);
121+ // std::cout<<"ts_b:"<<std::endl;
122+ // dump_tensor(ts_b, stdout);
123+ // std::cout<<"ts_bq:"<<std::endl;
124+ // dump_tensor(ts_bq, stdout);
125+ // std::cout<<"ts_c:"<<std::endl;
126+ // dump_tensor(ts_c, stdout);
127+ // std::cout<<"ts_c_pim:"<<std::endl;
128+ // dump_tensor(ts_c_pim, stdout);
129+
130+ std::cout << " ts_a: " << std::endl;
131+ print_tensor (ts_a, stdout);
132+ std::cout << " ts_b: " << std::endl;
133+ print_tensor (ts_b, stdout);
119134
120135 gemv_dpu_kernel (pqcontext, ts_a, ts_bq, ts_c_pim);
121- std::cout<<" ts_c_pim calculated by DPUs:" <<std::endl;
122- dump_tensor (ts_c_pim, stdout);
136+ // std::cout<<"ts_c_pim calculated by DPUs:"<<std::endl;
137+ // dump_tensor(ts_c_pim, stdout);
123138
124139 float first_res = mul_add_q4_0_q8_0 (ts_a, ts_bq);
125140 std::cout<<" first element: " <<std::fixed << std::setprecision (6 )<<first_res<<std::endl;
141+
142+ std::cout << " error between c and c_pim:" << std::endl;
143+ compare_tensor (ts_c, ts_c_pim);
126144 return 0 ;
127145}
0 commit comments