22#include < iostream>
33#include < iomanip>
44
5+ #define NR_DPUS 8
6+ #define NR_LAYER 2
7+ #define DPU_BINARY " ./dpu/gemv_dpu"
8+
59void fp_table_init (void ) {
610 for (int i = 0 ; i < (1 << 16 ); ++i) {
711 union {
@@ -11,15 +15,91 @@ void fp_table_init(void) {
1115 ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32 (u.fp16 );
1216 }
1317}
18+
19+ int gemv_dpu_kernel (struct pim_context *context, struct ggml_tensor * w, struct ggml_tensor * in_q, struct ggml_tensor * res) {
20+ uint32_t pim_offset = 0 ;
21+ struct dpu_set_t dpu;
22+
23+ // ggml_table_f32_f16 tbl is transferred to pim
24+ DPU_ASSERT (dpu_broadcast_to (context->dpu_set , DPU_MRAM_HEAP_POINTER_NAME, pim_offset, (void *)(ggml_table_f32_f16), sizeof (ggml_table_f32_f16), DPU_XFER_DEFAULT));
25+ pim_offset += sizeof (ggml_table_f32_f16);
26+
27+ // Transfer pim_metadata into DPUs
28+ context->pim_metadata .layer_num = NR_LAYER;
29+ context->pim_metadata .weight_type = (uint16_t )(w->type );
30+
31+ // ne[1] is row num,ne[0] is col num ?
32+ context->pim_metadata .rows_per_dpu = w->ne [1 ] / NR_DPUS;
33+ context->pim_metadata .rest_rows = w->ne [1 ] % NR_DPUS;
34+ GGML_ASSERT (context->pim_metadata .rest_rows == 0 );
35+
36+ context->pim_metadata .layer_len = w->nb [1 ] * (context->pim_metadata .rows_per_dpu );
37+ context->pim_metadata .input_offset = sizeof (ggml_table_f32_f16) + sizeof (struct pim_meta ) + context->pim_metadata .layer_len * NR_LAYER;
38+
39+ // Todo: NR_DPUS contexts are dispatched to different dpus(rest row is different on different dpu)
40+ DPU_ASSERT (dpu_broadcast_to (context->dpu_set , DPU_MRAM_HEAP_POINTER_NAME, pim_offset, &(context->pim_metadata ), sizeof (struct pim_meta ), DPU_XFER_DEFAULT));
41+ pim_offset += sizeof (struct pim_meta );
42+
43+ // Transfer weight into DPUs
44+ uint32_t layer_len = context->pim_metadata .layer_len ;
45+ uint32_t i;
46+ for (uint32_t layeridx = 0 ; layeridx < NR_LAYER; layeridx++) {
47+ uint32_t size_per_row = w->nb [1 ];
48+ // row is send to dpu
49+ DPU_FOREACH (context->dpu_set , dpu, i) {
50+ uint32_t prev_rows_dpu = i * context->pim_metadata .rows_per_dpu ;
51+
52+ // every dpu's data
53+ DPU_ASSERT (dpu_prepare_xfer (dpu, ((unsigned char *)w->data ) + prev_rows_dpu*size_per_row));
54+ }
55+
56+ DPU_ASSERT (dpu_push_xfer (context->dpu_set , DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, pim_offset + layer_len*layeridx, layer_len, DPU_XFER_DEFAULT));
57+ }
58+
59+ // Transfer input into DPUs
60+ pim_matrix_des input_descript;
61+ input_descript.type = (int32_t )in_q->type ;
62+ input_descript.layerid = 0 ; // TODO: set this value from 0 to NR_LAYER - 1 as you like
63+ memcpy (input_descript.ne , in_q->ne , sizeof (in_q->ne ));
64+
65+ uint32_t input_offset = context->pim_metadata .input_offset ;
66+ // broadcast input metadata
67+ DPU_ASSERT (dpu_broadcast_to (context->dpu_set , DPU_MRAM_HEAP_POINTER_NAME, input_offset, &input_descript, sizeof (pim_matrix_des), DPU_XFER_DEFAULT));
68+ input_offset += sizeof (pim_matrix_des);
69+
70+ // broadcast input data
71+ uint32_t bclen = ggml_row_size (in_q->type , in_q->ne [0 ])*in_q->ne [1 ]*in_q->ne [2 ]*in_q->ne [3 ];
72+ DPU_ASSERT (dpu_broadcast_to (context->dpu_set , DPU_MRAM_HEAP_POINTER_NAME, input_offset, in_q->data , bclen, DPU_XFER_DEFAULT));
73+ input_offset += bclen;
74+
75+ // Launch DPU kernel
76+ DPU_ASSERT (dpu_launch (context->dpu_set , DPU_SYNCHRONOUS));
77+
78+ // Check results
79+ float *mul_mat_res = (float *)res->data ;
80+ DPU_FOREACH (context->dpu_set , dpu, i) {
81+ DPU_ASSERT (dpu_prepare_xfer (dpu, mul_mat_res + i * context->pim_metadata .rows_per_dpu *in_q->ne [1 ]));
82+ }
83+ DPU_ASSERT (dpu_push_xfer (context->dpu_set , DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_offset, context->pim_metadata .rows_per_dpu *in_q->ne [1 ]*sizeof (float ), DPU_XFER_DEFAULT));
84+
85+ return 0 ;
86+ }
87+
1488int main (int argc, char ** argv) {
1589 // init fp table for fp16 dump
1690 fp_table_init ();
1791
18- const char * filenamea = " a.tensor" ;
19- const char * filenameb = " b.tensor" ;
20- const char * filenamebq = " b_quant.tensor" ;
21- const char * filenamec = " c.tensor" ;
22- const char * filenamec_p = " c_pim.tensor" ;
92+ // WQ-PIM allocate dpu
93+ struct pim_context *pqcontext = (struct pim_context *)malloc (sizeof (struct pim_context ));
94+ memset (pqcontext,0 ,sizeof (struct pim_context ));
95+ DPU_ASSERT (dpu_alloc (NR_DPUS, NULL , &pqcontext->dpu_set ));
96+ DPU_ASSERT (dpu_load (pqcontext->dpu_set , DPU_BINARY, NULL ));
97+
98+ const char * filenamea = " tensor-files/a.tensor" ;
99+ const char * filenameb = " tensor-files/b.tensor" ;
100+ const char * filenamebq = " tensor-files/b_quant.tensor" ;
101+ const char * filenamec = " tensor-files/c.tensor" ;
102+ const char * filenamec_p = " tensor-files/c_pim.tensor" ;
23103 struct ggml_tensor * ts_a = tensor_import (filenamea);
24104 struct ggml_tensor * ts_b = tensor_import (filenameb);
25105 struct ggml_tensor * ts_bq = tensor_import (filenamebq);
@@ -36,8 +116,10 @@ int main(int argc, char** argv) {
36116 std::cout<<" ts_c_pim:" <<std::endl;
37117 dump_tensor (ts_c_pim, stdout);
38118
39- // dump_tensor_first_n(ts_a, 4096, stdout);
40- // dump_tensor_first_n(ts_bq, 4096, stdout);
119+
120+ gemv_dpu_kernel (pqcontext, ts_a, ts_bq, ts_c_pim);
121+ std::cout<<" ts_c_pim calculated by DPUs:" <<std::endl;
122+ dump_tensor (ts_c_pim, stdout);
41123
42124 float first_res = mul_add_q4_0_q8_0 (ts_a, ts_bq);
43125 std::cout<<" first element: " <<std::fixed << std::setprecision (6 )<<first_res<<std::endl;
0 commit comments