Skip to content

Commit a7b3792

Browse files
author
Yinan
committed
Refact ts.cpp as a scaffold for dpu kernel
1 parent fa98763 commit a7b3792

File tree

7 files changed

+91
-9
lines changed

7 files changed

+91
-9
lines changed

examples/tensor/ts.cpp

Lines changed: 89 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22
#include <iostream>
33
#include <iomanip>
44

5+
#define NR_DPUS 8
6+
#define NR_LAYER 2
7+
#define DPU_BINARY "./dpu/gemv_dpu"
8+
59
void fp_table_init(void) {
610
for (int i = 0; i < (1 << 16); ++i) {
711
union {
@@ -11,15 +15,91 @@ void fp_table_init(void) {
1115
ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
1216
}
1317
}
18+
19+
int gemv_dpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct ggml_tensor * in_q, struct ggml_tensor * res) {
20+
uint32_t pim_offset = 0;
21+
struct dpu_set_t dpu;
22+
23+
//ggml_table_f32_f16 tbl is transferred to pim
24+
DPU_ASSERT(dpu_broadcast_to(context->dpu_set, DPU_MRAM_HEAP_POINTER_NAME, pim_offset, (void *)(ggml_table_f32_f16), sizeof(ggml_table_f32_f16), DPU_XFER_DEFAULT));
25+
pim_offset += sizeof(ggml_table_f32_f16);
26+
27+
// Transfer pim_metadata into DPUs
28+
context->pim_metadata.layer_num = NR_LAYER;
29+
context->pim_metadata.weight_type = (uint16_t)(w->type);
30+
31+
//ne[1] is row num,ne[0] is col num ?
32+
context->pim_metadata.rows_per_dpu = w->ne[1] / NR_DPUS;
33+
context->pim_metadata.rest_rows = w->ne[1] % NR_DPUS;
34+
GGML_ASSERT(context->pim_metadata.rest_rows == 0);
35+
36+
context->pim_metadata.layer_len = w->nb[1] * (context->pim_metadata.rows_per_dpu);
37+
context->pim_metadata.input_offset = sizeof(ggml_table_f32_f16) + sizeof(struct pim_meta) + context->pim_metadata.layer_len * NR_LAYER;
38+
39+
//Todo: NR_DPUS contexts are dispatched to different dpus(rest row is different on different dpu)
40+
DPU_ASSERT(dpu_broadcast_to(context->dpu_set, DPU_MRAM_HEAP_POINTER_NAME, pim_offset, &(context->pim_metadata), sizeof(struct pim_meta), DPU_XFER_DEFAULT));
41+
pim_offset += sizeof(struct pim_meta);
42+
43+
// Transfer weight into DPUs
44+
uint32_t layer_len = context->pim_metadata.layer_len;
45+
uint32_t i;
46+
for (uint32_t layeridx = 0; layeridx < NR_LAYER; layeridx++) {
47+
uint32_t size_per_row = w->nb[1];
48+
// row is send to dpu
49+
DPU_FOREACH(context->dpu_set, dpu, i) {
50+
uint32_t prev_rows_dpu = i * context->pim_metadata.rows_per_dpu;
51+
52+
// every dpu's data
53+
DPU_ASSERT(dpu_prepare_xfer(dpu, ((unsigned char *)w->data) + prev_rows_dpu*size_per_row));
54+
}
55+
56+
DPU_ASSERT(dpu_push_xfer(context->dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, pim_offset + layer_len*layeridx, layer_len, DPU_XFER_DEFAULT));
57+
}
58+
59+
// Transfer input into DPUs
60+
pim_matrix_des input_descript;
61+
input_descript.type = (int32_t)in_q->type;
62+
input_descript.layerid = 0; // TODO: set this value from 0 to NR_LAYER - 1 as you like
63+
memcpy(input_descript.ne, in_q->ne, sizeof(in_q->ne));
64+
65+
uint32_t input_offset = context->pim_metadata.input_offset;
66+
// broadcast input metadata
67+
DPU_ASSERT(dpu_broadcast_to(context->dpu_set, DPU_MRAM_HEAP_POINTER_NAME, input_offset, &input_descript, sizeof(pim_matrix_des), DPU_XFER_DEFAULT));
68+
input_offset += sizeof(pim_matrix_des);
69+
70+
// broadcast input data
71+
uint32_t bclen = ggml_row_size(in_q->type, in_q->ne[0])*in_q->ne[1]*in_q->ne[2]*in_q->ne[3];
72+
DPU_ASSERT(dpu_broadcast_to(context->dpu_set, DPU_MRAM_HEAP_POINTER_NAME, input_offset, in_q->data, bclen, DPU_XFER_DEFAULT));
73+
input_offset += bclen;
74+
75+
// Launch DPU kernel
76+
DPU_ASSERT(dpu_launch(context->dpu_set, DPU_SYNCHRONOUS));
77+
78+
// Check results
79+
float *mul_mat_res = (float *)res->data;
80+
DPU_FOREACH(context->dpu_set, dpu, i) {
81+
DPU_ASSERT(dpu_prepare_xfer(dpu, mul_mat_res + i * context->pim_metadata.rows_per_dpu*in_q->ne[1]));
82+
}
83+
DPU_ASSERT(dpu_push_xfer(context->dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_offset, context->pim_metadata.rows_per_dpu*in_q->ne[1]*sizeof(float), DPU_XFER_DEFAULT));
84+
85+
return 0;
86+
}
87+
1488
int main(int argc, char** argv) {
1589
// init fp table for fp16 dump
1690
fp_table_init();
1791

18-
const char* filenamea = "a.tensor";
19-
const char* filenameb = "b.tensor";
20-
const char* filenamebq = "b_quant.tensor";
21-
const char* filenamec = "c.tensor";
22-
const char* filenamec_p = "c_pim.tensor";
92+
// WQ-PIM allocate dpu
93+
struct pim_context *pqcontext = (struct pim_context *)malloc(sizeof(struct pim_context));
94+
memset(pqcontext,0,sizeof(struct pim_context));
95+
DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &pqcontext->dpu_set));
96+
DPU_ASSERT(dpu_load(pqcontext->dpu_set, DPU_BINARY, NULL));
97+
98+
const char* filenamea = "tensor-files/a.tensor";
99+
const char* filenameb = "tensor-files/b.tensor";
100+
const char* filenamebq = "tensor-files/b_quant.tensor";
101+
const char* filenamec = "tensor-files/c.tensor";
102+
const char* filenamec_p = "tensor-files/c_pim.tensor";
23103
struct ggml_tensor * ts_a = tensor_import(filenamea);
24104
struct ggml_tensor * ts_b = tensor_import(filenameb);
25105
struct ggml_tensor * ts_bq = tensor_import(filenamebq);
@@ -36,8 +116,10 @@ int main(int argc, char** argv) {
36116
std::cout<<"ts_c_pim:"<<std::endl;
37117
dump_tensor(ts_c_pim, stdout);
38118

39-
//dump_tensor_first_n(ts_a, 4096, stdout);
40-
//dump_tensor_first_n(ts_bq, 4096, stdout);
119+
120+
gemv_dpu_kernel(pqcontext, ts_a, ts_bq, ts_c_pim);
121+
std::cout<<"ts_c_pim calculated by DPUs:"<<std::endl;
122+
dump_tensor(ts_c_pim, stdout);
41123

42124
float first_res = mul_add_q4_0_q8_0(ts_a, ts_bq);
43125
std::cout<<"first element: "<<std::fixed << std::setprecision(6)<<first_res<<std::endl;

ggml/src/ggml.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17394,7 +17394,7 @@ static __inline__ void dpu_kernel_barrier(struct dpu_set_t dpu_set) {
1739417394

1739517395
static __inline__ int dpu_get_gemv_res(struct ggml_tensor *input, struct ggml_tensor *w, struct ggml_tensor *res) {
1739617396
struct dpu_set_t dpu_set, dpu;
17397-
float *mul_max_res = (float *)res->data;
17397+
float *mul_mat_res = (float *)res->data;
1739817398
uint32_t output_offset = res->inout_offset;
1739917399
static bool offset_printed = false;
1740017400
if( !offset_printed) {
@@ -17411,7 +17411,7 @@ static __inline__ int dpu_get_gemv_res(struct ggml_tensor *input, struct ggml_te
1741117411

1741217412
uint32_t i;
1741317413
DPU_FOREACH(dpu_set, dpu, i) {
17414-
DPU_ASSERT(dpu_prepare_xfer(dpu, mul_max_res + i * rows_per_dpu*input->ne[1]));
17414+
DPU_ASSERT(dpu_prepare_xfer(dpu, mul_mat_res + i * rows_per_dpu*input->ne[1]));
1741517415
}
1741617416
DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, output_offset, rows_per_dpu*input->ne[1]*sizeof(float), DPU_XFER_DEFAULT));
1741717417
return 0;

tensor-files/a.tensor

9 MB
Binary file not shown.

tensor-files/b.tensor

16.1 KB
Binary file not shown.

tensor-files/b_quant.tensor

4.39 KB
Binary file not shown.

tensor-files/c.tensor

16.1 KB
Binary file not shown.

tensor-files/c_pim.tensor

16.1 KB
Binary file not shown.

0 commit comments

Comments
 (0)