Skip to content

Commit bff5aae

Browse files
committed
完成dpu的多线程并行 & 增加tensor的误差对比函数
1 parent a7b3792 commit bff5aae

File tree

5 files changed

+156
-25
lines changed

5 files changed

+156
-25
lines changed

dpu/dpu_main.c

100755100644
Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -91,25 +91,37 @@ int wram2mram(__mram_ptr void *pmram,void *pwram,uint32_t size)
9191
}
9292

9393

94-
// main
95-
int main() {
96-
unsigned int tasklet_id = me();
94+
static float *psumf = NULL;
95+
96+
void init(unsigned int tasklet_id) {
9797
#if PRINT
98-
// printf("tasklet_id = %u\n", tasklet_id);
98+
printf("tasklet_id = %u\n", tasklet_id);
9999
#endif
100100
if (tasklet_id == 0){ // Initialize once the cycle counter
101101
mem_reset(); // Reset the heap
102+
103+
ptable_f32_f16 = (__mram_ptr float *)DPU_MRAM_HEAP_POINTER;
102104
}
103105
// Barrier
104106
barrier_wait(&my_barrier);
105107

108+
// ptable_f32_f16 = (__mram_ptr float *)DPU_MRAM_HEAP_POINTER;
109+
}
110+
111+
// main
112+
int main() {
113+
114+
unsigned int tasklet_id = me();
115+
116+
init(tasklet_id);
117+
106118
//fp32->fp16 table
107119
ptable_f32_f16 = (__mram_ptr float *)DPU_MRAM_HEAP_POINTER;
108120
uint32_t table_f32_f16_len = (1 << 16)*sizeof(float);
109121
uint32_t offset = table_f32_f16_len;
110122
int input_row_size = 0;
111123
int input_cols = 0;
112-
float *psumf = NULL;
124+
113125

114126
#if PRINT
115127
printf("table_f32_f16_len=%d\n",table_f32_f16_len);
@@ -129,6 +141,11 @@ int main() {
129141
cache_meta->layer_num,cache_meta->weight_type,cache_meta->rows_per_dpu,cache_meta->rest_rows,cache_meta->input_offset);
130142
#endif
131143

144+
// 先不考虑尾行
145+
uint16_t weight_rows_per_thread = cache_meta->rows_per_dpu / NR_TASKLETS;
146+
uint16_t weight_start_row = tasklet_id * weight_rows_per_thread;
147+
uint16_t weight_end_row = weight_start_row + weight_rows_per_thread;
148+
132149
// todo:rest row is existed, first thread in every dpu can one more row
133150
uint16_t weight_rows_cur_thread;
134151
if (cache_meta->rest_rows) {
@@ -169,7 +186,13 @@ int main() {
169186
input_row_size = nb*sizeof(block_q8_0);
170187
__mram_ptr void *pweight_base = (__mram_ptr void *)(weightmetadatabase + sizeof(struct pim_meta));
171188
__mram_ptr void *pinput_base = DPU_MRAM_HEAP_POINTER + cache_meta->input_offset + sizeof(pim_matrix_des);
172-
psumf = (float *)mem_alloc(sizeof(float)*input_cols*weight_rows_cur_thread);
189+
190+
if (tasklet_id == 0) {
191+
psumf = (float *)mem_alloc(sizeof(float)*input_cols*weight_rows_cur_thread);
192+
}
193+
barrier_wait(&my_barrier);
194+
195+
// psumf = (float *)mem_alloc(sizeof(float)*input_cols*weight_rows_cur_thread);
173196
memset(psumf, 0 ,sizeof(float)*input_cols*weight_rows_cur_thread);
174197
#if PRINT
175198
printf("input_cols=%d, rows_cur_thread=%d, nb=%d, input_row_size=%d\n",input_cols,weight_rows_cur_thread,nb,input_row_size);
@@ -179,7 +202,7 @@ int main() {
179202

180203
// weight_rows_cur_thread = 16;
181204
for(int l = 0;l < input_cols;l++) {
182-
__mram_ptr block_q8_0 *pinput = pinput_base + l*nb*sizeof(block_q8_0);
205+
__mram_ptr block_q8_0 *pinput = pinput_base + l * nb * sizeof(block_q8_0);
183206
mram2wram(pinput, pinput_cache, sizeof(block_q8_0)*nb);
184207
#if PRINT
185208
printf("input:\n");
@@ -192,8 +215,9 @@ int main() {
192215
}
193216
printf("pweight_base: %p\n", pweight_base);
194217
#endif
195-
for(int k = 0;k < weight_rows_cur_thread;k++) {
196-
__mram_ptr block_q4_0 *pweight = pweight_base + pinputcache->layerid*cache_meta->layer_len + k*nb*sizeof(block_q4_0);
218+
// for(int k = 0;k < weight_rows_cur_thread;k++) {
219+
for (int k = weight_start_row; k < weight_end_row; ++k) {
220+
__mram_ptr block_q4_0 *pweight = pweight_base + pinputcache->layerid * cache_meta->layer_len + k * nb * sizeof(block_q4_0);
197221
mram2wram(pweight, pweight_cache, sizeof(block_q4_0)*nb);
198222
#if PRINT
199223
if (k % 64 == 0) {

dpu/pim_build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
#!/bin/bash
2-
dpu-upmem-dpurte-clang -Wall -Wextra -O2 -DNR_TASKLETS=1 -DBL=11 -o gemv_dpu dpu_main.c
2+
dpu-upmem-dpurte-clang -Wall -Wextra -O2 -DNR_TASKLETS=8 -DBL=11 -o gemv_dpu dpu_main.c

examples/tensor/ts.cpp

Lines changed: 33 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "trace_driver.h"
22
#include <iostream>
33
#include <iomanip>
4+
#include <chrono>
45

56
#define NR_DPUS 8
67
#define NR_LAYER 2
@@ -24,6 +25,8 @@ int gemv_dpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct
2425
DPU_ASSERT(dpu_broadcast_to(context->dpu_set, DPU_MRAM_HEAP_POINTER_NAME, pim_offset, (void *)(ggml_table_f32_f16), sizeof(ggml_table_f32_f16), DPU_XFER_DEFAULT));
2526
pim_offset += sizeof(ggml_table_f32_f16);
2627

28+
std::cout << "ggml_table_f32_f16 len = " << sizeof(ggml_table_f32_f16) << std::endl;
29+
2730
// Transfer pim_metadata into DPUs
2831
context->pim_metadata.layer_num = NR_LAYER;
2932
context->pim_metadata.weight_type = (uint16_t)(w->type);
@@ -36,6 +39,8 @@ int gemv_dpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct
3639
context->pim_metadata.layer_len = w->nb[1] * (context->pim_metadata.rows_per_dpu);
3740
context->pim_metadata.input_offset = sizeof(ggml_table_f32_f16) + sizeof(struct pim_meta) + context->pim_metadata.layer_len * NR_LAYER;
3841

42+
std::cout << "layer_num = " << NR_LAYER << ", weight_type = " << (uint16_t)(w->type) << ", rows_per_dpu = " << w->ne[1] / NR_DPUS << ", rest_rows = " << w->ne[1] % NR_DPUS << ", layer_len = " << context->pim_metadata.layer_len << ", input_offset = " << context->pim_metadata.input_offset << std::endl;
43+
3944
//Todo: NR_DPUS contexts are dispatched to different dpus(rest row is different on different dpu)
4045
DPU_ASSERT(dpu_broadcast_to(context->dpu_set, DPU_MRAM_HEAP_POINTER_NAME, pim_offset, &(context->pim_metadata), sizeof(struct pim_meta), DPU_XFER_DEFAULT));
4146
pim_offset += sizeof(struct pim_meta);
@@ -50,10 +55,10 @@ int gemv_dpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct
5055
uint32_t prev_rows_dpu = i * context->pim_metadata.rows_per_dpu;
5156

5257
// every dpu's data
53-
DPU_ASSERT(dpu_prepare_xfer(dpu, ((unsigned char *)w->data) + prev_rows_dpu*size_per_row));
58+
DPU_ASSERT(dpu_prepare_xfer(dpu, ((unsigned char *)w->data) + prev_rows_dpu * size_per_row));
5459
}
5560

56-
DPU_ASSERT(dpu_push_xfer(context->dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, pim_offset + layer_len*layeridx, layer_len, DPU_XFER_DEFAULT));
61+
DPU_ASSERT(dpu_push_xfer(context->dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, pim_offset + layer_len * layeridx, layer_len, DPU_XFER_DEFAULT));
5762
}
5863

5964
// Transfer input into DPUs
@@ -72,8 +77,14 @@ int gemv_dpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct
7277
DPU_ASSERT(dpu_broadcast_to(context->dpu_set, DPU_MRAM_HEAP_POINTER_NAME, input_offset, in_q->data, bclen, DPU_XFER_DEFAULT));
7378
input_offset += bclen;
7479

80+
std::chrono::high_resolution_clock::time_point ex_tp1 = std::chrono::high_resolution_clock::now();
7581
// Launch DPU kernel
7682
DPU_ASSERT(dpu_launch(context->dpu_set, DPU_SYNCHRONOUS));
83+
std::chrono::high_resolution_clock::time_point ex_tp2 = std::chrono::high_resolution_clock::now();
84+
85+
std::chrono::duration<size_t, std::nano> dur = ex_tp2 - ex_tp1;
86+
87+
std::cout << "执行用时:" << std::chrono::duration_cast<std::chrono::milliseconds>(dur).count() << " ms" << std::endl;
7788

7889
// Check results
7990
float *mul_mat_res = (float *)res->data;
@@ -105,23 +116,30 @@ int main(int argc, char** argv) {
105116
struct ggml_tensor * ts_bq = tensor_import(filenamebq);
106117
struct ggml_tensor * ts_c = tensor_import(filenamec);
107118
struct ggml_tensor * ts_c_pim = tensor_import(filenamec_p);
108-
std::cout<<"ts_a:"<<std::endl;
109-
dump_tensor(ts_a, stdout);
110-
std::cout<<"ts_b:"<<std::endl;
111-
dump_tensor(ts_b, stdout);
112-
std::cout<<"ts_bq:"<<std::endl;
113-
dump_tensor(ts_bq, stdout);
114-
std::cout<<"ts_c:"<<std::endl;
115-
dump_tensor(ts_c, stdout);
116-
std::cout<<"ts_c_pim:"<<std::endl;
117-
dump_tensor(ts_c_pim, stdout);
118-
119+
// std::cout<<"ts_a:"<<std::endl;
120+
// dump_tensor(ts_a, stdout);
121+
// std::cout<<"ts_b:"<<std::endl;
122+
// dump_tensor(ts_b, stdout);
123+
// std::cout<<"ts_bq:"<<std::endl;
124+
// dump_tensor(ts_bq, stdout);
125+
// std::cout<<"ts_c:"<<std::endl;
126+
// dump_tensor(ts_c, stdout);
127+
// std::cout<<"ts_c_pim:"<<std::endl;
128+
// dump_tensor(ts_c_pim, stdout);
129+
130+
std::cout << "ts_a: " << std::endl;
131+
print_tensor(ts_a, stdout);
132+
std::cout << "ts_b: " << std::endl;
133+
print_tensor(ts_b, stdout);
119134

120135
gemv_dpu_kernel(pqcontext, ts_a, ts_bq, ts_c_pim);
121-
std::cout<<"ts_c_pim calculated by DPUs:"<<std::endl;
122-
dump_tensor(ts_c_pim, stdout);
136+
// std::cout<<"ts_c_pim calculated by DPUs:"<<std::endl;
137+
// dump_tensor(ts_c_pim, stdout);
123138

124139
float first_res = mul_add_q4_0_q8_0(ts_a, ts_bq);
125140
std::cout<<"first element: "<<std::fixed << std::setprecision(6)<<first_res<<std::endl;
141+
142+
std::cout << "error between c and c_pim:" << std::endl;
143+
compare_tensor(ts_c, ts_c_pim);
126144
return 0;
127145
}

include/trace_driver.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ extern "C" {
2020
void tensor_export(const struct ggml_tensor * tensor, const char * fname);
2121
struct ggml_tensor * tensor_import(const char * fname);
2222
void dump_tensor_first_n(const struct ggml_tensor * tensor, int n, FILE * fout);
23+
void compare_tensor(const struct ggml_tensor *a, const struct ggml_tensor *b);
24+
void print_q4_tensor(const struct ggml_tensor *a);
25+
void print_tensor(const struct ggml_tensor *tensor, FILE *fout);
2326
void dump_tensor(const struct ggml_tensor * tensor, FILE * fout);
2427

2528
float mul_add_q4_0_q8_0(struct ggml_tensor * a, struct ggml_tensor * b);

src/trace_driver.c

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include <stdlib.h>
22
#include <stdio.h>
3+
#include "ggml-quants.h"
34
#include "trace_driver.h"
45

56
void tensor_export(const struct ggml_tensor * tensor, const char * fname) {
@@ -167,6 +168,91 @@ void dump_tensor_first_n(const struct ggml_tensor * tensor, int n, FILE * fout)
167168
}
168169
}
169170

171+
void compare_tensor(const struct ggml_tensor *a, const struct ggml_tensor *b) {
172+
assert(a->type == b->type);
173+
assert(ggml_nelements(a) == ggml_nelements(b));
174+
175+
int nelems = ggml_nelements(a);
176+
177+
float max_err = 0.0f;
178+
float min_err = 100000.0f;
179+
float total_err = 0.0f;
180+
float avg_err = 0.0f;
181+
float *a_f32;
182+
float *b_f32;
183+
switch (a->type) {
184+
case GGML_TYPE_F32:
185+
case GGML_TYPE_F16:
186+
case GGML_TYPE_BF16: {
187+
a_f32 = (float *)a->data;
188+
b_f32 = (float *)b->data;
189+
break;
190+
}
191+
case GGML_TYPE_Q4_0: {
192+
void *a_data = a->data;
193+
void *b_data = b->data;
194+
195+
a_f32 = malloc(nelems * sizeof(float));
196+
b_f32 = malloc(nelems * sizeof(float));
197+
198+
dequantize_row_q4_0(a_data, a_f32, nelems);
199+
dequantize_row_q4_0(b_data, b_f32, nelems);
200+
break;
201+
}
202+
case GGML_TYPE_Q8_0: {
203+
void *a_data = a->data;
204+
void *b_data = b->data;
205+
206+
a_f32 = malloc(nelems * sizeof(float));
207+
b_f32 = malloc(nelems * sizeof(float));
208+
209+
dequantize_row_q8_0(a_data, a_f32, nelems);
210+
dequantize_row_q8_0(b_data, b_f32, nelems);
211+
break;
212+
}
213+
default: {
214+
assert(false);
215+
}
216+
}
217+
218+
for (int i = 0; i < nelems; ++i) {
219+
float err = abs(a_f32[i] - b_f32[i]);
220+
total_err += err;
221+
if (max_err < err) {
222+
max_err = err;
223+
}
224+
if (min_err > err) {
225+
min_err = err;
226+
}
227+
}
228+
229+
avg_err = total_err / nelems;
230+
231+
printf("max_error = %.5f, min_error = %.5f, total_error = %.5f, avg_error = %.5f\n", max_err, min_err, total_err, avg_err);
232+
}
233+
234+
void print_q4_tensor(const struct ggml_tensor *a) {
235+
void *d4_data = a->data;
236+
int nelems = ggml_nelements(a);
237+
printf("nelems = %d\n", nelems);
238+
239+
float *fp32_data = malloc(nelems * sizeof(float));
240+
dequantize_row_q4_0(d4_data, fp32_data, nelems);
241+
242+
for (int i = 0; i < 10; ++i) {
243+
printf("fp32_data[%d] = %f\n", i, fp32_data[i]);
244+
}
245+
}
246+
247+
void print_tensor(const struct ggml_tensor *tensor, FILE *fout) {
248+
const int64_t *ne = tensor->ne;
249+
const size_t *nb = tensor->nb;
250+
251+
fprintf(fout, "shape = {%d, %d, %d, %d}, stride = {%d, %d, %d, %d}\n",
252+
ne[0], ne[1], ne[2], ne[3],
253+
nb[0], nb[1], nb[2], nb[3]);
254+
}
255+
170256
void dump_tensor(const struct ggml_tensor * tensor, FILE * fout) {
171257
const int64_t * ne = tensor->ne;
172258
const size_t * nb = tensor->nb;

0 commit comments

Comments
 (0)