Skip to content

Commit 80572b3

Browse files
authored
Merge pull request #1 from lanhin/dev_pim_multithread
Dev pim multithread
2 parents a7b3792 + 4541374 commit 80572b3

File tree

5 files changed

+161
-27
lines changed

5 files changed

+161
-27
lines changed

dpu/dpu_main.c

100755100644
Lines changed: 38 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -91,25 +91,35 @@ int wram2mram(__mram_ptr void *pmram,void *pwram,uint32_t size)
9191
}
9292

9393

94-
// main
95-
int main() {
96-
unsigned int tasklet_id = me();
94+
// set psumf to global value for each thread access
95+
static float *psumf = NULL;
96+
97+
void init(unsigned int tasklet_id) {
9798
#if PRINT
98-
// printf("tasklet_id = %u\n", tasklet_id);
99+
printf("tasklet_id = %u\n", tasklet_id);
99100
#endif
100101
if (tasklet_id == 0){ // Initialize once the cycle counter
101102
mem_reset(); // Reset the heap
103+
// first thread set fp32->fp16 table
104+
ptable_f32_f16 = (__mram_ptr float *)DPU_MRAM_HEAP_POINTER;
102105
}
103106
// Barrier
104107
barrier_wait(&my_barrier);
108+
}
105109

106-
//fp32->fp16 table
107-
ptable_f32_f16 = (__mram_ptr float *)DPU_MRAM_HEAP_POINTER;
110+
// main
111+
int main() {
112+
113+
unsigned int tasklet_id = me();
114+
115+
init(tasklet_id);
116+
117+
//set fp32->fp16 table configure
108118
uint32_t table_f32_f16_len = (1 << 16)*sizeof(float);
109119
uint32_t offset = table_f32_f16_len;
110120
int input_row_size = 0;
111121
int input_cols = 0;
112-
float *psumf = NULL;
122+
113123

114124
#if PRINT
115125
printf("table_f32_f16_len=%d\n",table_f32_f16_len);
@@ -129,6 +139,11 @@ int main() {
129139
cache_meta->layer_num,cache_meta->weight_type,cache_meta->rows_per_dpu,cache_meta->rest_rows,cache_meta->input_offset);
130140
#endif
131141

142+
// set sart line, end line and line number in each thread
143+
uint16_t weight_rows_per_thread = cache_meta->rows_per_dpu / NR_TASKLETS;
144+
uint16_t weight_start_row = tasklet_id * weight_rows_per_thread;
145+
uint16_t weight_end_row = weight_start_row + weight_rows_per_thread;
146+
132147
// todo:rest row is existed, first thread in every dpu can one more row
133148
uint16_t weight_rows_cur_thread;
134149
if (cache_meta->rest_rows) {
@@ -142,14 +157,17 @@ int main() {
142157

143158
//input metadata
144159
offset += (cache_meta->layer_len * cache_meta->layer_num);
160+
145161
#if PRINT
146162
printf("layer_len=%d, input metadata offset=%d\n",cache_meta->layer_len,offset);
147163
#endif
164+
148165
uint32_t inputmetadatabase = weightmetadatabase + sizeof(struct pim_meta) + cache_meta->layer_len * cache_meta->layer_num;
149166
pim_matrix_des *pinputcache = (pim_matrix_des *) mem_alloc(sizeof(pim_matrix_des));
150167
mram_read((__mram_ptr void const*) (inputmetadatabase), pinputcache, sizeof(pim_matrix_des));
151168
input_cols = pinputcache->ne[1];
152169
assert(input_cols == 1 && "Only support vector as input.");
170+
153171
#if PRINT
154172
printf("input_type=%d, layerID=%d\n",pinputcache->type,pinputcache->layerid);
155173
for(int nn=0;nn<GGML_MAX_DIMS;nn++) {
@@ -158,6 +176,7 @@ int main() {
158176
#endif
159177

160178
assert(cache_meta->weight_type == ((uint16_t)GGML_TYPE_Q4_0) && "Only support Q4_0 weight.");
179+
161180
//weight info: GGML_TYPE_Q4_0 default
162181
if (cache_meta->weight_type == ((uint16_t)GGML_TYPE_Q4_0)) {
163182
if (pinputcache->type != GGML_TYPE_Q8_0) {
@@ -169,8 +188,15 @@ int main() {
169188
input_row_size = nb*sizeof(block_q8_0);
170189
__mram_ptr void *pweight_base = (__mram_ptr void *)(weightmetadatabase + sizeof(struct pim_meta));
171190
__mram_ptr void *pinput_base = DPU_MRAM_HEAP_POINTER + cache_meta->input_offset + sizeof(pim_matrix_des);
172-
psumf = (float *)mem_alloc(sizeof(float)*input_cols*weight_rows_cur_thread);
191+
192+
if (tasklet_id == 0) {
193+
psumf = (float *)mem_alloc(sizeof(float)*input_cols*weight_rows_cur_thread);
194+
}
195+
barrier_wait(&my_barrier);
196+
197+
// psumf = (float *)mem_alloc(sizeof(float)*input_cols*weight_rows_cur_thread);
173198
memset(psumf, 0 ,sizeof(float)*input_cols*weight_rows_cur_thread);
199+
174200
#if PRINT
175201
printf("input_cols=%d, rows_cur_thread=%d, nb=%d, input_row_size=%d\n",input_cols,weight_rows_cur_thread,nb,input_row_size);
176202
#endif
@@ -179,7 +205,7 @@ int main() {
179205

180206
// weight_rows_cur_thread = 16;
181207
for(int l = 0;l < input_cols;l++) {
182-
__mram_ptr block_q8_0 *pinput = pinput_base + l*nb*sizeof(block_q8_0);
208+
__mram_ptr block_q8_0 *pinput = pinput_base + l * nb * sizeof(block_q8_0);
183209
mram2wram(pinput, pinput_cache, sizeof(block_q8_0)*nb);
184210
#if PRINT
185211
printf("input:\n");
@@ -192,8 +218,9 @@ int main() {
192218
}
193219
printf("pweight_base: %p\n", pweight_base);
194220
#endif
195-
for(int k = 0;k < weight_rows_cur_thread;k++) {
196-
__mram_ptr block_q4_0 *pweight = pweight_base + pinputcache->layerid*cache_meta->layer_len + k*nb*sizeof(block_q4_0);
221+
// for(int k = 0;k < weight_rows_cur_thread;k++) {
222+
for (int k = weight_start_row; k < weight_end_row; ++k) {
223+
__mram_ptr block_q4_0 *pweight = pweight_base + pinputcache->layerid * cache_meta->layer_len + k * nb * sizeof(block_q4_0);
197224
mram2wram(pweight, pweight_cache, sizeof(block_q4_0)*nb);
198225
#if PRINT
199226
if (k % 64 == 0) {

dpu/pim_build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
#!/bin/bash
2-
dpu-upmem-dpurte-clang -Wall -Wextra -O2 -DNR_TASKLETS=1 -DBL=11 -o gemv_dpu dpu_main.c
2+
dpu-upmem-dpurte-clang -Wall -Wextra -O2 -DNR_TASKLETS=8 -DBL=11 -o gemv_dpu dpu_main.c

examples/tensor/ts.cpp

Lines changed: 33 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "trace_driver.h"
22
#include <iostream>
33
#include <iomanip>
4+
#include <chrono>
45

56
#define NR_DPUS 8
67
#define NR_LAYER 2
@@ -24,6 +25,8 @@ int gemv_dpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct
2425
DPU_ASSERT(dpu_broadcast_to(context->dpu_set, DPU_MRAM_HEAP_POINTER_NAME, pim_offset, (void *)(ggml_table_f32_f16), sizeof(ggml_table_f32_f16), DPU_XFER_DEFAULT));
2526
pim_offset += sizeof(ggml_table_f32_f16);
2627

28+
std::cout << "ggml_table_f32_f16 len = " << sizeof(ggml_table_f32_f16) << std::endl;
29+
2730
// Transfer pim_metadata into DPUs
2831
context->pim_metadata.layer_num = NR_LAYER;
2932
context->pim_metadata.weight_type = (uint16_t)(w->type);
@@ -36,6 +39,8 @@ int gemv_dpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct
3639
context->pim_metadata.layer_len = w->nb[1] * (context->pim_metadata.rows_per_dpu);
3740
context->pim_metadata.input_offset = sizeof(ggml_table_f32_f16) + sizeof(struct pim_meta) + context->pim_metadata.layer_len * NR_LAYER;
3841

42+
std::cout << "layer_num = " << NR_LAYER << ", weight_type = " << (uint16_t)(w->type) << ", rows_per_dpu = " << w->ne[1] / NR_DPUS << ", rest_rows = " << w->ne[1] % NR_DPUS << ", layer_len = " << context->pim_metadata.layer_len << ", input_offset = " << context->pim_metadata.input_offset << std::endl;
43+
3944
//Todo: NR_DPUS contexts are dispatched to different dpus(rest row is different on different dpu)
4045
DPU_ASSERT(dpu_broadcast_to(context->dpu_set, DPU_MRAM_HEAP_POINTER_NAME, pim_offset, &(context->pim_metadata), sizeof(struct pim_meta), DPU_XFER_DEFAULT));
4146
pim_offset += sizeof(struct pim_meta);
@@ -50,10 +55,10 @@ int gemv_dpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct
5055
uint32_t prev_rows_dpu = i * context->pim_metadata.rows_per_dpu;
5156

5257
// every dpu's data
53-
DPU_ASSERT(dpu_prepare_xfer(dpu, ((unsigned char *)w->data) + prev_rows_dpu*size_per_row));
58+
DPU_ASSERT(dpu_prepare_xfer(dpu, ((unsigned char *)w->data) + prev_rows_dpu * size_per_row));
5459
}
5560

56-
DPU_ASSERT(dpu_push_xfer(context->dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, pim_offset + layer_len*layeridx, layer_len, DPU_XFER_DEFAULT));
61+
DPU_ASSERT(dpu_push_xfer(context->dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, pim_offset + layer_len * layeridx, layer_len, DPU_XFER_DEFAULT));
5762
}
5863

5964
// Transfer input into DPUs
@@ -72,8 +77,14 @@ int gemv_dpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct
7277
DPU_ASSERT(dpu_broadcast_to(context->dpu_set, DPU_MRAM_HEAP_POINTER_NAME, input_offset, in_q->data, bclen, DPU_XFER_DEFAULT));
7378
input_offset += bclen;
7479

80+
std::chrono::high_resolution_clock::time_point ex_tp1 = std::chrono::high_resolution_clock::now();
7581
// Launch DPU kernel
7682
DPU_ASSERT(dpu_launch(context->dpu_set, DPU_SYNCHRONOUS));
83+
std::chrono::high_resolution_clock::time_point ex_tp2 = std::chrono::high_resolution_clock::now();
84+
85+
std::chrono::duration<size_t, std::nano> dur = ex_tp2 - ex_tp1;
86+
87+
std::cout << "执行用时:" << std::chrono::duration_cast<std::chrono::milliseconds>(dur).count() << " ms" << std::endl;
7788

7889
// Check results
7990
float *mul_mat_res = (float *)res->data;
@@ -105,23 +116,30 @@ int main(int argc, char** argv) {
105116
struct ggml_tensor * ts_bq = tensor_import(filenamebq);
106117
struct ggml_tensor * ts_c = tensor_import(filenamec);
107118
struct ggml_tensor * ts_c_pim = tensor_import(filenamec_p);
108-
std::cout<<"ts_a:"<<std::endl;
109-
dump_tensor(ts_a, stdout);
110-
std::cout<<"ts_b:"<<std::endl;
111-
dump_tensor(ts_b, stdout);
112-
std::cout<<"ts_bq:"<<std::endl;
113-
dump_tensor(ts_bq, stdout);
114-
std::cout<<"ts_c:"<<std::endl;
115-
dump_tensor(ts_c, stdout);
116-
std::cout<<"ts_c_pim:"<<std::endl;
117-
dump_tensor(ts_c_pim, stdout);
118-
119+
// std::cout<<"ts_a:"<<std::endl;
120+
// dump_tensor(ts_a, stdout);
121+
// std::cout<<"ts_b:"<<std::endl;
122+
// dump_tensor(ts_b, stdout);
123+
// std::cout<<"ts_bq:"<<std::endl;
124+
// dump_tensor(ts_bq, stdout);
125+
// std::cout<<"ts_c:"<<std::endl;
126+
// dump_tensor(ts_c, stdout);
127+
// std::cout<<"ts_c_pim:"<<std::endl;
128+
// dump_tensor(ts_c_pim, stdout);
129+
130+
std::cout << "ts_a: " << std::endl;
131+
print_tensor(ts_a, stdout);
132+
std::cout << "ts_b: " << std::endl;
133+
print_tensor(ts_b, stdout);
119134

120135
gemv_dpu_kernel(pqcontext, ts_a, ts_bq, ts_c_pim);
121-
std::cout<<"ts_c_pim calculated by DPUs:"<<std::endl;
122-
dump_tensor(ts_c_pim, stdout);
136+
// std::cout<<"ts_c_pim calculated by DPUs:"<<std::endl;
137+
// dump_tensor(ts_c_pim, stdout);
123138

124139
float first_res = mul_add_q4_0_q8_0(ts_a, ts_bq);
125140
std::cout<<"first element: "<<std::fixed << std::setprecision(6)<<first_res<<std::endl;
141+
142+
std::cout << "error between c and c_pim:" << std::endl;
143+
compare_tensor(ts_c, ts_c_pim);
126144
return 0;
127145
}

include/trace_driver.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ extern "C" {
2020
void tensor_export(const struct ggml_tensor * tensor, const char * fname);
2121
struct ggml_tensor * tensor_import(const char * fname);
2222
void dump_tensor_first_n(const struct ggml_tensor * tensor, int n, FILE * fout);
23+
void compare_tensor(const struct ggml_tensor *a, const struct ggml_tensor *b);
24+
void print_q4_tensor(const struct ggml_tensor *a);
25+
void print_tensor(const struct ggml_tensor *tensor, FILE *fout);
2326
void dump_tensor(const struct ggml_tensor * tensor, FILE * fout);
2427

2528
float mul_add_q4_0_q8_0(struct ggml_tensor * a, struct ggml_tensor * b);

src/trace_driver.c

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include <stdlib.h>
22
#include <stdio.h>
3+
#include "ggml-quants.h"
34
#include "trace_driver.h"
45

56
void tensor_export(const struct ggml_tensor * tensor, const char * fname) {
@@ -167,6 +168,91 @@ void dump_tensor_first_n(const struct ggml_tensor * tensor, int n, FILE * fout)
167168
}
168169
}
169170

171+
void compare_tensor(const struct ggml_tensor *a, const struct ggml_tensor *b) {
172+
assert(a->type == b->type);
173+
assert(ggml_nelements(a) == ggml_nelements(b));
174+
175+
int nelems = ggml_nelements(a);
176+
177+
float max_err = 0.0f;
178+
float min_err = 100000.0f;
179+
float total_err = 0.0f;
180+
float avg_err = 0.0f;
181+
float *a_f32;
182+
float *b_f32;
183+
switch (a->type) {
184+
case GGML_TYPE_F32:
185+
case GGML_TYPE_F16:
186+
case GGML_TYPE_BF16: {
187+
a_f32 = (float *)a->data;
188+
b_f32 = (float *)b->data;
189+
break;
190+
}
191+
case GGML_TYPE_Q4_0: {
192+
void *a_data = a->data;
193+
void *b_data = b->data;
194+
195+
a_f32 = malloc(nelems * sizeof(float));
196+
b_f32 = malloc(nelems * sizeof(float));
197+
198+
dequantize_row_q4_0(a_data, a_f32, nelems);
199+
dequantize_row_q4_0(b_data, b_f32, nelems);
200+
break;
201+
}
202+
case GGML_TYPE_Q8_0: {
203+
void *a_data = a->data;
204+
void *b_data = b->data;
205+
206+
a_f32 = malloc(nelems * sizeof(float));
207+
b_f32 = malloc(nelems * sizeof(float));
208+
209+
dequantize_row_q8_0(a_data, a_f32, nelems);
210+
dequantize_row_q8_0(b_data, b_f32, nelems);
211+
break;
212+
}
213+
default: {
214+
assert(false);
215+
}
216+
}
217+
218+
for (int i = 0; i < nelems; ++i) {
219+
float err = abs(a_f32[i] - b_f32[i]);
220+
total_err += err;
221+
if (max_err < err) {
222+
max_err = err;
223+
}
224+
if (min_err > err) {
225+
min_err = err;
226+
}
227+
}
228+
229+
avg_err = total_err / nelems;
230+
231+
printf("max_error = %.5f, min_error = %.5f, total_error = %.5f, avg_error = %.5f\n", max_err, min_err, total_err, avg_err);
232+
}
233+
234+
void print_q4_tensor(const struct ggml_tensor *a) {
235+
void *d4_data = a->data;
236+
int nelems = ggml_nelements(a);
237+
printf("nelems = %d\n", nelems);
238+
239+
float *fp32_data = malloc(nelems * sizeof(float));
240+
dequantize_row_q4_0(d4_data, fp32_data, nelems);
241+
242+
for (int i = 0; i < 10; ++i) {
243+
printf("fp32_data[%d] = %f\n", i, fp32_data[i]);
244+
}
245+
}
246+
247+
void print_tensor(const struct ggml_tensor *tensor, FILE *fout) {
248+
const int64_t *ne = tensor->ne;
249+
const size_t *nb = tensor->nb;
250+
251+
fprintf(fout, "shape = {%d, %d, %d, %d}, stride = {%d, %d, %d, %d}\n",
252+
ne[0], ne[1], ne[2], ne[3],
253+
nb[0], nb[1], nb[2], nb[3]);
254+
}
255+
170256
void dump_tensor(const struct ggml_tensor * tensor, FILE * fout) {
171257
const int64_t * ne = tensor->ne;
172258
const size_t * nb = tensor->nb;

0 commit comments

Comments
 (0)