Skip to content

Commit 9cd4190

Browse files
author
Yinan
committed
Bug fix for DPU kernel weight offset and layer id for multiple layers.
1 parent 093131b commit 9cd4190

File tree

4 files changed

+29
-23
lines changed

4 files changed

+29
-23
lines changed

dpu/dpu_main.c

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -107,8 +107,9 @@ int main() {
107107
ptable_f32_f16 = (__mram_ptr float *)DPU_MRAM_HEAP_POINTER;
108108
uint32_t table_f32_f16_len = (1 << 16)*sizeof(float);
109109
uint32_t offset = table_f32_f16_len;
110-
int input_row_size,input_cols;
111-
float *psumf;
110+
int input_row_size = 0;
111+
int input_cols = 0;
112+
float *psumf = NULL;
112113

113114
#if PRINT
114115
printf("table_f32_f16_len=%d\n",table_f32_f16_len);
@@ -124,7 +125,7 @@ int main() {
124125
mram_read((__mram_ptr void const*) (weightmetadatabase), cache_meta, sizeof(struct pim_meta));
125126

126127
#if PRINT
127-
printf("layer_num: %d, weight_type=%d,rows_per_dpu=%d,rest_rows=%d,input_offset=%d",
128+
printf("layer_num: %d, weight_type=%d, rows_per_dpu=%d, rest_rows=%d, input_offset=%d",
128129
cache_meta->layer_num,cache_meta->weight_type,cache_meta->rows_per_dpu,cache_meta->rest_rows,cache_meta->input_offset);
129130
#endif
130131

@@ -142,14 +143,15 @@ int main() {
142143
//input metadata
143144
offset += (cache_meta->layer_len * cache_meta->layer_num);
144145
#if PRINT
145-
printf("layer_len=%d,offset=%d\n",cache_meta->layer_len,offset);
146+
printf("layer_len=%d, input metadata offset=%d\n",cache_meta->layer_len,offset);
146147
#endif
147148
uint32_t inputmetadatabase = weightmetadatabase + sizeof(struct pim_meta) + cache_meta->layer_len * cache_meta->layer_num;
148149
pim_matrix_des *pinputcache = (pim_matrix_des *) mem_alloc(sizeof(pim_matrix_des));
149150
mram_read((__mram_ptr void const*) (inputmetadatabase), pinputcache, sizeof(pim_matrix_des));
150151
input_cols = pinputcache->ne[1];
152+
assert(input_cols == 1 && "Only support vector as input.");
151153
#if PRINT
152-
printf("input_type=%d,layerID=%d\n",pinputcache->type,pinputcache->layerid);
154+
printf("input_type=%d, layerID=%d\n",pinputcache->type,pinputcache->layerid);
153155
for(int nn=0;nn<GGML_MAX_DIMS;nn++) {
154156
printf("ne[%d]=%lld\n",nn,pinputcache->ne[nn]);
155157
}
@@ -165,19 +167,19 @@ int main() {
165167
int nb = pinputcache->ne[0]/QK8_0;
166168
int qk = QK8_0;
167169
input_row_size = nb*sizeof(block_q8_0);
168-
__mram_ptr block_q4_0 *pweight_base = (__mram_ptr block_q4_0 *)(weightmetadatabase + sizeof(struct pim_meta));
169-
__mram_ptr block_q8_0 *pinput_base = (__mram_ptr block_q8_0 *)(DPU_MRAM_HEAP_POINTER + cache_meta->input_offset + sizeof(pim_matrix_des));
170+
__mram_ptr void *pweight_base = (__mram_ptr void *)(weightmetadatabase + sizeof(struct pim_meta));
171+
__mram_ptr void *pinput_base = DPU_MRAM_HEAP_POINTER + cache_meta->input_offset + sizeof(pim_matrix_des);
170172
psumf = (float *)mem_alloc(sizeof(float)*input_cols*weight_rows_cur_thread);
171173
memset(psumf, 0 ,sizeof(float)*input_cols*weight_rows_cur_thread);
172174
#if PRINT
173-
printf("input_cols=%d,rows_cur_thread=%d,nb=%d,input_row_size=%d\n",input_cols,weight_rows_cur_thread,nb,input_row_size);
175+
printf("input_cols=%d, rows_cur_thread=%d, nb=%d, input_row_size=%d\n",input_cols,weight_rows_cur_thread,nb,input_row_size);
174176
#endif
175177
block_q4_0 *pweight_cache = (block_q4_0 *) mem_alloc(sizeof(block_q4_0)*nb);
176178
block_q8_0 *pinput_cache = (block_q8_0 *) mem_alloc(sizeof(block_q8_0)*nb);
177179

178180
// weight_rows_cur_thread = 16;
179181
for(int l = 0;l < input_cols;l++) {
180-
__mram_ptr block_q8_0 *pinput = pinput_base + l*nb;
182+
__mram_ptr block_q8_0 *pinput = pinput_base + l*nb*sizeof(block_q8_0);
181183
mram2wram(pinput, pinput_cache, sizeof(block_q8_0)*nb);
182184
#if PRINT
183185
printf("input:\n");
@@ -191,8 +193,7 @@ int main() {
191193
printf("pweight_base: %p\n", pweight_base);
192194
#endif
193195
for(int k = 0;k < weight_rows_cur_thread;k++) {
194-
//block_q4_0 *pqlayer0weight = (block_q4_0 *)(weightmetadatabase + sizeof(struct pim_meta) + cache_meta->layer_len*k);
195-
__mram_ptr block_q4_0 *pweight = pweight_base + pinputcache->layerid*cache_meta->layer_len + k*nb;
196+
__mram_ptr block_q4_0 *pweight = pweight_base + pinputcache->layerid*cache_meta->layer_len + k*nb*sizeof(block_q4_0);
196197
mram2wram(pweight, pweight_cache, sizeof(block_q4_0)*nb);
197198
#if PRINT
198199
if (k % 64 == 0) {
@@ -207,11 +208,10 @@ int main() {
207208
#endif
208209

209210
for (int i = 0; i < nb; i++) {
210-
//printf("input_col:%d,weight_row:%d\n",l,k);
211+
//printf("input_col:%d, current inner weight row idx:%d\n",l,k);
211212

212213
int sumi = 0;
213214
for (int j = 0; j < qk/2; ++j) {
214-
//printf("nb:%d,qk=%d,qs=%d\n",i,j,pweight_cache[i].qs[j]);
215215
const int v0 = (pweight_cache[i].qs[j] & 0x0F) - 8;
216216
const int v1 = (pweight_cache[i].qs[j] >> 4) - 8;
217217

@@ -230,9 +230,10 @@ int main() {
230230
printf("psumf[%d]=%f\n",iii,psumf[iii]);
231231
}
232232

233-
printf("offset=%d\n",offset);
233+
printf("output offset=%d\n",offset);
234234
#endif
235235
// Write C Matrix to current MRAM block
236-
wram2mram((__mram_ptr void *) (DPU_MRAM_HEAP_POINTER + offset),psumf,sizeof(float)*input_cols*weight_rows_cur_thread);
236+
// Note: with input_cols > 1, the results should be rearranged on host
237+
wram2mram((__mram_ptr void *) (DPU_MRAM_HEAP_POINTER + offset), psumf, sizeof(float)*input_cols*weight_rows_cur_thread);
237238
return 0;
238239
}

ggml/src/ggml.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12541,7 +12541,8 @@ static void ggml_compute_forward_mul_mat(
1254112541
// compute by src0 rows
1254212542

1254312543
// export the first gemv's tensor
12544-
if (type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_F32 &&
12544+
if (dst->flags & GGML_TENSOR_FLAG_PIM &&
12545+
type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_F32 &&
1254512546
ne00 == 4096 && ne01 == 4096 &&
1254612547
ne02 == 1 && ne03 == 1 &&
1254712548
ne10 == 4096 && ne11 == 1 &&
@@ -12613,7 +12614,7 @@ UseGgmlGemm1:;
1261312614
}
1261412615

1261512616
if ((dst->flags & GGML_TENSOR_FLAG_PIM)) {
12616-
dpu_launch_gemv_async(src1, wdata, src0, dst, 0);
12617+
dpu_launch_gemv_async(src1, wdata, src0, dst, dst->layerid);
1261712618
dpu_kernel_barrier(*(dst->dpu_set));
1261812619

1261912620
pim_res->type = dst->type;
@@ -17405,6 +17406,9 @@ static __inline__ int dpu_get_gemv_res(struct ggml_tensor *input, struct ggml_te
1740517406
dpu_get_nr_dpus(dpu_set, &nr_dpus);
1740617407
int rows_per_dpu = w->ne[1] / nr_dpus;
1740717408

17409+
// Only support vector as input
17410+
GGML_ASSERT(input->ne[1] == 1);
17411+
1740817412
uint32_t i;
1740917413
DPU_FOREACH(dpu_set, dpu, i) {
1741017414
DPU_ASSERT(dpu_prepare_xfer(dpu, mul_max_res + i * rows_per_dpu*input->ne[1]));

include/llama.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,9 @@ extern "C" {
424424
struct llama_context_params params);
425425

426426
#ifdef PIM_KERNEL
427+
#define NR_DPUS 64
428+
#define NR_LAYER 2
429+
#define DPU_BINARY "./dpu/gemv_dpu"
427430
enum WeightId {
428431
WQ,
429432
WCNT

src/llama.cpp

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9348,7 +9348,7 @@ int load_weight2dpu(enum WeightId w_id, struct dpu_set_t dpu_set, struct llama_m
93489348
uint32_t nr_dpus;
93499349
dpu_get_nr_dpus(dpu_set, &nr_dpus);
93509350
ggml_tensor *w = NULL;
9351-
for (uint32_t layeridx = 0; layeridx < 1; layeridx++) {
9351+
for (uint32_t layeridx = 0; layeridx < pim_metadata->layer_num; layeridx++) {
93529352
switch (w_id) {
93539353
case WQ:
93549354
w = model->layers[layeridx].wq;
@@ -9366,7 +9366,7 @@ int load_weight2dpu(enum WeightId w_id, struct dpu_set_t dpu_set, struct llama_m
93669366
uint32_t layer_len = pim_metadata->layer_len;
93679367
uint32_t i;
93689368

9369-
printf("%s: size_per_row: %d, rows_per_dpu: %d, offset_base: %d, layer_len: %d\n", __FUNCTION__, size_per_row, pim_metadata->rows_per_dpu, offset_base, layer_len);
9369+
printf("%s: size_per_row: %d, rows_per_dpu: %d, offset_base (the 1st weight base): %d, layer_len: %d, layer_id: %d/%d\n", __FUNCTION__, size_per_row, pim_metadata->rows_per_dpu, offset_base, layer_len, layeridx, pim_metadata->layer_num);
93709370

93719371
// row is send to dpu
93729372
DPU_FOREACH(dpu_set, dpu, i) {
@@ -9382,8 +9382,6 @@ int load_weight2dpu(enum WeightId w_id, struct dpu_set_t dpu_set, struct llama_m
93829382
}
93839383

93849384
int llama_load2dpu(struct llama_context *ctx, struct llama_model *model) {
9385-
#define NR_DPUS 64
9386-
#define DPU_BINARY "./dpu/gemv_dpu"
93879385
uint32_t nr_of_dpus;
93889386
uint32_t pim_offset = 0;
93899387
int i;
@@ -9409,7 +9407,7 @@ int llama_load2dpu(struct llama_context *ctx, struct llama_model *model) {
94099407
// WQ metadata is loaded to dpu WRAM, make WQ's param in every layer is same
94109408

94119409
//uint32_t n_layer = model->layers.size();
9412-
uint32_t n_layer = 1;
9410+
uint32_t n_layer = NR_LAYER;
94139411
uint32_t il = 0;
94149412
dpu_get_nr_dpus(pqcontext->dpu_set, &nr_of_dpus);
94159413
pqcontext->pim_metadata.layer_num = n_layer;
@@ -10755,7 +10753,7 @@ struct llm_build_context {
1075510753
// compute Q and K and RoPE them
1075610754
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
1075710755

10758-
if (il == 0 && Qcur->op == GGML_OP_MUL_MAT && n_tokens == 1) {
10756+
if (il < NR_LAYER && Qcur->op == GGML_OP_MUL_MAT && n_tokens == 1) {
1075910757
Qcur->flags |= GGML_TENSOR_FLAG_PIM;
1076010758
Qcur->dpu_set = &(lctx.pim_context_map[WQ]->dpu_set);
1076110759
Qcur->inout_offset = (lctx.pim_context_map[WQ]->pim_metadata).input_offset;

0 commit comments

Comments
 (0)