Skip to content

Commit c9a3797

Browse files
committed
添加测试cpu的执行时间
1 parent 4541374 commit c9a3797

File tree

1 file changed

+98
-25
lines changed

1 file changed

+98
-25
lines changed

examples/tensor/ts.cpp

Lines changed: 98 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
#include <iomanip>
44
#include <chrono>
55

6-
#define NR_DPUS 8
6+
#include <vector>
7+
8+
#define NR_DPUS 2048
79
#define NR_LAYER 2
810
#define DPU_BINARY "./dpu/gemv_dpu"
911

@@ -21,6 +23,8 @@ int gemv_dpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct
2123
uint32_t pim_offset = 0;
2224
struct dpu_set_t dpu;
2325

26+
std::chrono::high_resolution_clock::time_point ex_tp1 = std::chrono::high_resolution_clock::now();
27+
2428
//ggml_table_f32_f16 tbl is transferred to pim
2529
DPU_ASSERT(dpu_broadcast_to(context->dpu_set, DPU_MRAM_HEAP_POINTER_NAME, pim_offset, (void *)(ggml_table_f32_f16), sizeof(ggml_table_f32_f16), DPU_XFER_DEFAULT));
2630
pim_offset += sizeof(ggml_table_f32_f16);
@@ -39,7 +43,7 @@ int gemv_dpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct
3943
context->pim_metadata.layer_len = w->nb[1] * (context->pim_metadata.rows_per_dpu);
4044
context->pim_metadata.input_offset = sizeof(ggml_table_f32_f16) + sizeof(struct pim_meta) + context->pim_metadata.layer_len * NR_LAYER;
4145

42-
std::cout << "layer_num = " << NR_LAYER << ", weight_type = " << (uint16_t)(w->type) << ", rows_per_dpu = " << w->ne[1] / NR_DPUS << ", rest_rows = " << w->ne[1] % NR_DPUS << ", layer_len = " << context->pim_metadata.layer_len << ", input_offset = " << context->pim_metadata.input_offset << std::endl;
46+
// std::cout << "layer_num = " << NR_LAYER << ", weight_type = " << (uint16_t)(w->type) << ", rows_per_dpu = " << w->ne[1] / NR_DPUS << ", rest_rows = " << w->ne[1] % NR_DPUS << ", layer_len = " << context->pim_metadata.layer_len << ", input_offset = " << context->pim_metadata.input_offset << std::endl;
4347

4448
//Todo: NR_DPUS contexts are dispatched to different dpus(rest row is different on different dpu)
4549
DPU_ASSERT(dpu_broadcast_to(context->dpu_set, DPU_MRAM_HEAP_POINTER_NAME, pim_offset, &(context->pim_metadata), sizeof(struct pim_meta), DPU_XFER_DEFAULT));
@@ -61,6 +65,15 @@ int gemv_dpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct
6165
DPU_ASSERT(dpu_push_xfer(context->dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, pim_offset + layer_len * layeridx, layer_len, DPU_XFER_DEFAULT));
6266
}
6367

68+
std::chrono::high_resolution_clock::time_point ex_tp2 = std::chrono::high_resolution_clock::now();
69+
70+
std::chrono::duration<size_t, std::nano> dur = ex_tp2 - ex_tp1;
71+
72+
std::cout << "dpu: w传输用时:" << std::chrono::duration_cast<std::chrono::milliseconds>(dur).count() << " ms" << std::endl;
73+
std::cout << "dpu: w传输用时:" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count() << " us" << std::endl;
74+
75+
ex_tp1 = std::chrono::high_resolution_clock::now();
76+
6477
// Transfer input into DPUs
6578
pim_matrix_des input_descript;
6679
input_descript.type = (int32_t)in_q->type;
@@ -77,14 +90,22 @@ int gemv_dpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct
7790
DPU_ASSERT(dpu_broadcast_to(context->dpu_set, DPU_MRAM_HEAP_POINTER_NAME, input_offset, in_q->data, bclen, DPU_XFER_DEFAULT));
7891
input_offset += bclen;
7992

80-
std::chrono::high_resolution_clock::time_point ex_tp1 = std::chrono::high_resolution_clock::now();
93+
ex_tp2 = std::chrono::high_resolution_clock::now();
94+
95+
dur = ex_tp2 - ex_tp1;
96+
97+
std::cout << "dpu: in_q传输用时:" << std::chrono::duration_cast<std::chrono::milliseconds>(dur).count() << " ms" << std::endl;
98+
std::cout << "dpu: in_q传输用时:" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count() << " us" << std::endl;
99+
100+
ex_tp1 = std::chrono::high_resolution_clock::now();
81101
// Launch DPU kernel
82102
DPU_ASSERT(dpu_launch(context->dpu_set, DPU_SYNCHRONOUS));
83-
std::chrono::high_resolution_clock::time_point ex_tp2 = std::chrono::high_resolution_clock::now();
103+
ex_tp2 = std::chrono::high_resolution_clock::now();
84104

85-
std::chrono::duration<size_t, std::nano> dur = ex_tp2 - ex_tp1;
105+
dur = ex_tp2 - ex_tp1;
86106

87-
std::cout << "执行用时:" << std::chrono::duration_cast<std::chrono::milliseconds>(dur).count() << " ms" << std::endl;
107+
std::cout << "dpu: 执行用时:" << std::chrono::duration_cast<std::chrono::milliseconds>(dur).count() << " ms" << std::endl;
108+
std::cout << "dpu: 执行用时:" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count() << " us" << std::endl;
88109

89110
// Check results
90111
float *mul_mat_res = (float *)res->data;
@@ -96,6 +117,49 @@ int gemv_dpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct
96117
return 0;
97118
}
98119

120+
121+
void gemv_cpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct ggml_tensor * in_q, struct ggml_tensor * res_comp) {
122+
123+
// 初始化上下文
124+
ggml_init_params params = {.mem_size = 256*1024*1024};
125+
ggml_context* ctx = ggml_init(params);
126+
127+
// 创建tensor
128+
ggml_tensor* A = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 4096, 4096);
129+
ggml_tensor* B = ggml_new_tensor_2d(ctx, GGML_TYPE_Q8_0, 4096, 1);
130+
131+
assert(A->ne[0] == w->ne[0] && A->ne[1] == w->ne[1] && A->ne[2] == w->ne[2] && A->ne[3] == w->ne[3]);
132+
assert(B->ne[0] == in_q->ne[0] && B->ne[1] == in_q->ne[1] && B->ne[2] == in_q->ne[2] && B->ne[3] == in_q->ne[3]);
133+
134+
memcpy(A->data, w->data, ggml_nbytes(w));
135+
memcpy(B->data, in_q->data, ggml_nbytes(in_q));
136+
137+
// 构建计算图
138+
ggml_tensor* C = ggml_mul_mat(ctx, A, B);
139+
ggml_cgraph* gf = ggml_new_graph(ctx);
140+
ggml_build_forward_expand(gf, C);
141+
142+
std::chrono::high_resolution_clock::time_point ex_tp1 = std::chrono::high_resolution_clock::now();
143+
// 执行计算
144+
ggml_graph_compute_with_ctx(ctx, gf, 64); // 使用4线程
145+
std::chrono::high_resolution_clock::time_point ex_tp2 = std::chrono::high_resolution_clock::now();
146+
147+
std::chrono::duration<size_t, std::nano> dur = ex_tp2 - ex_tp1;
148+
149+
std::cout << "执行用时:" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count() << " us" << std::endl;
150+
std::cout << "执行用时:" << std::chrono::duration_cast<std::chrono::milliseconds>(dur).count() << " ms" << std::endl;
151+
152+
153+
// 保存结果
154+
print_tensor(C, stdout);
155+
156+
std::cout << "error between cpu and dpu before gemv:" << std::endl;
157+
compare_tensor(C, res_comp);
158+
159+
// 释放资源
160+
ggml_free(ctx);
161+
}
162+
99163
int main(int argc, char** argv) {
100164
// init fp table for fp16 dump
101165
fp_table_init();
@@ -106,15 +170,15 @@ int main(int argc, char** argv) {
106170
DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &pqcontext->dpu_set));
107171
DPU_ASSERT(dpu_load(pqcontext->dpu_set, DPU_BINARY, NULL));
108172

109-
const char* filenamea = "tensor-files/a.tensor";
110-
const char* filenameb = "tensor-files/b.tensor";
111-
const char* filenamebq = "tensor-files/b_quant.tensor";
112-
const char* filenamec = "tensor-files/c.tensor";
173+
const char* filenamea = "tensor-files/a.tensor";
174+
const char* filenameb = "tensor-files/b.tensor";
175+
const char* filenamebq = "tensor-files/b_quant.tensor";
176+
const char* filenamec = "tensor-files/c.tensor";
113177
const char* filenamec_p = "tensor-files/c_pim.tensor";
114-
struct ggml_tensor * ts_a = tensor_import(filenamea);
115-
struct ggml_tensor * ts_b = tensor_import(filenameb);
116-
struct ggml_tensor * ts_bq = tensor_import(filenamebq);
117-
struct ggml_tensor * ts_c = tensor_import(filenamec);
178+
struct ggml_tensor * ts_a = tensor_import(filenamea);
179+
struct ggml_tensor * ts_b = tensor_import(filenameb);
180+
struct ggml_tensor * ts_bq = tensor_import(filenamebq);
181+
struct ggml_tensor * ts_c = tensor_import(filenamec);
118182
struct ggml_tensor * ts_c_pim = tensor_import(filenamec_p);
119183
// std::cout<<"ts_a:"<<std::endl;
120184
// dump_tensor(ts_a, stdout);
@@ -126,20 +190,29 @@ int main(int argc, char** argv) {
126190
// dump_tensor(ts_c, stdout);
127191
// std::cout<<"ts_c_pim:"<<std::endl;
128192
// dump_tensor(ts_c_pim, stdout);
193+
// #define IS_CONTIGUOUS(t, tn) {\
194+
// if(ggml_is_contiguous(t)) printf("%s is contiguous\n", tn);\
195+
// }\
129196
130-
std::cout << "ts_a: " << std::endl;
131-
print_tensor(ts_a, stdout);
132-
std::cout << "ts_b: " << std::endl;
133-
print_tensor(ts_b, stdout);
197+
// IS_CONTIGUOUS(ts_a, "ts_a");
198+
// IS_CONTIGUOUS(ts_b, "ts_b");
199+
// IS_CONTIGUOUS(ts_bq, "ts_bq");
200+
// IS_CONTIGUOUS(ts_c, "ts_c");
201+
// IS_CONTIGUOUS(ts_c_pim, "ts_ac_pim");
202+
// #undef IS_CONTIGUOUS
134203

135-
gemv_dpu_kernel(pqcontext, ts_a, ts_bq, ts_c_pim);
136-
// std::cout<<"ts_c_pim calculated by DPUs:"<<std::endl;
137-
// dump_tensor(ts_c_pim, stdout);
138204

139-
float first_res = mul_add_q4_0_q8_0(ts_a, ts_bq);
140-
std::cout<<"first element: "<<std::fixed << std::setprecision(6)<<first_res<<std::endl;
205+
// dpu code
206+
// gemv_dpu_kernel(pqcontext, ts_a, ts_bq, ts_c_pim);
207+
// std::cout << "error between c and c_pim:" << std::endl;
208+
// compare_tensor(ts_c, ts_c_pim);
209+
210+
// cpu code
211+
gemv_cpu_kernel(pqcontext, ts_a, ts_bq, ts_c_pim);
212+
213+
// float first_res = mul_add_q4_0_q8_0(ts_a, ts_bq);
214+
// std::cout<<"first element: "<<std::fixed << std::setprecision(6)<<first_res<<std::endl;
141215

142-
std::cout << "error between c and c_pim:" << std::endl;
143-
compare_tensor(ts_c, ts_c_pim);
216+
144217
return 0;
145218
}

0 commit comments

Comments
 (0)