|
| 1 | +#include "trace_driver.h" |
| 2 | +#include <iostream> |
| 3 | +#include <iomanip> |
| 4 | +#include <chrono> |
| 5 | + |
| 6 | +extern "C" { |
| 7 | +#include "../../PIM-tensorStore/host/pim_llm.h" |
| 8 | +} |
| 9 | + |
| 10 | + |
| 11 | +#define NR_DPUS 512 |
| 12 | +#define NR_LAYER 2 |
| 13 | +#define DPU_BINARY "./PIM-tensorStore/build/dpu_task" |
| 14 | +#define PIM_KERNEL |
| 15 | + |
| 16 | +int16_t mul_table_int4_int8[1<<4][1<<8]; |
| 17 | + |
| 18 | +void fp_table_init(void) { |
| 19 | + for (int i = 0; i < (1 << 16); ++i) { |
| 20 | + union { |
| 21 | + uint16_t u16; |
| 22 | + ggml_fp16_t fp16; |
| 23 | + } u = {i}; |
| 24 | + ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16); |
| 25 | + } |
| 26 | +} |
| 27 | + |
| 28 | +void mul_table_int4_int8_init(void) { |
| 29 | + for(int i = 0; i < (1 << 4); ++i){ |
| 30 | + for(int j = 0; j< (1 << 8); ++j){ |
| 31 | + mul_table_int4_int8[i][j] = (i - 8) * (j + INT8_MIN); |
| 32 | + } |
| 33 | + } |
| 34 | +} |
| 35 | + |
| 36 | +#ifdef PIM_KERNEL |
| 37 | +int gemv_dpu_kernel(struct dpu_set_t dpu_set, struct ggml_tensor * w, struct ggml_tensor * in_q, struct ggml_tensor * res) { |
| 38 | + struct dpu_set_t dpu; |
| 39 | + |
| 40 | + std::chrono::high_resolution_clock::time_point ex_tp1 = std::chrono::high_resolution_clock::now(); |
| 41 | + |
| 42 | + DPU_ASSERT(dpu_broadcast_to(dpu_set, "mul_table_int4_int8", 0, (void *)(mul_table_int4_int8), sizeof(mul_table_int4_int8), DPU_XFER_DEFAULT)); |
| 43 | + //ggml_table_f32_f16 tbl is transferred to pim |
| 44 | + |
| 45 | + all_dpu_mm_reset(); |
| 46 | + remote_ptr table_f32_f16_pim_ptr = all_dpu_alloc(sizeof(ggml_table_f32_f16)); |
| 47 | + assert(table_f32_f16_pim_ptr.dpu_id == ALL_DPU && table_f32_f16_pim_ptr.dpu_addr == FREE_STORAGE_OFFSET); |
| 48 | + dpu_broadcast_direct(dpu_set, table_f32_f16_pim_ptr, (void *)(ggml_table_f32_f16), sizeof(ggml_table_f32_f16)); |
| 49 | + // DPU_ASSERT(dpu_broadcast_to(dpu_set, "table_f32_f16", 0, (void *)(ggml_table_f32_f16), sizeof(ggml_table_f32_f16), DPU_XFER_DEFAULT)); |
| 50 | + std::cout << "ggml_table_f32_f16 len = " << sizeof(ggml_table_f32_f16) << std::endl; |
| 51 | + |
| 52 | + assert(w->ne[1] % NR_DPUS == 0); |
| 53 | + |
| 54 | + remote_ptr w_pim_ptr = all_dpu_alloc(w->nb[1] * (w->ne[1] / NR_DPUS)); |
| 55 | + assert(w_pim_ptr.dpu_id == ALL_DPU && w_pim_ptr.dpu_addr == FREE_STORAGE_OFFSET + sizeof(ggml_table_f32_f16)); |
| 56 | + |
| 57 | + void *src_w_ptrs[NR_DPUS]; |
| 58 | + for (int i = 0; i < NR_DPUS; i++) |
| 59 | + { |
| 60 | + src_w_ptrs[i] = (void *)((unsigned char *)w->data + i * w->nb[1] * (w->ne[1] / NR_DPUS)); |
| 61 | + } |
| 62 | + |
| 63 | + dpu_send_direct(dpu_set, w_pim_ptr, src_w_ptrs, w->nb[1] * (w->ne[1] / NR_DPUS)); |
| 64 | + |
| 65 | + std::chrono::high_resolution_clock::time_point ex_tp2 = std::chrono::high_resolution_clock::now(); |
| 66 | + |
| 67 | + std::chrono::duration<size_t, std::nano> dur = ex_tp2 - ex_tp1; |
| 68 | + |
| 69 | + std::cout << "dpu: w传输用时:" << std::chrono::duration_cast<std::chrono::milliseconds>(dur).count() << " ms" << std::endl; |
| 70 | + std::cout << "dpu: w传输用时:" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count() << " us" << std::endl; |
| 71 | + |
| 72 | + ex_tp1 = std::chrono::high_resolution_clock::now(); |
| 73 | + |
| 74 | + msg_block_des msg_gemv; |
| 75 | + printf("%d\n", table_f32_f16_pim_ptr.dpu_addr); |
| 76 | + msg_block_builder_op_gemv_q4_q8(&msg_gemv, w_pim_ptr, w->ne[0], w->ne[1] / NR_DPUS, in_q->ne[0], in_q->data, in_q->nb[1], table_f32_f16_pim_ptr); |
| 77 | + |
| 78 | + msg_buffer buffer; |
| 79 | + msg_buffer_init(&buffer); |
| 80 | + msg_buffer_clear(&buffer); |
| 81 | + msg_buffer_append(&buffer, &msg_gemv); |
| 82 | + msg_buffer_finish(&buffer); |
| 83 | + // msg_buffer_dump_int32(&buffer); |
| 84 | + msg_buffer_send(&buffer, dpu_set); |
| 85 | + |
| 86 | + ex_tp2 = std::chrono::high_resolution_clock::now(); |
| 87 | + |
| 88 | + dur = ex_tp2 - ex_tp1; |
| 89 | + |
| 90 | + std::cout << "dpu: in_q传输用时:" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count() << " us" << std::endl; |
| 91 | + |
| 92 | + ex_tp1 = std::chrono::high_resolution_clock::now(); |
| 93 | + dpu_set_launch(dpu_set); |
| 94 | + ex_tp2 = std::chrono::high_resolution_clock::now(); |
| 95 | + |
| 96 | + dur = ex_tp2 - ex_tp1; |
| 97 | + |
| 98 | + std::cout << "执行用时:" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count() << " us" << std::endl; |
| 99 | + |
| 100 | + // dpu_set_log_read(dpu_set); |
| 101 | + // Check results |
| 102 | + float *mul_mat_res = (float *)res->data; |
| 103 | + |
| 104 | + void *dst_ptrs[NR_DPUS]; |
| 105 | + for (int i = 0; i < NR_DPUS; i++) |
| 106 | + { |
| 107 | + dst_ptrs[i] = (void *)(mul_mat_res + i * w->ne[1] / NR_DPUS); |
| 108 | + } |
| 109 | + |
| 110 | + ex_tp1 = std::chrono::high_resolution_clock::now(); |
| 111 | + msg_buffer_recv(dpu_set, dst_ptrs, w->ne[1] / NR_DPUS * sizeof(float)); |
| 112 | + ex_tp2 = std::chrono::high_resolution_clock::now(); |
| 113 | + |
| 114 | + dur = ex_tp2 - ex_tp1; |
| 115 | + |
| 116 | + std::cout << "传回结果用时:" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count() << " us" << std::endl; |
| 117 | + return 0; |
| 118 | +} |
| 119 | +#endif |
| 120 | + |
| 121 | + |
| 122 | +void gemv_cpu_kernel(struct ggml_tensor * w, struct ggml_tensor * in_q, struct ggml_tensor * res_comp) { |
| 123 | + |
| 124 | + // 初始化上下文 |
| 125 | + ggml_init_params params = {.mem_size = 256*1024*1024}; |
| 126 | + ggml_context* ctx = ggml_init(params); |
| 127 | + |
| 128 | + // 创建tensor |
| 129 | + ggml_tensor* A = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 4096, 4096); |
| 130 | + ggml_tensor* B = ggml_new_tensor_2d(ctx, GGML_TYPE_Q8_0, 4096, 1); |
| 131 | + |
| 132 | + assert(A->ne[0] == w->ne[0] && A->ne[1] == w->ne[1] && A->ne[2] == w->ne[2] && A->ne[3] == w->ne[3]); |
| 133 | + assert(B->ne[0] == in_q->ne[0] && B->ne[1] == in_q->ne[1] && B->ne[2] == in_q->ne[2] && B->ne[3] == in_q->ne[3]); |
| 134 | + |
| 135 | + memcpy(A->data, w->data, ggml_nbytes(w)); |
| 136 | + memcpy(B->data, in_q->data, ggml_nbytes(in_q)); |
| 137 | + |
| 138 | + // 构建计算图 |
| 139 | + ggml_tensor* C = ggml_mul_mat(ctx, A, B); |
| 140 | + ggml_cgraph* gf = ggml_new_graph(ctx); |
| 141 | + ggml_build_forward_expand(gf, C); |
| 142 | + |
| 143 | + std::chrono::high_resolution_clock::time_point ex_tp1 = std::chrono::high_resolution_clock::now(); |
| 144 | + // 执行计算 |
| 145 | + ggml_graph_compute_with_ctx(ctx, gf, 64); // 使用4线程 |
| 146 | + std::chrono::high_resolution_clock::time_point ex_tp2 = std::chrono::high_resolution_clock::now(); |
| 147 | + |
| 148 | + std::chrono::duration<size_t, std::nano> dur = ex_tp2 - ex_tp1; |
| 149 | + |
| 150 | + std::cout << "执行用时:" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count() << " us" << std::endl; |
| 151 | + std::cout << "执行用时:" << std::chrono::duration_cast<std::chrono::milliseconds>(dur).count() << " ms" << std::endl; |
| 152 | + |
| 153 | + |
| 154 | + // 保存结果 |
| 155 | + print_tensor(C, stdout); |
| 156 | + |
| 157 | + std::cout << "error between cpu and dpu before gemv:" << std::endl; |
| 158 | + compare_tensor(C, res_comp); |
| 159 | + |
| 160 | + // 释放资源 |
| 161 | + ggml_free(ctx); |
| 162 | +} |
| 163 | + |
| 164 | +int main(int argc, char** argv) { |
| 165 | + // init fp table for fp16 dump |
| 166 | + fp_table_init(); |
| 167 | + mul_table_int4_int8_init(); |
| 168 | + |
| 169 | +#ifdef PIM_KERNEL |
| 170 | + // WQ-PIM allocate dpu |
| 171 | + struct dpu_set_t dpu_set; |
| 172 | + DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set)); |
| 173 | + DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL)); |
| 174 | + |
| 175 | + const char* filenamea = "tensor-files/a.tensor"; |
| 176 | + const char* filenameb = "tensor-files/b.tensor"; |
| 177 | + const char* filenamebq = "tensor-files/b_quant.tensor"; |
| 178 | + const char* filenamec = "tensor-files/c.tensor"; |
| 179 | + const char* filenamec_p = "tensor-files/c_pim.tensor"; |
| 180 | + struct ggml_tensor * ts_a = tensor_import(filenamea); |
| 181 | + struct ggml_tensor * ts_b = tensor_import(filenameb); |
| 182 | + struct ggml_tensor * ts_bq = tensor_import(filenamebq); |
| 183 | + struct ggml_tensor * ts_c = tensor_import(filenamec); |
| 184 | + struct ggml_tensor * ts_c_pim = tensor_import(filenamec_p); |
| 185 | + |
| 186 | + std::cout << "ts_a: " << std::endl; |
| 187 | + print_tensor(ts_a, stdout); |
| 188 | + std::cout << "ts_b: " << std::endl; |
| 189 | + print_tensor(ts_b, stdout); |
| 190 | + |
| 191 | + gemv_dpu_kernel(dpu_set, ts_a, ts_bq, ts_c_pim); |
| 192 | + |
| 193 | + float first_res = mul_add_q4_0_q8_0(ts_a, ts_bq); |
| 194 | + std::cout<<"first element: "<<std::fixed << std::setprecision(6)<<first_res<<std::endl; |
| 195 | + |
| 196 | + std::cout << "error between c and c_pim:" << std::endl; |
| 197 | + compare_tensor(ts_c, ts_c_pim); |
| 198 | + |
| 199 | +#endif |
| 200 | + return 0; |
| 201 | +} |
0 commit comments