Skip to content

Commit 5bda436

Browse files
committed
delete pim dir & rebuild ts.cpp
1 parent 63db21b commit 5bda436

33 files changed

+268
-878
lines changed

Makefile

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ BUILD_TARGETS = \
3737
llama-speculative \
3838
llama-tokenize \
3939
llama-ts \
40+
llama-ts-rebuild \
4041
llama-vdot \
4142
llama-cvector-generator \
4243
llama-gen-docs \
@@ -952,7 +953,15 @@ OBJ_COMMON = \
952953
common/build-info.o \
953954
common/json-schema-to-grammar.o
954955

955-
OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON)
956+
OBJ_PIM_LLM = \
957+
PIM-tensorStore/host/mm/pim_mm.o \
958+
PIM-tensorStore/host/mm/pim_direct_comm.o \
959+
PIM-tensorStore/host/msg/msg_block.o \
960+
PIM-tensorStore/host/msg/msg_buffer.o \
961+
PIM-tensorStore/host/msg/msg_comm.o \
962+
PIM-tensorStore/host/util/util.o
963+
964+
OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON) ${OBJ_PIM_LLM}
956965

957966
LIB_GGML = $(LIB_PRE)ggml$(DSO_EXT)
958967
LIB_GGML_S = $(LIB_PRE)ggml.a
@@ -1238,12 +1247,55 @@ $(LIB_COMMON_S): \
12381247
$(OBJ_COMMON)
12391248
ar rcs $(LIB_COMMON_S) $^
12401249

1250+
# pim_llm
1251+
1252+
PIM-tensorStore/host/mm/pim_mm.o: \
1253+
PIM-tensorStore/host/mm/pim_mm.c \
1254+
PIM-tensorStore/host/mm/pim_mm.h
1255+
$(CC) $(CFLAGS) -c $< -o $@
1256+
1257+
PIM-tensorStore/host/mm/pim_direct_comm.o: \
1258+
PIM-tensorStore/host/mm/pim_direct_comm.c \
1259+
PIM-tensorStore/host/mm/pim_direct_comm.h
1260+
$(CC) $(CFLAGS) -c $< -o $@
1261+
1262+
PIM-tensorStore/host/msg/msg_block.o: \
1263+
PIM-tensorStore/host/msg/msg_block.c \
1264+
PIM-tensorStore/host/msg/msg_block.h
1265+
$(CC) $(CFLAGS) -c $< -o $@
1266+
1267+
PIM-tensorStore/host/msg/msg_buffer.o: \
1268+
PIM-tensorStore/host/msg/msg_buffer.c \
1269+
PIM-tensorStore/host/msg/msg_buffer.h
1270+
$(CC) $(CFLAGS) -c $< -o $@
1271+
1272+
PIM-tensorStore/host/msg/msg_comm.o: \
1273+
PIM-tensorStore/host/msg/msg_comm.c \
1274+
PIM-tensorStore/host/msg/msg_comm.h
1275+
$(CC) $(CFLAGS) -c $< -o $@
1276+
1277+
PIM-tensorStore/host/util/util.o : \
1278+
PIM-tensorStore/host/util/util.c \
1279+
PIM-tensorStore/host/util/util.h
1280+
$(CC) $(CFLAGS) -c $< -o $@
1281+
1282+
$(LIB_PIM_LLM): \
1283+
$(OBJ_PIM_LLM) \
1284+
$(CC) $(CFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
1285+
1286+
$(LIB_PIM_LLM_S): \
1287+
$(OBJ_PIM_LLM)
1288+
ar rcs $(LIB_PIM_LLM_S) $^
1289+
12411290
clean:
12421291
rm -vrf *.dot $(BUILD_TARGETS) $(TEST_TARGETS)
12431292
rm -rvf src/*.o
12441293
rm -rvf tests/*.o
12451294
rm -rvf examples/*.o
12461295
rm -rvf common/*.o
1296+
rm -rvf PIM-tensorStore/host/mm/*.o
1297+
rm -rvf PIM-tensorStore/host/msg/*.o
1298+
rm -rvf PIM-tensorStore/host/util/*.o
12471299
rm -rvf *.a
12481300
rm -rvf *.dll
12491301
rm -rvf *.so
@@ -1287,6 +1339,11 @@ llama-ts: examples/tensor/ts.cpp \
12871339
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
12881340
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
12891341

1342+
llama-ts-rebuild: examples/tensor/ts-rebuild.cpp \
1343+
$(OBJ_ALL)
1344+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1345+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1346+
12901347
llama-infill: examples/infill/infill.cpp \
12911348
$(OBJ_ALL)
12921349
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)

examples/tensor/ts-rebuild.cpp

Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
#include "trace_driver.h"
2+
#include <iostream>
3+
#include <iomanip>
4+
#include <chrono>
5+
6+
extern "C" {
7+
#include "../../PIM-tensorStore/host/pim_llm.h"
8+
}
9+
10+
11+
#define NR_DPUS 512
12+
#define NR_LAYER 2
13+
#define DPU_BINARY "./PIM-tensorStore/build/dpu_task"
14+
#define PIM_KERNEL
15+
16+
int16_t mul_table_int4_int8[1<<4][1<<8];
17+
18+
void fp_table_init(void) {
19+
for (int i = 0; i < (1 << 16); ++i) {
20+
union {
21+
uint16_t u16;
22+
ggml_fp16_t fp16;
23+
} u = {i};
24+
ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
25+
}
26+
}
27+
28+
void mul_table_int4_int8_init(void) {
29+
for(int i = 0; i < (1 << 4); ++i){
30+
for(int j = 0; j< (1 << 8); ++j){
31+
mul_table_int4_int8[i][j] = (i - 8) * (j + INT8_MIN);
32+
}
33+
}
34+
}
35+
36+
#ifdef PIM_KERNEL
37+
int gemv_dpu_kernel(struct dpu_set_t dpu_set, struct ggml_tensor * w, struct ggml_tensor * in_q, struct ggml_tensor * res) {
38+
struct dpu_set_t dpu;
39+
40+
std::chrono::high_resolution_clock::time_point ex_tp1 = std::chrono::high_resolution_clock::now();
41+
42+
DPU_ASSERT(dpu_broadcast_to(dpu_set, "mul_table_int4_int8", 0, (void *)(mul_table_int4_int8), sizeof(mul_table_int4_int8), DPU_XFER_DEFAULT));
43+
//ggml_table_f32_f16 tbl is transferred to pim
44+
45+
all_dpu_mm_reset();
46+
remote_ptr table_f32_f16_pim_ptr = all_dpu_alloc(sizeof(ggml_table_f32_f16));
47+
assert(table_f32_f16_pim_ptr.dpu_id == ALL_DPU && table_f32_f16_pim_ptr.dpu_addr == FREE_STORAGE_OFFSET);
48+
dpu_broadcast_direct(dpu_set, table_f32_f16_pim_ptr, (void *)(ggml_table_f32_f16), sizeof(ggml_table_f32_f16));
49+
// DPU_ASSERT(dpu_broadcast_to(dpu_set, "table_f32_f16", 0, (void *)(ggml_table_f32_f16), sizeof(ggml_table_f32_f16), DPU_XFER_DEFAULT));
50+
std::cout << "ggml_table_f32_f16 len = " << sizeof(ggml_table_f32_f16) << std::endl;
51+
52+
assert(w->ne[1] % NR_DPUS == 0);
53+
54+
remote_ptr w_pim_ptr = all_dpu_alloc(w->nb[1] * (w->ne[1] / NR_DPUS));
55+
assert(w_pim_ptr.dpu_id == ALL_DPU && w_pim_ptr.dpu_addr == FREE_STORAGE_OFFSET + sizeof(ggml_table_f32_f16));
56+
57+
void *src_w_ptrs[NR_DPUS];
58+
for (int i = 0; i < NR_DPUS; i++)
59+
{
60+
src_w_ptrs[i] = (void *)((unsigned char *)w->data + i * w->nb[1] * (w->ne[1] / NR_DPUS));
61+
}
62+
63+
dpu_send_direct(dpu_set, w_pim_ptr, src_w_ptrs, w->nb[1] * (w->ne[1] / NR_DPUS));
64+
65+
std::chrono::high_resolution_clock::time_point ex_tp2 = std::chrono::high_resolution_clock::now();
66+
67+
std::chrono::duration<size_t, std::nano> dur = ex_tp2 - ex_tp1;
68+
69+
std::cout << "dpu: w传输用时:" << std::chrono::duration_cast<std::chrono::milliseconds>(dur).count() << " ms" << std::endl;
70+
std::cout << "dpu: w传输用时:" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count() << " us" << std::endl;
71+
72+
ex_tp1 = std::chrono::high_resolution_clock::now();
73+
74+
msg_block_des msg_gemv;
75+
printf("%d\n", table_f32_f16_pim_ptr.dpu_addr);
76+
msg_block_builder_op_gemv_q4_q8(&msg_gemv, w_pim_ptr, w->ne[0], w->ne[1] / NR_DPUS, in_q->ne[0], in_q->data, in_q->nb[1], table_f32_f16_pim_ptr);
77+
78+
msg_buffer buffer;
79+
msg_buffer_init(&buffer);
80+
msg_buffer_clear(&buffer);
81+
msg_buffer_append(&buffer, &msg_gemv);
82+
msg_buffer_finish(&buffer);
83+
// msg_buffer_dump_int32(&buffer);
84+
msg_buffer_send(&buffer, dpu_set);
85+
86+
ex_tp2 = std::chrono::high_resolution_clock::now();
87+
88+
dur = ex_tp2 - ex_tp1;
89+
90+
std::cout << "dpu: in_q传输用时:" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count() << " us" << std::endl;
91+
92+
ex_tp1 = std::chrono::high_resolution_clock::now();
93+
dpu_set_launch(dpu_set);
94+
ex_tp2 = std::chrono::high_resolution_clock::now();
95+
96+
dur = ex_tp2 - ex_tp1;
97+
98+
std::cout << "执行用时:" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count() << " us" << std::endl;
99+
100+
// dpu_set_log_read(dpu_set);
101+
// Check results
102+
float *mul_mat_res = (float *)res->data;
103+
104+
void *dst_ptrs[NR_DPUS];
105+
for (int i = 0; i < NR_DPUS; i++)
106+
{
107+
dst_ptrs[i] = (void *)(mul_mat_res + i * w->ne[1] / NR_DPUS);
108+
}
109+
110+
ex_tp1 = std::chrono::high_resolution_clock::now();
111+
msg_buffer_recv(dpu_set, dst_ptrs, w->ne[1] / NR_DPUS * sizeof(float));
112+
ex_tp2 = std::chrono::high_resolution_clock::now();
113+
114+
dur = ex_tp2 - ex_tp1;
115+
116+
std::cout << "传回结果用时:" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count() << " us" << std::endl;
117+
return 0;
118+
}
119+
#endif
120+
121+
122+
void gemv_cpu_kernel(struct ggml_tensor * w, struct ggml_tensor * in_q, struct ggml_tensor * res_comp) {
123+
124+
// 初始化上下文
125+
ggml_init_params params = {.mem_size = 256*1024*1024};
126+
ggml_context* ctx = ggml_init(params);
127+
128+
// 创建tensor
129+
ggml_tensor* A = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 4096, 4096);
130+
ggml_tensor* B = ggml_new_tensor_2d(ctx, GGML_TYPE_Q8_0, 4096, 1);
131+
132+
assert(A->ne[0] == w->ne[0] && A->ne[1] == w->ne[1] && A->ne[2] == w->ne[2] && A->ne[3] == w->ne[3]);
133+
assert(B->ne[0] == in_q->ne[0] && B->ne[1] == in_q->ne[1] && B->ne[2] == in_q->ne[2] && B->ne[3] == in_q->ne[3]);
134+
135+
memcpy(A->data, w->data, ggml_nbytes(w));
136+
memcpy(B->data, in_q->data, ggml_nbytes(in_q));
137+
138+
// 构建计算图
139+
ggml_tensor* C = ggml_mul_mat(ctx, A, B);
140+
ggml_cgraph* gf = ggml_new_graph(ctx);
141+
ggml_build_forward_expand(gf, C);
142+
143+
std::chrono::high_resolution_clock::time_point ex_tp1 = std::chrono::high_resolution_clock::now();
144+
// 执行计算
145+
ggml_graph_compute_with_ctx(ctx, gf, 64); // 使用4线程
146+
std::chrono::high_resolution_clock::time_point ex_tp2 = std::chrono::high_resolution_clock::now();
147+
148+
std::chrono::duration<size_t, std::nano> dur = ex_tp2 - ex_tp1;
149+
150+
std::cout << "执行用时:" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count() << " us" << std::endl;
151+
std::cout << "执行用时:" << std::chrono::duration_cast<std::chrono::milliseconds>(dur).count() << " ms" << std::endl;
152+
153+
154+
// 保存结果
155+
print_tensor(C, stdout);
156+
157+
std::cout << "error between cpu and dpu before gemv:" << std::endl;
158+
compare_tensor(C, res_comp);
159+
160+
// 释放资源
161+
ggml_free(ctx);
162+
}
163+
164+
int main(int argc, char** argv) {
165+
// init fp table for fp16 dump
166+
fp_table_init();
167+
mul_table_int4_int8_init();
168+
169+
#ifdef PIM_KERNEL
170+
// WQ-PIM allocate dpu
171+
struct dpu_set_t dpu_set;
172+
DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
173+
DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
174+
175+
const char* filenamea = "tensor-files/a.tensor";
176+
const char* filenameb = "tensor-files/b.tensor";
177+
const char* filenamebq = "tensor-files/b_quant.tensor";
178+
const char* filenamec = "tensor-files/c.tensor";
179+
const char* filenamec_p = "tensor-files/c_pim.tensor";
180+
struct ggml_tensor * ts_a = tensor_import(filenamea);
181+
struct ggml_tensor * ts_b = tensor_import(filenameb);
182+
struct ggml_tensor * ts_bq = tensor_import(filenamebq);
183+
struct ggml_tensor * ts_c = tensor_import(filenamec);
184+
struct ggml_tensor * ts_c_pim = tensor_import(filenamec_p);
185+
186+
std::cout << "ts_a: " << std::endl;
187+
print_tensor(ts_a, stdout);
188+
std::cout << "ts_b: " << std::endl;
189+
print_tensor(ts_b, stdout);
190+
191+
gemv_dpu_kernel(dpu_set, ts_a, ts_bq, ts_c_pim);
192+
193+
float first_res = mul_add_q4_0_q8_0(ts_a, ts_bq);
194+
std::cout<<"first element: "<<std::fixed << std::setprecision(6)<<first_res<<std::endl;
195+
196+
std::cout << "error between c and c_pim:" << std::endl;
197+
compare_tensor(ts_c, ts_c_pim);
198+
199+
#endif
200+
return 0;
201+
}

examples/tensor/ts.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -119,11 +119,16 @@ int gemv_dpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct
119119

120120
// Check results
121121
float *mul_mat_res = (float *)res->data;
122+
ex_tp1 = std::chrono::high_resolution_clock::now();
122123
DPU_FOREACH(context->dpu_set, dpu, i) {
123124
DPU_ASSERT(dpu_prepare_xfer(dpu, mul_mat_res + i * context->pim_metadata.rows_per_dpu*in_q->ne[1]));
124125
}
125126
DPU_ASSERT(dpu_push_xfer(context->dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_offset, context->pim_metadata.rows_per_dpu*in_q->ne[1]*sizeof(float), DPU_XFER_DEFAULT));
127+
ex_tp2 = std::chrono::high_resolution_clock::now();
128+
129+
dur = ex_tp2 - ex_tp1;
126130

131+
std::cout << "传回结果用时:" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count() << " us" << std::endl;
127132
return 0;
128133
}
129134
#endif
@@ -216,12 +221,12 @@ int main(int argc, char** argv) {
216221

217222

218223
// dpu code
219-
// gemv_dpu_kernel(pqcontext, ts_a, ts_bq, ts_c_pim);
220-
// std::cout << "error between c and c_pim:" << std::endl;
221-
// compare_tensor(ts_c, ts_c_pim);
224+
gemv_dpu_kernel(pqcontext, ts_a, ts_bq, ts_c_pim);
225+
std::cout << "error between c and c_pim:" << std::endl;
226+
compare_tensor(ts_c, ts_c_pim);
222227

223228
// cpu code
224-
gemv_cpu_kernel(pqcontext, ts_a, ts_bq, ts_c_pim);
229+
// gemv_cpu_kernel(pqcontext, ts_a, ts_bq, ts_c_pim);
225230

226231
// float first_res = mul_add_q4_0_q8_0(ts_a, ts_bq);
227232
// std::cout<<"first element: "<<std::fixed << std::setprecision(6)<<first_res<<std::endl;

pim/Makefile

Lines changed: 0 additions & 52 deletions
This file was deleted.

0 commit comments

Comments
 (0)