Skip to content

Commit c2b40d5

Browse files
committed
add pim-llm-framework
1 parent 70f9d88 commit c2b40d5

30 files changed

+873
-0
lines changed

pim/Makefile

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
BUILDDIR ?= build
2+
3+
HOST_BINARY=${BUILDDIR}/host_app
4+
HOST_SOURCES=$(wildcard host/*.c)
5+
HOST_SOURCES+=$(wildcard host/mm/*.c)
6+
HOST_SOURCES+=$(wildcard host/msg/*.c)
7+
HOST_HEADERS=$(wildcard host/*.h)
8+
HOST_HEADERS+=$(wildcard host/mm/*.h)
9+
HOST_HEADERS+=$(wildcard host/msg/*.h)
10+
11+
DPU_BINARY=${BUILDDIR}/dpu_task
12+
DPU_SOURCES=$(wildcard dpu/*.c)
13+
DPU_SOURCES+=$(wildcard dpu/ops/*.c)
14+
DPU_SOURCES+=$(wildcard dpu/sto/*.c)
15+
DPU_SOURCES+=$(wildcard dpu/util/*.c)
16+
DPU_HEADERS=$(wildcard dpu/*.h)
17+
DPU_HEADERS+=$(wildcard dpu/ops/*.h)
18+
DPU_HEADERS+=$(wildcard dpu/sto/*.h)
19+
DPU_HEADERS+=$(wildcard dpu/util/*.h)
20+
21+
UTIL_HEADERS=$(wildcard host/util/*.h)
22+
UTIL_SOURCES=$(wildcard host/util/*.c)
23+
24+
CHECK_FORMAT_FILES=${HOST_SOURCES} ${HOST_HEADERS} ${DPU_SOURCES} ${DPU_HEADERS} ${UTIL_HEADERS} ${UTIL_SOURCES}
25+
CHECK_FORMAT_DEPENDENCIES=$(addsuffix -check-format,${CHECK_FORMAT_FILES})
26+
27+
NR_TASKLETS ?= 16
28+
29+
__dirs := $(shell mkdir -p ${BUILDDIR})
30+
31+
.PHONY: all clean run plotdata check check-format tools
32+
33+
all: ${HOST_BINARY} ${DPU_BINARY} tools
34+
clean:
35+
rm -rf ${BUILDDIR}
36+
37+
###
38+
### HOST APPLICATION
39+
###
40+
CFLAGS=-g -O3 -std=gnu99 -fgnu89-inline `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -pthread -lstdc++
41+
LDFLAGS=`dpu-pkg-config --libs dpu`
42+
43+
${HOST_BINARY}: ${HOST_SOURCES} ${HOST_HEADERS} ${UTIL_HEADERS} ${UTIL_SOURCES} ${DPU_BINARY}
44+
$(CC) -o $@ ${HOST_SOURCES} ${UTIL_SOURCES} $(LDFLAGS) $(CFLAGS) -DDPU_BINARY=\"$(realpath ${DPU_BINARY})\"
45+
46+
###
47+
### DPU BINARY
48+
###
49+
DPU_FLAGS=-g -O3 -fgnu89-inline -DNR_TASKLETS=${NR_TASKLETS} -DSTACK_SIZE_DEFAULT=1024
50+
51+
${DPU_BINARY}: ${DPU_SOURCES} ${DPU_HEADERS} ${UTIL_HEADERS} ${UTIL_SOURCES}
52+
dpu-upmem-dpurte-clang ${DPU_FLAGS} ${DPU_SOURCES} ${UTIL_SOURCES} -o $@

pim/dpu/main.c

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
#include <assert.h>
2+
#include <stdint.h>
3+
#include <stdio.h>
4+
#include <string.h>
5+
#include <defs.h>
6+
#include <mram.h>
7+
#include <alloc.h>
8+
#include <barrier.h>
9+
#include <seqread.h>
10+
11+
#include "./sto/g_vars.h"
12+
#include "./sto/mram_mm.h"
13+
#include "./ops/gemv.h"
14+
#include "./ops/tensor_add.h"
15+
16+
#ifndef NR_TASKLETS
17+
#define NR_TASKLETS 16
18+
#endif
19+
20+
#define DPU_MAIN_DEBUG_PRINT 0
21+
22+
BARRIER_INIT(my_barrier, NR_TASKLETS);
23+
24+
static void init()
25+
{
26+
mem_reset(); // reset heap in wram
27+
mram_read((__mram_ptr void const *)(MESSAGE_BUFFER_ADDR), &g_msg_header, sizeof(msg_buffer_header));
28+
29+
#if DPU_MAIN_DEBUG_PRINT
30+
printf("block_cnt: %d, total_size: %d\n", g_msg_header.block_cnt, g_msg_header.total_size);
31+
#endif
32+
33+
assert(g_msg_header.total_size <= 16 * 1024);
34+
g_msg_cache_ptr = (char *)mem_alloc(g_msg_header.total_size);
35+
mram2wram((__mram_ptr void const *)(MESSAGE_BUFFER_ADDR), g_msg_cache_ptr, g_msg_header.total_size);
36+
g_blocks_offset_ptr = (uint32_t *)(g_msg_cache_ptr + g_msg_header.total_size - align8(sizeof(uint32_t) * g_msg_header.block_cnt));
37+
38+
#if DPU_MAIN_DEBUG_PRINT
39+
for (int i = 0; i < g_msg_header.block_cnt; i++)
40+
{
41+
printf("blocks_offset: %d ", g_blocks_offset_ptr[i]);
42+
}
43+
printf("\n");
44+
#endif
45+
}
46+
47+
int main()
48+
{
49+
unsigned int tasklet_id = me();
50+
51+
if (tasklet_id == 0)
52+
{
53+
init();
54+
}
55+
56+
barrier_wait(&my_barrier);
57+
58+
for (int i = 0; i < g_msg_header.block_cnt; i++)
59+
{
60+
msg_block_header *header_ptr = (msg_block_header *)(g_msg_cache_ptr + g_blocks_offset_ptr[i]);
61+
#if DPU_MAIN_DEBUG_PRINT
62+
if (tasklet_id == 0)
63+
{
64+
printf("header_ptr->op: %d, src0.dpu_addr: %d, ne0: %d, ne1: %d\n", header_ptr->op, header_ptr->src0.ptr.dpu_addr,
65+
header_ptr->src0.ne[0], header_ptr->src0.ne[1]);
66+
}
67+
#endif
68+
switch (header_ptr->op)
69+
{
70+
case PIM_OP_GEMV:
71+
if (tasklet_id == 0)
72+
{
73+
gemv_prepare();
74+
}
75+
barrier_wait(&my_barrier);
76+
gemv_tasklets_run();
77+
barrier_wait(&my_barrier);
78+
if (tasklet_id == 0)
79+
{
80+
gemv_merge();
81+
}
82+
break;
83+
84+
case PIM_OP_TENSOR_ADD_FOR_TEST:
85+
if (tasklet_id == 0)
86+
{
87+
tensor_add_prepare(header_ptr);
88+
}
89+
barrier_wait(&my_barrier);
90+
tensor_add_tasklets_run();
91+
barrier_wait(&my_barrier);
92+
if (tasklet_id == 0)
93+
{
94+
tensor_add_merge();
95+
}
96+
break;
97+
98+
case PIM_OP_TENSOR_GET_FOR_TEST:
99+
if (tasklet_id == 0)
100+
{
101+
tensor_get_prepare(header_ptr);
102+
}
103+
barrier_wait(&my_barrier);
104+
tensor_get_tasklets_run();
105+
barrier_wait(&my_barrier);
106+
if (tasklet_id == 0)
107+
{
108+
tensor_get_merge();
109+
}
110+
break;
111+
112+
default:
113+
break;
114+
}
115+
}
116+
return 0;
117+
}

pim/dpu/ops/gemv.c

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#include "gemv.h"
2+
3+
__host int16_t mul_table_int4_int8[1 << 4][1 << 8];
4+
__mram_noinit float table_f32_f16[1 << 16];
5+
6+
static float lookup_fp16_to_fp32(uint16_t f)
7+
{
8+
uint16_t s;
9+
memcpy(&s, &f, sizeof(uint16_t));
10+
uint16_t alignedOffset;
11+
float temp[8];
12+
13+
alignedOffset = s & 0xfff8;
14+
mram_read((__mram_ptr void const *)(table_f32_f16 + alignedOffset), temp, sizeof(float) * 8);
15+
return temp[s & 0x7];
16+
}
17+
18+
#define FP16_TO_FP32(x) lookup_fp16_to_fp32(x)
19+
20+
void gemv_prepare()
21+
{
22+
}
23+
24+
void gemv_tasklets_run()
25+
{
26+
}
27+
28+
void gemv_merge()
29+
{
30+
}

pim/dpu/ops/gemv.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#ifndef _GEMV_H
2+
#define _GEMV_H
3+
4+
#include <mram.h>
5+
6+
void gemv_prepare();
7+
void gemv_tasklets_run();
8+
void gemv_merge();
9+
10+
#endif

pim/dpu/ops/tensor_add.c

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#include "tensor_add.h"
2+
3+
#define OP_TENSOR_ADD_DEBUG_PRINT 0
4+
5+
static pim_tensor_des *src0 = NULL;
6+
static int32_t *num = 0;
7+
8+
void tensor_add_prepare(msg_block_header *header_ptr)
9+
{
10+
src0 = &header_ptr->src0;
11+
num = (int32_t *)((char *)header_ptr + sizeof(msg_block_header));
12+
13+
#if OP_TENSOR_ADD_DEBUG_PRINT
14+
printf("src0.dpu_addr: %d, ne0: %d, ne1: %d, num: %d\n", header_ptr->src0.ptr.dpu_addr,
15+
header_ptr->src0.ne[0], header_ptr->src0.ne[1], *num);
16+
#endif
17+
18+
__mram_ptr int32_t *src0_addr = (__mram_ptr int32_t *)(DPU_MRAM_HEAP_POINTER + header_ptr->src0.ptr.dpu_addr);
19+
for (int i = 0; i < header_ptr->src0.ne[0] * header_ptr->src0.ne[1]; i++)
20+
{
21+
src0_addr[i] += *num;
22+
}
23+
}
24+
25+
void tensor_add_tasklets_run()
26+
{
27+
}
28+
29+
void tensor_add_merge()
30+
{
31+
}

pim/dpu/ops/tensor_add.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#ifndef _TENSOR_ADD_H
2+
#define _TENSOR_ADD_H
3+
4+
#include <mram.h>
5+
#include "../../host/msg/msg_block.h"
6+
7+
void tensor_add_prepare(msg_block_header *header_ptr);
8+
void tensor_add_tasklets_run();
9+
void tensor_add_merge();
10+
11+
#endif

pim/dpu/ops/tensor_get.c

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#include "tensor_get.h"
2+
3+
#define OP_TENSOR_GET_DEBUG_PRINT 0
4+
5+
static pim_tensor_des *src0 = NULL;
6+
7+
void tensor_get_prepare(msg_block_header *header_ptr)
8+
{
9+
src0 = &header_ptr->src0;
10+
11+
#if OP_TENSOR_GET_DEBUG_PRINT
12+
printf("src0.dpu_addr: %d, ne0: %d, ne1: %d\n", header_ptr->src0.ptr.dpu_addr,
13+
header_ptr->src0.ne[0], header_ptr->src0.ne[1]);
14+
#endif
15+
}
16+
17+
void tensor_get_tasklets_run()
18+
{
19+
}
20+
21+
void tensor_get_merge()
22+
{
23+
__mram_ptr int32_t *src0_addr = (__mram_ptr int32_t *)(DPU_MRAM_HEAP_POINTER + src0->ptr.dpu_addr);
24+
__mram_ptr int32_t *dst = (__mram_ptr int32_t *)RESULT_BUFFER_ADDR;
25+
for (int i = 0; i < src0->ne[0] * src0->ne[1]; i++)
26+
{
27+
dst[i] = src0_addr[i];
28+
}
29+
}

pim/dpu/ops/tensor_get.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#ifndef _TENSOR_GET_H
2+
#define _TENSOR_GET_H
3+
4+
#include <mram.h>
5+
#include "../../host/msg/msg_block.h"
6+
#include "../sto/mram_mm.h"
7+
8+
void tensor_get_prepare(msg_block_header *header_ptr);
9+
void tensor_get_tasklets_run();
10+
void tensor_get_merge();
11+
12+
#endif

pim/dpu/sto/g_vars.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#include "g_vars.h"
2+
3+
__host msg_buffer_header g_msg_header;
4+
__host char *g_msg_cache_ptr = NULL;
5+
__host uint32_t *g_blocks_offset_ptr = NULL;

pim/dpu/sto/g_vars.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#ifndef _G_VARS_H
2+
#define _G_VARS_H
3+
4+
#include <mram.h>
5+
#include <stdlib.h>
6+
#include "../../host/msg/msg_buffer.h"
7+
8+
extern __host msg_buffer_header g_msg_header;
9+
extern __host char *g_msg_cache_ptr;
10+
extern __host uint32_t *g_blocks_offset_ptr;
11+
12+
#endif

0 commit comments

Comments
 (0)