Skip to content

Commit 822a534

Browse files
author
Yinan
committed
Update PIM_README.md and some formats polished.
1 parent 9cd4190 commit 822a534

File tree

3 files changed

+60
-4
lines changed

3 files changed

+60
-4
lines changed

PIM_README.md

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# llama.cpp (PIM branch)
2+
3+
## 1. Build llama.cpp for PIM
4+
Make sure you have your PIM environment (e.g. UPMEM) set correctly already. Then try:
5+
```
6+
cd llama.cpp
7+
make LLAMA_PIM=1
8+
# make LLAMA_PIM=1 -j
9+
10+
# clean:
11+
# make clean
12+
```
13+
14+
## 2. Run llama.cpp with PIM
15+
Prepare your model files as the original README.md shows. A 4-bit-quantified model in gguf format is prefered.
16+
17+
```
18+
./llama-cli -m /mnt/LLM-models/chinese-alpaca-2-7b/gguf/chinese-alpaca-7b_q4_0.gguf --temp 0 -t 1 --no-warmup -p "列举5个北京经典美食。只列举名字,不要介绍。"
19+
```
20+
21+
## 3. llama-ts for tensor test
22+
A set of tensor utility functions have been implemented (as described in `include/trace_driver.h`), and `example/tensor/ts.cpp` is a good starting point to learn how to import tensors from data files and operate them.
23+
24+
Some snippets in `ggml/src/ggml.c` show how to export a tensor into data file, such as:
25+
```c
26+
#include "trace_driver.h"
27+
28+
stuct ggml_tensor * src0 = ...
29+
...
30+
const char* filenamea = "a.tensor";
31+
tensor_export(src0, filenamea);
32+
```
33+
34+
`example/tensor/ts.cpp` will be built as `llama-ts` after the upper `make` command.
35+
36+
37+
## 4. More details
38+
### 4.1 How we control the model layers computed on PIM
39+
There are several macros defined in `include/llama.h` that controls the bahavior of llama-cli:
40+
41+
```c++
42+
#ifdef PIM_KERNEL
43+
#define NR_DPUS 64
44+
#define NR_LAYER 2
45+
#define DPU_BINARY "./dpu/gemv_dpu"
46+
...
47+
#endif // PIM_KERNEL
48+
```
49+
50+
### 4.2 The PIM function(s) implementation
51+
The PIM binary `dpu/gemv_dpu` is built from `dpu/dpu_main.c` by typing:
52+
```shell
53+
cd dpu
54+
./pim_build.sh
55+
```
56+
So check `dpu/dpu_main.c` to find out how the kernel is implemented.

dpu/dpu_main.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ BARRIER_INIT(my_barrier, NR_TASKLETS);
3939
/*
4040
DPU MRAM Memory:
4141
42-
|--Quantify-tbl-- |--DPU0-weight-Metadata-- |--layer0-subweight0--pading-- |--layer1-subweight0--pading-- |...|--layer31-subweight0--pading-- |--input-output-metadata--|--input-token--|---output0--pading--|
42+
|--Quantify-tbl-- |--DPU0-weight-Metadata-- |--layer0-subweight0--pading-- |--layer1-subweight0--pading-- |...|--layer31-subweight0--pading-- |--input-output-metadata--|--input-token--|---output0--pading--|
4343
|--Quantify-tbl-- |--DPU1-weight-Metadata-- |--layer0-subweight1--pading-- |--layer1-subweight1--pading-- |...|--layer31-subweight1--pading-- |--input-output-metadata--|--input-token--|---output1--pading--|
4444
......
4545
|--Quantify-tbl-- |--DPU127-weight-Metadata--|--layer0-subweight127--pading--|--layer1-subweight127--pading--|...|--layer31-subweight127--pading--|--input-output-metadata--|--input-token--|---output127--pading--|
@@ -145,7 +145,7 @@ int main() {
145145
#if PRINT
146146
printf("layer_len=%d, input metadata offset=%d\n",cache_meta->layer_len,offset);
147147
#endif
148-
uint32_t inputmetadatabase = weightmetadatabase + sizeof(struct pim_meta) + cache_meta->layer_len * cache_meta->layer_num;
148+
uint32_t inputmetadatabase = weightmetadatabase + sizeof(struct pim_meta) + cache_meta->layer_len * cache_meta->layer_num;
149149
pim_matrix_des *pinputcache = (pim_matrix_des *) mem_alloc(sizeof(pim_matrix_des));
150150
mram_read((__mram_ptr void const*) (inputmetadatabase), pinputcache, sizeof(pim_matrix_des));
151151
input_cols = pinputcache->ne[1];
@@ -175,7 +175,7 @@ int main() {
175175
printf("input_cols=%d, rows_cur_thread=%d, nb=%d, input_row_size=%d\n",input_cols,weight_rows_cur_thread,nb,input_row_size);
176176
#endif
177177
block_q4_0 *pweight_cache = (block_q4_0 *) mem_alloc(sizeof(block_q4_0)*nb);
178-
block_q8_0 *pinput_cache = (block_q8_0 *) mem_alloc(sizeof(block_q8_0)*nb);
178+
block_q8_0 *pinput_cache = (block_q8_0 *) mem_alloc(sizeof(block_q8_0)*nb);
179179

180180
// weight_rows_cur_thread = 16;
181181
for(int l = 0;l < input_cols;l++) {

include/trace_driver.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ extern "C" {
1818
extern float ggml_table_f32_f16[1 << 16];
1919

2020
void tensor_export(const struct ggml_tensor * tensor, const char * fname);
21-
struct ggml_tensor * tensor_import(const char * fname);
21+
struct ggml_tensor * tensor_import(const char * fname);
2222
void dump_tensor_first_n(const struct ggml_tensor * tensor, int n, FILE * fout);
2323
void dump_tensor(const struct ggml_tensor * tensor, FILE * fout);
2424

0 commit comments

Comments
 (0)