Update PIM_README.md and some formats polished.

Yinan · Yinan · commit 822a5341cce2 · 2025-01-23T16:51:58.000+08:00
diff --git a/PIM_README.md b/PIM_README.md
@@ -0,0 +1,56 @@
+# llama.cpp (PIM branch)
+
+## 1. Build llama.cpp for PIM
+Make sure you have your PIM environment (e.g. UPMEM) set correctly already. Then try:
+```
+cd llama.cpp
+make LLAMA_PIM=1
+# make LLAMA_PIM=1 -j
+
+# clean:
+# make clean
+```
+
+## 2. Run llama.cpp with PIM
+Prepare your model files as the original README.md shows. A 4-bit-quantified model in gguf format is prefered.
+
+```
+./llama-cli -m /mnt/LLM-models/chinese-alpaca-2-7b/gguf/chinese-alpaca-7b_q4_0.gguf  --temp 0 -t 1 --no-warmup -p "列举5个北京经典美食。只列举名字，不要介绍。"
+```
+
+## 3. llama-ts for tensor test
+A set of tensor utility functions have been implemented (as described in `include/trace_driver.h`), and `example/tensor/ts.cpp` is a good starting point to learn how to import tensors from data files and operate them.
+
+Some snippets in `ggml/src/ggml.c` show how to export a tensor into data file, such as:
+```c
+#include "trace_driver.h"
+
+stuct ggml_tensor * src0 = ...
+...
+const char* filenamea = "a.tensor";
+tensor_export(src0, filenamea);
+```
+
+`example/tensor/ts.cpp` will be built as `llama-ts` after the upper `make` command.
+
+
+## 4. More details
+### 4.1 How we control the model layers computed on PIM
+There are several macros defined in `include/llama.h` that controls the bahavior of llama-cli:
+
+```c++
+#ifdef PIM_KERNEL
+#define NR_DPUS 64
+#define NR_LAYER 2
+#define DPU_BINARY "./dpu/gemv_dpu"
+...
+#endif // PIM_KERNEL
+```
+
+### 4.2 The PIM function(s) implementation
+The PIM binary `dpu/gemv_dpu` is built from `dpu/dpu_main.c` by typing:
+```shell
+cd dpu
+./pim_build.sh
+```
+So check `dpu/dpu_main.c` to find out how the kernel is implemented.
diff --git a/dpu/dpu_main.c b/dpu/dpu_main.c
@@ -39,7 +39,7 @@ BARRIER_INIT(my_barrier, NR_TASKLETS);
 /*
 DPU MRAM Memory:
 
-|--Quantify-tbl--  |--DPU0-weight-Metadata--  |--layer0-subweight0--pading--  |--layer1-subweight0--pading--  |...|--layer31-subweight0--pading--  |--input-output-metadata--|--input-token--|---output0--pading--| 
+|--Quantify-tbl--  |--DPU0-weight-Metadata--  |--layer0-subweight0--pading--  |--layer1-subweight0--pading--  |...|--layer31-subweight0--pading--  |--input-output-metadata--|--input-token--|---output0--pading--|
 |--Quantify-tbl--  |--DPU1-weight-Metadata--  |--layer0-subweight1--pading--  |--layer1-subweight1--pading--  |...|--layer31-subweight1--pading--  |--input-output-metadata--|--input-token--|---output1--pading--|
 ......
 |--Quantify-tbl--  |--DPU127-weight-Metadata--|--layer0-subweight127--pading--|--layer1-subweight127--pading--|...|--layer31-subweight127--pading--|--input-output-metadata--|--input-token--|---output127--pading--|
@@ -145,7 +145,7 @@ int main() {
 #if PRINT
     printf("layer_len=%d, input metadata offset=%d\n",cache_meta->layer_len,offset);
 #endif
-    uint32_t inputmetadatabase = weightmetadatabase + sizeof(struct pim_meta) + cache_meta->layer_len * cache_meta->layer_num;   
+    uint32_t inputmetadatabase = weightmetadatabase + sizeof(struct pim_meta) + cache_meta->layer_len * cache_meta->layer_num;
     pim_matrix_des *pinputcache = (pim_matrix_des *) mem_alloc(sizeof(pim_matrix_des));
     mram_read((__mram_ptr void const*) (inputmetadatabase), pinputcache, sizeof(pim_matrix_des));
     input_cols = pinputcache->ne[1];
@@ -175,7 +175,7 @@ int main() {
         printf("input_cols=%d, rows_cur_thread=%d, nb=%d, input_row_size=%d\n",input_cols,weight_rows_cur_thread,nb,input_row_size);
 #endif
         block_q4_0 *pweight_cache = (block_q4_0 *) mem_alloc(sizeof(block_q4_0)*nb);
-        block_q8_0 *pinput_cache = (block_q8_0 *) mem_alloc(sizeof(block_q8_0)*nb);          
+        block_q8_0 *pinput_cache = (block_q8_0 *) mem_alloc(sizeof(block_q8_0)*nb);
 
         // weight_rows_cur_thread = 16;
         for(int l = 0;l < input_cols;l++) {
diff --git a/include/trace_driver.h b/include/trace_driver.h
@@ -18,7 +18,7 @@ extern "C" {
   extern float ggml_table_f32_f16[1 << 16];
 
   void tensor_export(const struct ggml_tensor * tensor, const char * fname);
-struct ggml_tensor * tensor_import(const char * fname);
+  struct ggml_tensor * tensor_import(const char * fname);
   void dump_tensor_first_n(const struct ggml_tensor * tensor, int n, FILE * fout);
   void dump_tensor(const struct ggml_tensor * tensor, FILE * fout);