Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
f472d76
Add parallelised gemv op
Nov 25, 2024
ef1c89b
Add graph building in QKV calculation and in FFN (up and gate).
Nov 26, 2024
2b4b28a
Add single (unparallel) GEMV op for PIM.
Nov 26, 2024
f970a90
Fix build error
Nov 27, 2024
51e92c1
Utility for tensor export/import
Dec 4, 2024
a155d5e
Fix and add llama-ts for tensor import and dump test
Dec 4, 2024
3793749
Add mul_add for 1st element of two quantified tensors
Dec 4, 2024
18bbe76
Build goes well for host app as PIM_KERNEL enabled.
Dec 17, 2024
a667bd6
Add dpu_main.c for gemv computing on DPU.
Dec 19, 2024
b52ebd1
UPMEM gemv op runs for a single call in each generate iteartion. But …
Jan 11, 2025
093131b
More friendly log print; Tensor differ function; Bug fix for weight t…
Jan 13, 2025
9cd4190
Bug fix for DPU kernel weight offset and layer id for multiple layers.
Jan 15, 2025
822a534
Update PIM_README.md and some formats polished.
Jan 23, 2025
fa98763
Update PIM_README.md
Jan 26, 2025
a7b3792
Refact ts.cpp as a scaffold for dpu kernel
Feb 8, 2025
bff5aae
完成dpu的多线程并行 & 增加tensor的误差对比函数
nonestack Feb 20, 2025
4541374
添加了部分注释
nonestack Mar 5, 2025
80572b3
Merge pull request #1 from lanhin/dev_pim_multithread
lanhin Mar 31, 2025
c9a3797
添加测试cpu的执行时间
nonestack Apr 10, 2025
3ec476b
Merge pull request #2 from lanhin/dev_pim_multithread
lanhin Apr 14, 2025
f835014
Turn off tensor_export by a temp macro TENSOR_EXPORT
lanhin Apr 7, 2025
867c4d0
Fix compile when undefine PIM_KERNEL
lanhin Apr 7, 2025
5c2859b
add PIM_DEBUG_PERF_PRINT
mryvae Apr 8, 2025
e41e802
Multi-threads for PIM calculation.
lanhin Apr 9, 2025
08cdc28
Fix ts compile when PIM_KERNEL is off
lanhin Apr 9, 2025
3fa5935
Open offset print for 1st time when call PIM functions.
lanhin Apr 9, 2025
7004727
use row_segment for supporting 16 tasklets
mryvae Apr 11, 2025
46aaeba
use mul_table_int4_int8
mryvae Apr 14, 2025
70f9d88
Merge pull request #3 from lanhin/tensor_export_turnoff
lanhin Apr 16, 2025
c2b40d5
add pim-llm-framework
mryvae Apr 17, 2025
63db21b
Added PIM-tensorStore to the project.
mryvae Apr 18, 2025
5bda436
delete pim dir & rebuild ts.cpp
mryvae Apr 18, 2025
cbc8add
add ts-multi-thread
mryvae Apr 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
[submodule "kompute"]
path = ggml/src/kompute
url = https://github.com/nomic-ai/kompute.git
[submodule "PIM-tensorStore"]
path = PIM-tensorStore
url = https://github.com/mryvae/PIM-tensorStore.git
95 changes: 91 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ BUILD_TARGETS = \
llama-simple \
llama-speculative \
llama-tokenize \
llama-ts \
llama-ts-rebuild \
llama-ts-multi-thread \
llama-vdot \
llama-cvector-generator \
llama-gen-docs \
Expand Down Expand Up @@ -118,6 +121,11 @@ GGML_OPENBLAS := 1
DEPRECATE_WARNING := 1
endif

ifdef LLAMA_PIM
PIM_KERNEL := 1
DEPRECATE_WARNING := 1
endif

ifdef LLAMA_OPENBLAS64
GGML_OPENBLAS64 := 1
DEPRECATE_WARNING := 1
Expand Down Expand Up @@ -553,6 +561,12 @@ ifndef GGML_NO_OPENMP
endif # GGML_MUSA
endif # GGML_NO_OPENMP

ifdef PIM_KERNEL
MK_CPPFLAGS += -DPIM_KERNEL --std=c++11 `dpu-pkg-config --cflags --libs dpu`
MK_CFLAGS += -DPIM_KERNEL -Wall -Wextra `dpu-pkg-config --cflags --libs dpu`
MK_LDFLAGS += `dpu-pkg-config --libs dpu`
endif # PIM_ENABLED

ifdef GGML_OPENBLAS
MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
Expand Down Expand Up @@ -922,6 +936,7 @@ OBJ_GGML += \

OBJ_LLAMA = \
src/llama.o \
src/trace_driver.o \
src/llama-vocab.o \
src/llama-grammar.o \
src/llama-sampling.o \
Expand All @@ -939,7 +954,15 @@ OBJ_COMMON = \
common/build-info.o \
common/json-schema-to-grammar.o

OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON)
OBJ_PIM_LLM = \
PIM-tensorStore/host/mm/pim_mm.o \
PIM-tensorStore/host/mm/pim_direct_comm.o \
PIM-tensorStore/host/msg/msg_block.o \
PIM-tensorStore/host/msg/msg_buffer.o \
PIM-tensorStore/host/msg/msg_comm.o \
PIM-tensorStore/host/util/util.o

OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON) ${OBJ_PIM_LLM}

LIB_GGML = $(LIB_PRE)ggml$(DSO_EXT)
LIB_GGML_S = $(LIB_PRE)ggml.a
Expand Down Expand Up @@ -1018,6 +1041,7 @@ $(info - LLAMA_RPC)
$(info - LLAMA_SYCL)
$(info - LLAMA_SYCL_F16)
$(info - LLAMA_OPENBLAS)
$(info - LLAMA_PIM)
$(info - LLAMA_OPENBLAS64)
$(info - LLAMA_BLIS)
$(info - LLAMA_NO_LLAMAFILE)
Expand Down Expand Up @@ -1129,6 +1153,11 @@ src/llama.o: \
ggml/include/ggml-backend.h
$(CXX) $(CXXFLAGS) -c $< -o $@

src/trace_driver.o: \
src/trace_driver.c \
include/trace_driver.h
$(CC) $(CFLAGS) -c $< -o $@

src/llama-vocab.o: \
src/llama-vocab.cpp \
src/llama-vocab.h \
Expand Down Expand Up @@ -1219,12 +1248,55 @@ $(LIB_COMMON_S): \
$(OBJ_COMMON)
ar rcs $(LIB_COMMON_S) $^

# pim_llm

PIM-tensorStore/host/mm/pim_mm.o: \
PIM-tensorStore/host/mm/pim_mm.c \
PIM-tensorStore/host/mm/pim_mm.h
$(CC) $(CFLAGS) -c $< -o $@

PIM-tensorStore/host/mm/pim_direct_comm.o: \
PIM-tensorStore/host/mm/pim_direct_comm.c \
PIM-tensorStore/host/mm/pim_direct_comm.h
$(CC) $(CFLAGS) -c $< -o $@

PIM-tensorStore/host/msg/msg_block.o: \
PIM-tensorStore/host/msg/msg_block.c \
PIM-tensorStore/host/msg/msg_block.h
$(CC) $(CFLAGS) -c $< -o $@

PIM-tensorStore/host/msg/msg_buffer.o: \
PIM-tensorStore/host/msg/msg_buffer.c \
PIM-tensorStore/host/msg/msg_buffer.h
$(CC) $(CFLAGS) -c $< -o $@

PIM-tensorStore/host/msg/msg_comm.o: \
PIM-tensorStore/host/msg/msg_comm.c \
PIM-tensorStore/host/msg/msg_comm.h
$(CC) $(CFLAGS) -c $< -o $@

PIM-tensorStore/host/util/util.o : \
PIM-tensorStore/host/util/util.c \
PIM-tensorStore/host/util/util.h
$(CC) $(CFLAGS) -c $< -o $@

$(LIB_PIM_LLM): \
$(OBJ_PIM_LLM) \
$(CC) $(CFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

$(LIB_PIM_LLM_S): \
$(OBJ_PIM_LLM)
ar rcs $(LIB_PIM_LLM_S) $^

clean:
rm -vrf *.dot $(BUILD_TARGETS) $(TEST_TARGETS)
rm -rvf src/*.o
rm -rvf tests/*.o
rm -rvf examples/*.o
rm -rvf common/*.o
rm -rvf PIM-tensorStore/host/mm/*.o
rm -rvf PIM-tensorStore/host/msg/*.o
rm -rvf PIM-tensorStore/host/util/*.o
rm -rvf *.a
rm -rvf *.dll
rm -rvf *.so
Expand Down Expand Up @@ -1263,6 +1335,21 @@ llama-cli: examples/main/main.cpp \
@echo '==== Run ./llama-cli -h for help. ===='
@echo

llama-ts: examples/tensor/ts.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

llama-ts-rebuild: examples/tensor/ts-rebuild.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

llama-ts-multi-thread: examples/tensor/ts-multi-thread.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

llama-infill: examples/infill/infill.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
Expand Down Expand Up @@ -1324,7 +1411,7 @@ llama-save-load-state: examples/save-load-state/save-load-state.cpp \
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

llama-gguf: examples/gguf/gguf.cpp \
$(OBJ_GGML)
$(OBJ_GGML) src/trace_driver.o
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

Expand Down Expand Up @@ -1630,12 +1717,12 @@ tests/test-chat-template: tests/test-chat-template.cpp \
#

llama-vdot: pocs/vdot/vdot.cpp ggml/src/ggml.o \
$(OBJ_GGML)
$(OBJ_GGML) src/trace_driver.o
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
$(OBJ_GGML)
$(OBJ_GGML) src/trace_driver.o
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

Expand Down
1 change: 1 addition & 0 deletions PIM-tensorStore
Submodule PIM-tensorStore added at f72c77
79 changes: 79 additions & 0 deletions PIM_README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# llama.cpp (PIM branch)

## 1. Build llama.cpp for PIM
Make sure you have your PIM environment (e.g. UPMEM) set correctly already. Then try:
```
cd llama.cpp
make LLAMA_PIM=1
# make LLAMA_PIM=1 -j

# clean:
# make clean
```

## 2. Run llama.cpp with PIM
Prepare your model files as the original README.md shows. A 4-bit-quantified model in gguf format is prefered.

```
./llama-cli -m /mnt/LLM-models/chinese-alpaca-2-7b/gguf/chinese-alpaca-7b_q4_0.gguf \
--temp 0 -t 1 --no-warmup -p "列举5个北京经典美食。只列举名字,不要介绍。"
```

Which may output:
```shell
...
sampler seed: 4294967295
sampler params:
repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
top_k = 40, tfs_z = 1.000, top_p = 0.950, min_p = 0.050, typical_p = 1.000, temp = 0.000
mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
sampler chain: logits -> logit-bias -> penalties -> greedy
generate: n_ctx = 4096, n_batch = 2048, n_predict = -1, n_keep = 1

列举5个北京经典美食。只列举名字,不要介绍。1. 烤鸭 2. 炸酱面 3. 豆汁 4. 羊蝎子 5. 驴打滚 [end of text]


llama_perf_sampler_print: sampling time = 1.02 ms / 49 runs ( 0.02 ms per token, 47804.88 tokens per second)
llama_perf_context_print: load time = 4097.04 ms
llama_perf_context_print: prompt eval time = 2966.36 ms / 16 tokens ( 185.40 ms per token, 5.39 tokens per second)
llama_perf_context_print: eval time = 12105.60 ms / 32 runs ( 378.30 ms per token, 2.64 tokens per second)
llama_perf_context_print: total time = 16206.10 ms / 48 tokens

```

## 3. llama-ts for tensor test
A set of tensor utility functions have been implemented (as described in `include/trace_driver.h`), and `example/tensor/ts.cpp` is a good starting point to learn how to import tensors from data files and operate them.

Some snippets in `ggml/src/ggml.c` show how to export a tensor into data file, such as:
```c
#include "trace_driver.h"

stuct ggml_tensor * src0 = ...
...
const char* filenamea = "a.tensor";
tensor_export(src0, filenamea);
```

`example/tensor/ts.cpp` will be built as `llama-ts` after the upper `make` command.


## 4. More details
### 4.1 How we control the model layers computed on PIM
There are several macros defined in `include/llama.h` that controls the bahavior of llama-cli:

```c++
#ifdef PIM_KERNEL
#define NR_DPUS 64 //Number of DPUs to execute the kernel
#define NR_LAYER 2 //Number of transformer layers to offload
#define DPU_BINARY "./dpu/gemv_dpu"
...
#endif // PIM_KERNEL
```

### 4.2 The PIM function(s) implementation
The PIM binary `dpu/gemv_dpu` is built from `dpu/dpu_main.c` by typing:
```shell
cd dpu
./pim_build.sh
```
Check `dpu/dpu_main.c` to find out how the kernel is implemented.
4 changes: 4 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -888,6 +888,10 @@ struct common_init_result common_init_from_params(common_params & params) {
return iparams;
}

#ifdef PIM_KERNEL
llama_load2dpu(lctx, model);
#endif // PIM_KERNEL

if (!params.control_vectors.empty()) {
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
Expand Down
Loading