ggml-org · mryvae · Nov 25, 2024 · Nov 26, 2024 · Nov 26, 2024 · Nov 27, 2024
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "kompute"]
 	path = ggml/src/kompute
 	url = https://github.com/nomic-ai/kompute.git
+[submodule "PIM-tensorStore"]
+	path = PIM-tensorStore
+	url = https://github.com/mryvae/PIM-tensorStore.git
diff --git a/Makefile b/Makefile
@@ -36,6 +36,9 @@ BUILD_TARGETS = \
 	llama-simple \
 	llama-speculative \
 	llama-tokenize \
+	llama-ts \
+	llama-ts-rebuild	\
+	llama-ts-multi-thread	\
 	llama-vdot \
 	llama-cvector-generator \
 	llama-gen-docs \
@@ -118,6 +121,11 @@ GGML_OPENBLAS := 1
 DEPRECATE_WARNING := 1
 endif
 
+ifdef LLAMA_PIM
+PIM_KERNEL := 1
+DEPRECATE_WARNING := 1
+endif
+
 ifdef LLAMA_OPENBLAS64
 GGML_OPENBLAS64 := 1
 DEPRECATE_WARNING := 1
@@ -553,6 +561,12 @@ ifndef GGML_NO_OPENMP
 	endif # GGML_MUSA
 endif # GGML_NO_OPENMP
 
+ifdef PIM_KERNEL
+	MK_CPPFLAGS += -DPIM_KERNEL  --std=c++11  `dpu-pkg-config --cflags --libs dpu`
+	MK_CFLAGS   += -DPIM_KERNEL  -Wall -Wextra  `dpu-pkg-config --cflags --libs dpu`
+	MK_LDFLAGS  += `dpu-pkg-config --libs dpu`
+endif # PIM_ENABLED
+
 ifdef GGML_OPENBLAS
 	MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
@@ -922,6 +936,7 @@ OBJ_GGML += \
 
 OBJ_LLAMA = \
 	src/llama.o \
+	src/trace_driver.o \
 	src/llama-vocab.o \
 	src/llama-grammar.o \
 	src/llama-sampling.o \
@@ -939,7 +954,15 @@ OBJ_COMMON = \
 	common/build-info.o \
 	common/json-schema-to-grammar.o
 
-OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON)
+OBJ_PIM_LLM = \
+	PIM-tensorStore/host/mm/pim_mm.o	\
+	PIM-tensorStore/host/mm/pim_direct_comm.o 	\
+	PIM-tensorStore/host/msg/msg_block.o \
+	PIM-tensorStore/host/msg/msg_buffer.o \
+	PIM-tensorStore/host/msg/msg_comm.o \
+	PIM-tensorStore/host/util/util.o 
+
+OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON) ${OBJ_PIM_LLM}
 
 LIB_GGML   = $(LIB_PRE)ggml$(DSO_EXT)
 LIB_GGML_S = $(LIB_PRE)ggml.a
@@ -1018,6 +1041,7 @@ $(info   - LLAMA_RPC)
 $(info   - LLAMA_SYCL)
 $(info   - LLAMA_SYCL_F16)
 $(info   - LLAMA_OPENBLAS)
+$(info   - LLAMA_PIM)
 $(info   - LLAMA_OPENBLAS64)
 $(info   - LLAMA_BLIS)
 $(info   - LLAMA_NO_LLAMAFILE)
@@ -1129,6 +1153,11 @@ src/llama.o: \
 	ggml/include/ggml-backend.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
+src/trace_driver.o: \
+	src/trace_driver.c \
+	include/trace_driver.h
+	$(CC)  $(CFLAGS) -c $< -o $@
+
 src/llama-vocab.o: \
 	src/llama-vocab.cpp \
 	src/llama-vocab.h \
@@ -1219,12 +1248,55 @@ $(LIB_COMMON_S): \
 	$(OBJ_COMMON)
 	ar rcs $(LIB_COMMON_S) $^
 
+# pim_llm
+
+PIM-tensorStore/host/mm/pim_mm.o: \
+	PIM-tensorStore/host/mm/pim_mm.c \
+	PIM-tensorStore/host/mm/pim_mm.h
+	$(CC) $(CFLAGS) -c $< -o $@
+
+PIM-tensorStore/host/mm/pim_direct_comm.o: \
+	PIM-tensorStore/host/mm/pim_direct_comm.c \
+	PIM-tensorStore/host/mm/pim_direct_comm.h
+	$(CC) $(CFLAGS) -c $< -o $@
+
+PIM-tensorStore/host/msg/msg_block.o: \
+	PIM-tensorStore/host/msg/msg_block.c \
+	PIM-tensorStore/host/msg/msg_block.h
+	$(CC) $(CFLAGS) -c $< -o $@
+
+PIM-tensorStore/host/msg/msg_buffer.o: \
+	PIM-tensorStore/host/msg/msg_buffer.c \
+	PIM-tensorStore/host/msg/msg_buffer.h
+	$(CC) $(CFLAGS) -c $< -o $@
+
+PIM-tensorStore/host/msg/msg_comm.o: \
+	PIM-tensorStore/host/msg/msg_comm.c \
+	PIM-tensorStore/host/msg/msg_comm.h
+	$(CC) $(CFLAGS) -c $< -o $@
+
+PIM-tensorStore/host/util/util.o : \
+	PIM-tensorStore/host/util/util.c \
+	PIM-tensorStore/host/util/util.h
+	$(CC) $(CFLAGS) -c $< -o $@
+
+$(LIB_PIM_LLM): \
+	$(OBJ_PIM_LLM) \
+	$(CC) $(CFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
+
+$(LIB_PIM_LLM_S): \
+	$(OBJ_PIM_LLM)
+	ar rcs $(LIB_PIM_LLM_S) $^
+
 clean:
 	rm -vrf *.dot $(BUILD_TARGETS) $(TEST_TARGETS)
 	rm -rvf src/*.o
 	rm -rvf tests/*.o
 	rm -rvf examples/*.o
 	rm -rvf common/*.o
+	rm -rvf PIM-tensorStore/host/mm/*.o
+	rm -rvf PIM-tensorStore/host/msg/*.o
+	rm -rvf PIM-tensorStore/host/util/*.o
 	rm -rvf *.a
 	rm -rvf *.dll
 	rm -rvf *.so
@@ -1263,6 +1335,21 @@ llama-cli: examples/main/main.cpp \
 	@echo '====  Run ./llama-cli -h for help.  ===='
 	@echo
 
+llama-ts: examples/tensor/ts.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-ts-rebuild: examples/tensor/ts-rebuild.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-ts-multi-thread: examples/tensor/ts-multi-thread.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 llama-infill: examples/infill/infill.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
@@ -1324,7 +1411,7 @@ llama-save-load-state: examples/save-load-state/save-load-state.cpp \
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
 llama-gguf: examples/gguf/gguf.cpp \
-	$(OBJ_GGML)
+	$(OBJ_GGML) src/trace_driver.o
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
@@ -1630,12 +1717,12 @@ tests/test-chat-template: tests/test-chat-template.cpp \
 #
 
 llama-vdot: pocs/vdot/vdot.cpp ggml/src/ggml.o \
-	$(OBJ_GGML)
+	$(OBJ_GGML) src/trace_driver.o
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
 llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
-	$(OBJ_GGML)
+	$(OBJ_GGML) src/trace_driver.o
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 

diff --git a/PIM-tensorStore b/PIM-tensorStore
diff --git a/PIM_README.md b/PIM_README.md
@@ -0,0 +1,79 @@
+# llama.cpp (PIM branch)
+
+## 1. Build llama.cpp for PIM
+Make sure you have your PIM environment (e.g. UPMEM) set correctly already. Then try:
+```
+cd llama.cpp
+make LLAMA_PIM=1
+# make LLAMA_PIM=1 -j
+
+# clean:
+# make clean
+```
+
+## 2. Run llama.cpp with PIM
+Prepare your model files as the original README.md shows. A 4-bit-quantified model in gguf format is prefered.
+
+```
+./llama-cli -m /mnt/LLM-models/chinese-alpaca-2-7b/gguf/chinese-alpaca-7b_q4_0.gguf \
+--temp 0 -t 1 --no-warmup -p "列举5个北京经典美食。只列举名字，不要介绍。"
+```
+
+Which may output:
+```shell
+...
+sampler seed: 4294967295
+sampler params:
+        repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
+        top_k = 40, tfs_z = 1.000, top_p = 0.950, min_p = 0.050, typical_p = 1.000, temp = 0.000
+        mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
+sampler chain: logits -> logit-bias -> penalties -> greedy
+generate: n_ctx = 4096, n_batch = 2048, n_predict = -1, n_keep = 1
+
+ 列举5个北京经典美食。只列举名字，不要介绍。1. 烤鸭 2. 炸酱面 3. 豆汁 4. 羊蝎子 5. 驴打滚 [end of text]
+
+
+llama_perf_sampler_print:    sampling time =       1.02 ms /    49 runs   (    0.02 ms per token, 47804.88 tokens per second)
+llama_perf_context_print:        load time =    4097.04 ms
+llama_perf_context_print: prompt eval time =    2966.36 ms /    16 tokens (  185.40 ms per token,     5.39 tokens per second)
+llama_perf_context_print:        eval time =   12105.60 ms /    32 runs   (  378.30 ms per token,     2.64 tokens per second)
+llama_perf_context_print:       total time =   16206.10 ms /    48 tokens
+
+```
+
+## 3. llama-ts for tensor test
+A set of tensor utility functions have been implemented (as described in `include/trace_driver.h`), and `example/tensor/ts.cpp` is a good starting point to learn how to import tensors from data files and operate them.
+
+Some snippets in `ggml/src/ggml.c` show how to export a tensor into data file, such as:
+```c
+#include "trace_driver.h"
+
+stuct ggml_tensor * src0 = ...
+...
+const char* filenamea = "a.tensor";
+tensor_export(src0, filenamea);
+```
+
+`example/tensor/ts.cpp` will be built as `llama-ts` after the upper `make` command.
+
+
+## 4. More details
+### 4.1 How we control the model layers computed on PIM
+There are several macros defined in `include/llama.h` that controls the bahavior of llama-cli:
+
+```c++
+#ifdef PIM_KERNEL
+#define NR_DPUS 64    //Number of DPUs to execute the kernel
+#define NR_LAYER 2    //Number of transformer layers to offload
+#define DPU_BINARY "./dpu/gemv_dpu"
+...
+#endif // PIM_KERNEL
+```
+
+### 4.2 The PIM function(s) implementation
+The PIM binary `dpu/gemv_dpu` is built from `dpu/dpu_main.c` by typing:
+```shell
+cd dpu
+./pim_build.sh
+```
+Check `dpu/dpu_main.c` to find out how the kernel is implemented.
@@ -888,6 +888,10 @@ struct common_init_result common_init_from_params(common_params & params) {
         return iparams;
     }
 
+#ifdef PIM_KERNEL
+    llama_load2dpu(lctx, model);
+#endif // PIM_KERNEL
+
     if (!params.control_vectors.empty()) {
         if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
         if (params.control_vector_layer_end   <= 0) params.control_vector_layer_end   = llama_n_layer(model);