diff --git a/Makefile b/Makefile
index 1f9455eff0aec..b64728b4f8636 100644
--- a/Makefile
+++ b/Makefile
@@ -2,9 +2,12 @@ ifndef LLAMA_MAKEFILE
$(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
endif
+# Modified by Brad Hutchings to build llama.cpp targets correctly and build with cosmocc.
+
# Define the default target now so that it is always the first target
BUILD_TARGETS = \
libllava.a \
+ llama-server \
llama-batched \
llama-batched-bench \
llama-bench \
@@ -36,7 +39,6 @@ BUILD_TARGETS = \
llama-quantize-stats \
llama-retrieval \
llama-save-load-state \
- llama-server \
llama-simple \
llama-simple-chat \
llama-run \
@@ -258,7 +260,7 @@ endif
#
# keep standard at C11 and C++17
-MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU
+MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -I. -DGGML_USE_CPU
MK_CFLAGS = -std=c11 -fPIC
MK_CXXFLAGS = -std=c++17 -fPIC
MK_NVCCFLAGS = -std=c++17
@@ -370,27 +372,66 @@ ifndef GGML_NO_CPU_AARCH64
MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
endif
-# warnings
-WARN_FLAGS = \
- -Wall \
- -Wextra \
- -Wpedantic \
- -Wcast-qual \
- -Wno-unused-function
-
-MK_CFLAGS += \
- $(WARN_FLAGS) \
- -Wshadow \
- -Wstrict-prototypes \
- -Wpointer-arith \
- -Wmissing-prototypes \
- -Werror=implicit-int \
- -Werror=implicit-function-declaration
-
-MK_CXXFLAGS += \
- $(WARN_FLAGS) \
- -Wmissing-declarations \
- -Wmissing-noreturn
+ifeq ($(UNAME_S),cosmocc)
+$(info Setting MK_CFLAGS and MK_CXXFLAGS flags for cosmocc.)
+
+ WARN_FLAGS_ORIG = \
+ -Wall \
+ -Wextra \
+ -Wpedantic \
+ -Wcast-qual \
+ -Wno-unused-function
+
+ WARN_FLAGS = \
+ -Wcast-qual \
+ -Wno-unused-function
+
+ MK_CFLAGS += \
+ $(WARN_FLAGS) \
+ -Wshadow \
+ -Wstrict-prototypes \
+ -Wpointer-arith \
+ -Wmissing-prototypes \
+ -Werror=implicit-function-declaration \
+ -Wno-implicit-int \
+ -DCOSMOCC=1
+
+ MK_CXXFLAGS += \
+ $(WARN_FLAGS) \
+ -Wmissing-declarations \
+ -Wmissing-noreturn \
+ -Wno-literal-suffix \
+ -DCOSMOCC=1
+
+
+else
+$(info Using default MK_CFLAGS and MK_CXXFLAGS flags.)
+
+ # warnings
+ WARN_FLAGS = \
+ -Wall \
+ -Wextra \
+ -Wpedantic \
+ -Wcast-qual \
+ -Wno-unused-function
+
+ MK_CFLAGS += \
+ $(WARN_FLAGS) \
+ -Wshadow \
+ -Wstrict-prototypes \
+ -Wpointer-arith \
+ -Wmissing-prototypes \
+ -Werror=implicit-int \
+ -Werror=implicit-function-declaration
+
+ MK_CXXFLAGS += \
+ $(WARN_FLAGS) \
+ -Wmissing-declarations \
+ -Wmissing-noreturn
+
+endif
+
+
ifeq ($(LLAMA_FATAL_WARNINGS),1)
MK_CFLAGS += -Werror
@@ -398,9 +439,11 @@ ifeq ($(LLAMA_FATAL_WARNINGS),1)
endif
# this version of Apple ld64 is buggy
+ifneq ($(UNAME_S),cosmocc)
ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
endif
+endif
# OS specific
# TODO: support Windows
@@ -460,6 +503,7 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
#MK_CXXFLAGS += -mssse3
endif
+ifneq ($(UNAME_S),cosmocc)
ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
@@ -470,6 +514,7 @@ ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
# Target Windows 8 for PrefetchVirtualMemory
MK_CPPFLAGS += -D_WIN32_WINNT=0x602
endif
+endif
ifneq ($(filter aarch64%,$(UNAME_M)),)
# Apple M1, M2, etc.
@@ -978,6 +1023,7 @@ OBJ_GGML = \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \
+ $(DIR_GGML)/src/gguf.o \
$(OBJ_GGML_EXT)
OBJ_LLAMA = \
@@ -985,9 +1031,25 @@ OBJ_LLAMA = \
$(DIR_LLAMA)/llama-vocab.o \
$(DIR_LLAMA)/llama-grammar.o \
$(DIR_LLAMA)/llama-sampling.o \
+ $(DIR_LLAMA)/llama-adapter.o \
+ $(DIR_LLAMA)/llama-arch.o \
+ $(DIR_LLAMA)/llama-batch.o \
+ $(DIR_LLAMA)/llama-chat.o \
+ $(DIR_LLAMA)/llama-context.o \
+ $(DIR_LLAMA)/llama-graph.o \
+ $(DIR_LLAMA)/llama-hparams.o \
+ $(DIR_LLAMA)/llama-impl.o \
+ $(DIR_LLAMA)/llama-io.o \
+ $(DIR_LLAMA)/llama-kv-cache.o \
+ $(DIR_LLAMA)/llama-mmap.o \
+ $(DIR_LLAMA)/llama-model.o \
+ $(DIR_LLAMA)/llama-model-loader.o \
+ $(DIR_LLAMA)/llama-quant.o \
$(DIR_LLAMA)/unicode.o \
$(DIR_LLAMA)/unicode-data.o
+# $(DIR_LLAMA)/llama-context.o \
+
OBJ_COMMON = \
$(DIR_COMMON)/common.o \
$(DIR_COMMON)/arg.o \
@@ -1049,8 +1111,10 @@ $(info I CFLAGS: $(CFLAGS))
$(info I CXXFLAGS: $(CXXFLAGS))
$(info I NVCCFLAGS: $(NVCCFLAGS))
$(info I LDFLAGS: $(LDFLAGS))
+ifneq ($(UNAME_S),cosmocc)
$(info I CC: $(shell $(CC) --version | head -n 1))
$(info I CXX: $(shell $(CXX) --version | head -n 1))
+endif
ifdef GGML_CUDA
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
@@ -1196,7 +1260,14 @@ llama-infill: examples/infill/infill.cpp \
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-llama-run: examples/run/run.cpp \
+llama-run: \
+ examples/run/run.cpp \
+ examples/run/linenoise.cpp/linenoise.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-run-orig: examples/run/run.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1370,6 +1441,20 @@ rpc-server: examples/rpc/rpc-server.cpp \
endif # GGML_RPC
llama-server: \
+ examples/server/server.cpp \
+ examples/server/httplib.h \
+ common/chat.h \
+ common/minja/chat-template.hpp \
+ common/json.hpp \
+ common//minja/minja.hpp \
+ $(OBJ_ALL)
+ cmake -DINPUT=examples/server/public/index.html.gz -DOUTPUT=examples/server/index.html.gz.hpp -P scripts/xxd.cmake
+ cmake -DINPUT=examples/server/public_legacy/index.html -DOUTPUT=examples/server/index.html.hpp -P scripts/xxd.cmake
+ cmake -DINPUT=examples/server/public_legacy/loading.html -DOUTPUT=examples/server/loading.html.hpp -P scripts/xxd.cmake
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
+
+llama-server-orig: \
examples/server/server.cpp \
examples/server/utils.hpp \
examples/server/httplib.h \
diff --git a/Makefile-llama-cpp-original b/Makefile-llama-cpp-original
new file mode 100644
index 0000000000000..1f9455eff0aec
--- /dev/null
+++ b/Makefile-llama-cpp-original
@@ -0,0 +1,1617 @@
+ifndef LLAMA_MAKEFILE
+$(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
+endif
+
+# Define the default target now so that it is always the first target
+BUILD_TARGETS = \
+ libllava.a \
+ llama-batched \
+ llama-batched-bench \
+ llama-bench \
+ llama-cli \
+ llama-convert-llama2c-to-ggml \
+ llama-embedding \
+ llama-eval-callback \
+ llama-export-lora \
+ llama-gbnf-validator \
+ llama-gguf \
+ llama-gguf-hash \
+ llama-gguf-split \
+ llama-gritlm \
+ llama-imatrix \
+ llama-infill \
+ llama-llava-cli \
+ llama-minicpmv-cli\
+ llama-qwen2vl-cli\
+ llama-lookahead \
+ llama-lookup \
+ llama-lookup-create \
+ llama-lookup-merge \
+ llama-lookup-stats \
+ llama-parallel \
+ llama-passkey \
+ llama-perplexity \
+ llama-q8dot \
+ llama-quantize \
+ llama-quantize-stats \
+ llama-retrieval \
+ llama-save-load-state \
+ llama-server \
+ llama-simple \
+ llama-simple-chat \
+ llama-run \
+ llama-speculative \
+ llama-tokenize \
+ llama-vdot \
+ llama-cvector-generator \
+ llama-gen-docs \
+ tests/test-c.o
+
+# Binaries only useful for tests
+TEST_TARGETS = \
+ tests/test-arg-parser \
+ tests/test-autorelease \
+ tests/test-backend-ops \
+ tests/test-chat \
+ tests/test-chat-template \
+ tests/test-double-float \
+ tests/test-grammar-integration \
+ tests/test-grammar-parser \
+ tests/test-json-schema-to-grammar \
+ tests/test-llama-grammar \
+ tests/test-log \
+ tests/test-model-load-cancel \
+ tests/test-quantize-fns \
+ tests/test-quantize-perf \
+ tests/test-rope \
+ tests/test-sampling \
+ tests/test-tokenizer-0 \
+ tests/test-tokenizer-1-bpe \
+ tests/test-tokenizer-1-spm
+# tests/test-opt \
+
+# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
+LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
+ simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
+ retrieval speculative infill tokenize parallel export-lora lookahead lookup passkey gritlm
+
+# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
+# We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
+LEGACY_TARGETS_BUILD = main quantize perplexity embedding server
+
+# Deprecation aliases
+ifdef LLAMA_CUBLAS
+$(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.)
+endif
+
+ifdef LLAMA_CUDA
+GGML_CUDA := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_KOMPUTE
+GGML_KOMPUTE := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_METAL
+GGML_METAL := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_RPC
+GGML_RPC := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_SYCL
+GGML_SYCL := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_SYCL_F16
+GGML_SYCL_F16 := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_OPENBLAS
+GGML_OPENBLAS := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_OPENBLAS64
+GGML_OPENBLAS64 := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_BLIS
+GGML_BLIS := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_NO_LLAMAFILE
+GGML_NO_LLAMAFILE := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_NO_ACCELERATE
+GGML_NO_ACCELERATE := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_NO_OPENMP
+GGML_NO_OPENMP := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_NO_METAL
+GGML_NO_METAL := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_DISABLE_LOGS
+REMOVE_WARNING := 1
+endif
+
+ifdef LLAMA_SERVER_VERBOSE
+REMOVE_WARNING := 1
+endif
+
+ifndef UNAME_S
+UNAME_S := $(shell uname -s)
+endif
+
+ifndef UNAME_P
+UNAME_P := $(shell uname -p)
+endif
+
+ifndef UNAME_M
+UNAME_M := $(shell uname -m)
+endif
+
+# In GNU make default CXX is g++ instead of c++. Let's fix that so that users
+# of non-gcc compilers don't have to provide g++ alias or wrapper.
+DEFCC := cc
+DEFCXX := c++
+ifeq ($(origin CC),default)
+CC := $(DEFCC)
+endif
+ifeq ($(origin CXX),default)
+CXX := $(DEFCXX)
+endif
+
+# Mac OS + Arm can report x86_64
+# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
+ifeq ($(UNAME_S),Darwin)
+ ifndef GGML_NO_METAL
+ GGML_METAL := 1
+ endif
+
+ GGML_NO_OPENMP := 1
+
+ ifneq ($(UNAME_P),arm)
+ SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
+ ifeq ($(SYSCTL_M),1)
+ # UNAME_P := arm
+ # UNAME_M := arm64
+ warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
+ endif
+ endif
+endif
+
+ifdef GGML_METAL
+ GGML_METAL_EMBED_LIBRARY := 1
+endif
+
+ifdef GGML_RPC
+ BUILD_TARGETS += rpc-server
+endif
+
+ifdef GGML_VULKAN
+ BUILD_TARGETS += vulkan-shaders-gen
+endif
+
+default: $(BUILD_TARGETS) $(LEGACY_TARGETS_BUILD)
+
+test: $(TEST_TARGETS)
+ @failures=0; \
+ for test_target in $(TEST_TARGETS); do \
+ if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
+ ./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
+ ./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
+ ./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
+ ./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
+ ./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
+ ./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
+ ./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
+ ./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
+ elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
+ continue; \
+ elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
+ continue; \
+ else \
+ echo "Running test $$test_target..."; \
+ ./$$test_target; \
+ fi; \
+ if [ $$? -ne 0 ]; then \
+ printf 'Test %s FAILED!\n\n' $$test_target; \
+ failures=$$(( failures + 1 )); \
+ else \
+ printf 'Test %s passed.\n\n' $$test_target; \
+ fi; \
+ done; \
+ if [ $$failures -gt 0 ]; then \
+ printf '\n%s tests failed.\n' $$failures; \
+ exit 1; \
+ fi
+ @echo 'All tests passed.'
+
+all: $(BUILD_TARGETS) $(TEST_TARGETS) $(LEGACY_TARGETS_BUILD)
+
+ifdef RISCV_CROSS_COMPILE
+CC := riscv64-unknown-linux-gnu-gcc
+CXX := riscv64-unknown-linux-gnu-g++
+endif
+
+#
+# Compile flags
+#
+
+# keep standard at C11 and C++17
+MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU
+MK_CFLAGS = -std=c11 -fPIC
+MK_CXXFLAGS = -std=c++17 -fPIC
+MK_NVCCFLAGS = -std=c++17
+
+ifdef LLAMA_NO_CCACHE
+GGML_NO_CCACHE := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifndef GGML_NO_CCACHE
+CCACHE := $(shell which ccache)
+ifdef CCACHE
+export CCACHE_SLOPPINESS = time_macros
+$(info I ccache found, compilation results will be cached. Disable with GGML_NO_CCACHE.)
+CC := $(CCACHE) $(CC)
+CXX := $(CCACHE) $(CXX)
+else
+$(info I ccache not found. Consider installing it for faster compilation.)
+endif # CCACHE
+endif # GGML_NO_CCACHE
+
+# clock_gettime came in POSIX.1b (1993)
+# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
+# posix_memalign came in POSIX.1-2001 / SUSv3
+# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
+MK_CPPFLAGS += -D_XOPEN_SOURCE=600
+
+# Somehow in OpenBSD whenever POSIX conformance is specified
+# some string functions rely on locale_t availability,
+# which was introduced in POSIX.1-2008, forcing us to go higher
+ifeq ($(UNAME_S),OpenBSD)
+ MK_CPPFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
+endif
+
+# Data types, macros and functions related to controlling CPU affinity and
+# some memory allocation are available on Linux through GNU extensions in libc
+ifeq ($(UNAME_S),Linux)
+ MK_CPPFLAGS += -D_GNU_SOURCE
+ MK_LDFLAGS += -ldl
+endif
+
+# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
+# and on macOS its availability depends on enabling Darwin extensions
+# similarly on DragonFly, enabling BSD extensions is necessary
+ifeq ($(UNAME_S),Darwin)
+ MK_CPPFLAGS += -D_DARWIN_C_SOURCE
+endif
+ifeq ($(UNAME_S),DragonFly)
+ MK_CPPFLAGS += -D__BSD_VISIBLE
+endif
+
+# alloca is a non-standard interface that is not visible on BSDs when
+# POSIX conformance is specified, but not all of them provide a clean way
+# to enable it in such cases
+ifeq ($(UNAME_S),FreeBSD)
+ MK_CPPFLAGS += -D__BSD_VISIBLE
+endif
+ifeq ($(UNAME_S),NetBSD)
+ MK_CPPFLAGS += -D_NETBSD_SOURCE
+endif
+ifeq ($(UNAME_S),OpenBSD)
+ MK_CPPFLAGS += -D_BSD_SOURCE
+endif
+
+ifdef GGML_SCHED_MAX_COPIES
+ MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(GGML_SCHED_MAX_COPIES)
+endif
+
+ifdef LLAMA_DEBUG
+ MK_CFLAGS += -O0 -g
+ MK_CXXFLAGS += -O0 -g
+ MK_LDFLAGS += -g
+ MK_NVCCFLAGS += -O0 -g
+
+ ifeq ($(UNAME_S),Linux)
+ MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
+ endif
+else
+ MK_CPPFLAGS += -DNDEBUG
+ MK_CFLAGS += -O3 -g
+ MK_CXXFLAGS += -O3 -g
+ MK_NVCCFLAGS += -O3 -g
+endif
+
+ifdef LLAMA_SANITIZE_THREAD
+ MK_CFLAGS += -fsanitize=thread -g
+ MK_CXXFLAGS += -fsanitize=thread -g
+ MK_LDFLAGS += -fsanitize=thread -g
+endif
+
+ifdef LLAMA_SANITIZE_ADDRESS
+ MK_CFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
+ MK_CXXFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
+ MK_LDFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
+endif
+
+ifdef LLAMA_SANITIZE_UNDEFINED
+ MK_CFLAGS += -fsanitize=undefined -g
+ MK_CXXFLAGS += -fsanitize=undefined -g
+ MK_LDFLAGS += -fsanitize=undefined -g
+endif
+
+ifdef LLAMA_SERVER_SSL
+ MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
+ MK_LDFLAGS += -lssl -lcrypto
+endif
+
+ifndef GGML_NO_CPU_AARCH64
+ MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
+endif
+
+# warnings
+WARN_FLAGS = \
+ -Wall \
+ -Wextra \
+ -Wpedantic \
+ -Wcast-qual \
+ -Wno-unused-function
+
+MK_CFLAGS += \
+ $(WARN_FLAGS) \
+ -Wshadow \
+ -Wstrict-prototypes \
+ -Wpointer-arith \
+ -Wmissing-prototypes \
+ -Werror=implicit-int \
+ -Werror=implicit-function-declaration
+
+MK_CXXFLAGS += \
+ $(WARN_FLAGS) \
+ -Wmissing-declarations \
+ -Wmissing-noreturn
+
+ifeq ($(LLAMA_FATAL_WARNINGS),1)
+ MK_CFLAGS += -Werror
+ MK_CXXFLAGS += -Werror
+endif
+
+# this version of Apple ld64 is buggy
+ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
+ MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
+endif
+
+# OS specific
+# TODO: support Windows
+ifneq '' '$(filter $(UNAME_S),Linux Darwin FreeBSD NetBSD OpenBSD Haiku)'
+ MK_CFLAGS += -pthread
+ MK_CXXFLAGS += -pthread
+endif
+
+# detect Windows
+ifneq ($(findstring _NT,$(UNAME_S)),)
+ _WIN32 := 1
+endif
+
+# library name prefix
+ifneq ($(_WIN32),1)
+ LIB_PRE := lib
+endif
+
+# Dynamic Shared Object extension
+ifneq ($(_WIN32),1)
+ DSO_EXT := .so
+else
+ DSO_EXT := .dll
+endif
+
+# Windows Sockets 2 (Winsock) for network-capable apps
+ifeq ($(_WIN32),1)
+ LWINSOCK2 := -lws2_32
+endif
+
+ifdef LLAMA_GPROF
+ MK_CFLAGS += -pg
+ MK_CXXFLAGS += -pg
+endif
+
+# Architecture specific
+# TODO: probably these flags need to be tweaked on some architectures
+# feel free to update the Makefile for your architecture and send a pull request or issue
+
+ifndef RISCV_CROSS_COMPILE
+
+ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
+ # Use all CPU extensions that are available:
+ MK_CFLAGS += -march=native -mtune=native
+ HOST_CXXFLAGS += -march=native -mtune=native
+
+ # Usage AMX build test
+ #MK_CFLAGS += -march=graniterapids -mtune=graniterapids
+ #HOST_CXXFLAGS += -march=graniterapids -mtune=graniterapids
+
+ # Usage AVX-only
+ #MK_CFLAGS += -mfma -mf16c -mavx
+ #MK_CXXFLAGS += -mfma -mf16c -mavx
+
+ # Usage SSSE3-only (Not is SSE3!)
+ #MK_CFLAGS += -mssse3
+ #MK_CXXFLAGS += -mssse3
+endif
+
+ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
+ # The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
+ # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
+ # https://github.com/ggml-org/llama.cpp/issues/2922
+ MK_CFLAGS += -Xassembler -muse-unaligned-vector-move
+ MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
+
+ # Target Windows 8 for PrefetchVirtualMemory
+ MK_CPPFLAGS += -D_WIN32_WINNT=0x602
+endif
+
+ifneq ($(filter aarch64%,$(UNAME_M)),)
+ # Apple M1, M2, etc.
+ # Raspberry Pi 3, 4, Zero 2 (64-bit)
+ # Nvidia Jetson
+ MK_CFLAGS += -mcpu=native
+ MK_CXXFLAGS += -mcpu=native
+ JETSON_RELEASE_INFO = $(shell jetson_release)
+ ifdef JETSON_RELEASE_INFO
+ ifneq ($(filter TX2%,$(JETSON_RELEASE_INFO)),)
+ JETSON_EOL_MODULE_DETECT = 1
+ CC = aarch64-unknown-linux-gnu-gcc
+ cxx = aarch64-unknown-linux-gnu-g++
+ endif
+ endif
+endif
+
+ifneq ($(filter armv6%,$(UNAME_M)),)
+ # Raspberry Pi 1, Zero
+ MK_CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
+ MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
+endif
+
+ifneq ($(filter armv7%,$(UNAME_M)),)
+ # Raspberry Pi 2
+ MK_CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
+ MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
+endif
+
+ifneq ($(filter armv8%,$(UNAME_M)),)
+ # Raspberry Pi 3, 4, Zero 2 (32-bit)
+ MK_CFLAGS += -mfp16-format=ieee -mno-unaligned-access
+ MK_CXXFLAGS += -mfp16-format=ieee -mno-unaligned-access
+endif
+
+ifneq ($(filter ppc64%,$(UNAME_M)),)
+ POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
+ ifneq (,$(findstring POWER9,$(POWER9_M)))
+ MK_CFLAGS += -mcpu=power9
+ MK_CXXFLAGS += -mcpu=power9
+ endif
+endif
+
+ifneq ($(filter ppc64le%,$(UNAME_M)),)
+ MK_CFLAGS += -mcpu=powerpc64le
+ MK_CXXFLAGS += -mcpu=powerpc64le
+ CUDA_POWER_ARCH = 1
+endif
+
+ifneq ($(filter loongarch64%,$(UNAME_M)),)
+ MK_CFLAGS += -mlasx
+ MK_CXXFLAGS += -mlasx
+endif
+
+ifneq ($(filter riscv64%,$(UNAME_M)),)
+ MK_CFLAGS += -march=rv64gcv -mabi=lp64d
+ MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
+endif
+
+else # RISC-V CROSS COMPILATION
+ MK_CFLAGS += -march=rv64gcv -mabi=lp64d
+ MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
+endif
+
+ifndef GGML_NO_ACCELERATE
+ # Mac OS - include Accelerate framework.
+ # `-framework Accelerate` works both with Apple Silicon and Mac Intel
+ ifeq ($(UNAME_S),Darwin)
+ MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE
+ MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
+ MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
+ MK_LDFLAGS += -framework Accelerate
+ OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
+ endif
+endif # GGML_NO_ACCELERATE
+
+ifndef GGML_NO_OPENMP
+ MK_CPPFLAGS += -DGGML_USE_OPENMP
+ MK_CFLAGS += -fopenmp
+ MK_CXXFLAGS += -fopenmp
+endif # GGML_NO_OPENMP
+
+ifdef GGML_OPENBLAS
+ MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
+ MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
+ MK_LDFLAGS += $(shell pkg-config --libs openblas)
+ OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
+endif # GGML_OPENBLAS
+
+ifdef GGML_OPENBLAS64
+ MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64)
+ MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64)
+ MK_LDFLAGS += $(shell pkg-config --libs openblas64)
+ OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
+endif # GGML_OPENBLAS64
+
+ifdef GGML_BLIS
+ MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis
+ MK_LDFLAGS += -lblis -L/usr/local/lib
+ OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
+endif # GGML_BLIS
+
+ifdef GGML_NVPL
+ MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas
+ MK_LDFLAGS += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp
+ OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
+endif # GGML_NVPL
+
+ifndef GGML_NO_LLAMAFILE
+ MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
+ OBJ_GGML_EXT += ggml/src/ggml-cpu/llamafile/sgemm.o
+endif
+
+ifndef GGML_NO_AMX
+ MK_CPPFLAGS += -DGGML_USE_AMX
+ OBJ_GGML_EXT += ggml/src/ggml-cpu/amx/amx.o ggml/src/ggml-cpu/amx/mmq.o
+endif
+
+# only necessary for the CPU backend files
+MK_CPPFLAGS += -Iggml/src/ggml-cpu
+
+ifdef GGML_RPC
+ MK_CPPFLAGS += -DGGML_USE_RPC
+ OBJ_GGML_EXT += ggml/src/ggml-rpc.o
+endif # GGML_RPC
+
+OBJ_CUDA_TMPL = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-mma*.cu))
+OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/mmq*.cu))
+
+ifdef GGML_CUDA_FA_ALL_QUANTS
+ OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*.cu))
+else
+ OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
+ OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
+ OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
+endif # GGML_CUDA_FA_ALL_QUANTS
+
+ifdef GGML_CUDA
+ ifneq ('', '$(wildcard /opt/cuda)')
+ CUDA_PATH ?= /opt/cuda
+ else
+ CUDA_PATH ?= /usr/local/cuda
+ endif
+
+ MK_CPPFLAGS += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
+ MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
+ MK_NVCCFLAGS += -use_fast_math
+
+ OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
+ OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
+ OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
+
+ifdef LLAMA_FATAL_WARNINGS
+ MK_NVCCFLAGS += -Werror all-warnings
+endif # LLAMA_FATAL_WARNINGS
+
+ifndef JETSON_EOL_MODULE_DETECT
+ MK_NVCCFLAGS += --forward-unknown-to-host-compiler
+endif # JETSON_EOL_MODULE_DETECT
+
+ifdef LLAMA_DEBUG
+ MK_NVCCFLAGS += -lineinfo
+endif # LLAMA_DEBUG
+
+ifdef GGML_CUDA_DEBUG
+ MK_NVCCFLAGS += --device-debug
+endif # GGML_CUDA_DEBUG
+
+ifdef GGML_CUDA_NVCC
+ NVCC = $(CCACHE) $(GGML_CUDA_NVCC)
+else
+ NVCC = $(CCACHE) nvcc
+endif # GGML_CUDA_NVCC
+
+ifdef CUDA_DOCKER_ARCH
+ MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
+else ifndef CUDA_POWER_ARCH
+ MK_NVCCFLAGS += -arch=native
+endif # CUDA_DOCKER_ARCH
+
+ifdef GGML_CUDA_FORCE_MMQ
+ MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
+endif # GGML_CUDA_FORCE_MMQ
+
+ifdef GGML_CUDA_FORCE_CUBLAS
+ MK_NVCCFLAGS += -DGGML_CUDA_FORCE_CUBLAS
+endif # GGML_CUDA_FORCE_CUBLAS
+
+ifdef GGML_CUDA_F16
+ MK_NVCCFLAGS += -DGGML_CUDA_F16
+endif # GGML_CUDA_F16
+
+ifdef GGML_CUDA_DMMV_F16
+ MK_NVCCFLAGS += -DGGML_CUDA_F16
+endif # GGML_CUDA_DMMV_F16
+
+ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
+ MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
+else
+ MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
+endif # GGML_CUDA_PEER_MAX_BATCH_SIZE
+
+ifdef GGML_CUDA_NO_PEER_COPY
+ MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
+endif # GGML_CUDA_NO_PEER_COPY
+
+ifdef GGML_CUDA_CCBIN
+ MK_NVCCFLAGS += -ccbin $(GGML_CUDA_CCBIN)
+endif # GGML_CUDA_CCBIN
+
+ifdef GGML_CUDA_NO_FA
+ MK_NVCCFLAGS += -DGGML_CUDA_NO_FA
+endif # GGML_CUDA_NO_FA
+
+ifdef GGML_CUDA_FA_ALL_QUANTS
+ MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
+endif # GGML_CUDA_FA_ALL_QUANTS
+
+ifdef JETSON_EOL_MODULE_DETECT
+define NVCC_COMPILE
+ $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
+endef # NVCC_COMPILE
+else
+define NVCC_COMPILE
+ $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
+endef # NVCC_COMPILE
+endif # JETSON_EOL_MODULE_DETECT
+
+ggml/src/ggml-cuda/%.o: \
+ ggml/src/ggml-cuda/%.cu \
+ ggml/include/ggml.h \
+ ggml/src/ggml-common.h \
+ ggml/src/ggml-cuda/common.cuh
+ $(NVCC_COMPILE)
+
+ggml/src/ggml-cuda/ggml-cuda.o: \
+ ggml/src/ggml-cuda/ggml-cuda.cu \
+ ggml/include/ggml-cuda.h \
+ ggml/include/ggml.h \
+ ggml/include/ggml-backend.h \
+ ggml/src/ggml-backend-impl.h \
+ ggml/src/ggml-common.h \
+ $(wildcard ggml/src/ggml-cuda/*.cuh)
+ $(NVCC_COMPILE)
+endif # GGML_CUDA
+
+ifdef GGML_VULKAN
+ MK_CPPFLAGS += -DGGML_USE_VULKAN
+ MK_LDFLAGS += $(shell pkg-config --libs vulkan)
+ OBJ_GGML_EXT += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o
+
+ifdef GGML_VULKAN_CHECK_RESULTS
+ MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS
+endif
+
+ifdef GGML_VULKAN_DEBUG
+ MK_CPPFLAGS += -DGGML_VULKAN_DEBUG
+endif
+
+ifdef GGML_VULKAN_MEMORY_DEBUG
+ MK_CPPFLAGS += -DGGML_VULKAN_MEMORY_DEBUG
+endif
+
+ifdef GGML_VULKAN_PERF
+ MK_CPPFLAGS += -DGGML_VULKAN_PERF
+endif
+
+ifdef GGML_VULKAN_VALIDATE
+ MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE
+endif
+
+ifdef GGML_VULKAN_RUN_TESTS
+ MK_CPPFLAGS += -DGGML_VULKAN_RUN_TESTS
+endif
+
+GLSLC_CMD = glslc
+_ggml_vk_genshaders_cmd = $(shell pwd)/vulkan-shaders-gen
+_ggml_vk_header = ggml/src/ggml-vulkan-shaders.hpp
+_ggml_vk_source = ggml/src/ggml-vulkan-shaders.cpp
+_ggml_vk_input_dir = ggml/src/ggml-vulkan/vulkan-shaders
+_ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp)
+
+ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source)
+ $(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@
+
+$(_ggml_vk_header): $(_ggml_vk_source)
+
+$(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen
+ $(_ggml_vk_genshaders_cmd) \
+ --glslc $(GLSLC_CMD) \
+ --input-dir $(_ggml_vk_input_dir) \
+ --target-hpp $(_ggml_vk_header) \
+ --target-cpp $(_ggml_vk_source)
+
+vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+ $(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+
+endif # GGML_VULKAN
+
+ifdef GGML_HIP
+ ifeq ($(wildcard /opt/rocm),)
+ ROCM_PATH ?= /usr
+ AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
+ else
+ ROCM_PATH ?= /opt/rocm
+ AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
+ endif
+
+ MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA
+
+ifdef GGML_HIP_UMA
+ MK_CPPFLAGS += -DGGML_HIP_UMA
+endif # GGML_HIP_UMA
+
+ MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
+ MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
+ MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
+
+ HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
+
+ HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
+
+ifdef GGML_CUDA_FORCE_MMQ
+ HIPFLAGS += -DGGML_CUDA_FORCE_MMQ
+endif # GGML_CUDA_FORCE_MMQ
+
+ifdef GGML_CUDA_FORCE_CUBLAS
+ HIPFLAGS += -DGGML_CUDA_FORCE_CUBLAS
+endif # GGML_CUDA_FORCE_CUBLAS
+
+ifdef GGML_CUDA_NO_PEER_COPY
+ HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
+endif # GGML_CUDA_NO_PEER_COPY
+
+ifdef GGML_CUDA_NO_FA
+ HIPFLAGS += -DGGML_CUDA_NO_FA
+endif # GGML_CUDA_NO_FA
+
+ OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
+ OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
+ OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
+
+ggml/src/ggml-cuda/ggml-cuda.o: \
+ ggml/src/ggml-cuda/ggml-cuda.cu \
+ ggml/include/ggml-cuda.h \
+ ggml/include/ggml.h \
+ ggml/include/ggml-backend.h \
+ ggml/src/ggml-backend-impl.h \
+ ggml/src/ggml-common.h \
+ $(wildcard ggml/src/ggml-cuda/*.cuh)
+ $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
+
+ggml/src/ggml-cuda/%.o: \
+ ggml/src/ggml-cuda/%.cu \
+ ggml/include/ggml.h \
+ ggml/src/ggml-common.h \
+ ggml/src/ggml-cuda/common.cuh
+ $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
+endif # GGML_HIP
+
+ifdef GGML_MUSA
+ ifeq ($(wildcard /opt/musa),)
+ MUSA_PATH ?= /usr/local/musa
+ else
+ MUSA_PATH ?= /opt/musa
+ endif
+ MUSA_ARCHITECTURES ?= 21;22;31
+
+ MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA
+ MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib
+ MK_LDFLAGS += -lmusa -lmusart -lmublas
+
+ ifndef GGML_NO_OPENMP
+ # For Ubuntu Focal
+ MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp
+ MK_LDFLAGS += -L/usr/lib/llvm-10/lib
+ # For Ubuntu Jammy
+ MK_CPPFLAGS += -I/usr/lib/llvm-14/lib/clang/14.0.0/include
+ MK_LDFLAGS += -L/usr/lib/llvm-14/lib
+ endif # GGML_NO_OPENMP
+
+ CC := $(MUSA_PATH)/bin/clang
+ CXX := $(MUSA_PATH)/bin/clang++
+ MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc
+
+ MUSAFLAGS = -fsigned-char -x musa -mtgpu
+ MUSAFLAGS += $(foreach arch,$(subst ;, ,$(MUSA_ARCHITECTURES)),--cuda-gpu-arch=mp_$(arch))
+
+ifdef GGML_CUDA_FORCE_MMQ
+ MUSAFLAGS += -DGGML_CUDA_FORCE_MMQ
+endif # GGML_CUDA_FORCE_MMQ
+
+ifdef GGML_CUDA_FORCE_CUBLAS
+ MUSAFLAGS += -DGGML_CUDA_FORCE_CUBLAS
+endif # GGML_CUDA_FORCE_CUBLAS
+
+ifdef GGML_CUDA_F16
+ MUSAFLAGS += -DGGML_CUDA_F16
+endif # GGML_CUDA_F16
+
+ifdef GGML_CUDA_DMMV_F16
+ MUSAFLAGS += -DGGML_CUDA_F16
+endif # GGML_CUDA_DMMV_F16
+
+ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
+ MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
+else
+ MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
+endif # GGML_CUDA_PEER_MAX_BATCH_SIZE
+
+ifdef GGML_CUDA_NO_PEER_COPY
+ MUSAFLAGS += -DGGML_CUDA_NO_PEER_COPY
+endif # GGML_CUDA_NO_PEER_COPY
+
+ifdef GGML_CUDA_NO_FA
+ MUSAFLAGS += -DGGML_CUDA_NO_FA
+endif # GGML_CUDA_NO_FA
+
+ifdef GGML_CUDA_FA_ALL_QUANTS
+ MUSAFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
+endif # GGML_CUDA_FA_ALL_QUANTS
+
+ OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
+ OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
+ OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
+
+ggml/src/ggml-cuda/ggml-cuda.o: \
+ ggml/src/ggml-cuda/ggml-cuda.cu \
+ ggml/include/ggml-cuda.h \
+ ggml/include/ggml.h \
+ ggml/include/ggml-backend.h \
+ ggml/src/ggml-backend-impl.h \
+ ggml/src/ggml-common.h \
+ $(wildcard ggml/src/ggml-cuda/*.cuh)
+ $(MCC) $(CXXFLAGS) $(MUSAFLAGS) -c -o $@ $<
+
+ggml/src/ggml-cuda/%.o: \
+ ggml/src/ggml-cuda/%.cu \
+ ggml/include/ggml.h \
+ ggml/src/ggml-common.h \
+ ggml/src/ggml-cuda/common.cuh
+ $(MCC) $(CXXFLAGS) $(MUSAFLAGS) -c -o $@ $<
+endif # GGML_MUSA
+
+ifdef GGML_METAL
+ MK_CPPFLAGS += -DGGML_USE_METAL
+ MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
+ OBJ_GGML_EXT += ggml/src/ggml-metal/ggml-metal.o
+
+ifdef GGML_METAL_USE_BF16
+ MK_CPPFLAGS += -DGGML_METAL_USE_BF16
+endif # GGML_METAL_USE_BF16
+ifdef GGML_METAL_NDEBUG
+ MK_CPPFLAGS += -DGGML_METAL_NDEBUG
+endif
+ifdef GGML_METAL_EMBED_LIBRARY
+ MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
+ OBJ_GGML_EXT += ggml/src/ggml-metal-embed.o
+endif
+endif # GGML_METAL
+
+ifdef GGML_METAL
+ggml/src/ggml-metal/ggml-metal.o: \
+ ggml/src/ggml-metal/ggml-metal.m \
+ ggml/src/ggml-metal/ggml-metal-impl.h \
+ ggml/include/ggml-metal.h \
+ ggml/include/ggml.h
+ $(CC) $(CFLAGS) -c $< -o $@
+
+ifdef GGML_METAL_EMBED_LIBRARY
+ggml/src/ggml-metal-embed.o: \
+ ggml/src/ggml-metal/ggml-metal.metal \
+ ggml/src/ggml-metal/ggml-metal-impl.h \
+ ggml/src/ggml-common.h
+ @echo "Embedding Metal library"
+ @sed -e '/__embed_ggml-common.h__/r ggml/src/ggml-common.h' -e '/__embed_ggml-common.h__/d' < ggml/src/ggml-metal/ggml-metal.metal > ggml/src/ggml-metal/ggml-metal-embed.metal.tmp
+ @sed -e '/#include "ggml-metal-impl.h"/r ggml/src/ggml-metal/ggml-metal-impl.h' -e '/#include "ggml-metal-impl.h"/d' < ggml/src/ggml-metal/ggml-metal-embed.metal.tmp > ggml/src/ggml-metal/ggml-metal-embed.metal
+ $(eval TEMP_ASSEMBLY=$(shell mktemp -d))
+ @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+ @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+ @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+ @echo ".incbin \"ggml/src/ggml-metal/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+ @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+ @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+ $(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@
+ @rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s
+ @rmdir ${TEMP_ASSEMBLY}
+endif
+endif # GGML_METAL
+
+DIR_GGML = ggml
+DIR_LLAMA = src
+DIR_COMMON = common
+
+OBJ_GGML = \
+ $(DIR_GGML)/src/ggml.o \
+ $(DIR_GGML)/src/ggml-alloc.o \
+ $(DIR_GGML)/src/ggml-backend.o \
+ $(DIR_GGML)/src/ggml-backend-reg.o \
+ $(DIR_GGML)/src/ggml-opt.o \
+ $(DIR_GGML)/src/ggml-quants.o \
+ $(DIR_GGML)/src/ggml-threading.o \
+ $(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
+ $(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
+ $(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
+ $(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
+ $(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
+ $(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \
+ $(OBJ_GGML_EXT)
+
+OBJ_LLAMA = \
+ $(DIR_LLAMA)/llama.o \
+ $(DIR_LLAMA)/llama-vocab.o \
+ $(DIR_LLAMA)/llama-grammar.o \
+ $(DIR_LLAMA)/llama-sampling.o \
+ $(DIR_LLAMA)/unicode.o \
+ $(DIR_LLAMA)/unicode-data.o
+
+OBJ_COMMON = \
+ $(DIR_COMMON)/common.o \
+ $(DIR_COMMON)/arg.o \
+ $(DIR_COMMON)/log.o \
+ $(DIR_COMMON)/console.o \
+ $(DIR_COMMON)/ngram-cache.o \
+ $(DIR_COMMON)/sampling.o \
+ $(DIR_COMMON)/speculative.o \
+ $(DIR_COMMON)/chat.o \
+ $(DIR_COMMON)/build-info.o \
+ $(DIR_COMMON)/json-schema-to-grammar.o
+
+OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON)
+
+LIB_GGML = $(LIB_PRE)ggml$(DSO_EXT)
+LIB_GGML_S = $(LIB_PRE)ggml.a
+
+LIB_LLAMA = $(LIB_PRE)llama$(DSO_EXT)
+LIB_LLAMA_S = $(LIB_PRE)llama.a
+
+LIB_COMMON = $(LIB_PRE)common$(DSO_EXT)
+LIB_COMMON_S = $(LIB_PRE)common.a
+
+LIB_ALL = $(LIB_GGML) $(LIB_LLAMA) $(LIB_COMMON)
+LIB_ALL_S = $(LIB_GGML_S) $(LIB_LLAMA_S) $(LIB_COMMON_S)
+
+GF_CC := $(CC)
+include scripts/get-flags.mk
+
+# combine build flags with cmdline overrides
+override CPPFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS)
+override CFLAGS := $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
+BASE_CXXFLAGS := $(MK_CXXFLAGS) $(CXXFLAGS)
+override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS) $(CPPFLAGS)
+override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
+override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
+
+# identify CUDA host compiler
+ifdef GGML_CUDA
+GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
+include scripts/get-flags.mk
+CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
+endif
+
+ifdef LLAMA_CURL
+override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
+override LDFLAGS := $(LDFLAGS) -lcurl
+endif
+
+#
+# Print build information
+#
+
+$(info I llama.cpp build info: )
+$(info I UNAME_S: $(UNAME_S))
+$(info I UNAME_P: $(UNAME_P))
+$(info I UNAME_M: $(UNAME_M))
+$(info I CFLAGS: $(CFLAGS))
+$(info I CXXFLAGS: $(CXXFLAGS))
+$(info I NVCCFLAGS: $(NVCCFLAGS))
+$(info I LDFLAGS: $(LDFLAGS))
+$(info I CC: $(shell $(CC) --version | head -n 1))
+$(info I CXX: $(shell $(CXX) --version | head -n 1))
+ifdef GGML_CUDA
+$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
+CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
+ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
+
+ifndef CUDA_DOCKER_ARCH
+ifndef CUDA_POWER_ARCH
+$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
+endif # CUDA_POWER_ARCH
+endif # CUDA_DOCKER_ARCH
+
+endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
+endif # GGML_CUDA
+$(info )
+
+ifdef DEPRECATE_WARNING
+$(info !!! DEPRECATION WARNING !!!)
+$(info The following LLAMA_ options are deprecated and will be removed in the future. Use the GGML_ prefix instead)
+$(info - LLAMA_CUDA)
+$(info - LLAMA_METAL)
+$(info - LLAMA_METAL_EMBED_LIBRARY)
+$(info - LLAMA_OPENMP)
+$(info - LLAMA_RPC)
+$(info - LLAMA_SYCL)
+$(info - LLAMA_SYCL_F16)
+$(info - LLAMA_OPENBLAS)
+$(info - LLAMA_OPENBLAS64)
+$(info - LLAMA_BLIS)
+$(info - LLAMA_NO_LLAMAFILE)
+$(info - LLAMA_NO_ACCELERATE)
+$(info - LLAMA_NO_OPENMP)
+$(info - LLAMA_NO_METAL)
+$(info - LLAMA_NO_CCACHE)
+$(info )
+endif
+
+ifdef REMOVE_WARNING
+$(info !!! REMOVAL WARNING !!!)
+$(info The following LLAMA_ options have been removed and are no longer supported)
+$(info - LLAMA_DISABLE_LOGS (https://github.com/ggml-org/llama.cpp/pull/9418))
+$(info - LLAMA_SERVER_VERBOSE (https://github.com/ggml-org/llama.cpp/pull/9418))
+$(info )
+endif
+
+#
+# Build libraries
+#
+
+# Libraries
+LIB_GGML = libggml.so
+LIB_GGML_S = libggml.a
+
+LIB_LLAMA = libllama.so
+LIB_LLAMA_S = libllama.a
+
+LIB_COMMON = libcommon.so
+LIB_COMMON_S = libcommon.a
+
+# Targets
+BUILD_TARGETS += $(LIB_GGML) $(LIB_GGML_S) $(LIB_LLAMA) $(LIB_LLAMA_S) $(LIB_COMMON) $(LIB_COMMON_S)
+
+# Dependency files
+DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d)
+
+# Default target
+all: $(BUILD_TARGETS)
+
+# force c++ build for source file that have same name as c file
+# Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files
+$(DIR_GGML)/%_cpp.o: $(DIR_GGML)/%.cpp
+ $(CXX) $(CXXFLAGS) -MMD -c $< -o $@
+
+# Rules for building object files
+$(DIR_GGML)/%.o: $(DIR_GGML)/%.c
+ $(CC) $(CFLAGS) -MMD -c $< -o $@
+
+$(DIR_GGML)/%.o: $(DIR_GGML)/%.cpp
+ $(CXX) $(CXXFLAGS) -MMD -c $< -o $@
+
+$(DIR_LLAMA)/%.o: $(DIR_LLAMA)/%.cpp
+ $(CXX) $(CXXFLAGS) -MMD -c $< -o $@
+
+$(DIR_COMMON)/%.o: $(DIR_COMMON)/%.cpp
+ $(CXX) $(CXXFLAGS) -MMD -c $< -o $@
+
+# Rules for building libraries
+$(LIB_GGML): $(OBJ_GGML)
+ $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
+
+$(LIB_GGML_S): $(OBJ_GGML)
+ ar rcs $(LIB_GGML_S) $^
+
+$(LIB_LLAMA): $(OBJ_LLAMA) $(LIB_GGML)
+ $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
+
+$(LIB_LLAMA_S): $(OBJ_LLAMA)
+ ar rcs $(LIB_LLAMA_S) $^
+
+$(LIB_COMMON): $(OBJ_COMMON) $(LIB_LLAMA) $(LIB_GGML)
+ $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
+
+$(LIB_COMMON_S): $(OBJ_COMMON)
+ ar rcs $(LIB_COMMON_S) $^
+
+# Include dependency files
+-include $(DEP_FILES)
+
+# Clean generated server assets
+clean-server-assets:
+ find examples/server -type f -name "*.js.hpp" -delete
+ find examples/server -type f -name "*.mjs.hpp" -delete
+ find examples/server -type f -name "*.css.hpp" -delete
+ find examples/server -type f -name "*.html.hpp" -delete
+
+# Clean rule
+clean: clean-server-assets
+ rm -vrf $(BUILD_TARGETS) $(TEST_TARGETS)
+ rm -rvf *.a *.dll *.so *.dot
+ find ggml src common tests examples pocs -type f -name "*.o" -delete
+ find ggml src common tests examples pocs -type f -name "*.d" -delete
+
+#
+# Examples
+#
+
+# $< is the first prerequisite, i.e. the source file.
+# Explicitly compile this to an object file so that it can be cached with ccache.
+# The source file is then filtered out from $^ (the list of all prerequisites) and the object file is added instead.
+
+# Helper function that replaces .c, .cpp, and .cu file endings with .o:
+GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
+
+llama-cli: examples/main/main.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+ @echo
+ @echo '==== Run ./llama-cli -h for help. ===='
+ @echo
+
+llama-infill: examples/infill/infill.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-run: examples/run/run.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-simple: examples/simple/simple.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-simple-chat: examples/simple-chat/simple-chat.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-tokenize: examples/tokenize/tokenize.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-batched: examples/batched/batched.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-batched-bench: examples/batched-bench/batched-bench.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-quantize: examples/quantize/quantize.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-perplexity: examples/perplexity/perplexity.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-imatrix: examples/imatrix/imatrix.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-embedding: examples/embedding/embedding.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-gritlm: examples/gritlm/gritlm.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-save-load-state: examples/save-load-state/save-load-state.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-gguf: examples/gguf/gguf.cpp \
+ $(OBJ_GGML)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+examples/gguf-hash/deps/sha1/sha1.o: \
+ examples/gguf-hash/deps/sha1/sha1.c
+ $(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
+
+examples/gguf-hash/deps/xxhash/xxhash.o: \
+ examples/gguf-hash/deps/xxhash/xxhash.c
+ $(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
+
+examples/gguf-hash/deps/sha256/sha256.o: \
+ examples/gguf-hash/deps/sha256/sha256.c
+ $(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
+
+llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/sha1.o examples/gguf-hash/deps/xxhash/xxhash.o examples/gguf-hash/deps/sha256/sha256.o\
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-gguf-split: examples/gguf-split/gguf-split.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-eval-callback: examples/eval-callback/eval-callback.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-bench: examples/llama-bench/llama-bench.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-export-lora: examples/export-lora/export-lora.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-retrieval: examples/retrieval/retrieval.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-speculative: examples/speculative/speculative.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-parallel: examples/parallel/parallel.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-lookahead: examples/lookahead/lookahead.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-lookup: examples/lookup/lookup.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-lookup-create: examples/lookup/lookup-create.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-lookup-merge: examples/lookup/lookup-merge.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-lookup-stats: examples/lookup/lookup-stats.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-passkey: examples/passkey/passkey.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+ifdef GGML_RPC
+rpc-server: examples/rpc/rpc-server.cpp \
+ $(OBJ_GGML)
+ $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+endif # GGML_RPC
+
+llama-server: \
+ examples/server/server.cpp \
+ examples/server/utils.hpp \
+ examples/server/httplib.h \
+ examples/server/index.html.hpp \
+ examples/server/loading.html.hpp \
+ common/chat.cpp \
+ common/chat.h \
+ common/chat-template.hpp \
+ common/json.hpp \
+ common/minja.hpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
+
+# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
+examples/server/%.hpp: examples/server/public/% FORCE Makefile
+ @( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
+ echo "unsigned char $${NAME}[] = {" && \
+ cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
+ echo "};" && \
+ echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
+ ) > $@
+
+llama-gen-docs: examples/gen-docs/gen-docs.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+libllava.a: examples/llava/llava.cpp \
+ examples/llava/llava.h \
+ examples/llava/clip.cpp \
+ examples/llava/clip.h \
+ common/stb_image.h \
+ common/base64.hpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
+
+llama-llava-cli: examples/llava/llava-cli.cpp \
+ examples/llava/llava.cpp \
+ examples/llava/llava.h \
+ examples/llava/clip.cpp \
+ examples/llava/clip.h \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
+
+llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
+ examples/llava/llava.cpp \
+ examples/llava/llava.h \
+ examples/llava/clip.cpp \
+ examples/llava/clip.h \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
+
+llama-qwen2vl-cli: examples/llava/qwen2vl-cli.cpp \
+ examples/llava/llava.cpp \
+ examples/llava/llava.h \
+ examples/llava/clip.cpp \
+ examples/llava/clip.h \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
+
+ifeq ($(UNAME_S),Darwin)
+swift: examples/batched.swift
+ (cd examples/batched.swift; make build)
+endif
+
+common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh
+ @sh scripts/build-info.sh "$(CC)" > $@.tmp
+ @if ! cmp -s $@.tmp $@; then \
+ mv $@.tmp $@; \
+ else \
+ rm $@.tmp; \
+ fi
+
+common/build-info.o: common/build-info.cpp
+ $(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
+
+#
+# Tests
+#
+
+tests: $(TEST_TARGETS)
+
+tests/test-arg-parser: tests/test-arg-parser.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+tests/test-llama-grammar: tests/test-llama-grammar.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+tests/test-log: tests/test-log.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+tests/test-grammar-parser: tests/test-grammar-parser.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+tests/test-grammar-integration: tests/test-grammar-integration.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+tests/test-double-float: tests/test-double-float.cpp
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+tests/test-chat: tests/test-chat.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+tests/test-opt: tests/test-opt.cpp \
+ $(OBJ_GGML)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+tests/test-quantize-fns: tests/test-quantize-fns.cpp \
+ $(OBJ_GGML)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+tests/test-quantize-perf: tests/test-quantize-perf.cpp \
+ $(OBJ_GGML)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+tests/test-sampling: tests/test-sampling.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+tests/test-tokenizer-0: tests/test-tokenizer-0.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+tests/test-rope: tests/test-rope.cpp ggml/src/ggml.o \
+ $(OBJ_GGML)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+tests/test-c.o: tests/test-c.c include/llama.h
+ $(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
+
+tests/test-backend-ops: tests/test-backend-ops.cpp \
+ $(OBJ_GGML)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+tests/test-model-load-cancel: tests/test-model-load-cancel.cpp tests/get-model.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+tests/test-autorelease: tests/test-autorelease.cpp tests/get-model.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+tests/test-chat-template: tests/test-chat-template.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+#
+# PoCs
+#
+
+llama-vdot: pocs/vdot/vdot.cpp ggml/src/ggml.o \
+ $(OBJ_GGML)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
+ $(OBJ_GGML)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+#
+# Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
+#
+# Mark legacy binary targets as .PHONY so that they are always checked.
+.PHONY: FORCE main quantize perplexity embedding server
+
+# Define the object file target
+examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp
+ $(CXX) $(CXXFLAGS) -c $< -o $@
+
+# NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
+# Eventually we will want to remove these target from building all the time.
+main: examples/deprecation-warning/deprecation-warning.o
+ $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
+ @echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead."
+
+server: examples/deprecation-warning/deprecation-warning.o
+ $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
+ @echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead."
+
+quantize: examples/deprecation-warning/deprecation-warning.o
+ifneq (,$(wildcard quantize))
+ $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
+ @echo "#########"
+ @echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
+ @echo " Remove the 'quantize' binary to remove this warning."
+ @echo "#########"
+endif
+
+perplexity: examples/deprecation-warning/deprecation-warning.o
+ifneq (,$(wildcard perplexity))
+ $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
+ @echo "#########"
+ @echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
+ @echo " Remove the 'perplexity' binary to remove this warning."
+ @echo "#########"
+endif
+
+embedding: examples/deprecation-warning/deprecation-warning.o
+ifneq (,$(wildcard embedding))
+ $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
+ @echo "#########"
+ @echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
+ @echo " Remove the 'embedding' binary to remove this warning."
+ @echo "#########"
+endif
diff --git a/README-llama.cpp.md b/README-llama.cpp.md
new file mode 100644
index 0000000000000..1eec944f273a8
--- /dev/null
+++ b/README-llama.cpp.md
@@ -0,0 +1,545 @@
+# llama.cpp
+
+
+
+[](https://opensource.org/licenses/MIT)
+[](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
+
+[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
+
+Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
+
+> [!IMPORTANT]
+> New `llama.cpp` package location: [ggml-org/llama.cpp](https://github.com/ggml-org/llama.cpp/pkgs/container/llama.cpp)
+>
+> Update your container URLs to: `ghcr.io/ggml-org/llama.cpp`
+>
+> More info: https://github.com/ggml-org/llama.cpp/discussions/11801
+
+## Recent API changes
+
+- [Changelog for `libllama` API](https://github.com/ggml-org/llama.cpp/issues/9289)
+- [Changelog for `llama-server` REST API](https://github.com/ggml-org/llama.cpp/issues/9291)
+
+## Hot topics
+
+- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggml-org/llama.cpp/pull/11427
+- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
+- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
+- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
+- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
+- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
+- Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
+
+----
+
+## Description
+
+The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
+range of hardware - locally and in the cloud.
+
+- Plain C/C++ implementation without any dependencies
+- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
+- AVX, AVX2, AVX512 and AMX support for x86 architectures
+- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
+- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
+- Vulkan and SYCL backend support
+- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
+
+The `llama.cpp` project is the main playground for developing new features for the [ggml](https://github.com/ggml-org/ggml) library.
+
+
+Models
+
+Typically finetunes of the base models below are supported as well.
+
+Instructions for adding support for new models: [HOWTO-add-model.md](docs/development/HOWTO-add-model.md)
+
+#### Text-only
+
+- [X] LLaMA π¦
+- [x] LLaMA 2 π¦π¦
+- [x] LLaMA 3 π¦π¦π¦
+- [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
+- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
+- [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
+- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
+- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
+- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
+- [X] [BERT](https://github.com/ggml-org/llama.cpp/pull/5423)
+- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
+- [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
+- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
+- [X] [Starcoder models](https://github.com/ggml-org/llama.cpp/pull/3187)
+- [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
+- [X] [MPT](https://github.com/ggml-org/llama.cpp/pull/3417)
+- [X] [Bloom](https://github.com/ggml-org/llama.cpp/pull/3553)
+- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
+- [X] [StableLM models](https://huggingface.co/stabilityai)
+- [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
+- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
+- [x] [PLaMo-13B](https://github.com/ggml-org/llama.cpp/pull/3557)
+- [x] [Phi models](https://huggingface.co/models?search=microsoft/phi)
+- [x] [PhiMoE](https://github.com/ggml-org/llama.cpp/pull/11003)
+- [x] [GPT-2](https://huggingface.co/gpt2)
+- [x] [Orion 14B](https://github.com/ggml-org/llama.cpp/pull/5118)
+- [x] [InternLM2](https://huggingface.co/models?search=internlm2)
+- [x] [CodeShell](https://github.com/WisdomShell/codeshell)
+- [x] [Gemma](https://ai.google.dev/gemma)
+- [x] [Mamba](https://github.com/state-spaces/mamba)
+- [x] [Grok-1](https://huggingface.co/keyfan/grok-1-hf)
+- [x] [Xverse](https://huggingface.co/models?search=xverse)
+- [x] [Command-R models](https://huggingface.co/models?search=CohereForAI/c4ai-command-r)
+- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
+- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
+- [x] [OLMo](https://allenai.org/olmo)
+- [x] [OLMo 2](https://allenai.org/olmo)
+- [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924)
+- [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
+- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
+- [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520)
+- [x] [Smaug](https://huggingface.co/models?search=Smaug)
+- [x] [Poro 34B](https://huggingface.co/LumiOpen/Poro-34B)
+- [x] [Bitnet b1.58 models](https://huggingface.co/1bitLLM)
+- [x] [Flan T5](https://huggingface.co/models?search=flan-t5)
+- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
+- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b) + [GLMEdge-1.5b](https://huggingface.co/THUDM/glm-edge-1.5b-chat) + [GLMEdge-4b](https://huggingface.co/THUDM/glm-edge-4b-chat)
+- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
+- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
+- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
+- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
+- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
+- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
+- [x] [QRWKV-6](https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1)
+- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
+
+#### Multimodal
+
+- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
+- [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
+- [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
+- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
+- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
+- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
+- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
+- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
+- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
+- [x] [GLM-EDGE](https://huggingface.co/models?search=glm-edge)
+- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)
+
+
+
+
+Bindings
+
+- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
+- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
+- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
+- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
+- JS/TS (Programmable Prompt Engine CLI): [offline-ai/cli](https://github.com/offline-ai/cli)
+- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
+- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
+- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
+- Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs)
+- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
+- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
+- Rust (automated build from crates.io): [ShelbyJenkins/llm_client](https://github.com/ShelbyJenkins/llm_client)
+- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
+- C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html)
+- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
+- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
+- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
+- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
+- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
+- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
+- Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama)
+- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggml-org/llama.cpp/pull/6326)
+- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
+- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
+- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
+- Delphi [Embarcadero/llama-cpp-delphi](https://github.com/Embarcadero/llama-cpp-delphi)
+
+
+
+
+UIs
+
+*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
+
+- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
+- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
+- [Dot](https://github.com/alexpinel/Dot) (GPL)
+- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
+- [iohub/collama](https://github.com/iohub/coLLaMA) (Apache-2.0)
+- [janhq/jan](https://github.com/janhq/jan) (AGPL)
+- [johnbean393/Sidekick](https://github.com/johnbean393/Sidekick) (MIT)
+- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0)
+- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
+- [llama.vim](https://github.com/ggml-org/llama.vim) (MIT)
+- [LARS](https://github.com/abgulati/LARS) (AGPL)
+- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
+- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
+- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
+- [LMStudio](https://lmstudio.ai/) (proprietary)
+- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
+- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
+- [MindMac](https://mindmac.app) (proprietary)
+- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
+- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
+- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) (Apache-2.0)
+- [nat/openplayground](https://github.com/nat/openplayground) (MIT)
+- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) (MIT)
+- [ollama/ollama](https://github.com/ollama/ollama) (MIT)
+- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
+- [PocketPal AI](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
+- [psugihara/FreeChat](https://github.com/psugihara/FreeChat) (MIT)
+- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal) (MIT)
+- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
+- [ramalama](https://github.com/containers/ramalama) (MIT)
+- [semperai/amica](https://github.com/semperai/amica) (MIT)
+- [withcatai/catai](https://github.com/withcatai/catai) (MIT)
+- [Autopen](https://github.com/blackhole89/autopen) (GPL)
+
+
+
+
+Tools
+
+- [akx/ggify](https://github.com/akx/ggify) β download PyTorch models from HuggingFace Hub and convert them to GGML
+- [akx/ollama-dl](https://github.com/akx/ollama-dl) β download models from the Ollama library to be used directly with llama.cpp
+- [crashr/gppm](https://github.com/crashr/gppm) β launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
+- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
+- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example)
+
+
+
+
+Infrastructure
+
+- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
+- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
+- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
+- [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
+- [Kalavai](https://github.com/kalavai-net/kalavai-client) - Crowdsource end to end LLM deployment at any scale
+- [llmaz](https://github.com/InftyAI/llmaz) - βΈοΈ Easy, advanced inference platform for large language models on Kubernetes.
+
+
+
+Games
+
+- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
+
+
+
+## Supported backends
+
+| Backend | Target devices |
+| --- | --- |
+| [Metal](docs/build.md#metal-build) | Apple Silicon |
+| [BLAS](docs/build.md#blas-build) | All |
+| [BLIS](docs/backend/BLIS.md) | All |
+| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
+| [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
+| [CUDA](docs/build.md#cuda) | Nvidia GPU |
+| [HIP](docs/build.md#hip) | AMD GPU |
+| [Vulkan](docs/build.md#vulkan) | GPU |
+| [CANN](docs/build.md#cann) | Ascend NPU |
+| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
+
+## Building the project
+
+The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
+The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries:
+
+- Clone this repository and build locally, see [how to build](docs/build.md)
+- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md)
+- Use a Docker image, see [documentation for Docker](docs/docker.md)
+- Download pre-built binaries from [releases](https://github.com/ggml-org/llama.cpp/releases)
+
+## Obtaining and quantizing models
+
+The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`:
+
+- [Trending](https://huggingface.co/models?library=gguf&sort=trending)
+- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
+
+You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from Hugging Face by using this CLI argument: `-hf /[:quant]`
+
+After downloading a model, use the CLI tools to run it locally - see below.
+
+`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggml-org/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo.
+
+The Hugging Face platform provides a variety of online tools for converting, quantizing and hosting models with `llama.cpp`:
+
+- Use the [GGUF-my-repo space](https://huggingface.co/spaces/ggml-org/gguf-my-repo) to convert to GGUF format and quantize model weights to smaller sizes
+- Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggml-org/llama.cpp/discussions/10123)
+- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggml-org/llama.cpp/discussions/9268)
+- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggml-org/llama.cpp/discussions/9669)
+
+To learn more about model quantization, [read this documentation](examples/quantize/README.md)
+
+## [`llama-cli`](examples/main)
+
+#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
+
+-
+ Run in conversation mode
+
+ Models with a built-in chat template will automatically activate conversation mode. If this doesn't occur, you can manually enable it by adding `-cnv` and specifying a suitable chat template with `--chat-template NAME`
+
+ ```bash
+ llama-cli -m model.gguf
+
+ # > hi, who are you?
+ # Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
+ #
+ # > what is 1+1?
+ # Easy peasy! The answer to 1+1 is... 2!
+ ```
+
+
+
+-
+ Run in conversation mode with custom chat template
+
+ ```bash
+ # use the "chatml" template (use -h to see the list of supported templates)
+ llama-cli -m model.gguf -cnv --chat-template chatml
+
+ # use a custom template
+ llama-cli -m model.gguf -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
+ ```
+
+
+
+-
+ Run simple text completion
+
+ To disable conversation mode explicitly, use `-no-cnv`
+
+ ```bash
+ llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128 -no-cnv
+
+ # I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga β it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
+ ```
+
+
+
+-
+ Constrain the output with a custom grammar
+
+ ```bash
+ llama-cli -m model.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
+
+ # {"appointmentTime": "8pm", "appointmentDetails": "schedule a a call"}
+ ```
+
+ The [grammars/](grammars/) folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](grammars/README.md).
+
+ For authoring more complex JSON grammars, check out https://grammar.intrinsiclabs.ai/
+
+
+
+
+## [`llama-server`](examples/server)
+
+#### A lightweight, [OpenAI API](https://github.com/openai/openai-openapi) compatible, HTTP server for serving LLMs.
+
+-
+ Start a local HTTP server with default configuration on port 8080
+
+ ```bash
+ llama-server -m model.gguf --port 8080
+
+ # Basic web UI can be accessed via browser: http://localhost:8080
+ # Chat completion endpoint: http://localhost:8080/v1/chat/completions
+ ```
+
+
+
+-
+ Support multiple-users and parallel decoding
+
+ ```bash
+ # up to 4 concurrent requests, each with 4096 max context
+ llama-server -m model.gguf -c 16384 -np 4
+ ```
+
+
+
+-
+ Enable speculative decoding
+
+ ```bash
+ # the draft.gguf model should be a small variant of the target model.gguf
+ llama-server -m model.gguf -md draft.gguf
+ ```
+
+
+
+-
+ Serve an embedding model
+
+ ```bash
+ # use the /embedding endpoint
+ llama-server -m model.gguf --embedding --pooling cls -ub 8192
+ ```
+
+
+
+-
+ Serve a reranking model
+
+ ```bash
+ # use the /reranking endpoint
+ llama-server -m model.gguf --reranking
+ ```
+
+
+
+-
+ Constrain all outputs with a grammar
+
+ ```bash
+ # custom grammar
+ llama-server -m model.gguf --grammar-file grammar.gbnf
+
+ # JSON
+ llama-server -m model.gguf --grammar-file grammars/json.gbnf
+ ```
+
+
+
+
+## [`llama-perplexity`](examples/perplexity)
+
+#### A tool for measuring the perplexity [^1][^2] (and other quality metrics) of a model over a given text.
+
+-
+ Measure the perplexity over a text file
+
+ ```bash
+ llama-perplexity -m model.gguf -f file.txt
+
+ # [1]15.2701,[2]5.4007,[3]5.3073,[4]6.2965,[5]5.8940,[6]5.6096,[7]5.7942,[8]4.9297, ...
+ # Final estimate: PPL = 5.4007 +/- 0.67339
+ ```
+
+
+
+-
+ Measure KL divergence
+
+ ```bash
+ # TODO
+ ```
+
+
+
+[^1]: [examples/perplexity/README.md](./examples/perplexity/README.md)
+[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
+
+## [`llama-bench`](examples/llama-bench)
+
+#### Benchmark the performance of the inference for various parameters.
+
+-
+ Run default benchmark
+
+ ```bash
+ llama-bench -m model.gguf
+
+ # Output:
+ # | model | size | params | backend | threads | test | t/s |
+ # | ------------------- | ---------: | ---------: | ---------- | ------: | ------------: | -------------------: |
+ # | qwen2 1.5B Q4_0 | 885.97 MiB | 1.54 B | Metal,BLAS | 16 | pp512 | 5765.41 Β± 20.55 |
+ # | qwen2 1.5B Q4_0 | 885.97 MiB | 1.54 B | Metal,BLAS | 16 | tg128 | 197.71 Β± 0.81 |
+ #
+ # build: 3e0ba0e60 (4229)
+ ```
+
+
+
+## [`llama-run`](examples/run)
+
+#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
+
+-
+ Run a model with a specific prompt (by default it's pulled from Ollama registry)
+
+ ```bash
+ llama-run granite-code
+ ```
+
+
+
+[^3]: [RamaLama](https://github.com/containers/ramalama)
+
+## [`llama-simple`](examples/simple)
+
+#### A minimal example for implementing apps with `llama.cpp`. Useful for developers.
+
+-
+ Basic text completion
+
+ ```bash
+ llama-simple -m model.gguf
+
+ # Hello my name is Kaitlyn and I am a 16 year old girl. I am a junior in high school and I am currently taking a class called "The Art of
+ ```
+
+
+
+
+## Contributing
+
+- Contributors can open PRs
+- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
+- Collaborators will be invited based on contributions
+- Any help with managing issues, PRs and projects is very appreciated!
+- See [good first issues](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
+- Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
+- Make sure to read this: [Inference at the edge](https://github.com/ggml-org/llama.cpp/discussions/205)
+- A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
+
+## Other documentation
+
+- [main (cli)](examples/main/README.md)
+- [server](examples/server/README.md)
+- [GBNF grammars](grammars/README.md)
+
+#### Development documentation
+
+- [How to build](docs/build.md)
+- [Running on Docker](docs/docker.md)
+- [Build on Android](docs/android.md)
+- [Performance troubleshooting](docs/development/token_generation_performance_tips.md)
+- [GGML tips & tricks](https://github.com/ggml-org/llama.cpp/wiki/GGML-Tips-&-Tricks)
+
+#### Seminal papers and background on the models
+
+If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
+- LLaMA:
+ - [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/)
+ - [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
+- GPT-3
+ - [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)
+- GPT-3.5 / InstructGPT / ChatGPT:
+ - [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
+ - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
+
+## Completions
+Command-line completion is available for some environments.
+
+#### Bash Completion
+```bash
+$ build/bin/llama-cli --completion-bash > ~/.llama-completion.bash
+$ source ~/.llama-completion.bash
+```
+Optionally this can be added to your `.bashrc` or `.bash_profile` to load it
+automatically. For example:
+```console
+$ echo "source ~/.llama-completion.bash" >> ~/.bashrc
+```
+
+## References
diff --git a/README.md b/README.md
index 1eec944f273a8..65c081cf3153e 100644
--- a/README.md
+++ b/README.md
@@ -1,545 +1,73 @@
-# llama.cpp
+## llama-server-one
+Based on [llama.cpp](https://github.com/ggml-org/llama.cpp).
-
+Brad Hutchings
+brad@bradhutchings.com
-[](https://opensource.org/licenses/MIT)
-[](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
+
-[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
+---
+### Project Goals
-Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
+The goal of this project is to build a single `llama-server-one executable` file that can run "anywhere":
+- x86_64 Windows
+- x86_64 Linux
+- ARM Windows
+- ARM Linux
+- ARM MacOS
-> [!IMPORTANT]
-> New `llama.cpp` package location: [ggml-org/llama.cpp](https://github.com/ggml-org/llama.cpp/pkgs/container/llama.cpp)
->
-> Update your container URLs to: `ghcr.io/ggml-org/llama.cpp`
->
-> More info: https://github.com/ggml-org/llama.cpp/discussions/11801
+I am inspired by the [llamafile project](https://github.com/Mozilla-Ocho/llamafile). The main drawback of that project is that it has not kept up-to-date with llama.cpp and therefore, does not always support the latest models when llama.cpp supports them. Support for new models in llamafile takes work and time.
-## Recent API changes
+I want to use the MIT license as used by llama.cpp.
-- [Changelog for `libllama` API](https://github.com/ggml-org/llama.cpp/issues/9289)
-- [Changelog for `llama-server` REST API](https://github.com/ggml-org/llama.cpp/issues/9291)
+GPU support is not important to me and can be handled by platform specific builds of llama.cpp. CPU inference is quite adequate for many private end-user applications.
-## Hot topics
+The ability to package support files, such as a custom web, UI into the executable file is important to me. This is implemented.
-- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggml-org/llama.cpp/pull/11427
-- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
-- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
-- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
-- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
-- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
-- Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
+The ability to package default arguments, in an "args" file, into the executable file is important to me. This is implemented.
-----
+The ability to read arguments from a file adjacent to the executable file is important to me. This is implemented.
-## Description
+The ability to package a gguf model into the executable file is important to me. This is not implemented yet.
-The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
-range of hardware - locally and in the cloud.
+I welcome any of my changes being implemented in the official llama.cpp.
-- Plain C/C++ implementation without any dependencies
-- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
-- AVX, AVX2, AVX512 and AMX support for x86 architectures
-- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
-- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
-- Vulkan and SYCL backend support
-- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
+---
+### Documentation
+- My start-to-finish guide for building `llama-server` with Cosmo is in the [Building-ls1.md](docs/Building-ls1.md) file.
+- My guide for packaging a `llama-server-one` executable is in the [Packaging-ls1.md](docs/Packaging-ls1.md) file.
+- My guide for deploying a `llama-server-one` executable is in the [Deploying-ls1.md](docs/Deploying-ls1.md) file.
-The `llama.cpp` project is the main playground for developing new features for the [ggml](https://github.com/ggml-org/ggml) library.
+---
+### Modifications to llama.cpp
-
-Models
+To get this from the llama.cpp source base, there are few files that need to be modified:
-Typically finetunes of the base models below are supported as well.
+1. [Makefile](Makefile) -- Extensive modifications to bring up to date, as it is deprecated in favor of a CMake system, and to support COSMOCC.
-Instructions for adding support for new models: [HOWTO-add-model.md](docs/development/HOWTO-add-model.md)
+2. [src/llama-context.cpp](src/llama-context.cpp) -- COSMOCC doesn't have std::fill in its Standard Templates Library.
-#### Text-only
+3. [examples/server/server.cpp](examples/server/server.cpp) -- Support embedded or adjacent "args" file, fix Cosmo name conflict with "defer" task member, add additional meta data to `model_meta`.
-- [X] LLaMA π¦
-- [x] LLaMA 2 π¦π¦
-- [x] LLaMA 3 π¦π¦π¦
-- [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
-- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
-- [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
-- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
-- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
-- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
-- [X] [BERT](https://github.com/ggml-org/llama.cpp/pull/5423)
-- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
-- [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
-- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
-- [X] [Starcoder models](https://github.com/ggml-org/llama.cpp/pull/3187)
-- [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
-- [X] [MPT](https://github.com/ggml-org/llama.cpp/pull/3417)
-- [X] [Bloom](https://github.com/ggml-org/llama.cpp/pull/3553)
-- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
-- [X] [StableLM models](https://huggingface.co/stabilityai)
-- [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
-- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
-- [x] [PLaMo-13B](https://github.com/ggml-org/llama.cpp/pull/3557)
-- [x] [Phi models](https://huggingface.co/models?search=microsoft/phi)
-- [x] [PhiMoE](https://github.com/ggml-org/llama.cpp/pull/11003)
-- [x] [GPT-2](https://huggingface.co/gpt2)
-- [x] [Orion 14B](https://github.com/ggml-org/llama.cpp/pull/5118)
-- [x] [InternLM2](https://huggingface.co/models?search=internlm2)
-- [x] [CodeShell](https://github.com/WisdomShell/codeshell)
-- [x] [Gemma](https://ai.google.dev/gemma)
-- [x] [Mamba](https://github.com/state-spaces/mamba)
-- [x] [Grok-1](https://huggingface.co/keyfan/grok-1-hf)
-- [x] [Xverse](https://huggingface.co/models?search=xverse)
-- [x] [Command-R models](https://huggingface.co/models?search=CohereForAI/c4ai-command-r)
-- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
-- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
-- [x] [OLMo](https://allenai.org/olmo)
-- [x] [OLMo 2](https://allenai.org/olmo)
-- [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924)
-- [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
-- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
-- [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520)
-- [x] [Smaug](https://huggingface.co/models?search=Smaug)
-- [x] [Poro 34B](https://huggingface.co/LumiOpen/Poro-34B)
-- [x] [Bitnet b1.58 models](https://huggingface.co/1bitLLM)
-- [x] [Flan T5](https://huggingface.co/models?search=flan-t5)
-- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
-- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b) + [GLMEdge-1.5b](https://huggingface.co/THUDM/glm-edge-1.5b-chat) + [GLMEdge-4b](https://huggingface.co/THUDM/glm-edge-4b-chat)
-- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
-- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
-- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
-- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
-- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
-- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
-- [x] [QRWKV-6](https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1)
-- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
+---
+### Reference
-#### Multimodal
+Here are some projects and pages you should be familiar with if you want to get the most out of `llama-server-one`:
+- [llama.cpp](https://github.com/ggml-org/llama.cpp) - Georgi Gerganov and his team are the rock stars who are making the plumbing so LLMs can be available for developers of all kinds. The `llama.cpp` project is the industry standard for inference. I only fork it here because I want to make it a little better for my applications while preserving all its goodness.
+- [llamafile](https://github.com/Mozilla-Ocho/llamafile) - `Llamafile` lets you distribute and run LLMs with a single file. It is a Mozilla Foundation project that brough the Cosmopolitan C Library and llama.cpp together. It has some popular GPU support. It is based on an older version of llama.cpp and does not support all of the latest models supported by llama.cpp. Llamafile is an inspiration for this project.
+- [Cosmopolitan Libc](https://github.com/jart/cosmopolitan) - `Cosmopolitan` is a project for building cross-platform binaries that run on x86_64 and ARM architectures, supporting Linux, Windows, macOS, and other operating systems. Like `llamafile`, I use Cosmo compile cross-platform executables of `llama.cpp` targets, including `llama-server`.
+- [Actually Portable Executable (APE) Specification](https://github.com/jart/cosmopolitan/blob/master/ape/specification.md) - Within the Cosmopolitan Libc repo is documentation about how the cross CPU, cross platform executable works.
+- [Brad's LLMs](https://huggingface.co/bradhutchings/Brads-LLMs) - I share private local LLMs built with `llamafile` in a Hugging Face repo.
-- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
-- [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
-- [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
-- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
-- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
-- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
-- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
-- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
-- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
-- [x] [GLM-EDGE](https://huggingface.co/models?search=glm-edge)
-- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)
+---
+### To Do List
-
-
-
-Bindings
-
-- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
-- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
-- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
-- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
-- JS/TS (Programmable Prompt Engine CLI): [offline-ai/cli](https://github.com/offline-ai/cli)
-- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
-- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
-- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
-- Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs)
-- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
-- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
-- Rust (automated build from crates.io): [ShelbyJenkins/llm_client](https://github.com/ShelbyJenkins/llm_client)
-- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
-- C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html)
-- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
-- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
-- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
-- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
-- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
-- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
-- Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama)
-- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggml-org/llama.cpp/pull/6326)
-- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
-- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
-- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
-- Delphi [Embarcadero/llama-cpp-delphi](https://github.com/Embarcadero/llama-cpp-delphi)
-
-
-
-
-UIs
-
-*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
-
-- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
-- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
-- [Dot](https://github.com/alexpinel/Dot) (GPL)
-- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
-- [iohub/collama](https://github.com/iohub/coLLaMA) (Apache-2.0)
-- [janhq/jan](https://github.com/janhq/jan) (AGPL)
-- [johnbean393/Sidekick](https://github.com/johnbean393/Sidekick) (MIT)
-- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0)
-- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
-- [llama.vim](https://github.com/ggml-org/llama.vim) (MIT)
-- [LARS](https://github.com/abgulati/LARS) (AGPL)
-- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
-- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
-- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
-- [LMStudio](https://lmstudio.ai/) (proprietary)
-- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
-- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
-- [MindMac](https://mindmac.app) (proprietary)
-- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
-- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
-- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) (Apache-2.0)
-- [nat/openplayground](https://github.com/nat/openplayground) (MIT)
-- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) (MIT)
-- [ollama/ollama](https://github.com/ollama/ollama) (MIT)
-- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
-- [PocketPal AI](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
-- [psugihara/FreeChat](https://github.com/psugihara/FreeChat) (MIT)
-- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal) (MIT)
-- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
-- [ramalama](https://github.com/containers/ramalama) (MIT)
-- [semperai/amica](https://github.com/semperai/amica) (MIT)
-- [withcatai/catai](https://github.com/withcatai/catai) (MIT)
-- [Autopen](https://github.com/blackhole89/autopen) (GPL)
-
-
-
-
-Tools
-
-- [akx/ggify](https://github.com/akx/ggify) β download PyTorch models from HuggingFace Hub and convert them to GGML
-- [akx/ollama-dl](https://github.com/akx/ollama-dl) β download models from the Ollama library to be used directly with llama.cpp
-- [crashr/gppm](https://github.com/crashr/gppm) β launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
-- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
-- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example)
-
-
-
-
-Infrastructure
-
-- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
-- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
-- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
-- [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
-- [Kalavai](https://github.com/kalavai-net/kalavai-client) - Crowdsource end to end LLM deployment at any scale
-- [llmaz](https://github.com/InftyAI/llmaz) - βΈοΈ Easy, advanced inference platform for large language models on Kubernetes.
-
-
-
-Games
-
-- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
-
-
-
-## Supported backends
-
-| Backend | Target devices |
-| --- | --- |
-| [Metal](docs/build.md#metal-build) | Apple Silicon |
-| [BLAS](docs/build.md#blas-build) | All |
-| [BLIS](docs/backend/BLIS.md) | All |
-| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
-| [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
-| [CUDA](docs/build.md#cuda) | Nvidia GPU |
-| [HIP](docs/build.md#hip) | AMD GPU |
-| [Vulkan](docs/build.md#vulkan) | GPU |
-| [CANN](docs/build.md#cann) | Ascend NPU |
-| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
-
-## Building the project
-
-The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
-The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries:
-
-- Clone this repository and build locally, see [how to build](docs/build.md)
-- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md)
-- Use a Docker image, see [documentation for Docker](docs/docker.md)
-- Download pre-built binaries from [releases](https://github.com/ggml-org/llama.cpp/releases)
-
-## Obtaining and quantizing models
-
-The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`:
-
-- [Trending](https://huggingface.co/models?library=gguf&sort=trending)
-- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
-
-You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from Hugging Face by using this CLI argument: `-hf /[:quant]`
-
-After downloading a model, use the CLI tools to run it locally - see below.
-
-`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggml-org/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo.
-
-The Hugging Face platform provides a variety of online tools for converting, quantizing and hosting models with `llama.cpp`:
-
-- Use the [GGUF-my-repo space](https://huggingface.co/spaces/ggml-org/gguf-my-repo) to convert to GGUF format and quantize model weights to smaller sizes
-- Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggml-org/llama.cpp/discussions/10123)
-- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggml-org/llama.cpp/discussions/9268)
-- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggml-org/llama.cpp/discussions/9669)
-
-To learn more about model quantization, [read this documentation](examples/quantize/README.md)
-
-## [`llama-cli`](examples/main)
-
-#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
-
--
- Run in conversation mode
-
- Models with a built-in chat template will automatically activate conversation mode. If this doesn't occur, you can manually enable it by adding `-cnv` and specifying a suitable chat template with `--chat-template NAME`
-
- ```bash
- llama-cli -m model.gguf
-
- # > hi, who are you?
- # Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
- #
- # > what is 1+1?
- # Easy peasy! The answer to 1+1 is... 2!
- ```
-
-
-
--
- Run in conversation mode with custom chat template
-
- ```bash
- # use the "chatml" template (use -h to see the list of supported templates)
- llama-cli -m model.gguf -cnv --chat-template chatml
-
- # use a custom template
- llama-cli -m model.gguf -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
- ```
-
-
-
--
- Run simple text completion
-
- To disable conversation mode explicitly, use `-no-cnv`
-
- ```bash
- llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128 -no-cnv
-
- # I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga β it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
- ```
-
-
-
--
- Constrain the output with a custom grammar
-
- ```bash
- llama-cli -m model.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
-
- # {"appointmentTime": "8pm", "appointmentDetails": "schedule a a call"}
- ```
-
- The [grammars/](grammars/) folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](grammars/README.md).
-
- For authoring more complex JSON grammars, check out https://grammar.intrinsiclabs.ai/
-
-
-
-
-## [`llama-server`](examples/server)
-
-#### A lightweight, [OpenAI API](https://github.com/openai/openai-openapi) compatible, HTTP server for serving LLMs.
-
--
- Start a local HTTP server with default configuration on port 8080
-
- ```bash
- llama-server -m model.gguf --port 8080
-
- # Basic web UI can be accessed via browser: http://localhost:8080
- # Chat completion endpoint: http://localhost:8080/v1/chat/completions
- ```
-
-
-
--
- Support multiple-users and parallel decoding
-
- ```bash
- # up to 4 concurrent requests, each with 4096 max context
- llama-server -m model.gguf -c 16384 -np 4
- ```
-
-
-
--
- Enable speculative decoding
-
- ```bash
- # the draft.gguf model should be a small variant of the target model.gguf
- llama-server -m model.gguf -md draft.gguf
- ```
-
-
-
--
- Serve an embedding model
-
- ```bash
- # use the /embedding endpoint
- llama-server -m model.gguf --embedding --pooling cls -ub 8192
- ```
-
-
-
--
- Serve a reranking model
-
- ```bash
- # use the /reranking endpoint
- llama-server -m model.gguf --reranking
- ```
-
-
-
--
- Constrain all outputs with a grammar
-
- ```bash
- # custom grammar
- llama-server -m model.gguf --grammar-file grammar.gbnf
-
- # JSON
- llama-server -m model.gguf --grammar-file grammars/json.gbnf
- ```
-
-
-
-
-## [`llama-perplexity`](examples/perplexity)
-
-#### A tool for measuring the perplexity [^1][^2] (and other quality metrics) of a model over a given text.
-
--
- Measure the perplexity over a text file
-
- ```bash
- llama-perplexity -m model.gguf -f file.txt
-
- # [1]15.2701,[2]5.4007,[3]5.3073,[4]6.2965,[5]5.8940,[6]5.6096,[7]5.7942,[8]4.9297, ...
- # Final estimate: PPL = 5.4007 +/- 0.67339
- ```
-
-
-
--
- Measure KL divergence
-
- ```bash
- # TODO
- ```
-
-
-
-[^1]: [examples/perplexity/README.md](./examples/perplexity/README.md)
-[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
-
-## [`llama-bench`](examples/llama-bench)
-
-#### Benchmark the performance of the inference for various parameters.
-
--
- Run default benchmark
-
- ```bash
- llama-bench -m model.gguf
-
- # Output:
- # | model | size | params | backend | threads | test | t/s |
- # | ------------------- | ---------: | ---------: | ---------- | ------: | ------------: | -------------------: |
- # | qwen2 1.5B Q4_0 | 885.97 MiB | 1.54 B | Metal,BLAS | 16 | pp512 | 5765.41 Β± 20.55 |
- # | qwen2 1.5B Q4_0 | 885.97 MiB | 1.54 B | Metal,BLAS | 16 | tg128 | 197.71 Β± 0.81 |
- #
- # build: 3e0ba0e60 (4229)
- ```
-
-
-
-## [`llama-run`](examples/run)
-
-#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
-
--
- Run a model with a specific prompt (by default it's pulled from Ollama registry)
-
- ```bash
- llama-run granite-code
- ```
-
-
-
-[^3]: [RamaLama](https://github.com/containers/ramalama)
-
-## [`llama-simple`](examples/simple)
-
-#### A minimal example for implementing apps with `llama.cpp`. Useful for developers.
-
--
- Basic text completion
-
- ```bash
- llama-simple -m model.gguf
-
- # Hello my name is Kaitlyn and I am a 16 year old girl. I am a junior in high school and I am currently taking a class called "The Art of
- ```
-
-
-
-
-## Contributing
-
-- Contributors can open PRs
-- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
-- Collaborators will be invited based on contributions
-- Any help with managing issues, PRs and projects is very appreciated!
-- See [good first issues](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
-- Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
-- Make sure to read this: [Inference at the edge](https://github.com/ggml-org/llama.cpp/discussions/205)
-- A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
-
-## Other documentation
-
-- [main (cli)](examples/main/README.md)
-- [server](examples/server/README.md)
-- [GBNF grammars](grammars/README.md)
-
-#### Development documentation
-
-- [How to build](docs/build.md)
-- [Running on Docker](docs/docker.md)
-- [Build on Android](docs/android.md)
-- [Performance troubleshooting](docs/development/token_generation_performance_tips.md)
-- [GGML tips & tricks](https://github.com/ggml-org/llama.cpp/wiki/GGML-Tips-&-Tricks)
-
-#### Seminal papers and background on the models
-
-If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
-- LLaMA:
- - [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/)
- - [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
-- GPT-3
- - [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)
-- GPT-3.5 / InstructGPT / ChatGPT:
- - [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
- - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
-
-## Completions
-Command-line completion is available for some environments.
-
-#### Bash Completion
-```bash
-$ build/bin/llama-cli --completion-bash > ~/.llama-completion.bash
-$ source ~/.llama-completion.bash
-```
-Optionally this can be added to your `.bashrc` or `.bash_profile` to load it
-automatically. For example:
-```console
-$ echo "source ~/.llama-completion.bash" >> ~/.bashrc
-```
-
-## References
+In no particular order of importance, these are the things that bother me:
+- Package gguf file into executable file. The zip item needs to be aligned for mmap. There is a zipalign.c tool source in llamafile that seems loosely inspired by the Android zipalign too. I feel like there should be a more generic solution for this problem.
+- GPU support without a complicated kludge, and that can support all supported platform / CPU / GPU triads. Perhaps a plugin system with shared library dispatch? Invoking dev tools on Apple Metal like llamafile does is "complicated".
+- Code signing instructions. Might have to sign executables within the zip package, plus the package itself.
+- Clean up remaining build warnings, either by fixing source (i.e. Cosmo) or finding the magical compiler flags.
+- Copy the `cosmo_args` function into `server.cpp` so it could potentially be incorporated upstream in non-Cosmo builds. `common/arg2.cpp` might be a good landing spot. License in [Cosmo source code](https://github.com/jart/cosmopolitan/blob/master/tool/args/args2.c) appears to be MIT compatible with attribution.
+- The `--ctx-size` parameter doesn't seem quite right given that new models have the training (or max) context size in their metadata. That size should be used subject to a maximum in a passed parameter. E.g. So a 128K model can run comfortably on a smaller device.
diff --git a/docs/Building-ls1.md b/docs/Building-ls1.md
new file mode 100644
index 0000000000000..ba10227dac6af
--- /dev/null
+++ b/docs/Building-ls1.md
@@ -0,0 +1,88 @@
+## Building llama-server
+
+Brad Hutchings
+brad@bradhutchings.com
+
+This file contains instructions for building `llama.cpp` with `cosmocc` to yield a `llama-server` executable that will run on multiple platforms.
+
+---
+### Build Dependencies
+I build with a freshly installed Ubuntu 24.04 VM. Here are some packages that are helpful in creating a working build system. You may need to install more.
+```
+sudo apt install -y git python3-pip build-essential zlib1g-dev \
+ libffi-dev libssl-dev libbz2-dev libreadline-dev libsqlite3-dev \
+ liblzma-dev tk-dev python3-tk cmake zip
+```
+
+### Clone this Repo Locally
+Clone this repo into a `~\llama.cpp` directory.
+```
+cd ~
+git clone https://github.com/BradHutchings/llama-server-one.git llama.cpp
+```
+
+**Optional:** Use the `work-in-progress` branch where I implement and test my own changes and where I test upstream changes from `llama.cpp`.
+```
+cd ~/llama.cpp
+git checkout work-in-progress
+```
+
+### Make llama.cpp
+We use the old `Makefile` rather than CMake. We've updated the `Makefile` in this repo to build llama.cpp correctly.
+```
+cd ~/llama.cpp
+export LLAMA_MAKEFILE=1
+make
+```
+
+If the build is successful, it will end with this message:
+
+ **NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead.**
+
+### Install Cosmo
+```
+mkdir -p cosmocc
+cd cosmocc
+wget https://cosmo.zip/pub/cosmocc/cosmocc.zip
+unzip cosmocc.zip
+rm cosmocc.zip
+cd ..
+```
+
+### Prepare to make llama.cpp with Cosmo
+```
+export PATH="$(pwd)/cosmocc/bin:$PATH"
+export CC="cosmocc -I$(pwd)/cosmocc/include -L$(pwd)/cosmocc/lib"
+export CXX="cosmocc -I$(pwd)/cosmocc/include \
+ -I$(pwd)/cosmocc/include/third_party/libcxx \
+ -L$(pwd)/cosmocc/lib"
+export UNAME_S="cosmocc"
+export UNAME_P="cosmocc"
+export UNAME_M="cosmocc"
+```
+
+### Make llama.cpp with Cosmo
+```
+make clean
+make
+```
+
+If the build is successful, it will end with this message:
+
+ **NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead.**
+
+At this point, you should see `llama-server` and other built binaries in the directory listing.
+```
+ls -al
+```
+
+`llama-server` is actually a zip acrhive with an "Actually Portable Executable" (APE) loader prefix. Let's verify the zip archive part:
+```
+unzip -l llama-server
+```
+
+---
+### Packaging llama-server-one
+
+Now that you've built `llama-server`, you're ready to package it as `llama-server-one`. Follow instructions in [Packaging-ls1.md](Packaging-ls1.md).
+
diff --git a/docs/Deploying-ls1.md b/docs/Deploying-ls1.md
new file mode 100644
index 0000000000000..be251b0ad24c0
--- /dev/null
+++ b/docs/Deploying-ls1.md
@@ -0,0 +1,95 @@
+## Deploying llama-server-one
+
+Brad Hutchings
+brad@bradhutchings.com
+
+This file contains instructions for deploying the `llama-server-one` executable. I'm using Ubuntu 24.04.
+
+---
+### Deployment Folder
+Assuming you packaged as instructed in the [Packaging-ls1.md](Packaging-ls1.md) instructions file, let's create a folder with everything you need to deploy. You can zip this folder to distribute your `llama-server-one`, model, and arguments file for use on any platform.
+
+Let's define some environment variables:
+```
+LLAMA_CPP_DIR="llama.cpp"
+LLAMA_SERVER_ONE_DIR="llama-server-one"
+DEPLOY_DIR="llama-server-one-deploy"
+DEPLOY_ZIP="llama-server-one-deploy.zip"
+
+LLAMA_SERVER="llama-server"
+LLAMA_SERVER_ONE="llama-server-one"
+LLAMA_SERVER_ONE_EXE="llama-server-one.exe"
+LLAMA_SERVER_ONE_ARGS="llama-server-one-args"
+```
+
+Create a folder and copy `llama-server-one` into the new folder.
+```
+# This should use variables for paths and filenames. So should the packaging instructions.
+cd ~
+rm -r -f $DEPLOY_DIR $DEPLOY_ZIP
+mkdir -p $DEPLOY_DIR
+cd $DEPLOY_DIR
+cp ~/$LLAMA_SERVER_ONE_DIR/$LLAMA_SERVER_ONE .
+```
+
+On Windows, this executable will need to be renamed to a `.exe` file. Since our executable is small, let's just make a copy of `llama-server-one` with the `.exe` extension.
+
+```
+cp $LLAMA_SERVER_ONE $LLAMA_SERVER_ONE_EXE
+```
+
+We have already downloaded a model in the [Packaging steps](Packaging-ls1.md). Let's copy that into our deploy directory. We'll use the model's original filename and make that work with the `llama-server-args` file (below).
+```
+MODEL_FILE="Google-Gemma-1B-Instruct-v3-q8_0.gguf"
+cp ~/$LLAMA_SERVER_ONE_DIR/model.gguf $MODEL_FILE
+```
+
+**Optional:** If you would rather download it again and save as the original name, here are the commands:
+```
+MODEL_FILE="Google-Gemma-1B-Instruct-v3-q8_0.gguf"
+wget https://huggingface.co/bradhutchings/Brads-LLMs/resolve/main/models/$MODEL_FILE?download=true \
+ --show-progress --quiet -O $MODEL_FILE
+```
+
+Let's create a `llama-server-one-args` file. These parameters can override or augment the parameters you previously embedded in you `llama-server-one` archive. This file could be edited by the end user to configure llama-file-one without having to construct and type a long command line. Notice that we've overridden the `-m`, `--host`, and `--port` parameters.
+```
+cat << EOF > $LLAMA_SERVER_ONE_ARGS
+-m
+$MODEL_FILE
+--host
+0.0.0.0
+--port
+8888
+...
+EOF
+```
+
+Now we can test run `llama-server-one`, listening on all network interfaces, port 8888. Note that these are different from the default args you built into `llama-server-one`. You can connect to it from another web browser.
+```
+./$LLAMA_SERVER_ONE
+```
+
+Hit `ctrl-C` to stop it.
+
+Let's zip up the files into a `.zip` file you can share and move it to your home directory. The model won't compress much, so we're turning compression off with the `-0` parameter.
+
+```
+zip -0 $DEPLOY_ZIP *
+mv $DEPLOY_ZIP ~
+cd ~
+```
+
+Finally, let's review what you created in building, packaging, and deploying `llama-server-one`:
+```
+ls -aldh llama*
+```
+
+You should see three directories and a `.zip` file. The `llama-server-one-deploy.zip` file is ready to upload and share.
+
+---
+Congratulations! You did it. You built a `llama-server-one` executable that runs on two different CPU architectures and several popular operating systems. If you had any trouble in this process, please post a question in the [Discussions section](https://github.com/BradHutchings/llama-server-one/discussions). I'm happy to help!
+
+-Brad
+
+
+
diff --git a/docs/Packaging-ls1.md b/docs/Packaging-ls1.md
new file mode 100644
index 0000000000000..d2c619713c58b
--- /dev/null
+++ b/docs/Packaging-ls1.md
@@ -0,0 +1,130 @@
+## Packaging llama-server-one
+
+Brad Hutchings
+brad@bradhutchings.com
+
+This file contains instructions for packaging the `llama-server-one` executable to make it ready to deploy on multiple platforms.
+
+---
+### Package llama-server-one Executable
+
+Let's define some environment variables:
+```
+LLAMA_CPP_DIR="llama.cpp"
+LLAMA_SERVER_ONE_DIR="llama-server-one"
+
+LLAMA_SERVER="llama-server"
+LLAMA_SERVER_ONE="llama-server-one"
+LLAMA_SERVER_ONE_ZIP="llama-server-one.zip"
+DEFAULT_ARGS="default-args"
+```
+
+Next, let's create a directory where we'll package up `llama-server-one`:
+```
+cd ~
+rm -r -f ~/$LLAMA_SERVER_ONE_DIR
+mkdir -p $LLAMA_SERVER_ONE_DIR
+cp ~/$LLAMA_CPP_DIR/$LLAMA_SERVER \
+ ~/$LLAMA_SERVER_ONE_DIR/$LLAMA_SERVER_ONE_ZIP
+
+cd ~/$LLAMA_SERVER_ONE_DIR
+```
+
+Look at the contents of the `llama-server-one` zip archive:
+```
+unzip -l $LLAMA_SERVER_ONE_ZIP
+```
+
+You should notice a bunch of extraneous timezone related files in `/usr/*`. Let's get rid of those:
+```
+zip -d $LLAMA_SERVER_ONE_ZIP "/usr/*"
+```
+
+Verify that these files are no longer in the archive:
+```
+unzip -l $LLAMA_SERVER_ONE_ZIP
+```
+
+**Optional:** `llama.cpp` has a built in chat UI. If you'd like to provide a custom UI, you should add a `website` directory to the `llama-server-one` archive. `llama.cpp`'s chat UI is optimized for serving inside the project's source code. But we can copy the unoptimized source:
+```
+mkdir -p website
+cp -r ~/$LLAMA_CPP_DIR/examples/server/public_legacy/* website
+zip -0 -r $LLAMA_SERVER_ONE_ZIP website/*
+```
+
+**Optional:** Verify that the archive has your website:
+```
+unzip -l $LLAMA_SERVER_ONE_ZIP
+```
+
+A `llama-server-one-args` file in the archive can specify sane default parameters. The format of the file is parameter name on a line, parameter value on a line, rinse, repeat. End the file with a `...` line to include user specified parameters.
+
+We don't yet support including the model inside the zip archive (yet). That has a 4GB size limitation on Windows anyway, as `.exe` files cannot exceed 4GB. So let's use an adjacent file called `model.gguf`.
+
+We will serve on localhost, port 8080 by default for safety. The `--ctx-size` parameter is the size of the context window. This is kinda screwy to have as a set size rather than a maximum because the `.gguf` files now have the training context size in metadata. We set it to 8192 to be sensible.
+```
+cat << EOF > $DEFAULT_ARGS
+-m
+model.gguf
+--host
+127.0.0.1
+--port
+8080
+--ctx-size
+8192
+...
+EOF
+```
+
+**Optional:** If you added a website to the archive, use this instead:
+```
+cat << EOF > $DEFAULT_ARGS
+-m
+model.gguf
+--host
+127.0.0.1
+--port
+8080
+--ctx-size
+8192
+--path
+/zip/website
+...
+EOF
+```
+
+Add the `llama-server-one-args` file to the archive:
+```
+zip -0 -r $LLAMA_SERVER_ONE_ZIP $DEFAULT_ARGS
+```
+
+Verify that the archive contains the `llama-server-one-args` file:
+```
+unzip -l $LLAMA_SERVER_ONE_ZIP
+```
+
+Remove the `.zip` from our working file:
+```
+mv $LLAMA_SERVER_ONE_ZIP $LLAMA_SERVER_ONE
+```
+
+Let's download a small model. We'll use Google Gemma 1B Instruct v3, a surprisingly capable tiny model.
+```
+MODEL_FILE="Google-Gemma-1B-Instruct-v3-q8_0.gguf"
+wget https://huggingface.co/bradhutchings/Brads-LLMs/resolve/main/models/$MODEL_FILE?download=true \
+ --show-progress --quiet -O model.gguf
+```
+
+Now we can test run `llama-server-one`, listening on localhost:8080.
+```
+./$LLAMA_SERVER_ONE
+```
+
+Hit `ctrl-C` to stop it.
+
+If you'd like it to listen on all available interfaces, so you can connect from a browser on another computer:
+```
+./$LLAMA_SERVER_ONE --host 0.0.0.0
+```
+---
+Congratulations! You are ready to deploy your `llams-server-one` executable. Follow instructions in [Deploying-ls1.md](Deploying-ls1.md).
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 18caa9127662d..ea48e23704614 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -31,6 +31,10 @@
#include
#include
+#ifdef COSMOCC
+#include
+#endif
+
using json = nlohmann::ordered_json;
constexpr int HTTP_POLLING_SECONDS = 1;
@@ -1561,7 +1565,7 @@ struct server_queue {
}
// Add a new task, but defer until one slot is available
- void defer(server_task task) {
+ void defer_task(server_task task) {
std::unique_lock lock(mutex_tasks);
QUE_DBG("defer task, id = %d\n", task.id);
queue_tasks_deferred.push_back(std::move(task));
@@ -2603,13 +2607,13 @@ struct server_context {
if (slot == nullptr) {
// if no slot is available, we defer this task for processing later
SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id);
- queue_tasks.defer(task);
+ queue_tasks.defer_task(task);
break;
}
if (slot->is_processing()) {
// if requested slot is unavailable, we defer this task for processing later
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
- queue_tasks.defer(task);
+ queue_tasks.defer_task(task);
break;
}
@@ -2692,7 +2696,7 @@ struct server_context {
if (slot->is_processing()) {
// if requested slot is unavailable, we defer this task for processing later
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
- queue_tasks.defer(task);
+ queue_tasks.defer_task(task);
break;
}
@@ -2728,7 +2732,7 @@ struct server_context {
if (slot->is_processing()) {
// if requested slot is unavailable, we defer this task for processing later
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
- queue_tasks.defer(task);
+ queue_tasks.defer_task(task);
break;
}
@@ -2771,7 +2775,7 @@ struct server_context {
if (slot->is_processing()) {
// if requested slot is unavailable, we defer this task for processing later
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
- queue_tasks.defer(task);
+ queue_tasks.defer_task(task);
break;
}
@@ -3347,13 +3351,48 @@ struct server_context {
}
json model_meta() const {
+ char general_architecture[64];
+ char general_type[64];
+ char general_name[64];
+ char general_version[64];
+ char general_finetune[64];
+ char general_basename[64];
+ char general_size_label[64];
+ char general_license[64];
+
+ general_architecture[0] = 0;
+ general_type[0] = 0;
+ general_name[0] = 0;
+ general_version[0] = 0;
+ general_finetune[0] = 0;
+ general_basename[0] = 0;
+ general_size_label[0] = 0;
+ general_license[0] = 0;
+
+ llama_model_meta_val_str(model, "general.architecture", general_architecture, 64);
+ llama_model_meta_val_str(model, "general.type", general_type, 64);
+ llama_model_meta_val_str(model, "general.name", general_name, 64);
+ llama_model_meta_val_str(model, "general.version", general_version, 64);
+ llama_model_meta_val_str(model, "general.finetune", general_finetune, 64);
+ llama_model_meta_val_str(model, "general.basename", general_basename, 64);
+ llama_model_meta_val_str(model, "general.size_label", general_size_label, 64);
+ llama_model_meta_val_str(model, "general.license", general_license, 64);
+
return json {
- {"vocab_type", llama_vocab_type (vocab)},
- {"n_vocab", llama_vocab_n_tokens (vocab)},
- {"n_ctx_train", llama_model_n_ctx_train(model)},
- {"n_embd", llama_model_n_embd (model)},
- {"n_params", llama_model_n_params (model)},
- {"size", llama_model_size (model)},
+ {"vocab_type", llama_vocab_type (vocab)},
+ {"n_vocab", llama_vocab_n_tokens (vocab)},
+ {"n_ctx_train", llama_n_ctx_train (model)},
+ {"n_embd", llama_n_embd (model)},
+ {"n_params", llama_model_n_params (model)},
+ {"size", llama_model_size (model)},
+ {"general.architecture", general_architecture },
+ {"general.type", general_type },
+ {"general.name", general_name },
+ {"general.version", general_version },
+ {"general.finetune", general_finetune },
+ {"general.basename", general_basename },
+ {"general.size_label", general_size_label },
+ {"general.license", general_license },
};
}
};
@@ -3387,6 +3426,35 @@ inline void signal_handler(int signal) {
}
int main(int argc, char ** argv) {
+ // This implements an args file feature inspired by llamafile's.
+ #ifdef COSMOCC
+ // Args files if present. The names are different to remove confusion during packaging.
+ const std::string& argsFilename = "llama-server-one-args";
+ const std::string& zipArgsFilename = "/zip/default-args";
+ struct stat buffer;
+
+ // At this point, argc, argv represent:
+ // command (User supplied args)
+
+ if (stat (argsFilename.c_str(), &buffer) == 0) {
+ argc = cosmo_args(argsFilename.c_str(), &argv);
+ }
+
+ // At this point, argc, argv represent:
+ // command (argsFilename args) (User supplied args)
+
+ if (stat (zipArgsFilename.c_str(), &buffer) == 0) {
+ argc = cosmo_args(zipArgsFilename.c_str(), &argv);
+ }
+
+ // At this point, argc, argv represent:
+ // command (zipArgsFilename args) (argsFilename args) (User supplied args)
+
+ // Yep, this is counterintuitive, but how the cosmo_args command works.
+ // argsFilename args override zipArgsFilename file args.
+ // User supplied args override argsFilename and zipArgsFilename args.
+ #endif
+
// own arguments required by this example
common_params params;
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 5bec63e2e79ff..99df5810cc3a5 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1566,7 +1566,13 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
embd = has_embd ? output_base + logits_size : nullptr;
// set all ids as invalid (negative)
+ #ifndef COSMOCC
std::fill(output_ids.begin(), output_ids.end(), -1);
+ #else
+ for (auto iii = output_ids.begin(); iii != output_ids.end(); iii++) {
+ *iii = -1;
+ }
+ #endif
ggml_backend_buffer_clear(buf_output.get(), 0);
@@ -1606,7 +1612,13 @@ void llama_context::output_reorder() {
}
}
}
+ #ifndef COSMOCC
std::fill(output_ids.begin(), output_ids.end(), -1);
+ #else
+ for (auto iii = output_ids.begin(); iii != output_ids.end(); iii++) {
+ *iii = -1;
+ }
+ #endif
for (int32_t i = 0; i < n_outputs; ++i) {
output_ids[out_ids[i]] = i;
}