Skip to content

Commit 5679a3b

Browse files
committed
Merge branch 'master' into compilade/batch-splits
2 parents 952ed35 + 0d6fb52 commit 5679a3b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+967
-527
lines changed

.devops/llama-server.Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
33
FROM ubuntu:$UBUNTU_VERSION AS build
44

55
RUN apt-get update && \
6-
apt-get install -y build-essential git libcurl4-openssl-dev curl
6+
apt-get install -y build-essential git libcurl4-openssl-dev
77

88
WORKDIR /app
99

@@ -16,7 +16,7 @@ RUN make -j$(nproc) llama-server
1616
FROM ubuntu:$UBUNTU_VERSION AS runtime
1717

1818
RUN apt-get update && \
19-
apt-get install -y libcurl4-openssl-dev libgomp1
19+
apt-get install -y libcurl4-openssl-dev libgomp1 curl
2020

2121
COPY --from=build /app/llama-server /llama-server
2222

.devops/nix/package.nix

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -126,16 +126,9 @@ let
126126
++ optionals useMetalKit [ MetalKit ];
127127

128128
cudaBuildInputs = with cudaPackages; [
129-
cuda_cccl.dev # <nv/target>
130-
131-
# A temporary hack for reducing the closure size, remove once cudaPackages
132-
# have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
133-
cuda_cudart.dev
134-
cuda_cudart.lib
135-
cuda_cudart.static
136-
libcublas.dev
137-
libcublas.lib
138-
libcublas.static
129+
cuda_cudart
130+
cuda_cccl # <nv/target>
131+
libcublas
139132
];
140133

141134
rocmBuildInputs = with rocmPackages; [

.github/workflows/build.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -860,7 +860,8 @@ jobs:
860860
mkdir build
861861
cd build
862862
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
863-
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1))
863+
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
864+
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
864865
865866
- name: Determine tag name
866867
id: tag

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ build*
5050
!docs/build.md
5151
/libllama.so
5252
/llama-*
53+
/vulkan-shaders-gen
5354
android-ndk-*
5455
arm_neon.h
5556
cmake-build-*

CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,8 @@ set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location o
139139
# determining _precisely_ which defines are necessary for the llama-config
140140
# package.
141141
#
142-
get_directory_property(GGML_DIR_DEFINES DIRECTORY ggml/src COMPILE_DEFINITIONS)
142+
get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
143+
get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
143144
get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
144145
set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
145146
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)

Makefile

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1605,42 +1605,41 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
16051605
# Mark legacy binary targets as .PHONY so that they are always checked.
16061606
.PHONY: main quantize perplexity embedding server
16071607

1608+
# Define the object file target
1609+
examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp
1610+
$(CXX) $(CXXFLAGS) -c $< -o $@
1611+
16081612
# NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
16091613
# Eventually we will want to remove these target from building all the time.
1610-
main: examples/deprecation-warning/deprecation-warning.cpp
1611-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1612-
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1614+
main: examples/deprecation-warning/deprecation-warning.o
1615+
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
16131616
@echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead."
16141617

1615-
server: examples/deprecation-warning/deprecation-warning.cpp
1616-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1617-
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1618+
server: examples/deprecation-warning/deprecation-warning.o
1619+
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
16181620
@echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead."
16191621

1620-
quantize: examples/deprecation-warning/deprecation-warning.cpp
1622+
quantize: examples/deprecation-warning/deprecation-warning.o
16211623
ifneq (,$(wildcard quantize))
1622-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1623-
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1624+
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
16241625
@echo "#########"
16251626
@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
16261627
@echo " Remove the 'quantize' binary to remove this warning."
16271628
@echo "#########"
16281629
endif
16291630

1630-
perplexity: examples/deprecation-warning/deprecation-warning.cpp
1631+
perplexity: examples/deprecation-warning/deprecation-warning.o
16311632
ifneq (,$(wildcard perplexity))
1632-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1633-
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1633+
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
16341634
@echo "#########"
16351635
@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
16361636
@echo " Remove the 'perplexity' binary to remove this warning."
16371637
@echo "#########"
16381638
endif
16391639

1640-
embedding: examples/deprecation-warning/deprecation-warning.cpp
1640+
embedding: examples/deprecation-warning/deprecation-warning.o
16411641
ifneq (,$(wildcard embedding))
1642-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1643-
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1642+
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
16441643
@echo "#########"
16451644
@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
16461645
@echo " Remove the 'embedding' binary to remove this warning."

common/common.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1634,7 +1634,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
16341634
options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() });
16351635
options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
16361636
options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() });
1637-
options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" });
1637+
options.push_back({ "server", " --embedding(s)", "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" });
16381638
options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" });
16391639
options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" });
16401640
options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });

convert_hf_to_gguf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,7 @@ def prepare_tensors(self):
316316
if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
317317
if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
318318
data = gguf.quantize_bf16(data)
319-
assert data.dtype == np.int16
319+
assert data.dtype == np.uint16
320320
data_qtype = gguf.GGMLQuantizationType.BF16
321321

322322
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):

docs/build.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,11 @@ For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](ht
178178
cmake --build build --config Release
179179
```
180180
181-
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
181+
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
182+
183+
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
184+
185+
The following compilation options are also available to tweak performance:
182186
183187
| Option | Legal values | Default | Description |
184188
|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|

examples/baby-llama/baby-llama.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
#include "ggml.h"
22
#include "train.h"
33

4-
#include <vector>
54
#include <cassert>
65
#include <cstdlib>
76
#include <cstring>

0 commit comments

Comments
 (0)