Skip to content

Commit 06cc123

Browse files
committed
Merge branch 'concedo_experimental' into croco_nex_0
2 parents 0238fe5 + 3e6ef8e commit 06cc123

9 files changed

+34
-89
lines changed

.github/workflows/kcpp-build-release-win-cuda.yaml

Lines changed: 0 additions & 34 deletions
This file was deleted.

.github/workflows/kcpp-build-release-win-cuda12.yaml

Lines changed: 0 additions & 34 deletions
This file was deleted.

.github/workflows/kcpp-build-release-win-full-cu12.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ jobs:
4545
- name: Build Non-CUDA
4646
id: make_build
4747
run: |
48-
make -j ${env:NUMBER_OF_PROCESSORS}
48+
make LLAMA_CLBLAST=1 LLAMA_VULKAN=1 LLAMA_PORTABLE=1 -j ${env:NUMBER_OF_PROCESSORS}
4949
5050
- uses: Jimver/[email protected]
5151
id: cuda-toolkit

.github/workflows/kcpp-build-release-win-full.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ jobs:
4545
- name: Build Non-CUDA
4646
id: make_build
4747
run: |
48-
make -j ${env:NUMBER_OF_PROCESSORS}
48+
make LLAMA_CLBLAST=1 LLAMA_VULKAN=1 LLAMA_PORTABLE=1 -j ${env:NUMBER_OF_PROCESSORS}
4949
5050
- uses: Jimver/[email protected]
5151
id: cuda-toolkit

.github/workflows/kcpp-build-release-win-oldcpu-full.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ jobs:
4545
- name: Build Non-CUDA
4646
id: make_build
4747
run: |
48-
make -j ${env:NUMBER_OF_PROCESSORS} LLAMA_NOAVX2=1
48+
make LLAMA_CLBLAST=1 LLAMA_VULKAN=1 LLAMA_PORTABLE=1 -j ${env:NUMBER_OF_PROCESSORS} LLAMA_NOAVX2=1
4949
5050
- uses: Jimver/[email protected]
5151
id: cuda-toolkit

Makefile

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,6 @@
55

66
default: koboldcpp_default koboldcpp_failsafe koboldcpp_noavx2 koboldcpp_clblast koboldcpp_clblast_noavx2 koboldcpp_cublas koboldcpp_hipblas koboldcpp_vulkan koboldcpp_vulkan_noavx2 finishedmsg
77
tools: quantize_gpt2 quantize_gptj quantize_gguf quantize_neox quantize_mpt quantize_clip whispermain sdmain gguf-split
8-
dev: koboldcpp_default
9-
dev2: koboldcpp_clblast
10-
dev3: koboldcpp_vulkan finishedmsg
118

129
ifndef UNAME_S
1310
UNAME_S := $(shell uname -s)
@@ -153,6 +150,7 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
153150
# Use all CPU extensions that are available:
154151
# old library NEEDS mf16c to work. so we must build with it. new one doesnt
155152
ifeq ($(OS),Windows_NT)
153+
ifdef LLAMA_PORTABLE
156154
CFLAGS +=
157155
NONECFLAGS +=
158156
SIMPLECFLAGS += -mavx -msse3
@@ -161,8 +159,10 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
161159
else
162160
FULLCFLAGS += -mavx2 -msse3 -mfma -mf16c -mavx
163161
endif
162+
else
163+
CFLAGS += -march=native -mtune=native
164+
endif
164165
else
165-
# if not on windows, they are clearly building it themselves, so lets just use whatever is supported
166166
ifdef LLAMA_PORTABLE
167167
CFLAGS +=
168168
NONECFLAGS +=
@@ -423,10 +423,17 @@ NOTIFY_MSG =
423423

424424
ifeq ($(OS),Windows_NT)
425425
DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
426+
ifdef LLAMA_PORTABLE
426427
FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
427428
NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
429+
endif
430+
431+
ifdef LLAMA_CLBLAST
428432
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o [email protected] $(LDFLAGS)
433+
endif
434+
ifdef LLAMA_VULKAN
429435
VULKAN_BUILD = $(CXX) $(CXXFLAGS) $^ lib/vulkan-1.lib -shared -o [email protected] $(LDFLAGS)
436+
endif
430437

431438
ifdef LLAMA_CUBLAS
432439
CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o [email protected] $(CUBLASLD_FLAGS) $(LDFLAGS)
@@ -459,18 +466,18 @@ else
459466
ifdef LLAMA_VULKAN
460467
VULKAN_BUILD = $(CXX) $(CXXFLAGS) $^ -lvulkan -shared -o [email protected] $(LDFLAGS)
461468
endif
469+
endif
462470

463-
ifndef LLAMA_CLBLAST
464-
ifndef LLAMA_CUBLAS
465-
ifndef LLAMA_HIPBLAS
466-
ifndef LLAMA_VULKAN
467-
ifndef LLAMA_METAL
468-
NOTIFY_MSG = @echo -e '\n***\nYou did a basic CPU build. For faster speeds, consider installing and linking a GPU BLAS library. For example, set LLAMA_VULKAN=1 to compile with Vulkan support. Read the KoboldCpp Wiki for more information. This is just a reminder, not an error.\n***\n'
469-
endif
470-
endif
471-
endif
472-
endif
473-
endif
471+
ifndef LLAMA_CLBLAST
472+
ifndef LLAMA_CUBLAS
473+
ifndef LLAMA_HIPBLAS
474+
ifndef LLAMA_VULKAN
475+
ifndef LLAMA_METAL
476+
NOTIFY_MSG = @echo -e '\n***\nYou did a basic CPU build. For faster speeds, consider installing and linking a GPU BLAS library. For example, set LLAMA_CLBLAST=1 LLAMA_VULKAN=1 to compile with Vulkan and CLBlast support. Add LLAMA_PORTABLE=1 to make a sharable build that other devices can use. Read the KoboldCpp Wiki for more information. This is just a reminder, not an error.\n***\n'
477+
endif
478+
endif
479+
endif
480+
endif
474481
endif
475482

476483

README.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,13 +83,16 @@ when you can't use the precompiled binary directly, we provide an automated buil
8383
- For Debian: Install `libclblast-dev`.
8484
- You can attempt a CuBLAS build with `LLAMA_CUBLAS=1`, (or `LLAMA_HIPBLAS=1` for AMD). You will need CUDA Toolkit installed. Some have also reported success with the CMake file, though that is more for windows.
8585
- For a full featured build (all backends), do `make LLAMA_CLBLAST=1 LLAMA_CUBLAS=1 LLAMA_VULKAN=1`. (Note that `LLAMA_CUBLAS=1` will not work on windows, you need visual studio)
86+
- To make your build sharable and capable of working on other devices, you must use `LLAMA_PORTABLE=1`
8687
- After all binaries are built, you can run the python script with the command `koboldcpp.py [ggml_model.gguf] [port]`
8788

8889
### Compiling on Windows
8990
- You're encouraged to use the .exe released, but if you want to compile your binaries from source at Windows, the easiest way is:
9091
- Get the latest release of w64devkit (https://github.com/skeeto/w64devkit). Be sure to use the "vanilla one", not i686 or other different stuff. If you try they will conflit with the precompiled libs!
9192
- Clone the repo with `git clone https://github.com/LostRuins/koboldcpp.git`
92-
- Make sure you are using the w64devkit integrated terminal, then run `make` at the KoboldCpp source folder. This will create the .dll files.
93+
- Make sure you are using the w64devkit integrated terminal, then run `make` at the KoboldCpp source folder. This will create the .dll files for a pure CPU native build.
94+
- For a full featured build (all backends), do `make LLAMA_CLBLAST=1 LLAMA_VULKAN=1`. (Note that `LLAMA_CUBLAS=1` will not work on windows, you need visual studio)
95+
- To make your build sharable and capable of working on other devices, you must use `LLAMA_PORTABLE=1`
9396
- If you want to generate the .exe file, make sure you have the python module PyInstaller installed with pip (`pip install PyInstaller`). Then run the script `make_pyinstaller.bat`
9497
- The koboldcpp.exe file will be at your dist folder.
9598
- **Building with CUDA**: Visual Studio, CMake and CUDA Toolkit is required. Clone the repo, then open the CMake file and compile it in Visual Studio. Copy the `koboldcpp_cublas.dll` generated into the same directory as the `koboldcpp.py` file. If you are bundling executables, you may need to include CUDA dynamic libraries (such as `cublasLt64_11.dll` and `cublas64_11.dll`) in order for the executable to work correctly on a different PC.
@@ -104,6 +107,7 @@ when you can't use the precompiled binary directly, we provide an automated buil
104107
- You can compile your binaries from source. You can clone the repo with `git clone https://github.com/LostRuins/koboldcpp.git`
105108
- A makefile is provided, simply run `make`.
106109
- If you want Metal GPU support, instead run `make LLAMA_METAL=1`, note that MacOS metal libraries need to be installed.
110+
- To make your build sharable and capable of working on other devices, you must use `LLAMA_PORTABLE=1`
107111
- After all binaries are built, you can run the python script with the command `koboldcpp.py --model [ggml_model.gguf]` (and add `--gpulayers (number of layer)` if you wish to offload layers to GPU).
108112

109113
### Compiling on Android (Termux Installation)
@@ -114,6 +118,7 @@ when you can't use the precompiled binary directly, we provide an automated buil
114118
- Clone the repo `git clone https://github.com/LostRuins/koboldcpp.git`
115119
- Navigate to the koboldcpp folder `cd koboldcpp`
116120
- Build the project `make`
121+
- To make your build sharable and capable of working on other devices, you must use `LLAMA_PORTABLE=1`, this disables usage of ARM instrinsics.
117122
- Grab a small GGUF model, such as `wget https://huggingface.co/concedo/KobbleTinyV2-1.1B-GGUF/resolve/main/KobbleTiny-Q4_K.gguf`
118123
- Start the python server `python koboldcpp.py --model KobbleTiny-Q4_K.gguf`
119124
- Connect to `http://localhost:5001` on your mobile browser

ggml/src/ggml-backend.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -748,6 +748,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
748748
if (!backend_prealloc_warn) {
749749
backend_prealloc_warn = true;
750750
printf("\nCaution: pre-allocated tensor (%s) in a buffer (%s) that cannot run the operation (%s)\n", tensor->name, ggml_backend_buffer_name(buffer), ggml_op_name(tensor->op));
751+
printf("\nNote that if you are using Quantized KV, not all backends support it!\n");
751752
}
752753
}
753754

gpttype_adapter.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -630,14 +630,14 @@ struct kcpp_embd_batch { //duplcated from llava_embd_batch
630630
};
631631

632632
//loads a model for speculative decoding.
633-
static void speculative_decoding_setup(std::string spec_model_filename, const llama_model_params & base_model_params, const llama_context_params & base_ctx_params, int base_n_vocab, const float * draft_gpusplit, int draftgpulayers)
633+
static void speculative_decoding_setup(std::string spec_model_filename, const llama_model_params & base_model_params, const llama_context_params & base_ctx_params, int base_n_vocab, const float * draft_gpusplit, int draft_gpulayers)
634634
{
635635
llama_model_params draft_model_params = llama_model_default_params();
636636
llama_context_params draft_ctx_params = llama_context_default_params();
637637

638638
draft_model_params.use_mmap = base_model_params.use_mmap;
639639
draft_model_params.use_mlock = base_model_params.use_mlock;
640-
draft_model_params.n_gpu_layers = draftgpulayers; //layers offload the speculative model.
640+
draft_model_params.n_gpu_layers = draft_gpulayers; //layers offload the speculative model.
641641
draft_ctx_params.n_ctx = base_ctx_params.n_ctx;
642642
draft_ctx_params.logits_all = false;
643643
draft_ctx_params.offload_kqv = base_ctx_params.offload_kqv;

0 commit comments

Comments
 (0)