Merge branch 'concedo_experimental' into croco_nex_0

Nexesenex · Nexesenex · commit 06cc123ce4b3 · 2024-12-27T07:39:25.000+01:00
diff --git a/.github/workflows/kcpp-build-release-win-cuda.yaml b/.github/workflows/kcpp-build-release-win-cuda.yaml
diff --git a/.github/workflows/kcpp-build-release-win-cuda12.yaml b/.github/workflows/kcpp-build-release-win-cuda12.yaml
diff --git a/.github/workflows/kcpp-build-release-win-full-cu12.yaml b/.github/workflows/kcpp-build-release-win-full-cu12.yaml
@@ -45,7 +45,7 @@ jobs:
       - name: Build Non-CUDA
         id: make_build
         run: |
-          make -j ${env:NUMBER_OF_PROCESSORS}
+          make LLAMA_CLBLAST=1 LLAMA_VULKAN=1 LLAMA_PORTABLE=1 -j ${env:NUMBER_OF_PROCESSORS}
 
       - uses: Jimver/cuda-toolkit@v0.2.15
         id: cuda-toolkit
diff --git a/.github/workflows/kcpp-build-release-win-full.yaml b/.github/workflows/kcpp-build-release-win-full.yaml
@@ -45,7 +45,7 @@ jobs:
       - name: Build Non-CUDA
         id: make_build
         run: |
-          make -j ${env:NUMBER_OF_PROCESSORS}
+          make LLAMA_CLBLAST=1 LLAMA_VULKAN=1 LLAMA_PORTABLE=1 -j ${env:NUMBER_OF_PROCESSORS}
 
       - uses: Jimver/cuda-toolkit@v0.2.15
         id: cuda-toolkit
diff --git a/.github/workflows/kcpp-build-release-win-oldcpu-full.yaml b/.github/workflows/kcpp-build-release-win-oldcpu-full.yaml
@@ -45,7 +45,7 @@ jobs:
       - name: Build Non-CUDA
         id: make_build
         run: |
-          make -j ${env:NUMBER_OF_PROCESSORS} LLAMA_NOAVX2=1
+          make LLAMA_CLBLAST=1 LLAMA_VULKAN=1 LLAMA_PORTABLE=1 -j ${env:NUMBER_OF_PROCESSORS} LLAMA_NOAVX2=1
 
       - uses: Jimver/cuda-toolkit@v0.2.15
         id: cuda-toolkit
diff --git a/Makefile b/Makefile
@@ -5,9 +5,6 @@
 
 default: koboldcpp_default koboldcpp_failsafe koboldcpp_noavx2 koboldcpp_clblast koboldcpp_clblast_noavx2 koboldcpp_cublas koboldcpp_hipblas koboldcpp_vulkan koboldcpp_vulkan_noavx2 finishedmsg
 tools: quantize_gpt2 quantize_gptj quantize_gguf quantize_neox quantize_mpt quantize_clip whispermain sdmain gguf-split
-dev: koboldcpp_default
-dev2: koboldcpp_clblast
-dev3: koboldcpp_vulkan finishedmsg
 
 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
@@ -153,6 +150,7 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	# Use all CPU extensions that are available:
 # old library NEEDS mf16c to work. so we must build with it. new one doesnt
 	ifeq ($(OS),Windows_NT)
+		ifdef LLAMA_PORTABLE
 		CFLAGS +=
 		NONECFLAGS +=
 		SIMPLECFLAGS += -mavx -msse3
@@ -161,8 +159,10 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 		else
 			FULLCFLAGS += -mavx2 -msse3 -mfma -mf16c -mavx
 		endif
+		else
+		CFLAGS += -march=native -mtune=native
+		endif
 	else
-# if not on windows, they are clearly building it themselves, so lets just use whatever is supported
 		ifdef LLAMA_PORTABLE
 		CFLAGS +=
 		NONECFLAGS +=
@@ -423,10 +423,17 @@ NOTIFY_MSG =
 
 ifeq ($(OS),Windows_NT)
 	DEFAULT_BUILD = $(CXX) $(CXXFLAGS)  $^ -shared -o $@.dll $(LDFLAGS)
+	ifdef LLAMA_PORTABLE
 	FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.dll $(LDFLAGS)
 	NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.dll $(LDFLAGS)
+	endif
+
+	ifdef LLAMA_CLBLAST
 	CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o $@.dll $(LDFLAGS)
+	endif
+	ifdef LLAMA_VULKAN
 	VULKAN_BUILD = $(CXX) $(CXXFLAGS) $^ lib/vulkan-1.lib -shared -o $@.dll $(LDFLAGS)
+	endif
 
 	ifdef LLAMA_CUBLAS
 		CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o $@.dll $(CUBLASLD_FLAGS) $(LDFLAGS)
@@ -459,18 +466,18 @@ else
 	ifdef LLAMA_VULKAN
 		VULKAN_BUILD = $(CXX) $(CXXFLAGS) $^ -lvulkan -shared -o $@.so $(LDFLAGS)
 	endif
+endif
 
-	ifndef LLAMA_CLBLAST
-	ifndef LLAMA_CUBLAS
-	ifndef LLAMA_HIPBLAS
-	ifndef LLAMA_VULKAN
-	ifndef LLAMA_METAL
-	NOTIFY_MSG = @echo -e '\n***\nYou did a basic CPU build. For faster speeds, consider installing and linking a GPU BLAS library. For example, set LLAMA_VULKAN=1 to compile with Vulkan support. Read the KoboldCpp Wiki for more information. This is just a reminder, not an error.\n***\n'
-	endif
-	endif
-	endif
-	endif
-	endif
+ifndef LLAMA_CLBLAST
+ifndef LLAMA_CUBLAS
+ifndef LLAMA_HIPBLAS
+ifndef LLAMA_VULKAN
+ifndef LLAMA_METAL
+NOTIFY_MSG = @echo -e '\n***\nYou did a basic CPU build. For faster speeds, consider installing and linking a GPU BLAS library. For example, set LLAMA_CLBLAST=1 LLAMA_VULKAN=1 to compile with Vulkan and CLBlast support. Add LLAMA_PORTABLE=1 to make a sharable build that other devices can use. Read the KoboldCpp Wiki for more information. This is just a reminder, not an error.\n***\n'
+endif
+endif
+endif
+endif
 endif
 
 
diff --git a/README.md b/README.md
@@ -83,13 +83,16 @@ when you can't use the precompiled binary directly, we provide an automated buil
   - For Debian: Install `libclblast-dev`.
 - You can attempt a CuBLAS build with `LLAMA_CUBLAS=1`, (or `LLAMA_HIPBLAS=1` for AMD). You will need CUDA Toolkit installed. Some have also reported success with the CMake file, though that is more for windows.
 - For a full featured build (all backends), do `make LLAMA_CLBLAST=1 LLAMA_CUBLAS=1 LLAMA_VULKAN=1`. (Note that `LLAMA_CUBLAS=1` will not work on windows, you need visual studio)
+- To make your build sharable and capable of working on other devices, you must use `LLAMA_PORTABLE=1`
 - After all binaries are built, you can run the python script with the command `koboldcpp.py [ggml_model.gguf] [port]`
 
 ### Compiling on Windows
 - You're encouraged to use the .exe released, but if you want to compile your binaries from source at Windows, the easiest way is:
   - Get the latest release of w64devkit (https://github.com/skeeto/w64devkit). Be sure to use the "vanilla one", not i686 or other different stuff. If you try they will conflit with the precompiled libs!
   - Clone the repo with `git clone https://github.com/LostRuins/koboldcpp.git`
-  - Make sure you are using the w64devkit integrated terminal, then run `make` at the KoboldCpp source folder. This will create the .dll files.
+  - Make sure you are using the w64devkit integrated terminal, then run `make` at the KoboldCpp source folder. This will create the .dll files for a pure CPU native build.
+  - For a full featured build (all backends), do `make LLAMA_CLBLAST=1 LLAMA_VULKAN=1`. (Note that `LLAMA_CUBLAS=1` will not work on windows, you need visual studio)
+  - To make your build sharable and capable of working on other devices, you must use `LLAMA_PORTABLE=1`
   - If you want to generate the .exe file, make sure you have the python module PyInstaller installed with pip (`pip install PyInstaller`). Then run the script `make_pyinstaller.bat`
   - The koboldcpp.exe file will be at your dist folder.
 - **Building with CUDA**: Visual Studio, CMake and CUDA Toolkit is required. Clone the repo, then open the CMake file and compile it in Visual Studio. Copy the `koboldcpp_cublas.dll` generated into the same directory as the `koboldcpp.py` file. If you are bundling executables, you may need to include CUDA dynamic libraries (such as `cublasLt64_11.dll` and `cublas64_11.dll`) in order for the executable to work correctly on a different PC.
@@ -104,6 +107,7 @@ when you can't use the precompiled binary directly, we provide an automated buil
 - You can compile your binaries from source. You can clone the repo with `git clone https://github.com/LostRuins/koboldcpp.git`
 - A makefile is provided, simply run `make`.
 - If you want Metal GPU support, instead run `make LLAMA_METAL=1`, note that MacOS metal libraries need to be installed.
+- To make your build sharable and capable of working on other devices, you must use `LLAMA_PORTABLE=1`
 - After all binaries are built, you can run the python script with the command `koboldcpp.py --model [ggml_model.gguf]` (and add `--gpulayers (number of layer)` if you wish to offload layers to GPU).
 
 ### Compiling on Android (Termux Installation)
@@ -114,6 +118,7 @@ when you can't use the precompiled binary directly, we provide an automated buil
 - Clone the repo `git clone https://github.com/LostRuins/koboldcpp.git`
 - Navigate to the koboldcpp folder `cd koboldcpp`
 - Build the project `make`
+- To make your build sharable and capable of working on other devices, you must use `LLAMA_PORTABLE=1`, this disables usage of ARM instrinsics.
 - Grab a small GGUF model, such as `wget https://huggingface.co/concedo/KobbleTinyV2-1.1B-GGUF/resolve/main/KobbleTiny-Q4_K.gguf`
 - Start the python server `python koboldcpp.py --model KobbleTiny-Q4_K.gguf`
 - Connect to `http://localhost:5001` on your mobile browser
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
@@ -748,6 +748,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
         if (!backend_prealloc_warn) {
             backend_prealloc_warn = true;
             printf("\nCaution: pre-allocated tensor (%s) in a buffer (%s) that cannot run the operation (%s)\n", tensor->name, ggml_backend_buffer_name(buffer), ggml_op_name(tensor->op));
+            printf("\nNote that if you are using Quantized KV, not all backends support it!\n");
         }
     }
 
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
@@ -630,14 +630,14 @@ struct kcpp_embd_batch { //duplcated from llava_embd_batch
 };
 
 //loads a model for speculative decoding.
-static void speculative_decoding_setup(std::string spec_model_filename, const llama_model_params & base_model_params, const llama_context_params & base_ctx_params, int base_n_vocab, const float * draft_gpusplit, int draftgpulayers)
+static void speculative_decoding_setup(std::string spec_model_filename, const llama_model_params & base_model_params, const llama_context_params & base_ctx_params, int base_n_vocab, const float * draft_gpusplit, int draft_gpulayers)
 {
     llama_model_params draft_model_params = llama_model_default_params();
     llama_context_params draft_ctx_params = llama_context_default_params();
 
     draft_model_params.use_mmap = base_model_params.use_mmap;
     draft_model_params.use_mlock = base_model_params.use_mlock;
-    draft_model_params.n_gpu_layers = draftgpulayers; //layers offload the speculative model.
+    draft_model_params.n_gpu_layers = draft_gpulayers; //layers offload the speculative model.
     draft_ctx_params.n_ctx = base_ctx_params.n_ctx;
     draft_ctx_params.logits_all = false;
     draft_ctx_params.offload_kqv = base_ctx_params.offload_kqv;

Original file line number	Diff line number	Diff line change
`@@ -748,6 +748,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st`
`748`	`748`	`if (!backend_prealloc_warn) {`
`749`	`749`	`backend_prealloc_warn = true;`
`750`	`750`	`printf("\nCaution: pre-allocated tensor (%s) in a buffer (%s) that cannot run the operation (%s)\n", tensor->name, ggml_backend_buffer_name(buffer), ggml_op_name(tensor->op));`
	`751`	`+ printf("\nNote that if you are using Quantized KV, not all backends support it!\n");`
`751`	`752`	`}`
`752`	`753`	`}`
`753`	`754`