Nexesenex
diff --git a/‎.github/workflows/kcpp-build-release-arm64.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/kcpp-build-release-arm64.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/kcpp-build-release-linux-cuda12.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/kcpp-build-release-linux-cuda12.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/kcpp-build-release-linux.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/kcpp-build-release-linux.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/kcpp-build-release-osx.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/kcpp-build-release-osx.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/kcpp-build-release-win-full-cu12.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/kcpp-build-release-win-full-cu12.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/kcpp-build-release-win-full.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/kcpp-build-release-win-full.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/kcpp-build-release-win-oldcpu-full.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/kcpp-build-release-win-oldcpu-full.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Makefile‎
Lines changed: 1 addition & 1 deletion b/‎Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎common/arg.cpp‎
Lines changed: 2 additions & 2 deletions b/‎common/arg.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎common/common.cpp‎
Lines changed: 16 additions & 16 deletions b/‎common/common.cpp‎
Lines changed: 16 additions & 16 deletions
@@ -78,7 +78,7 @@ jobs:
             "
 
       - name: Save artifact
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: kcpp_linux_arm64_binary
           path: dist/
 
@@ -27,7 +27,7 @@ jobs:
           ./koboldcpp.sh dist
 
       - name: Save artifact
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: kcpp_linux_binary
           path: dist/
@@ -27,7 +27,7 @@ jobs:
           ./koboldcpp.sh dist
 
       - name: Save artifact
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: kcpp_linux_binary
           path: dist/
@@ -34,7 +34,7 @@ jobs:
           dist/koboldcpp-mac-arm64 --model baby_llama.gguf --gpulayers 99 --benchmark --prompt 'Hi, my name is'
 
       - name: Save artifact
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: kcpp_mac_binary
           path: dist/
 
@@ -80,7 +80,7 @@ jobs:
           ./make_pyinstaller_cuda12.bat
 
       - name: Save artifact
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: kcpp_windows_pyinstallers
           path: dist/
@@ -81,7 +81,7 @@ jobs:
           ./make_pyinstaller_cuda.bat
 
       - name: Save artifact
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: kcpp_windows_pyinstallers
           path: dist/
@@ -80,7 +80,7 @@ jobs:
           ./make_pyinstaller_cuda_oldcpu.bat
 
       - name: Save artifact
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: kcpp_windows_pyinstallers
           path: dist/
@@ -676,7 +676,7 @@ whispercpp_cublas.o: otherarch/whispercpp/whisper_adapter.cpp
 	$(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
 
 # idiotic "for easier compilation"
-GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp src/llama.cpp src/llama-grammar.cpp src/llama-sampling.cpp src/llama-vocab.cpp otherarch/utils.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml/include/ggml.h ggml/include/ggml-cpu.h ggml/include/ggml-cuda.h include/llama.h otherarch/llama-util.h
+GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp src/llama.cpp src/llama-impl.cpp src/llama-chat.cpp src/llama-mmap.cpp src/llama-context.cpp src/llama-adapter.cpp src/llama-arch.cpp src/llama-batch.cpp src/llama-vocab.cpp src/llama-grammar.cpp src/llama-sampling.cpp src/llama-kv-cache.cpp src/llama-model-loader.cpp src/llama-model.cpp src/llama-quant.cpp src/llama-hparams.cpp otherarch/utils.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml/include/ggml.h ggml/include/ggml-cpu.h ggml/include/ggml-cuda.h include/llama.h otherarch/llama-util.h
 gpttype_adapter_failsafe.o: $(GPTTYPE_ADAPTER)
 	$(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) -c $< -o $@
 gpttype_adapter.o: $(GPTTYPE_ADAPTER)
 
@@ -1514,15 +1514,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"--lora"}, "FNAME",
         "path to LoRA adapter (can be repeated to use multiple adapters)",
         [](common_params & params, const std::string & value) {
-            params.lora_adapters.push_back({ std::string(value), 1.0 });
+            params.lora_adapters.push_back({ std::string(value), 1.0, nullptr });
         }
         // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
     ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
     add_opt(common_arg(
         {"--lora-scaled"}, "FNAME", "SCALE",
         "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
         [](common_params & params, const std::string & fname, const std::string & scale) {
-            params.lora_adapters.push_back({ fname, std::stof(scale) });
+            params.lora_adapters.push_back({ fname, std::stof(scale), nullptr });
         }
         // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
     ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
 
@@ -891,9 +891,8 @@ struct common_init_result common_init_from_params(common_params & params) {
     }
 
     if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
-        LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
-        llama_free_model(model);
-        return iparams;
+        LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
+        params.ctx_shift = false;
     }
 
     if (!params.control_vectors.empty()) {
@@ -924,20 +923,21 @@ struct common_init_result common_init_from_params(common_params & params) {
 
     // load and optionally apply lora adapters
     for (auto & la : params.lora_adapters) {
-        common_lora_adapter_container loaded_la;
-        loaded_la.path = la.path;
-        loaded_la.scale = la.scale;
-        loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
-        if (loaded_la.adapter == nullptr) {
+        llama_lora_adapter_ptr lora;
+        lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
+        if (lora == nullptr) {
             LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
             llama_free(lctx);
             llama_free_model(model);
             return iparams;
         }
-        iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
+
+        la.ptr = lora.get();
+        iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
     }
+
     if (!params.lora_init_without_apply) {
-        common_lora_adapters_apply(lctx, iparams.lora_adapters);
+        common_lora_adapters_apply(lctx, params.lora_adapters);
     }
 
     if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
@@ -998,17 +998,17 @@ struct common_init_result common_init_from_params(common_params & params) {
         llama_perf_context_reset(lctx);
     }
 
-    iparams.model   = model;
-    iparams.context = lctx;
+    iparams.model.reset(model);
+    iparams.context.reset(lctx);
 
     return iparams;
 }
 
-void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
+void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {
     llama_lora_adapter_clear(ctx);
-    for (auto & la : lora_adapters) {
+    for (auto & la : lora) {
         if (la.scale != 0.0f) {
-            llama_lora_adapter_set(ctx, la.adapter, la.scale);
+            llama_lora_adapter_set(ctx, la.ptr, la.scale);
         }
     }
 }
@@ -1365,7 +1365,7 @@ struct llama_model * common_load_model_from_url(
             return NULL;
         }
 
-        auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT_STR);
+        auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
         if (key_n_split >= 0) {
             n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
         }
Original file line number	Diff line number	Diff line change
`@@ -78,7 +78,7 @@ jobs:`
`78`	`78`	`"`
`79`	`79`
`80`	`80`	`- name: Save artifact`
`81`		`- uses: actions/upload-artifact@v3`
	`81`	`+ uses: actions/upload-artifact@v4`
`82`	`82`	`with:`
`83`	`83`	`name: kcpp_linux_arm64_binary`
`84`	`84`	`path: dist/`
Original file line number	Diff line number	Diff line change
`@@ -891,9 +891,8 @@ struct common_init_result common_init_from_params(common_params & params) {`
`891`	`891`	`}`
`892`	`892`
`893`	`893`	`if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {`
`894`		`- LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);`
`895`		`- llama_free_model(model);`
`896`		`- return iparams;`
	`894`	`+ LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);`
	`895`	`+ params.ctx_shift = false;`
`897`	`896`	`}`
`898`	`897`
`899`	`898`	`if (!params.control_vectors.empty()) {`
`@@ -924,20 +923,21 @@ struct common_init_result common_init_from_params(common_params & params) {`
`924`	`923`
`925`	`924`	`// load and optionally apply lora adapters`
`926`	`925`	`for (auto & la : params.lora_adapters) {`
`927`		`- common_lora_adapter_container loaded_la;`
`928`		`- loaded_la.path = la.path;`
`929`		`- loaded_la.scale = la.scale;`
`930`		`- loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());`
`931`		`- if (loaded_la.adapter == nullptr) {`
	`926`	`+ llama_lora_adapter_ptr lora;`
	`927`	`+ lora.reset(llama_lora_adapter_init(model, la.path.c_str()));`
	`928`	`+ if (lora == nullptr) {`
`932`	`929`	`LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());`
`933`	`930`	`llama_free(lctx);`
`934`	`931`	`llama_free_model(model);`
`935`	`932`	`return iparams;`
`936`	`933`	`}`
`937`		`- iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters`
	`934`	`+`
	`935`	`+ la.ptr = lora.get();`
	`936`	`+ iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters`
`938`	`937`	`}`
	`938`	`+`
`939`	`939`	`if (!params.lora_init_without_apply) {`
`940`		`- common_lora_adapters_apply(lctx, iparams.lora_adapters);`
	`940`	`+ common_lora_adapters_apply(lctx, params.lora_adapters);`
`941`	`941`	`}`
`942`	`942`
`943`	`943`	`if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {`
`@@ -998,17 +998,17 @@ struct common_init_result common_init_from_params(common_params & params) {`
`998`	`998`	`llama_perf_context_reset(lctx);`
`999`	`999`	`}`
`1000`	`1000`
`1001`		`- iparams.model = model;`
`1002`		`- iparams.context = lctx;`
	`1001`	`+ iparams.model.reset(model);`
	`1002`	`+ iparams.context.reset(lctx);`
`1003`	`1003`
`1004`	`1004`	`return iparams;`
`1005`	`1005`	`}`
`1006`	`1006`
`1007`		`-void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {`
	`1007`	`+void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {`
`1008`	`1008`	`llama_lora_adapter_clear(ctx);`
`1009`		`- for (auto & la : lora_adapters) {`
	`1009`	`+ for (auto & la : lora) {`
`1010`	`1010`	`if (la.scale != 0.0f) {`
`1011`		`- llama_lora_adapter_set(ctx, la.adapter, la.scale);`
	`1011`	`+ llama_lora_adapter_set(ctx, la.ptr, la.scale);`
`1012`	`1012`	`}`
`1013`	`1013`	`}`
`1014`	`1014`	`}`
`@@ -1365,7 +1365,7 @@ struct llama_model * common_load_model_from_url(`
`1365`	`1365`	`return NULL;`
`1366`	`1366`	`}`
`1367`	`1367`
`1368`		`- auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT_STR);`
	`1368`	`+ auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);`
`1369`	`1369`	`if (key_n_split >= 0) {`
`1370`	`1370`	`n_split = gguf_get_val_u16(ctx_gguf, key_n_split);`
`1371`	`1371`	`}`