Skip to content

Commit 0dea426

Browse files
committed
Merge branch 'master' into compilade/batch-splits
2 parents 9c0a61f + 4c676c8 commit 0dea426

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

85 files changed

+2766
-6402
lines changed

.devops/llama-cli-intel.Dockerfile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
1414
echo "GGML_SYCL_F16 is set" && \
1515
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
1616
fi && \
17-
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
17+
echo "Building with static libs" && \
18+
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
19+
${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
1820
cmake --build build --config Release --target llama-cli
1921

2022
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime

.devops/llama-server-intel.Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
1414
echo "GGML_SYCL_F16 is set" && \
1515
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
1616
fi && \
17+
echo "Building with dynamic libs" && \
1718
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
1819
cmake --build build --config Release --target llama-server
1920

.devops/nix/apps.nix

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
"llama-embedding"
1111
"llama-server"
1212
"llama-quantize"
13-
"llama-train-text-from-scratch"
1413
];
1514
mkApp = name: {
1615
type = "app";

.devops/tools.sh

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,6 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
1313
./llama-quantize "$@"
1414
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
1515
./llama-cli "$@"
16-
elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
17-
./llama-finetune "$@"
1816
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
1917
echo "Converting PTH to GGML..."
2018
for i in `ls $1/$2/ggml-model-f16.bin*`; do
@@ -36,8 +34,6 @@ else
3634
echo " ex: --outtype f16 \"/models/7B/\" "
3735
echo " --quantize (-q): Optimize with quantization process ggml"
3836
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
39-
echo " --finetune (-f): Run finetune command to create a lora finetune of the model"
40-
echo " See documentation for finetune for command-line parameters"
4137
echo " --all-in-one (-a): Execute --convert & --quantize"
4238
echo " ex: \"/models/\" 7B"
4339
echo " --server (-s): Run a model on the server"

Makefile

Lines changed: 54 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ BUILD_TARGETS = \
1111
llama-embedding \
1212
llama-eval-callback \
1313
llama-export-lora \
14-
llama-finetune \
1514
llama-gbnf-validator \
1615
llama-gguf \
1716
llama-gguf-hash \
@@ -37,7 +36,6 @@ BUILD_TARGETS = \
3736
llama-simple \
3837
llama-speculative \
3938
llama-tokenize \
40-
llama-train-text-from-scratch \
4139
llama-vdot \
4240
llama-cvector-generator \
4341
tests/test-c.o
@@ -64,13 +62,13 @@ TEST_TARGETS = \
6462
tests/test-tokenizer-1-spm
6563

6664
# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
67-
LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
65+
LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
6866
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
69-
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm
67+
retrieval speculative infill tokenize benchmark-matmult parallel export-lora lookahead lookup passkey gritlm
7068

7169
# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
7270
# We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
73-
LEGACY_TARGETS_BUILD = main quantize perplexity embedding server finetune
71+
LEGACY_TARGETS_BUILD = main quantize perplexity embedding server
7472

7573
# Deprecation aliases
7674
ifdef LLAMA_CUBLAS
@@ -327,9 +325,9 @@ ifdef LLAMA_DEBUG
327325
endif
328326
else
329327
MK_CPPFLAGS += -DNDEBUG
330-
MK_CFLAGS += -O3
331-
MK_CXXFLAGS += -O3
332-
MK_NVCCFLAGS += -O3
328+
MK_CFLAGS += -O3 -g
329+
MK_CXXFLAGS += -O3 -g
330+
MK_NVCCFLAGS += -O3 -g
333331
endif
334332

335333
ifdef LLAMA_SANITIZE_THREAD
@@ -530,10 +528,21 @@ ifndef GGML_NO_ACCELERATE
530528
endif
531529
endif # GGML_NO_ACCELERATE
532530

531+
ifdef GGML_MUSA
532+
CC := clang
533+
CXX := clang++
534+
GGML_CUDA := 1
535+
MK_CPPFLAGS += -DGGML_USE_MUSA
536+
endif
537+
533538
ifndef GGML_NO_OPENMP
534539
MK_CPPFLAGS += -DGGML_USE_OPENMP
535540
MK_CFLAGS += -fopenmp
536541
MK_CXXFLAGS += -fopenmp
542+
ifdef GGML_MUSA
543+
MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp
544+
MK_LDFLAGS += -L/usr/lib/llvm-10/lib
545+
endif # GGML_MUSA
537546
endif # GGML_NO_OPENMP
538547

539548
ifdef GGML_OPENBLAS
@@ -584,15 +593,27 @@ else
584593
endif # GGML_CUDA_FA_ALL_QUANTS
585594

586595
ifdef GGML_CUDA
587-
ifneq ('', '$(wildcard /opt/cuda)')
588-
CUDA_PATH ?= /opt/cuda
596+
ifdef GGML_MUSA
597+
ifneq ('', '$(wildcard /opt/musa)')
598+
CUDA_PATH ?= /opt/musa
599+
else
600+
CUDA_PATH ?= /usr/local/musa
601+
endif
602+
603+
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include
604+
MK_LDFLAGS += -lmusa -lmublas -lmusart -lpthread -ldl -lrt -L$(CUDA_PATH)/lib -L/usr/lib64
605+
MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_22
589606
else
590-
CUDA_PATH ?= /usr/local/cuda
591-
endif
607+
ifneq ('', '$(wildcard /opt/cuda)')
608+
CUDA_PATH ?= /opt/cuda
609+
else
610+
CUDA_PATH ?= /usr/local/cuda
611+
endif
592612

593-
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
594-
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
595-
MK_NVCCFLAGS += -use_fast_math
613+
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
614+
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
615+
MK_NVCCFLAGS += -use_fast_math
616+
endif # GGML_MUSA
596617

597618
OBJ_GGML += ggml/src/ggml-cuda.o
598619
OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
@@ -602,9 +623,11 @@ ifdef LLAMA_FATAL_WARNINGS
602623
MK_NVCCFLAGS += -Werror all-warnings
603624
endif # LLAMA_FATAL_WARNINGS
604625

626+
ifndef GGML_MUSA
605627
ifndef JETSON_EOL_MODULE_DETECT
606628
MK_NVCCFLAGS += --forward-unknown-to-host-compiler
607629
endif # JETSON_EOL_MODULE_DETECT
630+
endif # GGML_MUSA
608631

609632
ifdef LLAMA_DEBUG
610633
MK_NVCCFLAGS += -lineinfo
@@ -617,8 +640,12 @@ endif # GGML_CUDA_DEBUG
617640
ifdef GGML_CUDA_NVCC
618641
NVCC = $(CCACHE) $(GGML_CUDA_NVCC)
619642
else
620-
NVCC = $(CCACHE) nvcc
621-
endif #GGML_CUDA_NVCC
643+
ifdef GGML_MUSA
644+
NVCC = $(CCACHE) mcc
645+
else
646+
NVCC = $(CCACHE) nvcc
647+
endif # GGML_MUSA
648+
endif # GGML_CUDA_NVCC
622649

623650
ifdef CUDA_DOCKER_ARCH
624651
MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
@@ -689,9 +716,15 @@ define NVCC_COMPILE
689716
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
690717
endef # NVCC_COMPILE
691718
else
719+
ifdef GGML_MUSA
720+
define NVCC_COMPILE
721+
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -c $< -o $@
722+
endef # NVCC_COMPILE
723+
else
692724
define NVCC_COMPILE
693725
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
694726
endef # NVCC_COMPILE
727+
endif # GGML_MUSA
695728
endif # JETSON_EOL_MODULE_DETECT
696729

697730
ggml/src/ggml-cuda/%.o: \
@@ -946,6 +979,7 @@ $(info I CXX: $(shell $(CXX) --version | head -n 1))
946979
ifdef GGML_CUDA
947980
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
948981
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
982+
ifndef GGML_MUSA
949983
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
950984

951985
ifndef CUDA_DOCKER_ARCH
@@ -955,6 +989,7 @@ endif # CUDA_POWER_ARCH
955989
endif # CUDA_DOCKER_ARCH
956990

957991
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
992+
endif # GGML_MUSA
958993
endif # GGML_CUDA
959994
$(info )
960995

@@ -1296,11 +1331,6 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
12961331
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
12971332
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
12981333

1299-
llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp \
1300-
$(OBJ_ALL)
1301-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1302-
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1303-
13041334
llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
13051335
$(OBJ_GGML) $(OBJ_LLAMA)
13061336
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
@@ -1316,13 +1346,8 @@ llama-baby-llama: examples/baby-llama/baby-llama.cpp \
13161346
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
13171347
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
13181348

1319-
llama-finetune: examples/finetune/finetune.cpp \
1320-
$(OBJ_ALL)
1321-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1322-
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1323-
13241349
llama-export-lora: examples/export-lora/export-lora.cpp \
1325-
$(OBJ_GGML) common/log.h
1350+
$(OBJ_ALL)
13261351
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
13271352
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
13281353

@@ -1578,7 +1603,7 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
15781603
# Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
15791604
#
15801605
# Mark legacy binary targets as .PHONY so that they are always checked.
1581-
.PHONY: main quantize perplexity embedding server finetune
1606+
.PHONY: main quantize perplexity embedding server
15821607

15831608
# NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
15841609
# Eventually we will want to remove these target from building all the time.
@@ -1621,13 +1646,3 @@ ifneq (,$(wildcard embedding))
16211646
@echo " Remove the 'embedding' binary to remove this warning."
16221647
@echo "#########"
16231648
endif
1624-
1625-
finetune: examples/deprecation-warning/deprecation-warning.cpp
1626-
ifneq (,$(wildcard finetune))
1627-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1628-
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1629-
@echo "#########"
1630-
@echo "WARNING: The 'finetune' binary is deprecated. Please use 'llama-finetune' instead."
1631-
@echo " Remove the 'finetune' binary to remove this warning."
1632-
@echo "#########"
1633-
endif

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ Typically finetunes of the base models below are supported as well.
138138

139139
Unless otherwise noted these projects are open-source with permissive licensing:
140140

141+
- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
141142
- [iohub/collama](https://github.com/iohub/coLLaMA)
142143
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
143144
- [nat/openplayground](https://github.com/nat/openplayground)
@@ -181,6 +182,9 @@ Unless otherwise noted these projects are open-source with permissive licensing:
181182

182183
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
183184

185+
**Games:**
186+
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
187+
184188
## Demo
185189

186190
<details>
@@ -405,6 +409,7 @@ Please refer to [Build llama.cpp locally](./docs/build.md)
405409
| [BLAS](./docs/build.md#blas-build) | All |
406410
| [BLIS](./docs/backend/BLIS.md) | All |
407411
| [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU |
412+
| [MUSA](./docs/build.md#musa) | Moore Threads GPU |
408413
| [CUDA](./docs/build.md#cuda) | Nvidia GPU |
409414
| [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
410415
| [Vulkan](./docs/build.md#vulkan) | GPU |

common/common.cpp

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -694,11 +694,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
694694
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
695695
return true;
696696
}
697-
if (arg == "--lora-base") {
698-
CHECK_ARG
699-
params.lora_base = argv[i];
700-
return true;
701-
}
702697
if (arg == "--control-vector") {
703698
CHECK_ARG
704699
params.control_vectors.push_back({ 1.0f, argv[i], });
@@ -1274,6 +1269,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
12741269
CHECK_ARG
12751270
params.out_file = argv[i];
12761271
params.cvector_outfile = argv[i];
1272+
params.lora_outfile = argv[i];
12771273
return true;
12781274
}
12791275
if (arg == "-ofreq" || arg == "--output-frequency") {
@@ -1328,6 +1324,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
13281324
else { invalid_param = true; }
13291325
return true;
13301326
}
1327+
if (arg == "--no-warmup") {
1328+
params.warmup = false;
1329+
return true;
1330+
}
13311331
#ifndef LOG_DISABLE_LOGS
13321332
// Parse args for logging parameters
13331333
if (log_param_single_parse(argv[i])) {
@@ -1450,6 +1450,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
14501450
options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
14511451
options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
14521452
options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
1453+
options.push_back({ "main", " --no-warmup", "skip warming up the model with an empty run" });
14531454
options.push_back({ "server infill",
14541455
" --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
14551456

@@ -1583,9 +1584,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
15831584
options.push_back({ "*", " --override-kv KEY=TYPE:VALUE",
15841585
"advanced option to override model metadata by key. may be specified multiple times.\n"
15851586
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
1586-
options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" });
1587-
options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" });
1588-
options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" });
1587+
options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (can be repeated to use multiple adapters)" });
1588+
options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
15891589
options.push_back({ "*", " --control-vector FNAME", "add a control vector\n"
15901590
"note: this argument can be repeated to add multiple control vectors" });
15911591
options.push_back({ "*", " --control-vector-scaled FNAME SCALE",
@@ -1676,6 +1676,13 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
16761676
options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
16771677
options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" });
16781678

1679+
options.push_back({ "export-lora" });
1680+
options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() });
1681+
options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" });
1682+
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
1683+
options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads });
1684+
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
1685+
16791686
printf("usage: %s [options]\n", argv[0]);
16801687

16811688
for (const auto & o : options) {
@@ -2721,7 +2728,7 @@ std::string llama_chat_format_single(const struct llama_model * model,
27212728
const llama_chat_msg & new_msg,
27222729
bool add_ass) {
27232730
std::ostringstream ss;
2724-
auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
2731+
auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false);
27252732
std::vector<llama_chat_msg> chat_new(past_msg);
27262733
// if the past_msg ends with a newline, we must preserve it in the formatted version
27272734
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
@@ -3166,7 +3173,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
31663173
}
31673174
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
31683175
}
3169-
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
31703176
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
31713177
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
31723178
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);

common/common.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,6 @@ struct gpt_params {
128128

129129
// TODO: avoid tuple, use struct
130130
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
131-
std::string lora_base = ""; // base model path for the lora adapter
132131

133132
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
134133

@@ -255,6 +254,8 @@ struct gpt_params {
255254
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
256255

257256
bool spm_infill = false; // suffix/prefix/middle pattern for infill
257+
258+
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
258259
};
259260

260261
void gpt_params_handle_hf_token(gpt_params & params);

0 commit comments

Comments
 (0)