Skip to content

Commit a841eb1

Browse files
author
prima
committed
Merge remote-tracking branch 'origin/concedo_experimental' into remoteManagement
2 parents 7213c8c + 45f589b commit a841eb1

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

79 files changed

+5679
-3158
lines changed

.github/workflows/kcpp-build-release-win.yaml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,4 +172,36 @@ jobs:
172172
ggml/src/ggml-vulkan-shaders.hpp
173173
vulkan-readme.txt
174174
175+
- name: Build Tools
176+
id: make_tools
177+
run: |
178+
make tools -j 4
179+
New-Item -ItemType Directory -Path legacy
180+
Copy-Item quantize_gpt2.exe -Destination legacy
181+
Copy-Item quantize_gptj.exe -Destination legacy
182+
Copy-Item quantize_mpt.exe -Destination legacy
183+
Copy-Item quantize_neox.exe -Destination legacy
184+
Copy-Item otherarch/tools/convert_hf_gpt2.py -Destination legacy
185+
Copy-Item otherarch/tools/convert_hf_gptj.py -Destination legacy
186+
Copy-Item otherarch/tools/convert_hf_mpt.py -Destination legacy
187+
Copy-Item otherarch/tools/convert_hf_neox.py -Destination legacy
188+
Copy-Item otherarch/tools/convert_llama_ggml_to_gguf.py -Destination legacy
189+
Copy-Item otherarch/tools/convert_pt_rwkv.py -Destination legacy
190+
shell: pwsh
175191

192+
- name: Upload Tools
193+
uses: actions/upload-artifact@v4
194+
with:
195+
name: koboldcpp_tools
196+
path: |
197+
gguf-split.exe
198+
quantize_clip.exe
199+
quantize_gguf.exe
200+
whispermain.exe
201+
sdmain.exe
202+
ttsmain.exe
203+
whispermain.exe
204+
convert_hf_to_gguf.py
205+
convert_hf_to_gguf_update.py
206+
gguf-py
207+
legacy

Makefile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,7 @@ ifdef LLAMA_HIPBLAS
244244
ifeq ($(wildcard /opt/rocm),)
245245
ROCM_PATH ?= /usr
246246
ifdef LLAMA_PORTABLE
247-
GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx942 gfx1010 gfx1030 gfx1031 gfx1032 gfx1100 gfx1101 gfx1102 $(shell $(shell which amdgpu-arch))
247+
GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx942 gfx1010 gfx1030 gfx1031 gfx1032 gfx1100 gfx1101 gfx1102 gfx1200 gfx1201 $(shell $(shell which amdgpu-arch))
248248
else
249249
GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
250250
endif
@@ -253,7 +253,7 @@ endif
253253
else
254254
ROCM_PATH ?= /opt/rocm
255255
ifdef LLAMA_PORTABLE
256-
GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx942 gfx1010 gfx1030 gfx1031 gfx1032 gfx1100 gfx1101 gfx1102 $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
256+
GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx942 gfx1010 gfx1030 gfx1031 gfx1032 gfx1100 gfx1101 gfx1102 gfx1200 gfx1201 $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
257257
else
258258
GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
259259
endif
@@ -675,7 +675,7 @@ embeddings_default.o: otherarch/embeddings_adapter.cpp
675675
$(CXX) $(CXXFLAGS) -c $< -o $@
676676

677677
# idiotic "for easier compilation"
678-
GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp src/llama.cpp src/llama-impl.cpp src/llama-chat.cpp src/llama-mmap.cpp src/llama-context.cpp src/llama-adapter.cpp src/llama-arch.cpp src/llama-batch.cpp src/llama-vocab.cpp src/llama-grammar.cpp src/llama-sampling.cpp src/llama-kv-cache-unified.cpp src/llama-kv-cache-unified-iswa.cpp src/llama-kv-cache-recurrent.cpp src/llama-model-loader.cpp src/llama-model.cpp src/llama-quant.cpp src/llama-hparams.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml/include/ggml.h ggml/include/ggml-cpu.h ggml/include/ggml-cuda.h include/llama.h otherarch/llama-util.h
678+
GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp src/llama.cpp src/llama-impl.cpp src/llama-chat.cpp src/llama-mmap.cpp src/llama-context.cpp src/llama-adapter.cpp src/llama-arch.cpp src/llama-batch.cpp src/llama-vocab.cpp src/llama-grammar.cpp src/llama-sampling.cpp src/llama-kv-cache-unified.cpp src/llama-kv-cache-unified-iswa.cpp src/llama-memory-hybrid.cpp src/llama-memory-recurrent.cpp src/llama-model-loader.cpp src/llama-model.cpp src/llama-quant.cpp src/llama-hparams.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml/include/ggml.h ggml/include/ggml-cpu.h ggml/include/ggml-cuda.h include/llama.h otherarch/llama-util.h
679679
gpttype_adapter_failsafe.o: $(GPTTYPE_ADAPTER)
680680
$(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) -c $< -o $@
681681
gpttype_adapter.o: $(GPTTYPE_ADAPTER)

common/arg.cpp

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2708,6 +2708,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
27082708
params.embd_sep = value;
27092709
}
27102710
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2711+
add_opt(common_arg(
2712+
{"--cls-separator"}, "STRING",
2713+
"separator of classification sequences (default \\t) for example \"<#seq#>\"",
2714+
[](common_params & params, const std::string & value) {
2715+
params.cls_sep = value;
2716+
}
2717+
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
27112718
add_opt(common_arg(
27122719
{"--host"}, "HOST",
27132720
string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
@@ -3212,6 +3219,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
32123219
params.speculative.model.path = value;
32133220
}
32143221
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
3222+
add_opt(common_arg(
3223+
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
3224+
string_format(
3225+
"KV cache data type for K for the draft model\n"
3226+
"allowed values: %s\n"
3227+
"(default: %s)",
3228+
get_all_kv_cache_types().c_str(),
3229+
ggml_type_name(params.speculative.cache_type_k)
3230+
),
3231+
[](common_params & params, const std::string & value) {
3232+
params.speculative.cache_type_k = kv_cache_type_from_str(value);
3233+
}
3234+
).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT"));
3235+
add_opt(common_arg(
3236+
{"-ctvd", "--cache-type-v-draft"}, "TYPE",
3237+
string_format(
3238+
"KV cache data type for V for the draft model\n"
3239+
"allowed values: %s\n"
3240+
"(default: %s)",
3241+
get_all_kv_cache_types().c_str(),
3242+
ggml_type_name(params.speculative.cache_type_v)
3243+
),
3244+
[](common_params & params, const std::string & value) {
3245+
params.speculative.cache_type_v = kv_cache_type_from_str(value);
3246+
}
3247+
).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT"));
32153248

32163249
add_opt(common_arg(
32173250
{"-mv", "--model-vocoder"}, "FNAME",

common/common.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -714,11 +714,17 @@ bool fs_validate_filename(const std::string & filename) {
714714
// disable C++17 deprecation warning for std::codecvt_utf8
715715
# pragma clang diagnostic push
716716
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
717+
#elif defined(__GNUC__)
718+
# pragma GCC diagnostic push
719+
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
717720
#endif
721+
718722
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
719723

720724
#if defined(__clang__)
721725
# pragma clang diagnostic pop
726+
#elif defined(__GNUC__)
727+
# pragma GCC diagnostic pop
722728
#endif
723729

724730
filename_utf32 = converter.from_bytes(filename);
@@ -1292,6 +1298,9 @@ std::vector<llama_token> common_tokenize(
12921298
int n_tokens = text.length() + 2 * add_special;
12931299
std::vector<llama_token> result(n_tokens);
12941300
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1301+
if (n_tokens == std::numeric_limits<int32_t>::min()) {
1302+
throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
1303+
}
12951304
if (n_tokens < 0) {
12961305
result.resize(-n_tokens);
12971306
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);

common/common.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,9 @@ struct common_params_speculative {
195195
float p_split = 0.1f; // speculative decoding split probability
196196
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
197197

198+
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
199+
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
200+
198201
struct cpu_params cpuparams;
199202
struct cpu_params cpuparams_batch;
200203

@@ -351,6 +354,7 @@ struct common_params {
351354
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
352355
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
353356
std::string embd_sep = "\n"; // separator of embeddings
357+
std::string cls_sep = "\t"; // separator of classification sequences
354358

355359
// server params
356360
int32_t port = 8080; // server listens on this network port

convert_hf_to_gguf.py

Lines changed: 11 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2145,7 +2145,6 @@ def __init__(self, *args, **kwargs):
21452145

21462146
def set_vocab(self):
21472147
self._set_vocab_gpt2()
2148-
self.gguf_writer.add_add_bos_token(True)
21492148

21502149
def set_gguf_parameters(self):
21512150
super().set_gguf_parameters()
@@ -3918,9 +3917,6 @@ def _xlmroberta_set_vocab(self) -> None:
39183917
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
39193918
special_vocab.add_to_gguf(self.gguf_writer)
39203919

3921-
self.gguf_writer.add_add_bos_token(True)
3922-
self.gguf_writer.add_add_eos_token(True)
3923-
39243920

39253921
@ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
39263922
class DistilBertModel(BertModel):
@@ -3962,8 +3958,6 @@ def set_vocab(self):
39623958
bpe_tok_path = self.dir_model / "tokenizer.json"
39633959
if bpe_tok_path.exists():
39643960
self._set_vocab_gpt2()
3965-
self.gguf_writer.add_add_bos_token(True)
3966-
self.gguf_writer.add_add_eos_token(True)
39673961

39683962
# we need this to validate the size of the token_type embeddings
39693963
# though currently we are passing all zeros to the token_type embeddings
@@ -4848,8 +4842,6 @@ def set_vocab(self):
48484842
self.gguf_writer.add_token_type_count(2)
48494843
else:
48504844
raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
4851-
self.gguf_writer.add_add_bos_token(True)
4852-
self.gguf_writer.add_add_eos_token(True)
48534845

48544846

48554847
@ModelBase.register("OpenELMForCausalLM")
@@ -5451,9 +5443,6 @@ def set_vocab(self):
54515443
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
54525444
special_vocab.add_to_gguf(self.gguf_writer)
54535445

5454-
self.gguf_writer.add_add_bos_token(False)
5455-
self.gguf_writer.add_add_eos_token(True)
5456-
54575446
def set_gguf_parameters(self):
54585447
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
54595448
logger.warning("Couldn't find context length in config.json, assuming default value of 512")
@@ -5591,9 +5580,6 @@ def set_vocab(self):
55915580
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
55925581
special_vocab.add_to_gguf(self.gguf_writer)
55935582

5594-
self.gguf_writer.add_add_bos_token(False)
5595-
self.gguf_writer.add_add_eos_token(True)
5596-
55975583
def set_gguf_parameters(self):
55985584
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
55995585
logger.warning("Couldn't find context length in config.json, assuming default value of 512")
@@ -6389,8 +6375,8 @@ def parse_args() -> argparse.Namespace:
63896375
help="model is executed on big endian machine",
63906376
)
63916377
parser.add_argument(
6392-
"model", type=Path,
6393-
help="directory containing model file",
6378+
"model", type=str,
6379+
help="directory containing model file or huggingface repository ID (if --remote)",
63946380
nargs="?",
63956381
)
63966382
parser.add_argument(
@@ -6493,18 +6479,20 @@ def main() -> None:
64936479
else:
64946480
logging.basicConfig(level=logging.INFO)
64956481

6496-
dir_model = args.model
6497-
64986482
if args.remote:
6483+
hf_repo_id = args.model
64996484
from huggingface_hub import snapshot_download
65006485
local_dir = snapshot_download(
6501-
repo_id=str(dir_model),
6486+
repo_id=hf_repo_id,
65026487
allow_patterns=["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"])
65036488
dir_model = Path(local_dir)
65046489
logger.info(f"Downloaded config and tokenizer to {local_dir}")
6490+
else:
6491+
hf_repo_id = None
6492+
dir_model = Path(args.model)
65056493

65066494
if not dir_model.is_dir():
6507-
logger.error(f'Error: {args.model} is not a directory')
6495+
logger.error(f'Error: {dir_model} is not a directory')
65086496
sys.exit(1)
65096497

65106498
ftype_map: dict[str, gguf.LlamaFileType] = {
@@ -6524,9 +6512,9 @@ def main() -> None:
65246512

65256513
if args.outfile is not None:
65266514
fname_out = args.outfile
6527-
elif args.remote:
6515+
elif hf_repo_id:
65286516
# if remote, use the model ID as the output file name
6529-
fname_out = Path("./" + str(args.model).replace("/", "-") + "-{ftype}.gguf")
6517+
fname_out = Path("./" + hf_repo_id.replace("/", "-") + "-{ftype}.gguf")
65306518
else:
65316519
fname_out = dir_model
65326520

@@ -6555,7 +6543,7 @@ def main() -> None:
65556543
split_max_tensors=args.split_max_tensors,
65566544
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
65576545
small_first_shard=args.no_tensor_first_split,
6558-
remote_hf_model_id=str(args.model) if args.remote else None)
6546+
remote_hf_model_id=hf_repo_id)
65596547

65606548
if args.vocab_only:
65616549
logger.info("Exporting model vocab...")

0 commit comments

Comments
 (0)