Skip to content

Commit 650f129

Browse files
committed
Merge branch 'concedo_experimental' into crokeso
2 parents 7a6084b + ce58d12 commit 650f129

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

70 files changed

+5056
-2825
lines changed

.github/workflows/kcpp-build-release-win.yaml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,4 +172,36 @@ jobs:
172172
ggml/src/ggml-vulkan-shaders.hpp
173173
vulkan-readme.txt
174174
175+
- name: Build Tools
176+
id: make_tools
177+
run: |
178+
make tools -j 4
179+
New-Item -ItemType Directory -Path legacy
180+
Copy-Item quantize_gpt2.exe -Destination legacy
181+
Copy-Item quantize_gptj.exe -Destination legacy
182+
Copy-Item quantize_mpt.exe -Destination legacy
183+
Copy-Item quantize_neox.exe -Destination legacy
184+
Copy-Item otherarch/tools/convert_hf_gpt2.py -Destination legacy
185+
Copy-Item otherarch/tools/convert_hf_gptj.py -Destination legacy
186+
Copy-Item otherarch/tools/convert_hf_mpt.py -Destination legacy
187+
Copy-Item otherarch/tools/convert_hf_neox.py -Destination legacy
188+
Copy-Item otherarch/tools/convert_llama_ggml_to_gguf.py -Destination legacy
189+
Copy-Item otherarch/tools/convert_pt_rwkv.py -Destination legacy
190+
shell: pwsh
175191

192+
- name: Upload Tools
193+
uses: actions/upload-artifact@v4
194+
with:
195+
name: koboldcpp_tools
196+
path: |
197+
gguf-split.exe
198+
quantize_clip.exe
199+
quantize_gguf.exe
200+
whispermain.exe
201+
sdmain.exe
202+
ttsmain.exe
203+
whispermain.exe
204+
convert_hf_to_gguf.py
205+
convert_hf_to_gguf_update.py
206+
gguf-py
207+
legacy

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -738,7 +738,7 @@ embeddings_default.o: otherarch/embeddings_adapter.cpp
738738
$(CXX) $(CXXFLAGS) -c $< -o $@
739739

740740
# idiotic "for easier compilation"
741-
GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp src/llama.cpp src/llama-impl.cpp src/llama-chat.cpp src/llama-mmap.cpp src/llama-context.cpp src/llama-adapter.cpp src/llama-arch.cpp src/llama-batch.cpp src/llama-vocab.cpp src/llama-grammar.cpp src/llama-sampling.cpp src/llama-kv-cache-unified.cpp src/llama-kv-cache-unified-iswa.cpp src/llama-kv-cache-recurrent.cpp src/llama-model-loader.cpp src/llama-model.cpp src/llama-quant.cpp src/llama-hparams.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml/include/ggml.h ggml/include/ggml-cpu.h ggml/include/ggml-cuda.h include/llama.h otherarch/llama-util.h
741+
GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp src/llama.cpp src/llama-impl.cpp src/llama-chat.cpp src/llama-mmap.cpp src/llama-context.cpp src/llama-adapter.cpp src/llama-arch.cpp src/llama-batch.cpp src/llama-vocab.cpp src/llama-grammar.cpp src/llama-sampling.cpp src/llama-kv-cache-unified.cpp src/llama-kv-cache-unified-iswa.cpp src/llama-memory-hybrid.cpp src/llama-memory-recurrent.cpp src/llama-model-loader.cpp src/llama-model.cpp src/llama-quant.cpp src/llama-hparams.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml/include/ggml.h ggml/include/ggml-cpu.h ggml/include/ggml-cuda.h include/llama.h otherarch/llama-util.h
742742
gpttype_adapter_failsafe.o: $(GPTTYPE_ADAPTER)
743743
$(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) -c $< -o $@
744744
gpttype_adapter.o: $(GPTTYPE_ADAPTER)

common/arg.cpp

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2709,6 +2709,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
27092709
params.embd_sep = value;
27102710
}
27112711
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2712+
add_opt(common_arg(
2713+
{"--cls-separator"}, "STRING",
2714+
"separator of classification sequences (default \\t) for example \"<#seq#>\"",
2715+
[](common_params & params, const std::string & value) {
2716+
params.cls_sep = value;
2717+
}
2718+
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
27122719
add_opt(common_arg(
27132720
{"--host"}, "HOST",
27142721
string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
@@ -3213,6 +3220,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
32133220
params.speculative.model.path = value;
32143221
}
32153222
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
3223+
add_opt(common_arg(
3224+
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
3225+
string_format(
3226+
"KV cache data type for K for the draft model\n"
3227+
"allowed values: %s\n"
3228+
"(default: %s)",
3229+
get_all_kv_cache_types().c_str(),
3230+
ggml_type_name(params.speculative.cache_type_k)
3231+
),
3232+
[](common_params & params, const std::string & value) {
3233+
params.speculative.cache_type_k = kv_cache_type_from_str(value);
3234+
}
3235+
).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT"));
3236+
add_opt(common_arg(
3237+
{"-ctvd", "--cache-type-v-draft"}, "TYPE",
3238+
string_format(
3239+
"KV cache data type for V for the draft model\n"
3240+
"allowed values: %s\n"
3241+
"(default: %s)",
3242+
get_all_kv_cache_types().c_str(),
3243+
ggml_type_name(params.speculative.cache_type_v)
3244+
),
3245+
[](common_params & params, const std::string & value) {
3246+
params.speculative.cache_type_v = kv_cache_type_from_str(value);
3247+
}
3248+
).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT"));
32163249

32173250
add_opt(common_arg(
32183251
{"-mv", "--model-vocoder"}, "FNAME",

common/common.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -714,11 +714,17 @@ bool fs_validate_filename(const std::string & filename) {
714714
// disable C++17 deprecation warning for std::codecvt_utf8
715715
# pragma clang diagnostic push
716716
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
717+
#elif defined(__GNUC__)
718+
# pragma GCC diagnostic push
719+
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
717720
#endif
721+
718722
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
719723

720724
#if defined(__clang__)
721725
# pragma clang diagnostic pop
726+
#elif defined(__GNUC__)
727+
# pragma GCC diagnostic pop
722728
#endif
723729

724730
filename_utf32 = converter.from_bytes(filename);
@@ -1292,6 +1298,9 @@ std::vector<llama_token> common_tokenize(
12921298
int n_tokens = text.length() + 2 * add_special;
12931299
std::vector<llama_token> result(n_tokens);
12941300
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1301+
if (n_tokens == std::numeric_limits<int32_t>::min()) {
1302+
throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
1303+
}
12951304
if (n_tokens < 0) {
12961305
result.resize(-n_tokens);
12971306
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);

common/common.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,9 @@ struct common_params_speculative {
195195
float p_split = 0.1f; // speculative decoding split probability
196196
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
197197

198+
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
199+
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
200+
198201
struct cpu_params cpuparams;
199202
struct cpu_params cpuparams_batch;
200203

@@ -351,6 +354,7 @@ struct common_params {
351354
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
352355
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
353356
std::string embd_sep = "\n"; // separator of embeddings
357+
std::string cls_sep = "\t"; // separator of classification sequences
354358

355359
// server params
356360
int32_t port = 8080; // server listens on this network port

convert_hf_to_gguf.py

100755100644
Lines changed: 11 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2474,7 +2474,6 @@ def __init__(self, *args, **kwargs):
24742474

24752475
def set_vocab(self):
24762476
self._set_vocab_gpt2()
2477-
self.gguf_writer.add_add_bos_token(True)
24782477

24792478
def set_gguf_parameters(self):
24802479
super().set_gguf_parameters()
@@ -4247,9 +4246,6 @@ def _xlmroberta_set_vocab(self) -> None:
42474246
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
42484247
special_vocab.add_to_gguf(self.gguf_writer)
42494248

4250-
self.gguf_writer.add_add_bos_token(True)
4251-
self.gguf_writer.add_add_eos_token(True)
4252-
42534249

42544250
@ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
42554251
class DistilBertModel(BertModel):
@@ -4291,8 +4287,6 @@ def set_vocab(self):
42914287
bpe_tok_path = self.dir_model / "tokenizer.json"
42924288
if bpe_tok_path.exists():
42934289
self._set_vocab_gpt2()
4294-
self.gguf_writer.add_add_bos_token(True)
4295-
self.gguf_writer.add_add_eos_token(True)
42964290

42974291
# we need this to validate the size of the token_type embeddings
42984292
# though currently we are passing all zeros to the token_type embeddings
@@ -5177,8 +5171,6 @@ def set_vocab(self):
51775171
self.gguf_writer.add_token_type_count(2)
51785172
else:
51795173
raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
5180-
self.gguf_writer.add_add_bos_token(True)
5181-
self.gguf_writer.add_add_eos_token(True)
51825174

51835175

51845176
@ModelBase.register("OpenELMForCausalLM")
@@ -5780,9 +5772,6 @@ def set_vocab(self):
57805772
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
57815773
special_vocab.add_to_gguf(self.gguf_writer)
57825774

5783-
self.gguf_writer.add_add_bos_token(False)
5784-
self.gguf_writer.add_add_eos_token(True)
5785-
57865775
def set_gguf_parameters(self):
57875776
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
57885777
logger.warning("Couldn't find context length in config.json, assuming default value of 512")
@@ -5920,9 +5909,6 @@ def set_vocab(self):
59205909
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
59215910
special_vocab.add_to_gguf(self.gguf_writer)
59225911

5923-
self.gguf_writer.add_add_bos_token(False)
5924-
self.gguf_writer.add_add_eos_token(True)
5925-
59265912
def set_gguf_parameters(self):
59275913
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
59285914
logger.warning("Couldn't find context length in config.json, assuming default value of 512")
@@ -6718,8 +6704,8 @@ def parse_args() -> argparse.Namespace:
67186704
help="model is executed on big endian machine",
67196705
)
67206706
parser.add_argument(
6721-
"model", type=Path,
6722-
help="directory containing model file",
6707+
"model", type=str,
6708+
help="directory containing model file or huggingface repository ID (if --remote)",
67236709
nargs="?",
67246710
)
67256711
parser.add_argument(
@@ -6826,18 +6812,20 @@ def main() -> None:
68266812
else:
68276813
logging.basicConfig(level=logging.INFO)
68286814

6829-
dir_model = args.model
6830-
68316815
if args.remote:
6816+
hf_repo_id = args.model
68326817
from huggingface_hub import snapshot_download
68336818
local_dir = snapshot_download(
6834-
repo_id=str(dir_model),
6819+
repo_id=hf_repo_id,
68356820
allow_patterns=["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"])
68366821
dir_model = Path(local_dir)
68376822
logger.info(f"Downloaded config and tokenizer to {local_dir}")
6823+
else:
6824+
hf_repo_id = None
6825+
dir_model = Path(args.model)
68386826

68396827
if not dir_model.is_dir():
6840-
logger.error(f'Error: {args.model} is not a directory')
6828+
logger.error(f'Error: {dir_model} is not a directory')
68416829
sys.exit(1)
68426830

68436831
ftype_map: dict[str, gguf.LlamaFileType] = {
@@ -6910,9 +6898,9 @@ def main() -> None:
69106898

69116899
if args.outfile is not None:
69126900
fname_out = args.outfile
6913-
elif args.remote:
6901+
elif hf_repo_id:
69146902
# if remote, use the model ID as the output file name
6915-
fname_out = Path("./" + str(args.model).replace("/", "-") + "-{ftype}.gguf")
6903+
fname_out = Path("./" + hf_repo_id.replace("/", "-") + "-{ftype}.gguf")
69166904
else:
69176905
fname_out = dir_model
69186906

@@ -6942,7 +6930,7 @@ def main() -> None:
69426930
split_max_tensors=args.split_max_tensors,
69436931
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
69446932
small_first_shard=args.no_tensor_first_split,
6945-
remote_hf_model_id=str(args.model) if args.remote else None,
6933+
remote_hf_model_id=hf_repo_id,
69466934
thread_count=args.threads)
69476935

69486936
if args.vocab_only:

0 commit comments

Comments
 (0)