Skip to content

Commit d29ba6e

Browse files
committed
Merge branch 'nocuda_since_mmvq_evo' into crokeso
2 parents 2f17eeb + f50b136 commit d29ba6e

35 files changed

+787
-197
lines changed

.github/workflows/kcpp-build-release-win-full-cu12.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ jobs:
6363
id: cuda-toolkit
6464
with:
6565
cuda: '12.1.0'
66+
use-github-cache: false
6667

6768
- name: Build CUDA
6869
id: cmake_build

.github/workflows/kcpp-build-release-win-full.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ jobs:
6262
id: cuda-toolkit
6363
with:
6464
cuda: '11.4.4'
65+
use-github-cache: false
6566

6667
- name: Build CUDA
6768
id: cmake_build

.github/workflows/kcpp-build-release-win-oldcpu-full.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ jobs:
6262
id: cuda-toolkit
6363
with:
6464
cuda: '11.4.4'
65+
use-github-cache: false
6566

6667
- name: Build CUDA
6768
id: cmake_build

common/arg.cpp

Lines changed: 27 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -218,13 +218,11 @@ struct curl_slist_ptr {
218218
#define CURL_MAX_RETRY 3
219219
#define CURL_RETRY_DELAY_SECONDS 2
220220

221-
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
221+
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds, const char * method_name) {
222222
int remaining_attempts = max_attempts;
223-
char * method = nullptr;
224-
curl_easy_getinfo(curl, CURLINFO_EFFECTIVE_METHOD, &method);
225223

226224
while (remaining_attempts > 0) {
227-
LOG_INF("%s: %s %s (attempt %d of %d)...\n", __func__ , method, url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
225+
LOG_INF("%s: %s %s (attempt %d of %d)...\n", __func__ , method_name, url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
228226

229227
CURLcode res = curl_easy_perform(curl);
230228
if (res == CURLE_OK) {
@@ -288,24 +286,17 @@ static bool common_download_file_single(const std::string & url, const std::stri
288286
try {
289287
metadata_in >> metadata;
290288
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
291-
if (metadata.contains("url") && metadata.at("url").is_string()) {
292-
auto previous_url = metadata.at("url").get<std::string>();
293-
if (previous_url != url) {
294-
LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
295-
return false;
296-
}
297-
}
298289
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
299290
etag = metadata.at("etag");
300291
}
301292
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
302293
last_modified = metadata.at("lastModified");
303294
}
304295
} catch (const nlohmann::json::exception & e) {
305-
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
306-
return false;
296+
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
307297
}
308298
}
299+
// if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
309300
} else {
310301
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
311302
}
@@ -351,7 +342,7 @@ static bool common_download_file_single(const std::string & url, const std::stri
351342

352343
// we only allow retrying once for HEAD requests
353344
// this is for the use case of using running offline (no internet), retrying can be annoying
354-
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0);
345+
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
355346
if (!was_perform_successful) {
356347
head_request_ok = false;
357348
}
@@ -433,7 +424,7 @@ static bool common_download_file_single(const std::string & url, const std::stri
433424
// start the download
434425
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
435426
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
436-
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
427+
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS, "GET");
437428
if (!was_perform_successful) {
438429
return false;
439430
}
@@ -1950,6 +1941,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
19501941
params.sampling.grammar = json_schema_to_grammar(json::parse(value));
19511942
}
19521943
).set_sparam());
1944+
add_opt(common_arg(
1945+
{"-jf", "--json-schema-file"}, "FILE",
1946+
"File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
1947+
[](common_params & params, const std::string & value) {
1948+
std::ifstream file(value);
1949+
if (!file) {
1950+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
1951+
}
1952+
std::string schema;
1953+
std::copy(
1954+
std::istreambuf_iterator<char>(file),
1955+
std::istreambuf_iterator<char>(),
1956+
std::back_inserter(schema)
1957+
);
1958+
params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
1959+
}
1960+
).set_sparam());
19531961
add_opt(common_arg(
19541962
{"--pooling"}, "{none,mean,cls,last,rank}",
19551963
"pooling type for embeddings, use model default if unspecified",
@@ -2777,7 +2785,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
27772785
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
27782786
add_opt(common_arg(
27792787
{"--cache-reuse"}, "N",
2780-
string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse),
2788+
string_format(
2789+
"min chunk size to attempt reusing from the cache via KV shifting (default: %d)\n"
2790+
"[(card)](https://ggml.ai/f0.png)", params.n_cache_reuse
2791+
),
27812792
[](common_params & params, int value) {
27822793
params.n_cache_reuse = value;
27832794
}

convert_hf_to_gguf.py

Lines changed: 87 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from hashlib import sha256
1717
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
1818
from itertools import chain
19+
from transformers import AutoConfig
1920

2021
import math
2122
import numpy as np
@@ -66,8 +67,6 @@ class ModelBase:
6667
part_names: list[str]
6768
is_safetensors: bool
6869
hparams: dict[str, Any]
69-
block_count: int
70-
tensor_map: gguf.TensorNameMap
7170
tensor_names: set[str] | None
7271
gguf_writer: gguf.GGUFWriter
7372
model_name: str | None
@@ -78,6 +77,10 @@ class ModelBase:
7877
# subclasses should define this!
7978
model_arch: gguf.MODEL_ARCH
8079

80+
# subclasses should initialize this!
81+
block_count: int
82+
tensor_map: gguf.TensorNameMap
83+
8184
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False,
8285
use_temp_file: bool = False, eager: bool = False,
8386
metadata_override: Path | None = None, model_name: str | None = None,
@@ -113,8 +116,6 @@ def get_remote_tensors() -> Iterator[tuple[str, Tensor]]:
113116
if not self.is_safetensors:
114117
self.part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
115118
self.hparams = ModelBase.load_hparams(self.dir_model) if hparams is None else hparams
116-
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
117-
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
118119
self.tensor_names = None
119120
self.metadata_override = metadata_override
120121
self.model_name = model_name
@@ -417,15 +418,15 @@ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]
417418

418419
@staticmethod
419420
def load_hparams(dir_model: Path):
420-
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
421-
hparams = json.load(f)
422-
architectures = hparams.get("architectures")
423-
if "text_config" in hparams:
424-
hparams = {**hparams, **hparams["text_config"]}
425-
if architectures is not None:
426-
# preserve "architectures" from root level config
427-
hparams["architectures"] = architectures
428-
return hparams
421+
try:
422+
# for security reason, we don't allow loading remote code by default
423+
# if a model need remote code, we will fallback to config.json
424+
return AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
425+
except Exception as e:
426+
logger.warning(f"Failed to load model config from {dir_model}: {e}")
427+
logger.warning("Trying to load config.json instead")
428+
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
429+
return json.load(f)
429430

430431
@classmethod
431432
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
@@ -454,6 +455,23 @@ def from_model_architecture(cls, arch: str, model_type = ModelType.TEXT) -> type
454455

455456

456457
class TextModel(ModelBase):
458+
def __init__(self, *args, **kwargs):
459+
super().__init__(*args, **kwargs)
460+
461+
if "text_config" in self.hparams:
462+
# move the text_config to the root level
463+
self.hparams = {**self.hparams, **self.hparams["text_config"]}
464+
465+
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
466+
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
467+
468+
@classmethod
469+
def __init_subclass__(cls):
470+
# can't use an abstract property, because overriding it without type errors
471+
# would require using decorated functions instead of simply defining the property
472+
if "model_arch" not in cls.__dict__:
473+
raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}")
474+
457475
def set_vocab(self):
458476
self._set_vocab_gpt2()
459477

@@ -1070,9 +1088,9 @@ def __init__(self, *args, **kwargs):
10701088
if self.model_arch != gguf.MODEL_ARCH.CLIP_VISION:
10711089
raise TypeError("VisionModel must be subclassed with model_arch = gguf.MODEL_ARCH.CLIP_VISION")
10721090

1073-
# small hack to correct the number of layers
1074-
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.CLIP_VISION, 128)
1075-
self.n_embd_text = self.find_hparam(["hidden_size", "n_embd"])
1091+
# get n_embd of the text model
1092+
text_config = {**self.hparams, **self.hparams["text_config"]}
1093+
self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0))
10761094
assert self.n_embd_text > 0, "n_embd not found in hparams"
10771095

10781096
if "vision_config" not in self.hparams:
@@ -1081,6 +1099,9 @@ def __init__(self, *args, **kwargs):
10811099
self.global_config = self.hparams
10821100
self.hparams = self.hparams["vision_config"]
10831101

1102+
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"])
1103+
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.CLIP_VISION, self.block_count)
1104+
10841105
# load preprocessor config
10851106
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
10861107
self.preprocessor_config = json.load(f)
@@ -1098,12 +1119,12 @@ def set_gguf_parameters(self):
10981119
self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"]))
10991120
self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"]))
11001121
self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"]))
1101-
self.gguf_writer.add_vision_block_count(self.find_hparam(["num_hidden_layers"]))
1122+
self.gguf_writer.add_vision_block_count(self.block_count)
11021123
self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"]))
11031124

11041125
# preprocessor config
11051126
self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
1106-
self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_mean"])
1127+
self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"])
11071128

11081129
def write_vocab(self):
11091130
raise ValueError("VisionModel does not support vocab writing")
@@ -1719,23 +1740,12 @@ def prepare_tensors(self):
17191740
"LlamaForCausalLM",
17201741
"MistralForCausalLM",
17211742
"MixtralForCausalLM",
1722-
"Idefics3ForConditionalGeneration",
1723-
"SmolVLMForConditionalGeneration",
1743+
"VLlama3ForCausalLM",
17241744
"LlavaForConditionalGeneration")
17251745
class LlamaModel(TextModel):
17261746
model_arch = gguf.MODEL_ARCH.LLAMA
17271747
undo_permute = True
17281748

1729-
def __init__(self, *args, **kwargs):
1730-
super().__init__(*args, **kwargs)
1731-
# fix for SmolVLM2, missing `num_attention_heads` in config.json
1732-
if self.hparams["architectures"][0] == "SmolVLMForConditionalGeneration":
1733-
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
1734-
# fix for Pixtral, missing `num_attention_heads` in config.json
1735-
if self.hparams["architectures"][0] == "LlavaForConditionalGeneration" \
1736-
and self.hparams.get("model_type") == "mistral":
1737-
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
1738-
17391749
def set_vocab(self):
17401750
try:
17411751
self._set_vocab_sentencepiece()
@@ -1891,31 +1901,50 @@ def prepare_tensors(self):
18911901
raise ValueError(f"Unprocessed experts: {experts}")
18921902

18931903

1894-
@ModelBase.register("LlavaForConditionalGeneration")
1904+
@ModelBase.register(
1905+
"LlavaForConditionalGeneration", # pixtral
1906+
"Mistral3ForConditionalGeneration", # mistral small 3.1
1907+
)
18951908
class LlavaVisionModel(VisionModel):
18961909
img_break_tok_id = -1
18971910

18981911
def __init__(self, *args, **kwargs):
18991912
super().__init__(*args, **kwargs)
19001913
if self.hparams["model_type"] == "pixtral":
1901-
# fix missing config.json values
1902-
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16)
1903-
self.hparams["num_hidden_layers"] = self.hparams.get("num_hidden_layers", 24)
1904-
self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 4096)
1905-
self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1024)
1914+
# layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py
19061915
self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5)
1907-
self.img_break_tok_id = 12 # see tokenizer_config.json
1916+
self.img_break_tok_id = self.get_token_id("[IMG_BREAK]")
1917+
logger.info(f"Image break token id: {self.img_break_tok_id}")
19081918
else:
19091919
raise ValueError(f"Unsupported model type: {self.hparams['model_type']}")
19101920

1921+
def get_token_id(self, token: str) -> int:
1922+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
1923+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
1924+
added_tokens_decoder = json.load(f)['added_tokens_decoder']
1925+
for id_, token_data in added_tokens_decoder.items():
1926+
if token_data["content"] == token:
1927+
return int(id_)
1928+
raise ValueError(f"Token '{token}' not found in tokenizer config.")
1929+
19111930
def set_gguf_parameters(self):
19121931
super().set_gguf_parameters()
19131932
hparams = self.hparams
19141933
if hparams["model_type"] == "pixtral":
19151934
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.PIXTRAL)
1916-
# default values below are taken from HF tranformers code
19171935
self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
1918-
self.gguf_writer.add_vision_use_silu(True)
1936+
1937+
# hidden_act
1938+
if hparams["hidden_act"] == "silu":
1939+
self.gguf_writer.add_vision_use_silu(True)
1940+
elif hparams["hidden_act"] == "gelu":
1941+
self.gguf_writer.add_vision_use_gelu(True)
1942+
else:
1943+
raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}")
1944+
1945+
# spatial_merge_size
1946+
if "spatial_merge_size" in self.global_config:
1947+
self.gguf_writer.add_vision_spatial_merge_size(self.global_config["spatial_merge_size"])
19191948

19201949
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
19211950
del bid # unused
@@ -1944,13 +1973,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
19441973
class SmolVLMModel(VisionModel):
19451974
def __init__(self, *args, **kwargs):
19461975
super().__init__(*args, **kwargs)
1947-
# fix for SmolVLM2, missing some keys in config.json
1948-
# default values are taken from transformers code
19491976
if self.hparams["model_type"] == "smolvlm_vision":
1977+
# fix for SmolVLM2, missing some keys in config.json
1978+
# default values are taken from transformers code
19501979
self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1152)
19511980
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16)
19521981
self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 3072)
1953-
self.hparams["num_hidden_layers"] = self.hparams.get("num_hidden_layers", 12)
19541982

19551983
def set_gguf_parameters(self):
19561984
super().set_gguf_parameters()
@@ -3505,6 +3533,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
35053533

35063534
@ModelBase.register("NomicBertModel")
35073535
class NomicBertModel(BertModel):
3536+
model_arch = gguf.MODEL_ARCH.BERT
3537+
35083538
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any):
35093539
hparams = kwargs.pop("hparams", None)
35103540
if hparams is None:
@@ -5849,6 +5879,19 @@ def split_str_to_n_bytes(split_str: str) -> int:
58495879
return n
58505880

58515881

5882+
def get_model_architecture(dir_model: Path, model_type: ModelType, hparams: Any = None) -> str:
5883+
hparams = ModelBase.load_hparams(dir_model) if hparams is None else hparams
5884+
text_config = hparams.get("text_config", {})
5885+
vision_config = hparams.get("vision_config", {})
5886+
arch = hparams["architectures"][0]
5887+
# if "architectures" is found in the sub-config, use that instead
5888+
if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
5889+
arch = text_config["architectures"][0]
5890+
elif model_type == ModelType.VISION and vision_config.get("architectures") is not None:
5891+
arch = vision_config["architectures"][0]
5892+
return arch
5893+
5894+
58525895
def main() -> None:
58535896
args = parse_args()
58545897

@@ -5901,16 +5944,15 @@ def main() -> None:
59015944

59025945
logger.info(f"Loading model: {dir_model.name}")
59035946

5904-
hparams = ModelBase.load_hparams(dir_model)
5905-
59065947
if args.mmproj:
59075948
if "mmproj" not in fname_out.name:
59085949
fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-")
59095950

59105951
with torch.inference_mode():
59115952
output_type = ftype_map[args.outtype]
5912-
model_architecture = hparams["architectures"][0]
59135953
model_type = ModelType.VISION if args.mmproj else ModelType.TEXT
5954+
model_architecture = get_model_architecture(dir_model, model_type)
5955+
logger.info(f"Model architecture: {model_architecture}")
59145956
try:
59155957
model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type)
59165958
except NotImplementedError:

0 commit comments

Comments
 (0)