Skip to content

Commit 4adce86

Browse files
committed
Merge branch 'master' into xsn/server_mtmd
2 parents 88461f2 + 32916a4 commit 4adce86

File tree

23 files changed

+1403
-1310
lines changed

23 files changed

+1403
-1310
lines changed

CMakePresets.json

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -38,15 +38,6 @@
3838
}
3939
},
4040

41-
{
42-
"name": "arm64-windows-msvc", "hidden": true,
43-
"architecture": { "value": "arm64", "strategy": "external" },
44-
"toolset": { "value": "host=x64", "strategy": "external" },
45-
"cacheVariables": {
46-
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
47-
}
48-
},
49-
5041
{
5142
"name": "arm64-windows-llvm", "hidden": true,
5243
"architecture": { "value": "arm64", "strategy": "external" },
@@ -73,10 +64,6 @@
7364
{ "name": "arm64-apple-clang-release", "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
7465
{ "name": "arm64-apple-clang+static-release", "inherits": [ "base", "arm64-apple-clang", "reldbg", "static" ] },
7566

76-
{ "name": "arm64-windows-msvc-debug", "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
77-
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
78-
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
79-
8067
{ "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] },
8168
{ "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] },
8269
{ "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] },

common/common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ enum common_sampler_type {
9696
COMMON_SAMPLER_TYPE_XTC = 8,
9797
COMMON_SAMPLER_TYPE_INFILL = 9,
9898
COMMON_SAMPLER_TYPE_PENALTIES = 10,
99+
COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
99100
};
100101

101102
// dimensionality reduction methods, used by cvector-generator
@@ -161,6 +162,7 @@ struct common_params_sampling {
161162
std::vector<enum common_sampler_type> samplers = {
162163
COMMON_SAMPLER_TYPE_PENALTIES,
163164
COMMON_SAMPLER_TYPE_DRY,
165+
COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
164166
COMMON_SAMPLER_TYPE_TOP_K,
165167
COMMON_SAMPLER_TYPE_TYPICAL_P,
166168
COMMON_SAMPLER_TYPE_TOP_P,

common/sampling.cpp

Lines changed: 46 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -229,51 +229,48 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
229229
params.logit_bias.data()));
230230

231231
if (params.mirostat == 0) {
232-
if (params.top_n_sigma >= 0) {
233-
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
234-
llama_sampler_chain_add(result->chain, llama_sampler_init_temp (params.temp));
235-
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
236-
} else {
237-
for (const auto & cnstr : params.samplers) {
238-
switch (cnstr) {
239-
case COMMON_SAMPLER_TYPE_DRY:
240-
{
241-
std::vector<const char *> c_breakers;
242-
c_breakers.reserve(params.dry_sequence_breakers.size());
243-
for (const auto & str : params.dry_sequence_breakers) {
244-
c_breakers.push_back(str.c_str());
245-
}
246-
247-
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
232+
for (const auto & cnstr : params.samplers) {
233+
switch (cnstr) {
234+
case COMMON_SAMPLER_TYPE_DRY:
235+
{
236+
std::vector<const char *> c_breakers;
237+
c_breakers.reserve(params.dry_sequence_breakers.size());
238+
for (const auto & str : params.dry_sequence_breakers) {
239+
c_breakers.push_back(str.c_str());
248240
}
249-
break;
250-
case COMMON_SAMPLER_TYPE_TOP_K:
251-
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
252-
break;
253-
case COMMON_SAMPLER_TYPE_TOP_P:
254-
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
255-
break;
256-
case COMMON_SAMPLER_TYPE_MIN_P:
257-
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
258-
break;
259-
case COMMON_SAMPLER_TYPE_XTC:
260-
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
261-
break;
262-
case COMMON_SAMPLER_TYPE_TYPICAL_P:
263-
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
264-
break;
265-
case COMMON_SAMPLER_TYPE_TEMPERATURE:
266-
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
267-
break;
268-
case COMMON_SAMPLER_TYPE_INFILL:
269-
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
270-
break;
271-
case COMMON_SAMPLER_TYPE_PENALTIES:
272-
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
273-
break;
274-
default:
275-
GGML_ASSERT(false && "unknown sampler type");
276-
}
241+
242+
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
243+
}
244+
break;
245+
case COMMON_SAMPLER_TYPE_TOP_K:
246+
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
247+
break;
248+
case COMMON_SAMPLER_TYPE_TOP_P:
249+
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
250+
break;
251+
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
252+
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
253+
break;
254+
case COMMON_SAMPLER_TYPE_MIN_P:
255+
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
256+
break;
257+
case COMMON_SAMPLER_TYPE_XTC:
258+
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
259+
break;
260+
case COMMON_SAMPLER_TYPE_TYPICAL_P:
261+
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
262+
break;
263+
case COMMON_SAMPLER_TYPE_TEMPERATURE:
264+
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
265+
break;
266+
case COMMON_SAMPLER_TYPE_INFILL:
267+
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
268+
break;
269+
case COMMON_SAMPLER_TYPE_PENALTIES:
270+
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
271+
break;
272+
default:
273+
GGML_ASSERT(false && "unknown sampler type");
277274
}
278275
}
279276
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
@@ -475,6 +472,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
475472
case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
476473
case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
477474
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
475+
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return 's';
478476
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
479477
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
480478
case COMMON_SAMPLER_TYPE_XTC: return 'x';
@@ -490,6 +488,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
490488
case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
491489
case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
492490
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
491+
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return "top_n_sigma";
493492
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
494493
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
495494
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
@@ -504,6 +503,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
504503
{ "dry", COMMON_SAMPLER_TYPE_DRY },
505504
{ "top_k", COMMON_SAMPLER_TYPE_TOP_K },
506505
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
506+
{ "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
507507
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
508508
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
509509
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
@@ -517,6 +517,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
517517
std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
518518
{ "top-k", COMMON_SAMPLER_TYPE_TOP_K },
519519
{ "top-p", COMMON_SAMPLER_TYPE_TOP_P },
520+
{ "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
520521
{ "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
521522
{ "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
522523
{ "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
@@ -552,6 +553,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
552553
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
553554
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
554555
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
556+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_N_SIGMA), COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
555557
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
556558
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
557559
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },

convert_hf_to_gguf.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2761,6 +2761,13 @@ def set_gguf_parameters(self):
27612761
if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
27622762
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
27632763
logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
2764+
# YaRN is not enabled by default
2765+
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
2766+
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
2767+
if self.hparams["rope_scaling"].get("type") == "yarn":
2768+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
2769+
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
2770+
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
27642771

27652772
_experts: list[dict[str, Tensor]] | None = None
27662773

@@ -3908,6 +3915,16 @@ def set_gguf_parameters(self):
39083915
# default values below are taken from HF tranformers code
39093916
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
39103917
self.gguf_writer.add_vision_use_gelu(True)
3918+
# calculate proj_scale_factor (used by tinygemma3 test model)
3919+
image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
3920+
n_per_side = int(image_seq_length ** 0.5)
3921+
image_size = self.hparams["image_size"]
3922+
patch_size = self.hparams["patch_size"]
3923+
proj_scale_factor = (image_size // patch_size) // n_per_side
3924+
if proj_scale_factor > 0 and proj_scale_factor != 4:
3925+
# we only need to write this if it's not the default value
3926+
# in this case, we are converting a test model
3927+
self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor)
39113928

39123929
def tensor_force_quant(self, name, new_name, bid, n_dims):
39133930
del bid, new_name, n_dims # unused
@@ -3921,6 +3938,9 @@ def tensor_force_quant(self, name, new_name, bid, n_dims):
39213938
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
39223939
del bid # unused
39233940

3941+
if "vision_model.head." in name:
3942+
return [] # skip redundant tensors for tinygemma3
3943+
39243944
if name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
39253945
or name.startswith("multimodal_projector.") or name.startswith("vision_model."):
39263946
# process vision tensors

ggml/include/ggml-backend.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ extern "C" {
3838
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
3939
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
4040
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
41-
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
41+
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
4242
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
4343
GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
4444

@@ -59,7 +59,7 @@ extern "C" {
5959
GGML_API enum ggml_status ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
6060
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
6161
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
62-
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
62+
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor);
6363
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
6464
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
6565
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);

ggml/include/ggml.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -673,11 +673,15 @@ extern "C" {
673673
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
674674
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
675675

676+
// returns whether the tensor elements can be iterated over with a flattened index (no gaps, no permutation)
676677
GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor);
677678
GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
678679
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
679680
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
680681

682+
// returns whether the tensor elements are allocated as one contiguous block of memory (no gaps, but permutation ok)
683+
GGML_API bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor);
684+
681685
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
682686
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
683687

ggml/src/ggml-backend.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
5656
return SIZE_MAX;
5757
}
5858

59-
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
59+
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
6060
// get_alloc_size is optional, defaults to ggml_nbytes
6161
if (buft->iface.get_alloc_size) {
6262
size_t size = buft->iface.get_alloc_size(buft, tensor);
@@ -152,7 +152,7 @@ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
152152
return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
153153
}
154154

155-
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
155+
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor) {
156156
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
157157
}
158158

ggml/src/ggml-cuda/fattn-common.cuh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -719,6 +719,7 @@ void launch_fattn(
719719
size_t nb23 = V->nb[3];
720720

721721
if (need_f16_K && K->type != GGML_TYPE_F16) {
722+
GGML_ASSERT(ggml_is_contiguously_allocated(K));
722723
K_f16.alloc(ggml_nelements(K));
723724
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(K->type);
724725
to_fp16(K_data, K_f16.ptr, ggml_nelements(K), main_stream);
@@ -733,6 +734,7 @@ void launch_fattn(
733734
}
734735

735736
if (need_f16_V && V->type != GGML_TYPE_F16) {
737+
GGML_ASSERT(ggml_is_contiguously_allocated(V));
736738
V_f16.alloc(ggml_nelements(V));
737739
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type);
738740
to_fp16(V_data, V_f16.ptr, ggml_nelements(V), main_stream);

0 commit comments

Comments
 (0)