Skip to content

Commit 06cb595

Browse files
committed
Merge branch 'master' into xsn/mtmd_c_api
2 parents 4d842eb + 8ae5ebc commit 06cb595

File tree

12 files changed

+215
-110
lines changed

12 files changed

+215
-110
lines changed

convert_hf_to_gguf.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2123,6 +2123,9 @@ def __init__(self, *args, **kwargs):
21232123
# if n_heads_in_group is not None, then
21242124
# _num_kv_heads[il] is num_attention_head // n_heads_in_group and
21252125
# _num_heads[il] is num_attention_head
2126+
# ***dummy layer*** for nemotron 253B
2127+
# if n_heads_in_group is None and ffn_mult is None
2128+
# then _num_kv_heads[il] is 0 and _num_heads[il] is 0 and _ffn_dims is 0
21262129
for il in range(len(_block_configs)):
21272130
if _block_configs[il]["attention"]["n_heads_in_group"] is None:
21282131
if _block_configs[il]["attention"]["replace_with_linear"] is True:
@@ -2134,7 +2137,10 @@ def __init__(self, *args, **kwargs):
21342137
else:
21352138
self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"])
21362139
self._num_heads.append(self.hparams["num_attention_heads"])
2137-
_ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
2140+
if _block_configs[il]["ffn"]["ffn_mult"] is None: # dummy layer
2141+
_ffn_multipliers.append(0.0)
2142+
else:
2143+
_ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
21382144
assert self.block_count == len(self._num_kv_heads)
21392145
assert self.block_count == len(self._num_heads)
21402146
assert self.block_count == len(_ffn_multipliers)

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 115 additions & 73 deletions
Large diffs are not rendered by default.

ggml/src/ggml-vulkan/vulkan-shaders/relu.comp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,5 @@ void main() {
1717
return;
1818
}
1919

20-
data_d[i] = max(float(data_a[i]), 0);
20+
data_d[i] = D_TYPE(max(float(data_a[i]), 0));
2121
}

ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,5 @@ void main() {
1616
if (i >= p.KX) {
1717
return;
1818
}
19-
data_d[i] = D_TYPE(1. / (1 + exp(-1. *data_a[i])));
19+
data_d[i] = D_TYPE(1. / (1 + exp(-1. * float(data_a[i]))));
2020
}

ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,5 @@ void main() {
1616
if (i >= p.KX) {
1717
return;
1818
}
19-
data_d[i] = D_TYPE(1. - 2. / (exp(2.*data_a[i]) + 1.));
19+
data_d[i] = D_TYPE(1. - 2. / (exp(2.*float(data_a[i])) + 1.));
2020
}

ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -485,10 +485,12 @@ void process_shaders() {
485485
string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
486486
string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
487487
string_to_spv("cpy_f16_f16", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
488+
string_to_spv("cpy_f16_f32", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
488489
string_to_spv("cpy_f32_bf16","copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "uint16_t"}, {"DATA_D_BF16", "1"}});
489490
string_to_spv("contig_cpy_f32_f32", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
490491
string_to_spv("contig_cpy_f32_f16", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
491492
string_to_spv("contig_cpy_f16_f16", "contig_copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
493+
string_to_spv("contig_cpy_f16_f32", "contig_copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
492494
string_to_spv("contig_cpy_f32_bf16","contig_copy.comp",{{"A_TYPE", "float"}, {"D_TYPE", "uint16_t"}, {"DATA_D_BF16", "1"}});
493495

494496
for (std::string t : {"q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) {
@@ -497,8 +499,26 @@ void process_shaders() {
497499
string_to_spv("cpy_" + t + "_f32", "copy_from_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
498500
}
499501

500-
string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
501-
string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
502+
auto get_type_str = [](bool f16) {
503+
return f16 ? "float16_t" : "float";
504+
};
505+
auto get_suffix = [](bool src0_f16, bool src1_f16, bool dst_f16) {
506+
std::string s;
507+
s += std::string(src0_f16 ? "_f16" : "_f32");
508+
s += std::string(src1_f16 ? "_f16" : "_f32");
509+
s += std::string(dst_f16 ? "_f16" : "_f32");
510+
return s;
511+
};
512+
for (std::string op : {"add", "sub", "mul", "div"}) {
513+
for (auto src0_f16 : {false, true}) {
514+
for (auto src1_f16 : {false, true}) {
515+
for (auto dst_f16 : {false, true}) {
516+
auto name = op + get_suffix(src0_f16, src1_f16, dst_f16);
517+
string_to_spv(name.c_str(), op + ".comp", {{"A_TYPE", get_type_str(src0_f16)}, {"B_TYPE", get_type_str(src1_f16)}, {"D_TYPE", get_type_str(dst_f16)}, {"FLOAT_TYPE", "float"}});
518+
}
519+
}
520+
}
521+
}
502522

503523
string_to_spv("sub_f32", "sub.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
504524

@@ -533,14 +553,21 @@ void process_shaders() {
533553

534554
string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
535555

536-
string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
537-
string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
538-
string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
539-
string_to_spv("silu_back_f32", "silu_back.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
540-
string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
541-
string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
542-
string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
543-
string_to_spv("sigmoid_f32", "sigmoid.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
556+
string_to_spv("gelu_f16", "gelu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
557+
string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
558+
string_to_spv("gelu_quick_f16", "gelu_quick.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
559+
string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
560+
string_to_spv("silu_f16", "silu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
561+
string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
562+
string_to_spv("relu_f16", "relu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
563+
string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
564+
string_to_spv("tanh_f16", "tanh.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
565+
string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
566+
string_to_spv("sigmoid_f16", "sigmoid.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
567+
string_to_spv("sigmoid_f32", "sigmoid.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
568+
569+
string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
570+
string_to_spv("silu_back_f32", "silu_back.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
544571

545572
string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
546573

@@ -641,7 +668,12 @@ void write_output_files() {
641668
std::remove(path.c_str());
642669
}
643670
}
644-
671+
for (const char *op : {"add", "sub", "mul", "div"}) {
672+
fprintf(hdr, "extern unsigned char *%s_data[2][2][2];\n", op);
673+
fprintf(hdr, "extern uint64_t %s_len[2][2][2];\n", op);
674+
fprintf(src, "unsigned char *%s_data[2][2][2] = {{{%s_f32_f32_f32_data, %s_f32_f32_f16_data}, {%s_f32_f16_f32_data, %s_f32_f16_f16_data}}, {{%s_f16_f32_f32_data, %s_f16_f32_f16_data}, {%s_f16_f16_f32_data, %s_f16_f16_f16_data}}};\n", op, op, op, op, op, op, op, op, op);
675+
fprintf(src, "uint64_t %s_len[2][2][2] = {{{%s_f32_f32_f32_len, %s_f32_f32_f16_len}, {%s_f32_f16_f32_len, %s_f32_f16_f16_len}}, {{%s_f16_f32_f32_len, %s_f16_f32_f16_len}, {%s_f16_f16_f32_len, %s_f16_f16_f16_len}}};\n", op, op, op, op, op, op, op, op, op);
676+
}
645677
fclose(hdr);
646678
fclose(src);
647679
}

src/llama-model.cpp

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ const char * llm_type_name(llm_type type) {
8080
case LLM_TYPE_236B: return "236B";
8181
case LLM_TYPE_290B: return "290B";
8282
case LLM_TYPE_314B: return "314B";
83+
case LLM_TYPE_405B: return "405B";
8384
case LLM_TYPE_671B: return "671B";
8485
case LLM_TYPE_SMALL: return "0.1B";
8586
case LLM_TYPE_MEDIUM: return "0.4B";
@@ -582,6 +583,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
582583
switch (hparams.n_layer) {
583584
case 32: type = LLM_TYPE_7B; break;
584585
case 80: type = LLM_TYPE_70B; break;
586+
case 162: type = LLM_TYPE_405B; break;
585587
default: type = LLM_TYPE_UNKNOWN;
586588
}
587589
} break;
@@ -1848,7 +1850,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
18481850
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
18491851
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
18501852

1851-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1853+
if (n_ff > 0) {
1854+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1855+
}
18521856

18531857
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
18541858
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
@@ -1858,9 +1862,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
18581862
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
18591863
}
18601864

1861-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1862-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1863-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1865+
if (n_ff > 0) {
1866+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1867+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1868+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1869+
}
18641870

18651871
// optional MLP bias
18661872
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
@@ -4705,6 +4711,7 @@ struct llm_build_deci : public llm_graph_context {
47054711
ggml_tensor * inpSA = inpL;
47064712
const int64_t n_head_kv = hparams.n_head_kv(il);
47074713
const int64_t n_head = hparams.n_head(il);
4714+
const int64_t n_ff = hparams.n_ff(il);
47084715

47094716
if (n_head == 0) {
47104717
// attention-free layer of Llama-3_1-Nemotron-51B
@@ -4780,6 +4787,11 @@ struct llm_build_deci : public llm_graph_context {
47804787
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
47814788
}
47824789

4790+
// FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
4791+
if (n_head == 0 && n_ff == 0) {
4792+
continue;
4793+
}
4794+
47834795
// For Granite architecture
47844796
if (hparams.f_residual_scale) {
47854797
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);

src/llama-model.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ enum llm_type {
7676
LLM_TYPE_236B,
7777
LLM_TYPE_290B,
7878
LLM_TYPE_314B,
79+
LLM_TYPE_405B,
7980
LLM_TYPE_671B,
8081
LLM_TYPE_SMALL,
8182
LLM_TYPE_MEDIUM,

tools/imatrix/imatrix.cpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ class IMatrixCollector {
4646
common_params m_params;
4747
std::mutex m_mutex;
4848
int m_last_call = 0;
49-
std::vector<float> m_src1_data;
49+
std::vector<char> m_src1_data;
5050
std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
5151
};
5252

@@ -93,11 +93,13 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
9393
const bool is_host = ggml_backend_buffer_is_host(src1->buffer);
9494

9595
if (!is_host) {
96-
m_src1_data.resize(ggml_nelements(src1));
97-
ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1));
96+
const size_t src1_nbytes = ggml_nbytes(src1);
97+
m_src1_data.resize(src1_nbytes);
98+
ggml_backend_tensor_get(src1, m_src1_data.data(), 0, src1_nbytes);
9899
}
99100

100-
const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
101+
const char * data = is_host ? (const char *) src1->data : m_src1_data.data();
102+
GGML_ASSERT(src1->nb[0] == ggml_element_size(src1));
101103

102104
// this has been adapted to the new format of storing merged experts in a single 3d tensor
103105
// ref: https://github.com/ggml-org/llama.cpp/pull/6387
@@ -144,7 +146,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
144146

145147
const int64_t i11 = idx % src1->ne[1];
146148
const int64_t i12 = row;
147-
const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]);
149+
const float * x = (const float *)(data + i11*src1->nb[1] + i12*src1->nb[2]);
148150

149151
for (int j = 0; j < (int)src1->ne[0]; ++j) {
150152
e.values[e_start + j] += x[j]*x[j];
@@ -180,7 +182,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
180182
++e.ncall;
181183
LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
182184
for (int row = 0; row < (int)src1->ne[1]; ++row) {
183-
const float * x = data + row * src1->ne[0];
185+
const float * x = (const float *) (data + row * src1->nb[1]);
184186
for (int j = 0; j < (int)src1->ne[0]; ++j) {
185187
e.values[j] += x[j]*x[j];
186188
e.counts[j]++;

tools/llava/clip-impl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@
7575
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
7676
#define TN_MM_PATCH_MERGER "mm.patch_merger.weight" // mistral small 3.1
7777
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
78+
#define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model)
79+
#define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model)
7880

7981
// mimicpmv
8082
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"

0 commit comments

Comments
 (0)