Skip to content

Commit b6f6338

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/build-linux-cross.yml # .github/workflows/build.yml # CODEOWNERS # ggml/CMakeLists.txt # ggml/src/ggml-cuda/fattn.cu # ggml/src/ggml-webgpu/CMakeLists.txt # ggml/src/ggml-webgpu/ggml-webgpu.cpp # ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl # tests/test-backend-ops.cpp # tests/test-chat-template.cpp # tools/llama-bench/llama-bench.cpp # tools/rpc/README.md # tools/server/README.md
2 parents 224800b + d2ee056 commit b6f6338

32 files changed

+1549
-629
lines changed

common/arg.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2607,6 +2607,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
26072607
params.no_extra_bufts = true;
26082608
}
26092609
).set_env("LLAMA_ARG_NO_REPACK"));
2610+
add_opt(common_arg(
2611+
{"--no-host"},
2612+
"bypass host buffer allowing extra buffers to be used",
2613+
[](common_params & params) {
2614+
params.no_host = true;
2615+
}
2616+
).set_env("LLAMA_ARG_NO_HOST"));
26102617
add_opt(common_arg(
26112618
{"-ctk", "--cache-type-k"}, "TYPE",
26122619
string_format(
@@ -3875,7 +3882,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
38753882
[](common_params & params) {
38763883
params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
38773884
params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf";
3878-
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
38793885
params.embd_normalize = 2;
38803886
params.n_ctx = 512;
38813887
params.verbose_prompt = true;
@@ -3889,7 +3895,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
38893895
[](common_params & params) {
38903896
params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
38913897
params.model.hf_file = "e5-small-v2-q8_0.gguf";
3892-
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
38933898
params.embd_normalize = 2;
38943899
params.n_ctx = 512;
38953900
params.verbose_prompt = true;
@@ -3903,7 +3908,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
39033908
[](common_params & params) {
39043909
params.model.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
39053910
params.model.hf_file = "gte-small-q8_0.gguf";
3906-
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
39073911
params.embd_normalize = 2;
39083912
params.n_ctx = 512;
39093913
params.verbose_prompt = true;

common/common.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1141,6 +1141,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
11411141
mparams.use_mlock = params.use_mlock;
11421142
mparams.check_tensors = params.check_tensors;
11431143
mparams.use_extra_bufts = !params.no_extra_bufts;
1144+
mparams.no_host = params.no_host;
11441145

11451146
if (params.kv_overrides.empty()) {
11461147
mparams.kv_overrides = NULL;

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,7 @@ struct common_params {
388388
bool check_tensors = false; // validate tensor data
389389
bool no_op_offload = false; // globally disable offload host tensor operations to device
390390
bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
391+
bool no_host = false; // bypass host buffer allowing extra buffers to be used
391392

392393
bool single_turn = false; // single turn chat conversation
393394

convert_hf_to_gguf.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8841,6 +8841,75 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
88418841
return [(self.map_tensor_name(name), data_torch)]
88428842

88438843

8844+
@ModelBase.register("Lfm2MoeForCausalLM")
8845+
class LFM2MoeModel(TextModel):
8846+
model_arch = gguf.MODEL_ARCH.LFM2MOE
8847+
8848+
def set_gguf_parameters(self):
8849+
# set num_key_value_heads only for attention layers
8850+
self.hparams["num_key_value_heads"] = [
8851+
self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0
8852+
for layer_type in self.hparams["layer_types"]
8853+
]
8854+
8855+
super().set_gguf_parameters()
8856+
8857+
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
8858+
self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
8859+
self.gguf_writer.add_leading_dense_block_count(self.hparams["num_dense_layers"])
8860+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
8861+
8862+
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
8863+
self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"])
8864+
8865+
# cache for experts weights for merging
8866+
_experts_cache: dict[int, dict[str, Tensor]] = {}
8867+
8868+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
8869+
# conv op requires 2d tensor
8870+
if 'conv.conv' in name:
8871+
data_torch = data_torch.squeeze(1)
8872+
8873+
if name.endswith(".expert_bias"):
8874+
name = name.replace(".expert_bias", ".expert_bias.bias")
8875+
8876+
# merge expert weights
8877+
if 'experts' in name:
8878+
n_experts = self.hparams["num_experts"]
8879+
assert bid is not None
8880+
8881+
expert_cache = self._experts_cache.setdefault(bid, {})
8882+
expert_cache[name] = data_torch
8883+
expert_weights = ["w1", "w2", "w3"]
8884+
8885+
# not enough expert weights to merge
8886+
if len(expert_cache) < n_experts * len(expert_weights):
8887+
return []
8888+
8889+
tensors: list[tuple[str, Tensor]] = []
8890+
for w_name in expert_weights:
8891+
datas: list[Tensor] = []
8892+
8893+
for xid in range(n_experts):
8894+
ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{w_name}.weight"
8895+
datas.append(expert_cache[ename])
8896+
del expert_cache[ename]
8897+
8898+
data_torch = torch.stack(datas, dim=0)
8899+
merged_name = f"layers.{bid}.feed_forward.experts.{w_name}.weight"
8900+
new_name = self.map_tensor_name(merged_name)
8901+
tensors.append((new_name, data_torch))
8902+
8903+
del self._experts_cache[bid]
8904+
return tensors
8905+
8906+
return [(self.map_tensor_name(name), data_torch)]
8907+
8908+
def prepare_tensors(self):
8909+
super().prepare_tensors()
8910+
assert not self._experts_cache
8911+
8912+
88448913
@ModelBase.register("Lfm2VlForConditionalGeneration")
88458914
class LFM2VLModel(MmprojModel):
88468915
def __init__(self, *args, **kwargs):

ggml/src/ggml-cpu/ops.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8135,7 +8135,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
81358135
}
81368136

81378137
// V /= S
8138-
const float S_inv = 1.0f/S;
8138+
const float S_inv = S == 0.0f ? 0.0f : 1.0f/S;
81398139
ggml_vec_scale_f32(DV, VKQ32, S_inv);
81408140

81418141
// dst indices

ggml/src/ggml-cuda/fattn.cu

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,12 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
208208

209209
const int cc = ggml_cuda_info().devices[device].cc;
210210

211+
// TODO: temporary until support is extended
212+
// https://github.com/ggml-org/llama.cpp/pull/16148#issuecomment-3343525206
213+
if (K->ne[1] % FATTN_KQ_STRIDE != 0) {
214+
return BEST_FATTN_KERNEL_NONE;
215+
}
216+
211217
#if defined(GGML_HIP_ROCWMMA_FATTN)
212218
if (GGML_CUDA_CC_IS_AMD(cc) && ggml_cuda_should_use_wmma_fattn(cc)) { //kcpp: fix for rocwmma
213219
return BEST_FATTN_KERNEL_WMMA_F16;

ggml/src/ggml-metal/ggml-metal-device.cpp

Lines changed: 118 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -338,7 +338,13 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_ssm_conv(ggml_metal_librar
338338
char base[256];
339339
char name[256];
340340

341-
snprintf(base, 256, "kernel_ssm_conv_%s_%s", ggml_type_name(op->src[0]->type), ggml_type_name(op->src[1]->type));
341+
const char * suffix = "";
342+
343+
if (op->src[1]->ne[0] % 4 == 0) {
344+
suffix = "_4";
345+
}
346+
347+
snprintf(base, 256, "kernel_ssm_conv_%s_%s%s", ggml_type_name(op->src[0]->type), ggml_type_name(op->src[1]->type), suffix);
342348
snprintf(name, 256, "%s", base);
343349

344350
ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
@@ -352,15 +358,15 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_ssm_conv(ggml_metal_librar
352358
}
353359

354360
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_ssm_scan(ggml_metal_library_t lib, const ggml_tensor * op) {
361+
GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
362+
355363
char base[256];
356364
char name[256];
357365

358-
if (op->src[3]->ne[0] == 1) {
359-
snprintf(base, 256, "kernel_ssm_scan_group_%s", ggml_type_name(op->src[0]->type));
360-
} else {
361-
snprintf(base, 256, "kernel_ssm_scan_%s", ggml_type_name(op->src[0]->type));
362-
}
363-
snprintf(name, 256, "%s", base);
366+
const int nsg = (ne00 + 31)/32;
367+
368+
snprintf(base, 256, "kernel_ssm_scan_%s", ggml_type_name(op->src[0]->type));
369+
snprintf(name, 256, "%s_nsg=%d", base, nsg);
364370

365371
ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
366372
if (res) {
@@ -369,7 +375,7 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_ssm_scan(ggml_metal_librar
369375

370376
res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
371377

372-
ggml_metal_pipeline_set_smem(res, 32*sizeof(float));
378+
ggml_metal_pipeline_set_smem(res, 32*sizeof(float)*nsg);
373379

374380
return res;
375381
}
@@ -918,13 +924,104 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_argsort(ggml_metal_library
918924
return res;
919925
}
920926

927+
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_pad(
928+
ggml_metal_library_t lib,
929+
const struct ggml_tensor * op,
930+
bool has_mask,
931+
int32_t ncpsg) {
932+
assert(op->op == GGML_OP_FLASH_ATTN_EXT);
933+
GGML_UNUSED(op);
934+
935+
char base[256];
936+
char name[256];
937+
938+
snprintf(base, 256, "kernel_%s",
939+
"flash_attn_ext_pad");
940+
941+
snprintf(name, 256, "%s_mask=%d_ncpsg=%d",
942+
base,
943+
has_mask,
944+
ncpsg);
945+
946+
ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
947+
if (res) {
948+
return res;
949+
}
950+
951+
ggml_metal_cv_t cv = ggml_metal_cv_init();
952+
953+
ggml_metal_cv_set_bool(cv, has_mask, FC_FLASH_ATTN_EXT_PAD + 0);
954+
//ggml_metal_cv_set_bool(cv, has_sinks, FC_FLASH_ATTN_EXT_PAD + 1);
955+
//ggml_metal_cv_set_bool(cv, has_bias, FC_FLASH_ATTN_EXT_PAD + 2);
956+
//ggml_metal_cv_set_bool(cv, has_scap, FC_FLASH_ATTN_EXT_PAD + 3);
957+
958+
//ggml_metal_cv_set_int32(cv, ns10, FC_FLASH_ATTN_EXT_PAD + 20);
959+
//ggml_metal_cv_set_int32(cv, ns20, FC_FLASH_ATTN_EXT_PAD + 21);
960+
//ggml_metal_cv_set_int32(cv, nsg, FC_FLASH_ATTN_EXT_PAD + 22);
961+
//ggml_metal_cv_set_int32(cv, nwg, FC_FLASH_ATTN_EXT_PAD + 23);
962+
//ggml_metal_cv_set_int32(cv, nqptg, FC_FLASH_ATTN_EXT_PAD + 24);
963+
ggml_metal_cv_set_int32(cv, ncpsg, FC_FLASH_ATTN_EXT_PAD + 25);
964+
965+
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
966+
967+
ggml_metal_cv_free(cv);
968+
969+
return res;
970+
}
971+
972+
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_blk(
973+
ggml_metal_library_t lib,
974+
const struct ggml_tensor * op,
975+
int32_t nqptg,
976+
int32_t ncpsg) {
977+
assert(op->op == GGML_OP_FLASH_ATTN_EXT);
978+
GGML_UNUSED(op);
979+
980+
char base[256];
981+
char name[256];
982+
983+
snprintf(base, 256, "kernel_%s",
984+
"flash_attn_ext_blk");
985+
986+
snprintf(name, 256, "%s_nqptg=%d_ncpsg=%d",
987+
base,
988+
nqptg,
989+
ncpsg);
990+
991+
ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
992+
if (res) {
993+
return res;
994+
}
995+
996+
ggml_metal_cv_t cv = ggml_metal_cv_init();
997+
998+
//ggml_metal_cv_set_bool(cv, has_mask, FC_FLASH_ATTN_EXT_BLK + 0);
999+
//ggml_metal_cv_set_bool(cv, has_sinks, FC_FLASH_ATTN_EXT_BLK + 1);
1000+
//ggml_metal_cv_set_bool(cv, has_bias, FC_FLASH_ATTN_EXT_BLK + 2);
1001+
//ggml_metal_cv_set_bool(cv, has_scap, FC_FLASH_ATTN_EXT_BLK + 3);
1002+
1003+
//ggml_metal_cv_set_int32(cv, ns10, FC_FLASH_ATTN_EXT_BLK + 20);
1004+
//ggml_metal_cv_set_int32(cv, ns20, FC_FLASH_ATTN_EXT_BLK + 21);
1005+
//ggml_metal_cv_set_int32(cv, nsg, FC_FLASH_ATTN_EXT_BLK + 22);
1006+
//ggml_metal_cv_set_int32(cv, nwg, FC_FLASH_ATTN_EXT_BLK + 23);
1007+
ggml_metal_cv_set_int32(cv, nqptg, FC_FLASH_ATTN_EXT_BLK + 24);
1008+
ggml_metal_cv_set_int32(cv, ncpsg, FC_FLASH_ATTN_EXT_BLK + 25);
1009+
1010+
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
1011+
1012+
ggml_metal_cv_free(cv);
1013+
1014+
return res;
1015+
}
1016+
9211017
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext(
9221018
ggml_metal_library_t lib,
9231019
const ggml_tensor * op,
9241020
bool has_mask,
9251021
bool has_sinks,
9261022
bool has_bias,
9271023
bool has_scap,
1024+
bool has_kvpad,
9281025
int32_t nsg) {
9291026
assert(op->op == GGML_OP_FLASH_ATTN_EXT);
9301027

@@ -937,18 +1034,23 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext(
9371034
const int32_t ns10 = op->src[1]->nb[1]/op->src[1]->nb[0];
9381035
const int32_t ns20 = op->src[2]->nb[1]/op->src[2]->nb[0];
9391036

1037+
// do bounds checks for the mask?
1038+
const bool bc_mask = op->src[3] && (op->src[3]->ne[1] % 8 != 0);
1039+
9401040
snprintf(base, 256, "kernel_%s_%s_dk%d_dv%d",
9411041
"flash_attn_ext",
9421042
ggml_type_name(op->src[1]->type),
9431043
dk,
9441044
dv);
9451045

946-
snprintf(name, 256, "%s_mask=%d_sinks=%d_bias=%d_scap=%d_ns10=%d_ns20=%d_nsg=%d",
1046+
snprintf(name, 256, "%s_mask=%d_sinks=%d_bias=%d_scap=%d_kvpad=%d_bcm=%d_ns10=%d_ns20=%d_nsg=%d",
9471047
base,
9481048
has_mask,
9491049
has_sinks,
9501050
has_bias,
9511051
has_scap,
1052+
has_kvpad,
1053+
bc_mask,
9521054
ns10,
9531055
ns20,
9541056
nsg);
@@ -964,6 +1066,9 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext(
9641066
ggml_metal_cv_set_bool(cv, has_sinks, FC_FLASH_ATTN_EXT + 1);
9651067
ggml_metal_cv_set_bool(cv, has_bias, FC_FLASH_ATTN_EXT + 2);
9661068
ggml_metal_cv_set_bool(cv, has_scap, FC_FLASH_ATTN_EXT + 3);
1069+
ggml_metal_cv_set_bool(cv, has_kvpad, FC_FLASH_ATTN_EXT + 4);
1070+
1071+
ggml_metal_cv_set_bool(cv, bc_mask, FC_FLASH_ATTN_EXT + 10);
9671072

9681073
ggml_metal_cv_set_int32(cv, ns10, FC_FLASH_ATTN_EXT + 20);
9691074
ggml_metal_cv_set_int32(cv, ns20, FC_FLASH_ATTN_EXT + 21);
@@ -983,6 +1088,7 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_vec(
9831088
bool has_sinks,
9841089
bool has_bias,
9851090
bool has_scap,
1091+
bool has_kvpad,
9861092
int32_t nsg,
9871093
int32_t nwg) {
9881094
assert(op->op == GGML_OP_FLASH_ATTN_EXT);
@@ -1002,12 +1108,13 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_vec(
10021108
dk,
10031109
dv);
10041110

1005-
snprintf(name, 256, "%s_mask=%d_sink=%d_bias=%d_softcap=%d_ns10=%d_ns20=%d_nsg=%d_nwg=%d",
1111+
snprintf(name, 256, "%s_mask=%d_sink=%d_bias=%d_scap=%d_kvpad=%d_ns10=%d_ns20=%d_nsg=%d_nwg=%d",
10061112
base,
10071113
has_mask,
10081114
has_sinks,
10091115
has_bias,
10101116
has_scap,
1117+
has_kvpad,
10111118
ns10,
10121119
ns20,
10131120
nsg, nwg);
@@ -1023,6 +1130,7 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_vec(
10231130
ggml_metal_cv_set_bool(cv, has_sinks, FC_FLASH_ATTN_EXT_VEC + 1);
10241131
ggml_metal_cv_set_bool(cv, has_bias, FC_FLASH_ATTN_EXT_VEC + 2);
10251132
ggml_metal_cv_set_bool(cv, has_scap, FC_FLASH_ATTN_EXT_VEC + 3);
1133+
ggml_metal_cv_set_bool(cv, has_kvpad, FC_FLASH_ATTN_EXT_VEC + 4);
10261134

10271135
ggml_metal_cv_set_int32(cv, ns10, FC_FLASH_ATTN_EXT_VEC + 20);
10281136
ggml_metal_cv_set_int32(cv, ns20, FC_FLASH_ATTN_EXT_VEC + 21);

0 commit comments

Comments
 (0)