Skip to content

Commit 20c802a

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # CODEOWNERS # ggml/CMakeLists.txt # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-cpu/kleidiai/kleidiai.cpp # scripts/sync-ggml.last # tests/test-backend-ops.cpp
2 parents b3a0ba5 + 2df5bcf commit 20c802a

File tree

11 files changed

+455
-241
lines changed

11 files changed

+455
-241
lines changed

common/arg.cpp

Lines changed: 67 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,53 @@ struct common_hf_file_res {
219219
std::string mmprojFile;
220220
};
221221

222+
static void write_etag(const std::string & path, const std::string & etag) {
223+
const std::string etag_path = path + ".etag";
224+
write_file(etag_path, etag);
225+
LOG_DBG("%s: file etag saved: %s\n", __func__, etag_path.c_str());
226+
}
227+
228+
static std::string read_etag(const std::string & path) {
229+
std::string none;
230+
const std::string etag_path = path + ".etag";
231+
232+
if (std::filesystem::exists(etag_path)) {
233+
std::ifstream etag_in(etag_path);
234+
if (!etag_in) {
235+
LOG_ERR("%s: could not open .etag file for reading: %s\n", __func__, etag_path.c_str());
236+
return none;
237+
}
238+
std::string etag;
239+
std::getline(etag_in, etag);
240+
return etag;
241+
}
242+
243+
// no etag file, but maybe there is an old .json
244+
// remove this code later
245+
const std::string metadata_path = path + ".json";
246+
247+
if (std::filesystem::exists(metadata_path)) {
248+
std::ifstream metadata_in(metadata_path);
249+
try {
250+
nlohmann::json metadata_json;
251+
metadata_in >> metadata_json;
252+
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(),
253+
metadata_json.dump().c_str());
254+
if (metadata_json.contains("etag") && metadata_json.at("etag").is_string()) {
255+
std::string etag = metadata_json.at("etag");
256+
write_etag(path, etag);
257+
if (!std::filesystem::remove(metadata_path)) {
258+
LOG_WRN("%s: failed to delete old .json metadata file: %s\n", __func__, metadata_path.c_str());
259+
}
260+
return etag;
261+
}
262+
} catch (const nlohmann::json::exception & e) {
263+
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
264+
}
265+
}
266+
return none;
267+
}
268+
222269
#ifdef LLAMA_USE_CURL
223270

224271
bool common_has_curl() {
@@ -375,36 +422,15 @@ static bool common_download_head(CURL * curl,
375422
static bool common_download_file_single_online(const std::string & url,
376423
const std::string & path,
377424
const std::string & bearer_token) {
378-
// If the file exists, check its JSON metadata companion file.
379-
std::string metadata_path = path + ".json";
380425
static const int max_attempts = 3;
381426
static const int retry_delay_seconds = 2;
382427
for (int i = 0; i < max_attempts; ++i) {
383-
nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
384-
std::string etag;
385-
std::string last_modified;
428+
std::string etag;
386429

387430
// Check if the file already exists locally
388431
const auto file_exists = std::filesystem::exists(path);
389432
if (file_exists) {
390-
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
391-
std::ifstream metadata_in(metadata_path);
392-
if (metadata_in.good()) {
393-
try {
394-
metadata_in >> metadata;
395-
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(),
396-
metadata.dump().c_str());
397-
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
398-
etag = metadata.at("etag");
399-
}
400-
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
401-
last_modified = metadata.at("lastModified");
402-
}
403-
} catch (const nlohmann::json::exception & e) {
404-
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
405-
}
406-
}
407-
// if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
433+
etag = read_etag(path);
408434
} else {
409435
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
410436
}
@@ -442,11 +468,6 @@ static bool common_download_file_single_online(const std::string & url,
442468
headers.etag.c_str());
443469
should_download = true;
444470
should_download_from_scratch = true;
445-
} else if (!last_modified.empty() && last_modified != headers.last_modified) {
446-
LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__,
447-
last_modified.c_str(), headers.last_modified.c_str());
448-
should_download = true;
449-
should_download_from_scratch = true;
450471
}
451472
}
452473

@@ -477,15 +498,9 @@ static bool common_download_file_single_online(const std::string & url,
477498
}
478499
}
479500
}
480-
481-
// Write the updated JSON metadata file.
482-
metadata.update({
483-
{ "url", url },
484-
{ "etag", headers.etag },
485-
{ "lastModified", headers.last_modified }
486-
});
487-
write_file(metadata_path, metadata.dump(4));
488-
LOG_DBG("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
501+
if (head_request_ok) {
502+
write_etag(path, headers.etag);
503+
}
489504

490505
// start the download
491506
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n",
@@ -668,51 +683,6 @@ static void print_progress(size_t current, size_t total) { // TODO isatty
668683
std::cout.flush();
669684
}
670685

671-
struct common_file_metadata {
672-
std::string etag;
673-
std::string last_modified;
674-
};
675-
676-
static std::optional<common_file_metadata> read_metadata(const std::string & path) {
677-
if (!std::filesystem::exists(path)) {
678-
return std::nullopt;
679-
}
680-
681-
nlohmann::json metadata_json;
682-
common_file_metadata metadata;
683-
684-
std::ifstream metadata_in(path);
685-
try {
686-
metadata_in >> metadata_json;
687-
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, path.c_str(),
688-
metadata_json.dump().c_str());
689-
if (metadata_json.contains("etag") && metadata_json.at("etag").is_string()) {
690-
metadata.etag = metadata_json.at("etag");
691-
}
692-
if (metadata_json.contains("lastModified") && metadata_json.at("lastModified").is_string()) {
693-
metadata.last_modified = metadata_json.at("lastModified");
694-
}
695-
} catch (const nlohmann::json::exception & e) {
696-
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, path.c_str(), e.what());
697-
return std::nullopt;
698-
}
699-
700-
return metadata;
701-
}
702-
703-
static void write_metadata(const std::string & path,
704-
const std::string & url,
705-
const common_file_metadata & metadata) {
706-
nlohmann::json metadata_json = {
707-
{ "url", url },
708-
{ "etag", metadata.etag },
709-
{ "lastModified", metadata.last_modified }
710-
};
711-
712-
write_file(path, metadata_json.dump(4));
713-
LOG_DBG("%s: file metadata saved: %s\n", __func__, path.c_str());
714-
}
715-
716686
static bool common_pull_file(httplib::Client & cli,
717687
const std::string & resolve_path,
718688
const std::string & path_tmp,
@@ -779,8 +749,6 @@ static bool common_pull_file(httplib::Client & cli,
779749
static bool common_download_file_single_online(const std::string & url,
780750
const std::string & path,
781751
const std::string & bearer_token) {
782-
// If the file exists, check its JSON metadata companion file.
783-
std::string metadata_path = path + ".json";
784752
static const int max_attempts = 3;
785753
static const int retry_delay_seconds = 2;
786754

@@ -792,12 +760,11 @@ static bool common_download_file_single_online(const std::string & url,
792760
}
793761
cli.set_default_headers(default_headers);
794762

795-
common_file_metadata last;
796763
const bool file_exists = std::filesystem::exists(path);
764+
765+
std::string last_etag;
797766
if (file_exists) {
798-
if (auto opt = read_metadata(metadata_path)) {
799-
last = *opt;
800-
}
767+
last_etag = read_etag(path);
801768
} else {
802769
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
803770
}
@@ -813,14 +780,9 @@ static bool common_download_file_single_online(const std::string & url,
813780
}
814781
}
815782

816-
common_file_metadata current;
817-
if (head_ok) {
818-
if (head->has_header("ETag")) {
819-
current.etag = head->get_header_value("ETag");
820-
}
821-
if (head->has_header("Last-Modified")) {
822-
current.last_modified = head->get_header_value("Last-Modified");
823-
}
783+
std::string etag;
784+
if (head_ok && head->has_header("ETag")) {
785+
etag = head->get_header_value("ETag");
824786
}
825787

826788
size_t total_size = 0;
@@ -838,16 +800,10 @@ static bool common_download_file_single_online(const std::string & url,
838800
}
839801

840802
bool should_download_from_scratch = false;
841-
if (head_ok) {
842-
if (!last.etag.empty() && last.etag != current.etag) {
843-
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__,
844-
last.etag.c_str(), current.etag.c_str());
845-
should_download_from_scratch = true;
846-
} else if (!last.last_modified.empty() && last.last_modified != current.last_modified) {
847-
LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__,
848-
last.last_modified.c_str(), current.last_modified.c_str());
849-
should_download_from_scratch = true;
850-
}
803+
if (!last_etag.empty() && !etag.empty() && last_etag != etag) {
804+
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__,
805+
last_etag.c_str(), etag.c_str());
806+
should_download_from_scratch = true;
851807
}
852808

853809
if (file_exists) {
@@ -875,9 +831,8 @@ static bool common_download_file_single_online(const std::string & url,
875831
}
876832

877833
// start the download
878-
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n",
879-
__func__, show_masked_url(parts).c_str(), path_temporary.c_str(),
880-
current.etag.c_str(), current.last_modified.c_str());
834+
LOG_INF("%s: trying to download model from %s to %s (etag:%s)...\n",
835+
__func__, show_masked_url(parts).c_str(), path_temporary.c_str(), etag.c_str());
881836
const bool was_pull_successful = common_pull_file(cli, parts.path, path_temporary, supports_ranges, existing_size, total_size);
882837
if (!was_pull_successful) {
883838
if (i + 1 < max_attempts) {
@@ -887,15 +842,16 @@ static bool common_download_file_single_online(const std::string & url,
887842
} else {
888843
LOG_ERR("%s: download failed after %d attempts\n", __func__, max_attempts);
889844
}
890-
891845
continue;
892846
}
893847

894848
if (std::rename(path_temporary.c_str(), path.c_str()) != 0) {
895849
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
896850
return false;
897851
}
898-
write_metadata(metadata_path, url, current);
852+
if (!etag.empty()) {
853+
write_etag(path, etag);
854+
}
899855
break;
900856
}
901857

ggml/src/ggml-cuda/cpy.cu

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
329329
} else
330330
#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
331331
{
332-
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
332+
if (src0->type == GGML_TYPE_F32) {
333+
ggml_cpy_flt_cuda<float, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
334+
} else {
335+
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
336+
}
333337
}
334338
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
335339
ggml_cpy_flt_cuda<float, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
@@ -400,7 +404,13 @@ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
400404

401405
void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
402406
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
403-
return nullptr;
407+
// Prioritize CUDA graph compatibility over direct memory copy optimization.
408+
// Using copy kernels here maintains graph indirection support, preventing performance regression from disabled CUDA graphs.
409+
if (src0->type == GGML_TYPE_F32) {
410+
return (void*) cpy_flt<cpy_1_flt<float, float>>;
411+
} else {
412+
return nullptr;
413+
}
404414
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
405415
return (void*) cpy_flt<cpy_1_flt<float, float>>;
406416
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2654,6 +2654,8 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
26542654
const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased";
26552655
const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased";
26562656
const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";
2657+
const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out";
2658+
const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d";
26572659

26582660
for (int i = 0; i < cgraph->n_nodes; i++) {
26592661
ggml_tensor * node = cgraph->nodes[i];
@@ -2682,7 +2684,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
26822684
(node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) &&
26832685
strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 &&
26842686
strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 &&
2685-
strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0) {
2687+
strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 &&
2688+
strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 &&
2689+
strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) {
26862690
// disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
26872691
// by means of matching node names. See
26882692
// https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and

ggml/src/ggml-metal/ggml-metal-device.cpp

Lines changed: 11 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -495,22 +495,17 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mv(ggml_metal_library_
495495
case GGML_TYPE_F16:
496496
case GGML_TYPE_BF16:
497497
{
498-
if (ne00 == 4) {
498+
if (ne00 < 32) {
499499
nsg = 1;
500500
nr0 = 32;
501-
nr1 = 4;
502-
suffix = "_c4";
503-
} else if (ne00 % 4 == 0) {
504-
nsg = N_SG_F;
505-
nr0 = N_R0_F;
506501
nr1 = 1;
507-
smem = 32*sizeof(float)*N_R0_F;
508-
suffix = "_4";
502+
suffix = "_short";
509503
} else {
510-
nsg = N_SG_F;
511-
nr0 = N_R0_F;
504+
nsg = std::min(4, (ne00 + 127) / 128);
505+
nr0 = 2;
512506
nr1 = 1;
513-
smem = 32*sizeof(float)*N_R0_F;
507+
smem = 32*sizeof(float)*nr0;
508+
suffix = ne00 % 4 == 0 ? "_4" : "";
514509
}
515510
} break;
516511
case GGML_TYPE_Q4_0:
@@ -727,18 +722,11 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mv_id(ggml_metal_libra
727722
case GGML_TYPE_F16:
728723
case GGML_TYPE_BF16:
729724
{
730-
if (ne00 % 4 == 0) {
731-
nsg = N_SG_F;
732-
nr0 = N_R0_F;
733-
nr1 = 1;
734-
smem = 32*sizeof(float)*N_R0_F;
735-
suffix = "_4";
736-
} else {
737-
nsg = N_SG_F;
738-
nr0 = N_R0_F;
739-
nr1 = 1;
740-
smem = 32*sizeof(float)*N_R0_F;
741-
}
725+
nsg = std::min(4, (ne00 + 127) / 128);
726+
nr0 = 2;
727+
nr1 = 1;
728+
smem = 32*sizeof(float)*nr0;
729+
suffix = ne00 % 4 == 0 ? "_4" : "";
742730
} break;
743731
case GGML_TYPE_Q4_0:
744732
{

ggml/src/ggml-metal/ggml-metal-impl.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,6 @@
88
//
99
// TODO: for optimal performance, become function of the device and work size
1010

11-
#define N_R0_F 2
12-
#define N_SG_F 4
13-
1411
#define N_R0_Q4_0 4
1512
#define N_SG_Q4_0 2
1613

@@ -352,6 +349,7 @@ typedef struct {
352349
uint64_t nb13;
353350
int32_t ne0;
354351
int32_t ne1;
352+
int32_t nr0;
355353
int16_t r2;
356354
int16_t r3;
357355
} ggml_metal_kargs_mul_mv;
@@ -427,6 +425,7 @@ typedef struct {
427425
int32_t ne0;
428426
int32_t ne1;
429427
uint64_t nb1;
428+
int32_t nr0;
430429
} ggml_metal_kargs_mul_mv_id;
431430

432431
// NORM

0 commit comments

Comments
 (0)