Skip to content

Commit 3c1285b

Browse files
Merge pull request #360 from janhq/update-dev-from-master-2025-12-13-00-35
Sync master with upstream release b7371
2 parents 4289b5c + 7bed317 commit 3c1285b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+664
-322
lines changed

common/arg.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1856,15 +1856,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
18561856
}
18571857
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
18581858
add_opt(common_arg(
1859-
{"--mmproj"}, "FILE",
1859+
{"-mm", "--mmproj"}, "FILE",
18601860
"path to a multimodal projector file. see tools/mtmd/README.md\n"
18611861
"note: if -hf is used, this argument can be omitted",
18621862
[](common_params & params, const std::string & value) {
18631863
params.mmproj.path = value;
18641864
}
18651865
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
18661866
add_opt(common_arg(
1867-
{"--mmproj-url"}, "URL",
1867+
{"-mmu", "--mmproj-url"}, "URL",
18681868
"URL to a multimodal projector file. see tools/mtmd/README.md",
18691869
[](common_params & params, const std::string & value) {
18701870
params.mmproj.url = value;

common/download.cpp

Lines changed: 69 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
#include <filesystem>
1313
#include <fstream>
1414
#include <future>
15+
#include <map>
16+
#include <mutex>
1517
#include <regex>
1618
#include <string>
1719
#include <thread>
@@ -472,36 +474,79 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
472474

473475
#elif defined(LLAMA_USE_HTTPLIB)
474476

475-
static bool is_output_a_tty() {
477+
class ProgressBar {
478+
static inline std::mutex mutex;
479+
static inline std::map<const ProgressBar *, int> lines;
480+
static inline int max_line = 0;
481+
482+
static void cleanup(const ProgressBar * line) {
483+
lines.erase(line);
484+
if (lines.empty()) {
485+
max_line = 0;
486+
}
487+
}
488+
489+
static bool is_output_a_tty() {
476490
#if defined(_WIN32)
477-
return _isatty(_fileno(stdout));
491+
return _isatty(_fileno(stdout));
478492
#else
479-
return isatty(1);
493+
return isatty(1);
480494
#endif
481-
}
495+
}
482496

483-
static void print_progress(size_t current, size_t total) {
484-
if (!is_output_a_tty()) {
485-
return;
497+
public:
498+
ProgressBar() = default;
499+
500+
~ProgressBar() {
501+
std::lock_guard<std::mutex> lock(mutex);
502+
cleanup(this);
486503
}
487504

488-
if (!total) {
489-
return;
505+
void update(size_t current, size_t total) {
506+
if (!is_output_a_tty()) {
507+
return;
508+
}
509+
510+
if (!total) {
511+
return;
512+
}
513+
514+
std::lock_guard<std::mutex> lock(mutex);
515+
516+
if (lines.find(this) == lines.end()) {
517+
lines[this] = max_line++;
518+
std::cout << "\n";
519+
}
520+
int lines_up = max_line - lines[this];
521+
522+
size_t width = 50;
523+
size_t pct = (100 * current) / total;
524+
size_t pos = (width * current) / total;
525+
526+
std::cout << "\033[s";
527+
528+
if (lines_up > 0) {
529+
std::cout << "\033[" << lines_up << "A";
530+
}
531+
std::cout << "\033[2K\r["
532+
<< std::string(pos, '=')
533+
<< (pos < width ? ">" : "")
534+
<< std::string(width - pos, ' ')
535+
<< "] " << std::setw(3) << pct << "% ("
536+
<< current / (1024 * 1024) << " MB / "
537+
<< total / (1024 * 1024) << " MB) "
538+
<< "\033[u";
539+
540+
std::cout.flush();
541+
542+
if (current == total) {
543+
cleanup(this);
544+
}
490545
}
491546

492-
size_t width = 50;
493-
size_t pct = (100 * current) / total;
494-
size_t pos = (width * current) / total;
495-
496-
std::cout << "["
497-
<< std::string(pos, '=')
498-
<< (pos < width ? ">" : "")
499-
<< std::string(width - pos, ' ')
500-
<< "] " << std::setw(3) << pct << "% ("
501-
<< current / (1024 * 1024) << " MB / "
502-
<< total / (1024 * 1024) << " MB)\r";
503-
std::cout.flush();
504-
}
547+
ProgressBar(const ProgressBar &) = delete;
548+
ProgressBar & operator=(const ProgressBar &) = delete;
549+
};
505550

506551
static bool common_pull_file(httplib::Client & cli,
507552
const std::string & resolve_path,
@@ -523,6 +568,7 @@ static bool common_pull_file(httplib::Client & cli,
523568
const char * func = __func__; // avoid __func__ inside a lambda
524569
size_t downloaded = existing_size;
525570
size_t progress_step = 0;
571+
ProgressBar bar;
526572

527573
auto res = cli.Get(resolve_path, headers,
528574
[&](const httplib::Response &response) {
@@ -554,16 +600,14 @@ static bool common_pull_file(httplib::Client & cli,
554600
progress_step += len;
555601

556602
if (progress_step >= total_size / 1000 || downloaded == total_size) {
557-
print_progress(downloaded, total_size);
603+
bar.update(downloaded, total_size);
558604
progress_step = 0;
559605
}
560606
return true;
561607
},
562608
nullptr
563609
);
564610

565-
std::cout << "\n";
566-
567611
if (!res) {
568612
LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1);
569613
return false;

convert_hf_to_gguf.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7286,6 +7286,10 @@ def set_gguf_parameters(self):
72867286
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
72877287
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
72887288
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
7289+
7290+
# [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
7291+
# note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
7292+
# ref https://github.com/ggml-org/llama.cpp/pull/17945
72897293
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_scaling["mscale_all_dim"])
72907294

72917295
_experts: list[dict[str, Tensor]] | None = None
@@ -10041,6 +10045,10 @@ def set_gguf_parameters(self):
1004110045
MistralModel.set_mistral_config(self.gguf_writer, self.hparams)
1004210046
yarn_params = self.hparams["yarn"]
1004310047
self.gguf_writer.add_attn_temperature_length(yarn_params["original_max_position_embeddings"])
10048+
10049+
# [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
10050+
# note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
10051+
# ref https://github.com/ggml-org/llama.cpp/pull/17945
1004410052
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1) # mscale_all_dim * 0.1
1004510053

1004610054
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):

examples/model-conversion/scripts/causal/compare-logits.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,6 @@ def quick_logits_check(pytorch_file, llamacpp_file):
3232
print(f"Top 10 llama.cpp logits: {llamacpp_logits[llamacpp_top10]}")
3333
print(f"Max absolute difference: {max_diff:.4f}")
3434

35-
if max_diff > 1.0:
36-
print(f"❌ NOK: Large differences detected - max diff: {max_diff:.4f}")
37-
return False
38-
3935
return True
4036

4137
def main():

ggml/include/ggml-cpu.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ extern "C" {
9999
GGML_BACKEND_API int ggml_cpu_has_sme (void);
100100
// other
101101
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
102+
GGML_BACKEND_API int ggml_cpu_get_rvv_vlen (void); // risc-v vector length in bytes
102103
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
103104
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
104105
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2548,6 +2548,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
25482548
case GGML_OP_ARGSORT:
25492549
case GGML_OP_ACC:
25502550
case GGML_OP_GROUP_NORM:
2551+
return true;
25512552
case GGML_OP_PAD:
25522553
// TODO: add circular padding support for cann, see https://github.com/ggml-org/llama.cpp/pull/16985
25532554
return ggml_get_op_params_i32(op, 8) == 0;

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,11 @@ struct ggml_arm_arch_features_type {
8181
} ggml_arm_arch_features = { 0 };
8282
#endif
8383

84+
#if defined(__riscv)
85+
struct ggml_riscv_arch_features_type {
86+
int rvv_vlen;
87+
} ggml_riscv_arch_features = { 0 };
88+
#endif
8489

8590
#if defined(_WIN32)
8691

@@ -703,6 +708,15 @@ static void ggml_init_arm_arch_features(void) {}
703708
#endif
704709
#endif // __ARM_ARCH
705710

711+
#if defined(__riscv) && defined(__riscv_v_intrinsic)
712+
#include <riscv_vector.h>
713+
static void ggml_init_riscv_arch_features(void) {
714+
ggml_riscv_arch_features.rvv_vlen = __riscv_vlenb();
715+
}
716+
#else
717+
static void ggml_init_riscv_arch_features(void) {}
718+
#endif
719+
706720
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
707721
GGML_ASSERT(!ggml_get_no_alloc(ctx));
708722

@@ -3459,6 +3473,14 @@ int ggml_cpu_has_riscv_v(void) {
34593473
#endif
34603474
}
34613475

3476+
int ggml_cpu_get_rvv_vlen(void) {
3477+
#if defined(__riscv) && defined(__riscv_v_intrinsic)
3478+
return ggml_riscv_arch_features.rvv_vlen;
3479+
#else
3480+
return 0;
3481+
#endif
3482+
}
3483+
34623484
int ggml_cpu_has_f16c(void) {
34633485
#if defined(__F16C__)
34643486
return 1;
@@ -3625,6 +3647,10 @@ void ggml_cpu_init(void) {
36253647
ggml_init_arm_arch_features();
36263648
#endif
36273649

3650+
#if defined(__riscv)
3651+
ggml_init_riscv_arch_features();
3652+
#endif
3653+
36283654
is_first_call = false;
36293655
}
36303656

ggml/src/ggml-cpu/ggml-cpu.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -583,6 +583,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
583583
if (ggml_cpu_has_riscv_v()) {
584584
features.push_back({ "RISCV_V", "1" });
585585
}
586+
if (ggml_cpu_get_rvv_vlen() > 0) {
587+
static std::string rvv_vlen = std::to_string(ggml_cpu_get_rvv_vlen());
588+
features.push_back({ "RVV_VLEN", rvv_vlen.c_str() });
589+
}
586590
if (ggml_cpu_has_vsx()) {
587591
features.push_back({ "VSX", "1" });
588592
}

ggml/src/ggml-cpu/repack.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2169,7 +2169,8 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
21692169
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
21702170

21712171
if (cur->type == GGML_TYPE_Q4_0) {
2172-
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
2172+
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
2173+
|| (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
21732174
if (cur->ne[1] % 8 == 0) {
21742175
return &q4_0_8x8_q8_0;
21752176
}

ggml/src/ggml-cuda/common.cuh

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -67,19 +67,22 @@
6767
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
6868
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
6969
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
70+
#define GGML_CUDA_CC_RDNA3_5 (GGML_CUDA_CC_OFFSET_AMD + 0x1150) // AI 370, AI Max 395 laptops.
7071
#define GGML_CUDA_CC_RDNA4 (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000
7172

72-
#define GGML_CUDA_CC_IS_AMD(cc) (cc >= GGML_CUDA_CC_OFFSET_AMD)
73-
#define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1)
74-
#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
75-
#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
76-
#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
77-
#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
78-
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
79-
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
80-
#define GGML_CUDA_CC_IS_CDNA1(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2)
81-
#define GGML_CUDA_CC_IS_CDNA2(cc) (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3)
82-
#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
73+
#define GGML_CUDA_CC_IS_AMD(cc) (cc >= GGML_CUDA_CC_OFFSET_AMD)
74+
#define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1)
75+
#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
76+
#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
77+
#define GGML_CUDA_CC_IS_RDNA3_0(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA3_5)
78+
#define GGML_CUDA_CC_IS_RDNA3_5(cc) (cc >= GGML_CUDA_CC_RDNA3_5 && cc < GGML_CUDA_CC_RDNA4)
79+
#define GGML_CUDA_CC_IS_RDNA3(cc) (GGML_CUDA_CC_IS_RDNA3_0(cc) || GGML_CUDA_CC_IS_RDNA3_5(cc))
80+
#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
81+
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
82+
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
83+
#define GGML_CUDA_CC_IS_CDNA1(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2)
84+
#define GGML_CUDA_CC_IS_CDNA2(cc) (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3)
85+
#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
8386

8487
// Moore Threads
8588
#define MUSART_HMASK 40300 // MUSA rc4.3, min. ver. for half2 -> uint mask comparisons

0 commit comments

Comments
 (0)