Skip to content

Commit 7255be3

Browse files
committed
Merge branch 'master' into esocrok
2 parents 5910f24 + c5023da commit 7255be3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+2019
-174
lines changed

.devops/s390x.Dockerfile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,9 @@ RUN --mount=type=cache,target=/root/.ccache \
2424
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
2525
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
2626
-DLLAMA_BUILD_TESTS=OFF \
27-
-DGGML_BACKEND_DL=OFF \
2827
-DGGML_NATIVE=OFF \
28+
-DGGML_BACKEND_DL=ON \
29+
-DGGML_CPU_ALL_VARIANTS=ON \
2930
-DGGML_BLAS=ON \
3031
-DGGML_BLAS_VENDOR=OpenBLAS && \
3132
cmake --build build --config Release -j $(nproc) && \
@@ -103,6 +104,7 @@ FROM base AS light
103104
WORKDIR /llama.cpp/bin
104105

105106
# Copy llama.cpp binaries and libraries
107+
COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
106108
COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin
107109

108110
ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
@@ -116,6 +118,7 @@ ENV LLAMA_ARG_HOST=0.0.0.0
116118
WORKDIR /llama.cpp/bin
117119

118120
# Copy llama.cpp binaries and libraries
121+
COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
119122
COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin
120123

121124
EXPOSE 8080

common/arg.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2791,6 +2791,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
27912791
params.image.emplace_back(value);
27922792
}
27932793
).set_examples({LLAMA_EXAMPLE_MTMD}));
2794+
add_opt(common_arg(
2795+
{"--image-min-tokens"}, "N",
2796+
"minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
2797+
[](common_params & params, int value) {
2798+
params.image_min_tokens = value;
2799+
}
2800+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MIN_TOKENS"));
2801+
add_opt(common_arg(
2802+
{"--image-max-tokens"}, "N",
2803+
"maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
2804+
[](common_params & params, int value) {
2805+
params.image_max_tokens = value;
2806+
}
2807+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
27942808
if (llama_supports_rpc()) {
27952809
add_opt(common_arg(
27962810
{"--rpc"}, "SERVERS",

common/chat.cpp

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,6 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
313313
}
314314
if (!msg.reasoning_content.empty()) {
315315
jmsg["reasoning_content"] = msg.reasoning_content;
316-
jmsg["thinking"] = msg.reasoning_content; // gpt-oss
317316
}
318317
if (!msg.tool_name.empty()) {
319318
jmsg["name"] = msg.tool_name;
@@ -1810,7 +1809,23 @@ static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) {
18101809

18111810
static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
18121811
common_chat_params data;
1813-
auto prompt = apply(tmpl, inputs);
1812+
1813+
// Copy reasoning to the "thinking" field as expected by the gpt-oss template
1814+
auto adjusted_messages = json::array();
1815+
for (const auto & msg : inputs.messages) {
1816+
auto has_reasoning_content = msg.contains("reasoning_content") && msg.at("reasoning_content").is_string();
1817+
auto has_tool_calls = msg.contains("tool_calls") && msg.at("tool_calls").is_array();
1818+
1819+
if (has_reasoning_content && has_tool_calls) {
1820+
auto adjusted_message = msg;
1821+
adjusted_message["thinking"] = msg.at("reasoning_content");
1822+
adjusted_messages.push_back(adjusted_message);
1823+
} else {
1824+
adjusted_messages.push_back(msg);
1825+
}
1826+
}
1827+
1828+
auto prompt = apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
18141829

18151830
// Check if we need to replace the return token with end token during
18161831
// inference and without generation prompt. For more details see:

common/common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,8 @@ struct common_params {
402402
bool mmproj_use_gpu = true; // use GPU for multimodal model
403403
bool no_mmproj = false; // explicitly disable multimodal model
404404
std::vector<std::string> image; // path to image file(s)
405+
int image_min_tokens = -1;
406+
int image_max_tokens = -1;
405407

406408
// finetune
407409
struct lr_opt lr;

convert_hf_to_gguf.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9802,6 +9802,113 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
98029802

98039803
return [(self.map_tensor_name(name), data_torch)]
98049804

9805+
9806+
@ModelBase.register("JanusForConditionalGeneration")
9807+
class JanusProModel(LlamaModel):
9808+
model_arch = gguf.MODEL_ARCH.LLAMA # reuse Llama arch
9809+
9810+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9811+
# Skip vision, aligner, and generation tensors
9812+
skip_prefixes = (
9813+
'model.vision_model.',
9814+
'model.aligner.',
9815+
'model.vqmodel.',
9816+
'model.generation_embeddings.',
9817+
'model.generation_aligner.',
9818+
'model.generation_head.',
9819+
)
9820+
if name.startswith(skip_prefixes):
9821+
return []
9822+
9823+
if name.startswith('model.language_model.'):
9824+
name = name.replace('model.language_model.', 'model.')
9825+
elif name.startswith('language_model.'):
9826+
name = name.replace('language_model.', '')
9827+
9828+
return super().modify_tensors(data_torch, name, bid)
9829+
9830+
9831+
@ModelBase.register("JanusForConditionalGeneration")
9832+
class JanusProVisionModel(MmprojModel):
9833+
def __init__(self, *args, **kwargs):
9834+
super().__init__(*args, **kwargs)
9835+
assert self.hparams_vision is not None
9836+
if "intermediate_size" not in self.hparams_vision:
9837+
mlp_ratio = self.hparams_vision.get("mlp_ratio")
9838+
hidden_size = self.hparams_vision.get("hidden_size")
9839+
if mlp_ratio is not None and hidden_size is not None:
9840+
self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio))
9841+
9842+
def set_gguf_parameters(self):
9843+
super().set_gguf_parameters()
9844+
assert self.hparams_vision is not None
9845+
9846+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.JANUS_PRO)
9847+
9848+
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))
9849+
9850+
hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower()
9851+
if hidden_act == "gelu":
9852+
self.gguf_writer.add_vision_use_gelu(True)
9853+
elif hidden_act == "silu":
9854+
self.gguf_writer.add_vision_use_silu(True)
9855+
9856+
def _map_aligner_tensor(self, data_torch: Tensor, name: str) -> Iterable[tuple[str, Tensor]]:
9857+
"""Map aligner tensors to projector format"""
9858+
suffix = ".bias" if name.endswith(".bias") else ".weight"
9859+
9860+
if name.startswith("model.aligner."):
9861+
local_name = name[len("model.aligner."):]
9862+
elif name.startswith("aligner."):
9863+
local_name = name[len("aligner."):]
9864+
else:
9865+
raise ValueError(f"Unsupported Janus aligner prefix: {name}")
9866+
9867+
if local_name.startswith("fc1."):
9868+
mm_index = 0
9869+
elif local_name.startswith("hidden_layers."):
9870+
parts = local_name.split(".", 2)
9871+
if len(parts) < 3:
9872+
raise ValueError(f"Unexpected Janus aligner tensor name: {name}")
9873+
mm_index = int(parts[1]) + 1
9874+
else:
9875+
raise ValueError(f"Unsupported Janus aligner tensor: {name}")
9876+
9877+
tensor_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_index, suffix=suffix)
9878+
return [(tensor_name, data_torch)]
9879+
9880+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9881+
del bid # unused
9882+
9883+
# Skip language model tensors as they will be handled by `JanusProModel`
9884+
if name.startswith(('model.language_model.', 'language_model.')):
9885+
return []
9886+
9887+
# Skip generation-related components
9888+
skip_generation_prefixes = (
9889+
'model.vqmodel.',
9890+
'vqmodel.',
9891+
'model.generation_embeddings.',
9892+
'generation_embeddings.',
9893+
'model.generation_aligner.',
9894+
'generation_aligner.',
9895+
'model.generation_head.',
9896+
'generation_head.',
9897+
)
9898+
if name.startswith(skip_generation_prefixes):
9899+
return []
9900+
9901+
# Handle aligner tensors
9902+
if name.startswith(('model.aligner.', 'aligner.')):
9903+
return list(self._map_aligner_tensor(data_torch, name))
9904+
9905+
# Handle vision tensors
9906+
if name.startswith(('model.vision_model.', 'vision_model.')):
9907+
return [(self.map_tensor_name(name), data_torch)]
9908+
9909+
return []
9910+
9911+
98059912
###### CONVERSION LOGIC ######
98069913

98079914

ggml/src/ggml-cpu/arch/loongarch/quants.c

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -700,7 +700,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
700700
for (; ib + 1 < nb; ib += 2) {
701701

702702
// Compute combined scale for the block 0 and 1
703-
const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
703+
const float ft0 = GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d);
704+
const __m128 d_0_1 = (__m128)(v4f32){ft0, ft0, ft0, ft0};
704705

705706
const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0);
706707

@@ -714,11 +715,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
714715
bx_1 = __lsx_vsub_b(bx_1, off);
715716
const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
716717

717-
//_mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
718-
//_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
719-
720718
// Compute combined scale for the block 2 and 3
721-
const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) );
719+
const float ft1 = GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d);
720+
const __m128 d_2_3 = (__m128)(v4f32){ft1, ft1, ft1, ft1};
722721

723722
const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0);
724723

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#include "ggml-backend-impl.h"
2+
3+
#if defined(__s390x__)
4+
#include <sys/auxv.h>
5+
6+
// find hwcap bits in asm/elf.h
7+
#ifndef HWCAP_VXRS_EXT2
8+
#define HWCAP_VXRS_EXT2 (1 << 15)
9+
#endif
10+
11+
#ifndef HWCAP_NNPA
12+
#define HWCAP_NNPA (1 << 20)
13+
#endif
14+
15+
struct s390x_features {
16+
bool has_vxe2 = false;
17+
bool has_nnpa = false;
18+
19+
s390x_features() {
20+
uint32_t hwcap = getauxval(AT_HWCAP);
21+
// NOTE: use hwcap2 with DFLT for z17 and later
22+
// uint32_t hwcap2 = getauxval(AT_HWCAP2);
23+
24+
has_vxe2 = !!(hwcap & HWCAP_VXRS_EXT2);
25+
has_nnpa = !!(hwcap & HWCAP_NNPA);
26+
}
27+
};
28+
29+
static int ggml_backend_cpu_s390x_score() {
30+
int score = 1;
31+
s390x_features sf;
32+
33+
// IBM z15 / LinuxONE 3
34+
#ifdef GGML_USE_VXE2
35+
if (!sf.has_vxe2) { return 0; }
36+
score += 1 << 1;
37+
#endif
38+
39+
// IBM z16 / LinuxONE 4 and z17 / LinuxONE 5
40+
#ifdef GGML_USE_NNPA
41+
if (!sf.has_nnpa) { return 0; }
42+
score += 1 << 2;
43+
#endif
44+
45+
return score;
46+
}
47+
48+
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_s390x_score)
49+
50+
#endif // __s390x__

ggml/src/ggml-cpu/ggml-cpu-impl.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -500,13 +500,15 @@ inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
500500

501501
#endif
502502

503-
#if defined(__loongarch_asx)
503+
#if defined(__loongarch_sx)
504504
/* float type data load instructions */
505505
static __m128 __lsx_vreplfr2vr_s(const float val) {
506506
v4f32 res = {val, val, val, val};
507507
return (__m128)res;
508508
}
509+
#endif
509510

511+
#if defined(__loongarch_asx)
510512
static __m256 __lasx_xvreplfr2vr_s(const float val) {
511513
v8f32 res = {val, val, val, val, val, val, val, val};
512514
return (__m256)res;

ggml/src/ggml-cpu/simd-mappings.h

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -956,7 +956,7 @@ do { \
956956

957957
#define GGML_F32Cx8 __m256
958958
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
959-
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
959+
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
960960

961961
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
962962
__m256i a;
@@ -999,34 +999,34 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
999999

10001000
#define GGML_F32x4 __m128
10011001
#define GGML_F32x4_ZERO (__m128)__lsx_vldi(0)
1002-
#define GGML_F32x4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
1002+
#define GGML_F32x4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
10031003
#define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
10041004
#define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
10051005
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
10061006
#define GGML_F32x4_ADD __lsx_vfadd_s
10071007
#define GGML_F32x4_MUL __lsx_vfmul_s
1008-
#define GGML_F32x4_REDUCE(res, x) \
1009-
{ \
1010-
int offset = GGML_F32_ARR >> 1; \
1011-
for (int i = 0; i < offset; ++i) { \
1012-
x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
1013-
} \
1014-
offset >>= 1; \
1015-
for (int i = 0; i < offset; ++i) { \
1016-
x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
1017-
} \
1018-
offset >>= 1; \
1019-
for (int i = 0; i < offset; ++i) { \
1020-
x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
1021-
} \
1022-
__m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
1023-
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
1024-
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
1025-
const __m128 t0 = (__m128)__lsx_vshuf4i_w(tmp, 0x88); \
1026-
tmp = __lsx_vsrli_d((__m128i) t0, 32); \
1027-
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
1028-
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
1029-
res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
1008+
1009+
#define GGML_F32x4_REDUCE(res, x) \
1010+
{ \
1011+
int offset = GGML_F32_ARR >> 1; \
1012+
for (int i = 0; i < offset; ++i) { \
1013+
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
1014+
} \
1015+
offset >>= 1; \
1016+
for (int i = 0; i < offset; ++i) { \
1017+
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
1018+
} \
1019+
offset >>= 1; \
1020+
for (int i = 0; i < offset; ++i) { \
1021+
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
1022+
} \
1023+
__m128i t0 = __lsx_vpickev_w((__m128i)x[0], (__m128i)x[0]); \
1024+
__m128i t1 = __lsx_vpickod_w((__m128i)x[0], (__m128i)x[0]); \
1025+
__m128 t2 = __lsx_vfadd_s((__m128)t0, (__m128)t1); \
1026+
__m128i t3 = __lsx_vpickev_w((__m128i)t2, (__m128i)t2); \
1027+
__m128i t4 = __lsx_vpickod_w((__m128i)t2, (__m128i)t2); \
1028+
__m128 t5 = __lsx_vfadd_s((__m128)t3, (__m128)t4); \
1029+
res = (ggml_float) ((v4f32)t5)[0]; \
10301030
}
10311031

10321032
#define GGML_F32_VEC GGML_F32x4
@@ -1068,7 +1068,7 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
10681068

10691069
#define GGML_F32Cx4 __m128
10701070
#define GGML_F32Cx4_ZERO (__m128)__lsx_vldi(0)
1071-
#define GGML_F32Cx4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
1071+
#define GGML_F32Cx4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
10721072
#define GGML_F32Cx4_LOAD(x) (__m128)__lsx_f16x4_load(x)
10731073
#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
10741074
#define GGML_F32Cx4_FMA GGML_F32x4_FMA

ggml/src/ggml-cuda/fattn-tile.cu

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@ void ggml_cuda_flash_attn_ext_tile(ggml_backend_cuda_context & ctx, ggml_tensor
1414
GGML_ASSERT(V->ne[0] == K->ne[0]);
1515
ggml_cuda_flash_attn_ext_tile_case< 64, 64>(ctx, dst);
1616
} break;
17+
case 72: {
18+
GGML_ASSERT(V->ne[0] == K->ne[0]);
19+
ggml_cuda_flash_attn_ext_tile_case< 72, 72>(ctx, dst);
20+
} break;
1721
case 80: {
1822
GGML_ASSERT(V->ne[0] == K->ne[0]);
1923
ggml_cuda_flash_attn_ext_tile_case< 80, 80>(ctx, dst);

0 commit comments

Comments
 (0)