Skip to content

Commit 0c0e6f8

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents 8ede647 + 5ba36f6 commit 0c0e6f8

File tree

16 files changed

+706
-50
lines changed

16 files changed

+706
-50
lines changed

.devops/cpu.Dockerfile

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,15 @@ FROM ubuntu:$UBUNTU_VERSION AS build
44

55
ARG TARGETARCH
66

7-
ARG GGML_CPU_ARM_ARCH=armv8-a
8-
97
RUN apt-get update && \
108
apt-get install -y build-essential git cmake libcurl4-openssl-dev
119

1210
WORKDIR /app
1311

1412
COPY . .
1513

16-
RUN if [ "$TARGETARCH" = "amd64" ]; then \
14+
RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \
1715
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
18-
elif [ "$TARGETARCH" = "arm64" ]; then \
19-
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
2016
else \
2117
echo "Unsupported architecture"; \
2218
exit 1; \

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ LLM inference in C/C++
1717

1818
## Hot topics
1919

20+
- **[[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)**
2021
- Support for the `gpt-oss` model with native MXFP4 format has been added | [PR](https://github.com/ggml-org/llama.cpp/pull/15091) | [Collaboration with NVIDIA](https://blogs.nvidia.com/blog/rtx-ai-garage-openai-oss) | [Comment](https://github.com/ggml-org/llama.cpp/discussions/15095)
2122
- Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
2223
- Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)

common/chat.cpp

Lines changed: 155 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,7 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
296296
}
297297
if (!msg.reasoning_content.empty()) {
298298
jmsg["reasoning_content"] = msg.reasoning_content;
299+
jmsg["thinking"] = msg.reasoning_content; // gpt-oss
299300
}
300301
if (!msg.tool_name.empty()) {
301302
jmsg["name"] = msg.tool_name;
@@ -1338,16 +1339,164 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
13381339
data.prompt = prompt;
13391340
data.format = COMMON_CHAT_FORMAT_GPT_OSS;
13401341

1341-
// TODO: support tool calls in GPT-OSS?
1342+
// These special tokens are required to parse properly, so we include them
1343+
// even if parse_tool_calls is false.
1344+
data.preserved_tokens = {
1345+
"<|channel|>",
1346+
"<|constrain|>",
1347+
"<|message|>",
1348+
"<|start|>",
1349+
"<|end|>",
1350+
};
1351+
1352+
if (inputs.tools.is_array() && !inputs.tools.empty()) {
1353+
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
1354+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1355+
// tool calls can appear in commentary or analysis channels
1356+
auto channel = builder.add_rule("channel", "\"<|channel|>\" ( \"commentary\" | \"analysis\" )");
1357+
1358+
std::vector<std::string> tool_rules_recipient_in_role;
1359+
std::vector<std::string> tool_rules_recipient_in_channel;
1360+
foreach_function(inputs.tools, [&](const json & tool) {
1361+
const auto & function = tool.at("function");
1362+
std::string name = function.at("name");
1363+
auto parameters = function.at("parameters");
1364+
builder.resolve_refs(parameters);
1365+
1366+
tool_rules_recipient_in_role.push_back(
1367+
builder.add_rule(name + "-call",
1368+
"\"" + name + "\"" + channel + " \" <|constrain|>json\"? \"<|message|>\" " +
1369+
builder.add_schema(name + "-args", parameters)
1370+
)
1371+
);
1372+
1373+
tool_rules_recipient_in_channel.push_back(
1374+
builder.add_rule(name + "-call",
1375+
"\"" + name + "\"" + " \" <|constrain|>json\"? \"<|message|>\" " +
1376+
builder.add_schema(name + "-args", parameters)
1377+
)
1378+
);
1379+
});
1380+
1381+
auto recipient_in_role = builder.add_rule("recipient_in_role",
1382+
"\"<|start|>assistant\"? \" to=functions.\" ( " +
1383+
string_join(tool_rules_recipient_in_role, " | ") + " )"
1384+
);
1385+
1386+
auto recipient_in_channel = builder.add_rule("recipient_in_channel",
1387+
channel + " \" to=functions.\" ( " +
1388+
string_join(tool_rules_recipient_in_channel, " | ") + " )"
1389+
);
1390+
1391+
builder.add_rule("root", recipient_in_role + " | " + recipient_in_channel);
1392+
1393+
// Trigger on tool calls that appear in the commentary channel
1394+
data.grammar_triggers.push_back({
1395+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
1396+
"<\\|channel\\|>(commentary|analysis) to"
1397+
});
1398+
1399+
// Trigger tool calls that appear in the role section, either at the
1400+
// start or in the middle.
1401+
data.grammar_triggers.push_back({
1402+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
1403+
"^ to"
1404+
});
1405+
1406+
data.grammar_triggers.push_back({
1407+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
1408+
"<\\|start\\|>assistant to"
1409+
});
1410+
});
1411+
}
13421412

13431413
return data;
13441414
}
13451415
static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
1346-
// TODO @ngxson : this won't work with --special enabled, we should fix that
1347-
builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|start|>assistant<|channel|>final<|message|>");
1348-
if (!builder.syntax().parse_tool_calls) {
1349-
builder.add_content(builder.consume_rest());
1350-
return;
1416+
static const std::string constraint = "(?: (<\\|constrain\\|>)?([a-zA-Z0-9_-]+))";
1417+
static const std::string recipient("(?: to=functions\\.([^<\\s]+))");
1418+
1419+
static const common_regex start_regex("<\\|start\\|>assistant");
1420+
static const common_regex analysis_regex("<\\|channel\\|>analysis");
1421+
static const common_regex final_regex("<\\|channel\\|>final" + constraint + "?");
1422+
static const common_regex preamble_regex("<\\|channel\\|>commentary");
1423+
static const common_regex tool_call1_regex(recipient + "<\\|channel\\|>(analysis|commentary)" + constraint + "?");
1424+
static const common_regex tool_call2_regex("<\\|channel\\|>(analysis|commentary)" + recipient + constraint + "?");
1425+
1426+
auto consume_end = [&](bool include_end = false) {
1427+
if (auto res = builder.try_find_literal("<|end|>")) {
1428+
return res->prelude + (include_end ? builder.str(res->groups[0]) : "");
1429+
}
1430+
return builder.consume_rest();
1431+
};
1432+
1433+
auto handle_tool_call = [&](const std::string & name) {
1434+
if (auto args = builder.try_consume_json_with_dumped_args({{}})) {
1435+
if (builder.syntax().parse_tool_calls) {
1436+
if (!builder.add_tool_call(name, "", args->value) || args->is_partial) {
1437+
throw common_chat_msg_partial_exception("incomplete tool call");
1438+
}
1439+
} else if (args->is_partial) {
1440+
throw common_chat_msg_partial_exception("incomplete tool call");
1441+
}
1442+
}
1443+
};
1444+
1445+
auto regex_match = [](const common_regex & regex, const std::string & input) -> std::optional<common_regex_match> {
1446+
auto match = regex.search(input, 0, true);
1447+
if (match.type == COMMON_REGEX_MATCH_TYPE_FULL) {
1448+
return match;
1449+
}
1450+
return std::nullopt;
1451+
};
1452+
1453+
do {
1454+
auto header_start_pos = builder.pos();
1455+
auto content_start = builder.try_find_literal("<|message|>");
1456+
if (!content_start) {
1457+
throw common_chat_msg_partial_exception("incomplete header");
1458+
}
1459+
1460+
auto header = content_start->prelude;
1461+
1462+
if (auto match = regex_match(tool_call1_regex, header)) {
1463+
auto group = match->groups[1];
1464+
auto name = header.substr(group.begin, group.end - group.begin);
1465+
handle_tool_call(name);
1466+
continue;
1467+
}
1468+
1469+
if (auto match = regex_match(tool_call2_regex, header)) {
1470+
auto group = match->groups[2];
1471+
auto name = header.substr(group.begin, group.end - group.begin);
1472+
handle_tool_call(name);
1473+
continue;
1474+
}
1475+
1476+
if (regex_match(analysis_regex, header)) {
1477+
builder.move_to(header_start_pos);
1478+
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || builder.syntax().reasoning_in_content) {
1479+
builder.add_content(consume_end(true));
1480+
} else {
1481+
builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|end|>");
1482+
}
1483+
continue;
1484+
}
1485+
1486+
if(regex_match(final_regex, header) || regex_match(preamble_regex, header)) {
1487+
builder.add_content(consume_end());
1488+
continue;
1489+
}
1490+
1491+
// Possibly a malformed message, attempt to recover by rolling
1492+
// back to pick up the next <|start|>
1493+
LOG_DBG("%s: unknown header from message: %s\n", __func__, header.c_str());
1494+
builder.move_to(header_start_pos);
1495+
} while (builder.try_find_regex(start_regex, std::string::npos, false));
1496+
1497+
auto remaining = builder.consume_rest();
1498+
if (!remaining.empty()) {
1499+
LOG_DBG("%s: content after last message: %s\n", __func__, remaining.c_str());
13511500
}
13521501
}
13531502

ggml/src/ggml-cuda/convert.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
3131
dequantize_kernel(vx, ib, iqs, v);
3232

3333
const int64_t iy0 = ((i03*ne02 + i02)*ne01 + i01)*ne00 + iybs + iqs;
34-
y[iy0 + 0] = float(v.x);
35-
y[iy0 + y_offset] = float(v.y);
34+
y[iy0 + 0] = ggml_cuda_cast<dst_t>(v.x);
35+
y[iy0 + y_offset] = ggml_cuda_cast<dst_t>(v.y);
3636
}
3737

3838
template <bool need_check>
@@ -630,7 +630,7 @@ static __global__ void convert_unary(
630630

631631
const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00;
632632
const int64_t iy = ((i03*ne02 + i02)*ne01 + i01)*ne00 + i00;
633-
y[iy] = float(x[ix]);
633+
y[iy] = ggml_cuda_cast<dst_t>(x[ix]);
634634
}
635635

636636
template <typename src_t, typename dst_t>

ggml/src/ggml-cuda/convert.cuh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,16 @@ typedef to_t_nc_cuda_t<nv_bfloat16> to_bf16_nc_cuda_t;
2929
to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type);
3030
to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type);
3131
to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type);
32+
33+
template<typename dst_t, typename src_t>
34+
__host__ __device__ inline dst_t ggml_cuda_cast(src_t x) {
35+
if constexpr (std::is_same_v<dst_t, src_t>) {
36+
return x;
37+
} else if constexpr(std::is_same_v<dst_t, nv_bfloat16>) {
38+
return __float2bfloat16(float(x));
39+
} else if constexpr(std::is_same_v<src_t, nv_bfloat16>) {
40+
return __bfloat162float(x);
41+
} else {
42+
return float(x);
43+
}
44+
}

ggml/src/ggml-cuda/cpy-utils.cuh

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,7 @@
11
#pragma once
22

33
#include "ggml-common.h"
4-
5-
template<typename src_t, typename dst_t>
6-
static __device__ __forceinline__ void convert_flt(const src_t * src, dst_t * dst) {
7-
if constexpr (std::is_same_v<src_t, dst_t>) {
8-
*dst = *src;
9-
} else {
10-
*dst = float(*src);
11-
}
12-
}
4+
#include "convert.cuh"
135

146
static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) {
157
if (x <= val[0]) return 0;
@@ -221,5 +213,5 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
221213

222214
template<typename src_t, typename dst_t>
223215
static __device__ void cpy_1_flt(const char * cxi, char * cdsti) {
224-
convert_flt((const src_t *)cxi, (dst_t *)cdsti);
216+
*(dst_t *) cdsti = ggml_cuda_cast<dst_t>(*(const src_t *) cxi);
225217
}

ggml/src/ggml-cuda/getrows.cu

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "getrows.cuh"
22
#include "dequantize.cuh"
3+
#include "convert.cuh"
34

45
template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
56
static __global__ void k_get_rows(
@@ -34,8 +35,8 @@ static __global__ void k_get_rows(
3435
dfloat2 v;
3536
dequantize_kernel(src0_row, ib, iqs, v);
3637

37-
dst_row[iybs + iqs + 0] = float(v.x);
38-
dst_row[iybs + iqs + y_offset] = float(v.y);
38+
dst_row[iybs + iqs + 0] = ggml_cuda_cast<dst_t>(v.x);
39+
dst_row[iybs + iqs + y_offset] = ggml_cuda_cast<dst_t>(v.y);
3940
}
4041

4142
template<typename src0_t, typename dst_t>
@@ -62,7 +63,7 @@ static __global__ void k_get_rows_float(
6263
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
6364
const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03);
6465

65-
dst_row[i00] = float(src0_row[i00]);
66+
dst_row[i00] = ggml_cuda_cast<dst_t>(src0_row[i00]);
6667
}
6768

6869
template<typename grad_t, typename dst_t>

ggml/src/ggml-cuda/mmvf.cu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "ggml.h"
22
#include "common.cuh"
3+
#include "convert.cuh"
34
#include "mmvf.cuh"
45

56
template <typename T, typename type_acc, int ncols_dst, int block_size>
@@ -93,8 +94,8 @@ static __global__ void mul_mat_vec_f(
9394
#pragma unroll
9495
for (int j = 0; j < ncols_dst; ++j) {
9596
const float2 tmpy = y2[j*stride_col_y2 + col2];
96-
sumf[j] += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[0]) * tmpy.x;
97-
sumf[j] += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[1]) * tmpy.y;
97+
sumf[j] += ggml_cuda_cast<float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[0]) * tmpy.x;
98+
sumf[j] += ggml_cuda_cast<float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[1]) * tmpy.y;
9899
}
99100
}
100101
} else {

ggml/src/ggml-cuda/set-rows.cu

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,6 @@
33

44
typedef void (*set_rows_kernel_t)(const char * src, char * dst);
55

6-
template<typename src_t, typename dst_t>
7-
__device__ __forceinline__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {
8-
convert_flt(src_f, dst_f);
9-
}
10-
116
// Generic quantized set_rows kernel template
127
template<typename block_type, int qk, void (*quantize_func)(const float*, block_type*)>
138
static __global__ void k_set_rows_quant(
@@ -117,9 +112,7 @@ static __global__ void k_set_rows(
117112
const src_t * src0_row = src0 + i01*s01 + i02*s02 + i03*s03;
118113
dst_t * dst_row_ptr = dst + dst_row*s1 + i02*s2 + i03*s3;
119114

120-
const src_t* src_elem = src0_row + i00;
121-
dst_t* dst_elem = dst_row_ptr + i00;
122-
set_rows_1(src_elem, dst_elem);
115+
dst_row_ptr[i00] = ggml_cuda_cast<dst_t>(src0_row[i00]);
123116

124117
GGML_UNUSED(ne10);
125118
GGML_UNUSED(ne13);

ggml/src/ggml-cuda/vendors/hip.h

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#include <hip/hip_runtime.h>
55
#include <hipblas/hipblas.h>
66
#include <hip/hip_fp16.h>
7-
#include <hip/hip_bfloat16.h>
7+
#include <hip/hip_bf16.h>
88

99
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
1010
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
@@ -135,7 +135,7 @@
135135
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
136136
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
137137

138-
#if HIP_VERSION >= 70000000
138+
#if HIP_VERSION >= 60500000
139139
#define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F
140140
#define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F
141141
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F
@@ -147,7 +147,7 @@
147147
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
148148
#define cublasComputeType_t hipblasDatatype_t
149149
#define cudaDataType_t hipblasDatatype_t
150-
#endif // HIP_VERSION >= 7000000
150+
#endif // HIP_VERSION >= 6050000
151151

152152
#if !defined(__HIP_PLATFORM_AMD__)
153153
#error "The HIP backend supports only AMD targets"
@@ -179,8 +179,7 @@
179179
#define RDNA4
180180
#endif
181181

182-
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
183-
defined(__gfx1150__) || defined(__gfx1151__)
182+
#if defined(__GFX11__)
184183
#define RDNA3
185184
#endif
186185

@@ -197,8 +196,8 @@
197196
#define __has_builtin(x) 0
198197
#endif
199198

200-
typedef hip_bfloat16 nv_bfloat16;
201-
typedef short2 nv_bfloat162; // FIXME there is no 2x BF16 type being defined in bfloat16.h, ad-hoc compilation fix
199+
typedef __hip_bfloat16 nv_bfloat16;
200+
typedef __hip_bfloat162 nv_bfloat162;
202201

203202
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
204203
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));

0 commit comments

Comments
 (0)