Skip to content

Commit fb13e3e

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # src/llama-context.cpp # tests/test-backend-ops.cpp
2 parents abc1d8a + af3373f commit fb13e3e

26 files changed

+543
-447
lines changed

common/json-schema-to-grammar.cpp

Lines changed: 3 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -41,49 +41,6 @@ static std::string build_repetition(const std::string & item_rule, int min_items
4141
return result;
4242
}
4343

44-
/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
45-
class string_view {
46-
const std::string & _str;
47-
const size_t _start;
48-
const size_t _end;
49-
public:
50-
string_view(const std::string & str, size_t start = 0, size_t end = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
51-
52-
size_t size() const {
53-
return _end - _start;
54-
}
55-
56-
size_t length() const {
57-
return size();
58-
}
59-
60-
operator std::string() const {
61-
return str();
62-
}
63-
64-
std::string str() const {
65-
return _str.substr(_start, _end - _start);
66-
}
67-
68-
string_view substr(size_t pos, size_t len = std::string::npos) const {
69-
return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
70-
}
71-
72-
char operator[](size_t pos) const {
73-
auto index = _start + pos;
74-
if (index >= _end) {
75-
throw std::out_of_range("string_view index out of range");
76-
}
77-
return _str[_start + pos];
78-
}
79-
80-
bool operator==(const string_view & other) const {
81-
std::string this_str = *this;
82-
std::string other_str = other;
83-
return this_str == other_str;
84-
}
85-
};
86-
8744
static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
8845
auto has_min = min_value != std::numeric_limits<int>::min();
8946
auto has_max = max_value != std::numeric_limits<int>::max();
@@ -112,14 +69,14 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
11269
}
11370
out << "}";
11471
};
115-
std::function<void(const string_view &, const string_view &)> uniform_range =
116-
[&](const string_view & from, const string_view & to) {
72+
std::function<void(const std::string_view &, const std::string_view &)> uniform_range =
73+
[&](const std::string_view & from, const std::string_view & to) {
11774
size_t i = 0;
11875
while (i < from.length() && i < to.length() && from[i] == to[i]) {
11976
i++;
12077
}
12178
if (i > 0) {
122-
out << "\"" << from.substr(0, i).str() << "\"";
79+
out << "\"" << from.substr(0, i) << "\"";
12380
}
12481
if (i < from.length() && i < to.length()) {
12582
if (i > 0) {

convert_hf_to_gguf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2193,7 +2193,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
21932193
name += ".weight"
21942194
if "multi_modal_projector.linear_1" in name:
21952195
# despite the name with number postfix, this is a single fully connected layer
2196-
return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC], data_torch)]
2196+
return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC] + '.weight', data_torch)]
21972197
return [(self.map_tensor_name(name), data_torch)]
21982198
return []
21992199

ggml/src/ggml-cuda/common.cuh

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -245,8 +245,18 @@ static bool fp16_mma_available(const int cc) {
245245
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
246246
return false;
247247
#else
248-
return (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
249-
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
248+
if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
249+
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc)) {
250+
return true;
251+
} else if (GGML_CUDA_CC_IS_RDNA4(cc)) {
252+
#if defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12)
253+
return true;
254+
#else
255+
return false;
256+
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12)
257+
} else {
258+
return false;
259+
}
250260
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
251261
}
252262

@@ -366,6 +376,26 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
366376
#endif // FP16_AVAILABLE
367377
}
368378

379+
// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
380+
template<bool norm>
381+
static __global__ void reduce_rows_f32(const float * x, float * dst, const int ncols) {
382+
const int row = blockIdx.x;
383+
const int col = threadIdx.x;
384+
385+
float sum = 0.0f;
386+
for (int i = col; i < ncols; i += blockDim.x) {
387+
sum += x[row * ncols + i];
388+
}
389+
390+
sum = warp_reduce_sum(sum);
391+
392+
if (col != 0) {
393+
return;
394+
}
395+
396+
dst[row] = norm ? sum / ncols : sum;
397+
}
398+
369399
template<int width = WARP_SIZE>
370400
static __device__ __forceinline__ float warp_reduce_max(float x) {
371401
#pragma unroll

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ bool g_mul_mat_q = true;
3939
#include "ggml-cuda/ssm-scan.cuh"
4040
#include "ggml-cuda/sum.cuh"
4141
#include "ggml-cuda/sumrows.cuh"
42+
#include "ggml-cuda/mean.cuh"
4243
#include "ggml-cuda/tsembd.cuh"
4344
#include "ggml-cuda/unary.cuh"
4445
#include "ggml-cuda/upscale.cuh"
@@ -101,8 +102,7 @@ int ggml_cuda_get_device() {
101102
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
102103
ggml_cuda_set_device(device);
103104
cudaError_t err;
104-
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
105-
{
105+
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr) {
106106
err = cudaMallocManaged(ptr, size);
107107
#if defined(GGML_USE_HIP)
108108
if (err == hipSuccess) {
@@ -120,9 +120,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
120120
err = cudaMalloc(ptr, size);
121121
}
122122
#endif // defined(GGML_USE_HIP)
123-
}
124-
else
125-
{
123+
} else {
126124
err = cudaMalloc(ptr, size);
127125
}
128126
return err;
@@ -2362,6 +2360,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
23622360
case GGML_OP_SUM_ROWS:
23632361
ggml_cuda_op_sum_rows(ctx, dst);
23642362
break;
2363+
case GGML_OP_MEAN:
2364+
ggml_cuda_op_mean(ctx, dst);
2365+
break;
23652366
case GGML_OP_SSM_CONV:
23662367
ggml_cuda_op_ssm_conv(ctx, dst);
23672368
break;
@@ -3265,6 +3266,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
32653266
case GGML_OP_POOL_2D:
32663267
case GGML_OP_SUM:
32673268
case GGML_OP_SUM_ROWS:
3269+
case GGML_OP_MEAN:
32683270
case GGML_OP_ARGSORT:
32693271
case GGML_OP_ACC:
32703272
return true;

ggml/src/ggml-cuda/mean.cu

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#include "mean.cuh"
2+
3+
void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
4+
const ggml_tensor * src0 = dst->src[0];
5+
const float * src0_d = (const float *) src0->data;
6+
float * dst_d = (float *) dst->data;
7+
cudaStream_t stream = ctx.stream();
8+
9+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
10+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
11+
GGML_ASSERT(ggml_is_contiguous(src0));
12+
13+
const int64_t ncols = src0->ne[0];
14+
const int64_t nrows = ggml_nrows(src0);
15+
16+
const dim3 block_dims(WARP_SIZE, 1, 1);
17+
const dim3 block_nums(nrows, 1, 1);
18+
reduce_rows_f32</*norm*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
19+
}

ggml/src/ggml-cuda/mean.cuh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#include "common.cuh"
2+
3+
void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

ggml/src/ggml-cuda/sumrows.cu

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,9 @@
11
#include "sumrows.cuh"
22

3-
static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
4-
const int row = blockIdx.x;
5-
const int col = threadIdx.x;
6-
7-
float sum = 0.0f;
8-
for (int i = col; i < ncols; i += blockDim.x) {
9-
sum += x[row * ncols + i];
10-
}
11-
12-
sum = warp_reduce_sum(sum);
13-
14-
if (col == 0) {
15-
dst[row] = sum;
16-
}
17-
}
18-
193
void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
204
const dim3 block_dims(WARP_SIZE, 1, 1);
215
const dim3 block_nums(nrows, 1, 1);
22-
k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
6+
reduce_rows_f32</*norm*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
237
}
248

259
void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -35,5 +19,8 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
3519
const int64_t ncols = src0->ne[0];
3620
const int64_t nrows = ggml_nrows(src0);
3721

38-
sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream);
22+
const dim3 block_dims(WARP_SIZE, 1, 1);
23+
const dim3 block_nums(nrows, 1, 1);
24+
25+
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
3926
}

ggml/src/ggml-cuda/sumrows.cuh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#include "common.cuh"
22

33
void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream);
4-
54
void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

0 commit comments

Comments
 (0)