Skip to content

Commit 8da828a

Browse files
committed
Merge branch 'master' into prune
2 parents f037443 + 40bfa04 commit 8da828a

File tree

9 files changed

+67
-65
lines changed

9 files changed

+67
-65
lines changed

common/json-schema-to-grammar.cpp

Lines changed: 3 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -41,49 +41,6 @@ static std::string build_repetition(const std::string & item_rule, int min_items
4141
return result;
4242
}
4343

44-
/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
45-
class string_view {
46-
const std::string & _str;
47-
const size_t _start;
48-
const size_t _end;
49-
public:
50-
string_view(const std::string & str, size_t start = 0, size_t end = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
51-
52-
size_t size() const {
53-
return _end - _start;
54-
}
55-
56-
size_t length() const {
57-
return size();
58-
}
59-
60-
operator std::string() const {
61-
return str();
62-
}
63-
64-
std::string str() const {
65-
return _str.substr(_start, _end - _start);
66-
}
67-
68-
string_view substr(size_t pos, size_t len = std::string::npos) const {
69-
return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
70-
}
71-
72-
char operator[](size_t pos) const {
73-
auto index = _start + pos;
74-
if (index >= _end) {
75-
throw std::out_of_range("string_view index out of range");
76-
}
77-
return _str[_start + pos];
78-
}
79-
80-
bool operator==(const string_view & other) const {
81-
std::string this_str = *this;
82-
std::string other_str = other;
83-
return this_str == other_str;
84-
}
85-
};
86-
8744
static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
8845
auto has_min = min_value != std::numeric_limits<int>::min();
8946
auto has_max = max_value != std::numeric_limits<int>::max();
@@ -112,14 +69,14 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
11269
}
11370
out << "}";
11471
};
115-
std::function<void(const string_view &, const string_view &)> uniform_range =
116-
[&](const string_view & from, const string_view & to) {
72+
std::function<void(const std::string_view &, const std::string_view &)> uniform_range =
73+
[&](const std::string_view & from, const std::string_view & to) {
11774
size_t i = 0;
11875
while (i < from.length() && i < to.length() && from[i] == to[i]) {
11976
i++;
12077
}
12178
if (i > 0) {
122-
out << "\"" << from.substr(0, i).str() << "\"";
79+
out << "\"" << from.substr(0, i) << "\"";
12380
}
12481
if (i < from.length() && i < to.length()) {
12582
if (i > 0) {

ggml/src/ggml-cuda/common.cuh

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,26 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
362362
#endif // FP16_AVAILABLE
363363
}
364364

365+
// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
366+
template<bool norm>
367+
static __global__ void reduce_rows_f32(const float * x, float * dst, const int ncols) {
368+
const int row = blockIdx.x;
369+
const int col = threadIdx.x;
370+
371+
float sum = 0.0f;
372+
for (int i = col; i < ncols; i += blockDim.x) {
373+
sum += x[row * ncols + i];
374+
}
375+
376+
sum = warp_reduce_sum(sum);
377+
378+
if (col != 0) {
379+
return;
380+
}
381+
382+
dst[row] = norm ? sum / ncols : sum;
383+
}
384+
365385
template<int width = WARP_SIZE>
366386
static __device__ __forceinline__ float warp_reduce_max(float x) {
367387
#pragma unroll

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include "ggml-cuda/ssm-scan.cuh"
3838
#include "ggml-cuda/sum.cuh"
3939
#include "ggml-cuda/sumrows.cuh"
40+
#include "ggml-cuda/mean.cuh"
4041
#include "ggml-cuda/tsembd.cuh"
4142
#include "ggml-cuda/unary.cuh"
4243
#include "ggml-cuda/upscale.cuh"
@@ -2357,6 +2358,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
23572358
case GGML_OP_SUM_ROWS:
23582359
ggml_cuda_op_sum_rows(ctx, dst);
23592360
break;
2361+
case GGML_OP_MEAN:
2362+
ggml_cuda_op_mean(ctx, dst);
2363+
break;
23602364
case GGML_OP_SSM_CONV:
23612365
ggml_cuda_op_ssm_conv(ctx, dst);
23622366
break;
@@ -3260,6 +3264,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
32603264
case GGML_OP_POOL_2D:
32613265
case GGML_OP_SUM:
32623266
case GGML_OP_SUM_ROWS:
3267+
case GGML_OP_MEAN:
32633268
case GGML_OP_ARGSORT:
32643269
case GGML_OP_ACC:
32653270
return true;

ggml/src/ggml-cuda/mean.cu

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#include "mean.cuh"
2+
3+
void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
4+
const ggml_tensor * src0 = dst->src[0];
5+
const float * src0_d = (const float *) src0->data;
6+
float * dst_d = (float *) dst->data;
7+
cudaStream_t stream = ctx.stream();
8+
9+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
10+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
11+
GGML_ASSERT(ggml_is_contiguous(src0));
12+
13+
const int64_t ncols = src0->ne[0];
14+
const int64_t nrows = ggml_nrows(src0);
15+
16+
const dim3 block_dims(WARP_SIZE, 1, 1);
17+
const dim3 block_nums(nrows, 1, 1);
18+
reduce_rows_f32</*norm*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
19+
}

ggml/src/ggml-cuda/mean.cuh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#include "common.cuh"
2+
3+
void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

ggml/src/ggml-cuda/sumrows.cu

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,9 @@
11
#include "sumrows.cuh"
22

3-
static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
4-
const int row = blockIdx.x;
5-
const int col = threadIdx.x;
6-
7-
float sum = 0.0f;
8-
for (int i = col; i < ncols; i += blockDim.x) {
9-
sum += x[row * ncols + i];
10-
}
11-
12-
sum = warp_reduce_sum(sum);
13-
14-
if (col == 0) {
15-
dst[row] = sum;
16-
}
17-
}
18-
193
void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
204
const dim3 block_dims(WARP_SIZE, 1, 1);
215
const dim3 block_nums(nrows, 1, 1);
22-
k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
6+
reduce_rows_f32</*norm*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
237
}
248

259
void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -35,5 +19,8 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
3519
const int64_t ncols = src0->ne[0];
3620
const int64_t nrows = ggml_nrows(src0);
3721

38-
sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream);
22+
const dim3 block_dims(WARP_SIZE, 1, 1);
23+
const dim3 block_nums(nrows, 1, 1);
24+
25+
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
3926
}

ggml/src/ggml-cuda/sumrows.cuh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#include "common.cuh"
22

33
void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream);
4-
54
void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

gguf-py/gguf/vocab.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,16 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool:
197197
if special_last := tmpl_single[-1].get('SpecialToken', {}).get('id'):
198198
if not tokenizer_config:
199199
special_eos = special_last
200+
elif special_last != special_eos:
201+
if 'eot' not in self.special_token_types:
202+
self.special_token_types = tuple(self.special_token_types) + ('eot', )
203+
tokenizer_config['eot_token'] = special_eos
204+
elif 'eom' not in self.special_token_types:
205+
self.special_token_types = tuple(self.special_token_types) + ('eom', )
206+
tokenizer_config['eom_token'] = special_eos
207+
else:
208+
logger.warning(f'Overriding EOS token {special_eos!r} with {special_last!r} without EOT/EOM fallback!')
209+
tokenizer_config['eos_token'] = special_eos = special_last
200210
self.add_special_token['eos'] = True if special_last == special_eos else False
201211
if special_last != special_eos:
202212
logger.warning(f'Unknown trailing special token {special_last!r} in TemplateProcessing<single>')

tests/test-backend-ops.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4652,6 +4652,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
46524652

46534653
test_cases.emplace_back(new test_conv_transpose_2d({256, 256, 256, 1}, {3, 3, 16, 256}, 1));
46544654

4655+
test_cases.emplace_back(new test_mean(GGML_TYPE_F32, {256, 256, 3, 1}));
4656+
46554657
return test_cases;
46564658
}
46574659

0 commit comments

Comments
 (0)