Skip to content

Commit 6f4ba66

Browse files
Merge pull request #294 from menloresearch/update-dev-from-master-2025-10-16-00-34
Sync master with upstream release b6774
2 parents 07387fe + 466c191 commit 6f4ba66

40 files changed

+1340
-81
lines changed

docs/ops.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ Legend:
2222
| ARANGE ||||||||||
2323
| ARGMAX ||||||||||
2424
| ARGSORT ||||||||||
25+
| CEIL ||||||||||
2526
| CLAMP ||||| 🟡 | 🟡 || 🟡 ||
2627
| CONCAT |||| 🟡 || 🟡 | 🟡 |||
2728
| CONT || 🟡 |||| 🟡 | 🟡 | 🟡 ||
@@ -41,6 +42,7 @@ Legend:
4142
| ELU |||| 🟡 | 🟡 || 🟡 |||
4243
| EXP |||| 🟡 | 🟡 || 🟡 |||
4344
| FLASH_ATTN_EXT || 🟡 || 🟡 | 🟡 ||| 🟡 ||
45+
| FLOOR ||||||||||
4446
| GATED_LINEAR_ATTN ||||||||||
4547
| GEGLU ||||| 🟡 ||| 🟡 ||
4648
| GEGLU_ERF ||||| 🟡 ||| 🟡 ||
@@ -82,6 +84,7 @@ Legend:
8284
| ROLL ||||||||||
8385
| ROPE || 🟡 ||||||||
8486
| ROPE_BACK ||||||||||
87+
| ROUND ||||||||||
8588
| RWKV_WKV6 ||||||||||
8689
| RWKV_WKV7 ||||||||||
8790
| SCALE || 🟡 ||||||||
@@ -108,5 +111,6 @@ Legend:
108111
| TANH |||| 🟡 | 🟡 || 🟡 | 🟡 ||
109112
| TIMESTEP_EMBEDDING ||||||||||
110113
| TOPK_MOE ||||||||||
114+
| TRUNC ||||||||||
111115
| UPSCALE || 🟡 ||| 🟡 || 🟡 |||
112116
| XIELU ||||||||||

docs/ops/CPU.csv

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,14 @@
5959
"CPU","EXP","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","CPU"
6060
"CPU","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","CPU"
6161
"CPU","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","CPU"
62+
"CPU","FLOOR","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
63+
"CPU","FLOOR","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
64+
"CPU","CEIL","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
65+
"CPU","CEIL","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
66+
"CPU","ROUND","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
67+
"CPU","ROUND","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
68+
"CPU","TRUNC","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
69+
"CPU","TRUNC","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
6270
"CPU","ABS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
6371
"CPU","ABS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
6472
"CPU","SGN","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
@@ -119,6 +127,14 @@
119127
"CPU","EXP","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","CPU"
120128
"CPU","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","CPU"
121129
"CPU","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","CPU"
130+
"CPU","FLOOR","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
131+
"CPU","FLOOR","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
132+
"CPU","CEIL","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
133+
"CPU","CEIL","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
134+
"CPU","ROUND","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
135+
"CPU","ROUND","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
136+
"CPU","TRUNC","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
137+
"CPU","TRUNC","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
122138
"CPU","REGLU","type=f16,ne_a=[128,2,2,2],v=0,swapped=0","support","1","yes","CPU"
123139
"CPU","REGLU","type=f16,ne_a=[5,7,11,13],v=0,swapped=0","support","1","yes","CPU"
124140
"CPU","REGLU","type=f16,ne_a=[128,2,2,2],v=0,swapped=1","support","1","yes","CPU"

ggml/include/ggml.h

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -577,6 +577,10 @@ extern "C" {
577577
GGML_UNARY_OP_EXP,
578578
GGML_UNARY_OP_GELU_ERF,
579579
GGML_UNARY_OP_XIELU,
580+
GGML_UNARY_OP_FLOOR,
581+
GGML_UNARY_OP_CEIL,
582+
GGML_UNARY_OP_ROUND,
583+
GGML_UNARY_OP_TRUNC,
580584

581585
GGML_UNARY_OP_COUNT,
582586
};
@@ -1151,6 +1155,46 @@ extern "C" {
11511155
struct ggml_context * ctx,
11521156
struct ggml_tensor * a);
11531157

1158+
GGML_API struct ggml_tensor * ggml_floor(
1159+
struct ggml_context * ctx,
1160+
struct ggml_tensor * a);
1161+
1162+
GGML_API struct ggml_tensor * ggml_floor_inplace(
1163+
struct ggml_context * ctx,
1164+
struct ggml_tensor * a);
1165+
1166+
GGML_API struct ggml_tensor * ggml_ceil(
1167+
struct ggml_context * ctx,
1168+
struct ggml_tensor * a);
1169+
1170+
GGML_API struct ggml_tensor * ggml_ceil_inplace(
1171+
struct ggml_context * ctx,
1172+
struct ggml_tensor * a);
1173+
1174+
GGML_API struct ggml_tensor * ggml_round(
1175+
struct ggml_context * ctx,
1176+
struct ggml_tensor * a);
1177+
1178+
GGML_API struct ggml_tensor * ggml_round_inplace(
1179+
struct ggml_context * ctx,
1180+
struct ggml_tensor * a);
1181+
1182+
/**
1183+
* Truncates the fractional part of each element in the tensor (towards zero).
1184+
* For example: trunc(3.7) = 3.0, trunc(-2.9) = -2.0
1185+
* Similar to std::trunc in C/C++.
1186+
*/
1187+
1188+
GGML_API struct ggml_tensor * ggml_trunc(
1189+
struct ggml_context * ctx,
1190+
struct ggml_tensor * a);
1191+
1192+
GGML_API struct ggml_tensor * ggml_trunc_inplace(
1193+
struct ggml_context * ctx,
1194+
struct ggml_tensor * a);
1195+
1196+
1197+
11541198
// xIELU activation function
11551199
// x = x * (c_a(alpha_n) + c_b(alpha_p, beta) * sigmoid(beta * x)) + eps * (x > 0)
11561200
// where c_a = softplus and c_b(a, b) = softplus(a) + b are constraining functions

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2184,6 +2184,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
21842184
case GGML_UNARY_OP_HARDSWISH:
21852185
case GGML_UNARY_OP_HARDSIGMOID:
21862186
case GGML_UNARY_OP_EXP:
2187+
case GGML_UNARY_OP_FLOOR:
2188+
case GGML_UNARY_OP_CEIL:
2189+
case GGML_UNARY_OP_ROUND:
2190+
case GGML_UNARY_OP_TRUNC:
21872191
{
21882192
n_tasks = 1;
21892193
} break;

ggml/src/ggml-cpu/ops.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8993,6 +8993,22 @@ void ggml_compute_forward_unary(
89938993
{
89948994
ggml_compute_forward_exp(params, dst);
89958995
} break;
8996+
case GGML_UNARY_OP_FLOOR:
8997+
{
8998+
ggml_compute_forward_floor(params, dst);
8999+
} break;
9000+
case GGML_UNARY_OP_CEIL:
9001+
{
9002+
ggml_compute_forward_ceil(params, dst);
9003+
} break;
9004+
case GGML_UNARY_OP_ROUND:
9005+
{
9006+
ggml_compute_forward_round(params, dst);
9007+
} break;
9008+
case GGML_UNARY_OP_TRUNC:
9009+
{
9010+
ggml_compute_forward_trunc(params, dst);
9011+
} break;
89969012
case GGML_UNARY_OP_XIELU:
89979013
{
89989014
ggml_compute_forward_xielu(params, dst);

ggml/src/ggml-cpu/unary-ops.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,22 @@ static inline float op_log(float x) {
7373
return logf(x);
7474
}
7575

76+
static inline float op_floor(float x) {
77+
return floorf(x);
78+
}
79+
80+
static inline float op_ceil(float x) {
81+
return ceilf(x);
82+
}
83+
84+
static inline float op_round(float x) {
85+
return roundf(x);
86+
}
87+
88+
static inline float op_trunc(float x) {
89+
return truncf(x);
90+
}
91+
7692
template <float (*op)(float), typename src0_t, typename dst_t>
7793
static inline void vec_unary_op(int64_t n, dst_t * y, const src0_t * x) {
7894
constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
@@ -274,6 +290,22 @@ void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor *
274290
unary_op<op_log>(params, dst);
275291
}
276292

293+
void ggml_compute_forward_floor(const ggml_compute_params * params, ggml_tensor * dst) {
294+
unary_op<op_floor>(params, dst);
295+
}
296+
297+
void ggml_compute_forward_ceil(const ggml_compute_params * params, ggml_tensor * dst) {
298+
unary_op<op_ceil>(params, dst);
299+
}
300+
301+
void ggml_compute_forward_round(const ggml_compute_params * params, ggml_tensor * dst) {
302+
unary_op<op_round>(params, dst);
303+
}
304+
305+
void ggml_compute_forward_trunc(const ggml_compute_params * params, ggml_tensor * dst) {
306+
unary_op<op_trunc>(params, dst);
307+
}
308+
277309
void ggml_compute_forward_xielu(const ggml_compute_params * params, ggml_tensor * dst) {
278310
const float alpha_n = ggml_get_op_params_f32(dst, 1);
279311
const float alpha_p = ggml_get_op_params_f32(dst, 2);

ggml/src/ggml-cpu/unary-ops.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct
2222
void ggml_compute_forward_sin(const struct ggml_compute_params * params, struct ggml_tensor * dst);
2323
void ggml_compute_forward_cos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
2424
void ggml_compute_forward_log(const struct ggml_compute_params * params, struct ggml_tensor * dst);
25+
void ggml_compute_forward_floor(const struct ggml_compute_params * params, struct ggml_tensor * dst);
26+
void ggml_compute_forward_ceil(const struct ggml_compute_params * params, struct ggml_tensor * dst);
27+
void ggml_compute_forward_round(const struct ggml_compute_params * params, struct ggml_tensor * dst);
28+
void ggml_compute_forward_trunc(const struct ggml_compute_params * params, struct ggml_tensor * dst);
2529
void ggml_compute_forward_xielu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
2630

2731
#ifdef __cplusplus

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,15 @@ static ggml_cuda_device_info ggml_cuda_init() {
273273
} else if (device_name.substr(0, 21) == "NVIDIA GeForce GTX 16") {
274274
turing_devices_without_mma.push_back({ id, device_name });
275275
}
276+
277+
// Temporary performance fix:
278+
// Setting device scheduling strategy for iGPUs with cc121 to "spinning" to avoid delays in cuda synchronize calls.
279+
// TODO: Check for future drivers the default scheduling strategy and
280+
// remove this call again when cudaDeviceScheduleSpin is default.
281+
if (prop.major == 12 && prop.minor == 1) {
282+
CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceScheduleSpin));
283+
}
284+
276285
#endif // defined(GGML_USE_HIP)
277286
}
278287

@@ -3616,9 +3625,10 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
36163625
case GGML_OP_CONV_2D_DW:
36173626
case GGML_OP_CONV_TRANSPOSE_2D:
36183627
case GGML_OP_POOL_2D:
3619-
case GGML_OP_SUM:
36203628
case GGML_OP_ACC:
36213629
return true;
3630+
case GGML_OP_SUM:
3631+
return ggml_is_contiguous_rows(op->src[0]);
36223632
case GGML_OP_ARGSORT:
36233633
// TODO: Support arbitrary column width
36243634
return op->src[0]->ne[0] <= 1024;

ggml/src/ggml-metal/ggml-metal-device.m

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -662,6 +662,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
662662
case GGML_OP_LOG:
663663
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
664664
case GGML_OP_SUM:
665+
return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]);
665666
case GGML_OP_SUM_ROWS:
666667
case GGML_OP_MEAN:
667668
case GGML_OP_SOFT_MAX:

ggml/src/ggml-metal/ggml-metal-ops.cpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -866,12 +866,25 @@ int ggml_metal_op_sum(ggml_metal_op_t ctx, int idx) {
866866

867867
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_sum(lib, op);
868868

869+
int nth = 32; // SIMD width
870+
871+
while (nth < (int) n && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
872+
nth *= 2;
873+
}
874+
875+
nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
876+
nth = std::min(nth, (int) n);
877+
878+
const int nsg = (nth + 31) / 32;
879+
869880
ggml_metal_encoder_set_pipeline(enc, pipeline);
870881
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
871882
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
872883
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
873884

874-
ggml_metal_encoder_dispatch_threadgroups(enc, 1, 1, 1, 1, 1, 1);
885+
ggml_metal_encoder_set_threadgroup_memory_size(enc, nsg * sizeof(float), 0);
886+
887+
ggml_metal_encoder_dispatch_threadgroups(enc, 1, 1, 1, nth, 1, 1);
875888

876889
return 1;
877890
}

0 commit comments

Comments
 (0)