Skip to content

Commit 4bbe5b1

Browse files
committed
Merge branch 'master' into cuda_graph_plan
2 parents c17f8b5 + 7adc79c commit 4bbe5b1

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+1366
-95
lines changed

docs/ops.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ Legend:
2222
| ARANGE ||||||||||
2323
| ARGMAX ||||||||||
2424
| ARGSORT ||||||||||
25+
| CEIL ||||||||||
2526
| CLAMP ||||| 🟡 | 🟡 || 🟡 ||
2627
| CONCAT |||| 🟡 || 🟡 | 🟡 |||
2728
| CONT || 🟡 |||| 🟡 | 🟡 | 🟡 ||
@@ -41,6 +42,7 @@ Legend:
4142
| ELU |||| 🟡 | 🟡 || 🟡 |||
4243
| EXP |||| 🟡 | 🟡 || 🟡 |||
4344
| FLASH_ATTN_EXT || 🟡 || 🟡 | 🟡 ||| 🟡 ||
45+
| FLOOR ||||||||||
4446
| GATED_LINEAR_ATTN ||||||||||
4547
| GEGLU ||||| 🟡 ||| 🟡 ||
4648
| GEGLU_ERF ||||| 🟡 ||| 🟡 ||
@@ -82,6 +84,7 @@ Legend:
8284
| ROLL ||||||||||
8385
| ROPE || 🟡 ||||||||
8486
| ROPE_BACK ||||||||||
87+
| ROUND ||||||||||
8588
| RWKV_WKV6 ||||||||||
8689
| RWKV_WKV7 ||||||||||
8790
| SCALE || 🟡 ||||||||
@@ -108,5 +111,6 @@ Legend:
108111
| TANH |||| 🟡 | 🟡 || 🟡 | 🟡 ||
109112
| TIMESTEP_EMBEDDING ||||||||||
110113
| TOPK_MOE ||||||||||
114+
| TRUNC ||||||||||
111115
| UPSCALE || 🟡 ||| 🟡 || 🟡 |||
112116
| XIELU ||||||||||

docs/ops/CPU.csv

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,14 @@
5959
"CPU","EXP","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","CPU"
6060
"CPU","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","CPU"
6161
"CPU","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","CPU"
62+
"CPU","FLOOR","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
63+
"CPU","FLOOR","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
64+
"CPU","CEIL","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
65+
"CPU","CEIL","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
66+
"CPU","ROUND","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
67+
"CPU","ROUND","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
68+
"CPU","TRUNC","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
69+
"CPU","TRUNC","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
6270
"CPU","ABS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
6371
"CPU","ABS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
6472
"CPU","SGN","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
@@ -119,6 +127,14 @@
119127
"CPU","EXP","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","CPU"
120128
"CPU","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","CPU"
121129
"CPU","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","CPU"
130+
"CPU","FLOOR","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
131+
"CPU","FLOOR","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
132+
"CPU","CEIL","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
133+
"CPU","CEIL","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
134+
"CPU","ROUND","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
135+
"CPU","ROUND","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
136+
"CPU","TRUNC","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
137+
"CPU","TRUNC","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
122138
"CPU","REGLU","type=f16,ne_a=[128,2,2,2],v=0,swapped=0","support","1","yes","CPU"
123139
"CPU","REGLU","type=f16,ne_a=[5,7,11,13],v=0,swapped=0","support","1","yes","CPU"
124140
"CPU","REGLU","type=f16,ne_a=[128,2,2,2],v=0,swapped=1","support","1","yes","CPU"

ggml/include/ggml.h

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -577,6 +577,10 @@ extern "C" {
577577
GGML_UNARY_OP_EXP,
578578
GGML_UNARY_OP_GELU_ERF,
579579
GGML_UNARY_OP_XIELU,
580+
GGML_UNARY_OP_FLOOR,
581+
GGML_UNARY_OP_CEIL,
582+
GGML_UNARY_OP_ROUND,
583+
GGML_UNARY_OP_TRUNC,
580584

581585
GGML_UNARY_OP_COUNT,
582586
};
@@ -1151,6 +1155,46 @@ extern "C" {
11511155
struct ggml_context * ctx,
11521156
struct ggml_tensor * a);
11531157

1158+
GGML_API struct ggml_tensor * ggml_floor(
1159+
struct ggml_context * ctx,
1160+
struct ggml_tensor * a);
1161+
1162+
GGML_API struct ggml_tensor * ggml_floor_inplace(
1163+
struct ggml_context * ctx,
1164+
struct ggml_tensor * a);
1165+
1166+
GGML_API struct ggml_tensor * ggml_ceil(
1167+
struct ggml_context * ctx,
1168+
struct ggml_tensor * a);
1169+
1170+
GGML_API struct ggml_tensor * ggml_ceil_inplace(
1171+
struct ggml_context * ctx,
1172+
struct ggml_tensor * a);
1173+
1174+
GGML_API struct ggml_tensor * ggml_round(
1175+
struct ggml_context * ctx,
1176+
struct ggml_tensor * a);
1177+
1178+
GGML_API struct ggml_tensor * ggml_round_inplace(
1179+
struct ggml_context * ctx,
1180+
struct ggml_tensor * a);
1181+
1182+
/**
1183+
* Truncates the fractional part of each element in the tensor (towards zero).
1184+
* For example: trunc(3.7) = 3.0, trunc(-2.9) = -2.0
1185+
* Similar to std::trunc in C/C++.
1186+
*/
1187+
1188+
GGML_API struct ggml_tensor * ggml_trunc(
1189+
struct ggml_context * ctx,
1190+
struct ggml_tensor * a);
1191+
1192+
GGML_API struct ggml_tensor * ggml_trunc_inplace(
1193+
struct ggml_context * ctx,
1194+
struct ggml_tensor * a);
1195+
1196+
1197+
11541198
// xIELU activation function
11551199
// x = x * (c_a(alpha_n) + c_b(alpha_p, beta) * sigmoid(beta * x)) + eps * (x > 0)
11561200
// where c_a = softplus and c_b(a, b) = softplus(a) + b are constraining functions

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2184,6 +2184,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
21842184
case GGML_UNARY_OP_HARDSWISH:
21852185
case GGML_UNARY_OP_HARDSIGMOID:
21862186
case GGML_UNARY_OP_EXP:
2187+
case GGML_UNARY_OP_FLOOR:
2188+
case GGML_UNARY_OP_CEIL:
2189+
case GGML_UNARY_OP_ROUND:
2190+
case GGML_UNARY_OP_TRUNC:
21872191
{
21882192
n_tasks = 1;
21892193
} break;

ggml/src/ggml-cpu/ops.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8993,6 +8993,22 @@ void ggml_compute_forward_unary(
89938993
{
89948994
ggml_compute_forward_exp(params, dst);
89958995
} break;
8996+
case GGML_UNARY_OP_FLOOR:
8997+
{
8998+
ggml_compute_forward_floor(params, dst);
8999+
} break;
9000+
case GGML_UNARY_OP_CEIL:
9001+
{
9002+
ggml_compute_forward_ceil(params, dst);
9003+
} break;
9004+
case GGML_UNARY_OP_ROUND:
9005+
{
9006+
ggml_compute_forward_round(params, dst);
9007+
} break;
9008+
case GGML_UNARY_OP_TRUNC:
9009+
{
9010+
ggml_compute_forward_trunc(params, dst);
9011+
} break;
89969012
case GGML_UNARY_OP_XIELU:
89979013
{
89989014
ggml_compute_forward_xielu(params, dst);

ggml/src/ggml-cpu/unary-ops.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,22 @@ static inline float op_log(float x) {
7373
return logf(x);
7474
}
7575

76+
static inline float op_floor(float x) {
77+
return floorf(x);
78+
}
79+
80+
static inline float op_ceil(float x) {
81+
return ceilf(x);
82+
}
83+
84+
static inline float op_round(float x) {
85+
return roundf(x);
86+
}
87+
88+
static inline float op_trunc(float x) {
89+
return truncf(x);
90+
}
91+
7692
template <float (*op)(float), typename src0_t, typename dst_t>
7793
static inline void vec_unary_op(int64_t n, dst_t * y, const src0_t * x) {
7894
constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
@@ -274,6 +290,22 @@ void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor *
274290
unary_op<op_log>(params, dst);
275291
}
276292

293+
void ggml_compute_forward_floor(const ggml_compute_params * params, ggml_tensor * dst) {
294+
unary_op<op_floor>(params, dst);
295+
}
296+
297+
void ggml_compute_forward_ceil(const ggml_compute_params * params, ggml_tensor * dst) {
298+
unary_op<op_ceil>(params, dst);
299+
}
300+
301+
void ggml_compute_forward_round(const ggml_compute_params * params, ggml_tensor * dst) {
302+
unary_op<op_round>(params, dst);
303+
}
304+
305+
void ggml_compute_forward_trunc(const ggml_compute_params * params, ggml_tensor * dst) {
306+
unary_op<op_trunc>(params, dst);
307+
}
308+
277309
void ggml_compute_forward_xielu(const ggml_compute_params * params, ggml_tensor * dst) {
278310
const float alpha_n = ggml_get_op_params_f32(dst, 1);
279311
const float alpha_p = ggml_get_op_params_f32(dst, 2);

ggml/src/ggml-cpu/unary-ops.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct
2222
void ggml_compute_forward_sin(const struct ggml_compute_params * params, struct ggml_tensor * dst);
2323
void ggml_compute_forward_cos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
2424
void ggml_compute_forward_log(const struct ggml_compute_params * params, struct ggml_tensor * dst);
25+
void ggml_compute_forward_floor(const struct ggml_compute_params * params, struct ggml_tensor * dst);
26+
void ggml_compute_forward_ceil(const struct ggml_compute_params * params, struct ggml_tensor * dst);
27+
void ggml_compute_forward_round(const struct ggml_compute_params * params, struct ggml_tensor * dst);
28+
void ggml_compute_forward_trunc(const struct ggml_compute_params * params, struct ggml_tensor * dst);
2529
void ggml_compute_forward_xielu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
2630

2731
#ifdef __cplusplus

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,15 @@ static ggml_cuda_device_info ggml_cuda_init() {
273273
} else if (device_name.substr(0, 21) == "NVIDIA GeForce GTX 16") {
274274
turing_devices_without_mma.push_back({ id, device_name });
275275
}
276+
277+
// Temporary performance fix:
278+
// Setting device scheduling strategy for iGPUs with cc121 to "spinning" to avoid delays in cuda synchronize calls.
279+
// TODO: Check for future drivers the default scheduling strategy and
280+
// remove this call again when cudaDeviceScheduleSpin is default.
281+
if (prop.major == 12 && prop.minor == 1) {
282+
CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceScheduleSpin));
283+
}
284+
276285
#endif // defined(GGML_USE_HIP)
277286
}
278287

@@ -3644,9 +3653,10 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
36443653
case GGML_OP_CONV_2D_DW:
36453654
case GGML_OP_CONV_TRANSPOSE_2D:
36463655
case GGML_OP_POOL_2D:
3647-
case GGML_OP_SUM:
36483656
case GGML_OP_ACC:
36493657
return true;
3658+
case GGML_OP_SUM:
3659+
return ggml_is_contiguous_rows(op->src[0]);
36503660
case GGML_OP_ARGSORT:
36513661
// TODO: Support arbitrary column width
36523662
return op->src[0]->ne[0] <= 1024;

ggml/src/ggml-metal/ggml-metal-device.m

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77

88
#include <Metal/Metal.h>
99

10+
#include <stdatomic.h>
11+
1012
#ifndef TARGET_OS_VISION
1113
#define TARGET_OS_VISION 0
1214
#endif
@@ -22,6 +24,9 @@
2224
// overload of MTLGPUFamilyMetal3 (not available in some environments)
2325
static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
2426

27+
// virtual address for GPU memory allocations
28+
static atomic_uintptr_t g_addr_device = 0x000000400ULL;
29+
2530
#if !GGML_METAL_EMBED_LIBRARY
2631
// Here to assist with NSBundle Path Hack
2732
@interface GGMLMetalClass : NSObject
@@ -657,6 +662,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
657662
case GGML_OP_LOG:
658663
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
659664
case GGML_OP_SUM:
665+
return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]);
660666
case GGML_OP_SUM_ROWS:
661667
case GGML_OP_MEAN:
662668
case GGML_OP_SOFT_MAX:
@@ -827,7 +833,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
827833
};
828834

829835
struct ggml_metal_buffer {
830-
void * all_data; // TODO: https://github.com/ggml-org/llama.cpp/pull/15985
836+
void * all_data;
831837
size_t all_size;
832838

833839
// if false, the Metal buffer data is allocated in private GPU memory and is not shared with the host
@@ -965,14 +971,15 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
965971
if (shared) {
966972
res->all_data = ggml_metal_host_malloc(size_aligned);
967973
res->is_shared = true;
968-
res->owned = true;
969974
} else {
970-
// dummy, non-NULL value - we'll populate this after creating the Metal buffer below
971-
res->all_data = (void *) 0x000000400ULL;
975+
// use virtual address from g_addr_device counter
976+
res->all_data = (void *) atomic_fetch_add_explicit(&g_addr_device, size_aligned, memory_order_relaxed);
972977
res->is_shared = false;
973978
}
974979
res->all_size = size_aligned;
975980

981+
res->owned = true;
982+
976983
res->device = ggml_metal_device_get_obj(dev);
977984
res->queue = ggml_metal_device_get_queue(dev);
978985

@@ -983,15 +990,13 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
983990
res->buffers[0].metal = nil;
984991

985992
if (size_aligned > 0) {
986-
if (props_dev->use_shared_buffers &&shared) {
993+
if (props_dev->use_shared_buffers && shared) {
987994
res->buffers[0].metal = [res->device newBufferWithBytesNoCopy:res->all_data
988995
length:size_aligned
989996
options:MTLResourceStorageModeShared
990997
deallocator:nil];
991998
} else {
992999
res->buffers[0].metal = [res->device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
993-
994-
res->all_data = (void *) (res->buffers[0].metal.gpuAddress);
9951000
}
9961001
}
9971002

@@ -1139,7 +1144,7 @@ bool ggml_metal_buffer_is_shared(ggml_metal_buffer_t buf) {
11391144

11401145
void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
11411146
if (buf->is_shared) {
1142-
memset((char *)tensor->data + offset, value, size);
1147+
memset((char *) tensor->data + offset, value, size);
11431148
return;
11441149
}
11451150

@@ -1168,7 +1173,7 @@ void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor
11681173

11691174
void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
11701175
if (buf->is_shared) {
1171-
memcpy((char *)tensor->data + offset, data, size);
1176+
memcpy((char *) tensor->data + offset, data, size);
11721177
return;
11731178
}
11741179

@@ -1223,7 +1228,7 @@ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor *
12231228

12241229
void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
12251230
if (buf->is_shared) {
1226-
memcpy(data, (const char *)tensor->data + offset, size);
1231+
memcpy(data, (const char *) tensor->data + offset, size);
12271232
return;
12281233
}
12291234

ggml/src/ggml-metal/ggml-metal-impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,7 @@ typedef struct {
251251
int32_t sect_1;
252252
int32_t sect_2;
253253
int32_t sect_3;
254+
bool src2;
254255
} ggml_metal_kargs_rope;
255256

256257
typedef struct {

0 commit comments

Comments
 (0)