Skip to content

Commit 8fec14c

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents 7760ede + 69ffd89 commit 8fec14c

File tree

22 files changed

+806
-80
lines changed

22 files changed

+806
-80
lines changed

ggml/src/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,9 @@ message(STATUS "GGML_SYSTEM_ARCH: ${GGML_SYSTEM_ARCH}")
114114

115115
if (NOT MSVC)
116116
if (GGML_STATIC)
117+
if (UNIX AND NOT APPLE)
118+
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a;.so")
119+
endif()
117120
add_link_options(-static)
118121
if (MINGW)
119122
add_link_options(-static-libgcc -static-libstdc++)

ggml/src/ggml-backend-impl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ extern "C" {
116116
void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
117117

118118
// (optional) sort/optimize the nodes in the graph
119-
void (*optimize_graph) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
119+
void (*graph_optimize) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
120120
};
121121

122122
struct ggml_backend {

ggml/src/ggml-backend.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -463,10 +463,10 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)
463463
backend->iface.event_wait(backend, event);
464464
}
465465

466-
static void ggml_backend_optimize_graph(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
466+
static void ggml_backend_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
467467
GGML_ASSERT(backend);
468-
if (backend->iface.optimize_graph != NULL) {
469-
backend->iface.optimize_graph(backend, cgraph);
468+
if (backend->iface.graph_optimize != NULL) {
469+
backend->iface.graph_optimize(backend, cgraph);
470470
}
471471
}
472472

@@ -1307,7 +1307,7 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
13071307

13081308
// Optimize this split of the graph. This needs to happen before we make graph_copy,
13091309
// so they are in sync.
1310-
ggml_backend_optimize_graph(sched->backends[split->backend_id], &split->graph);
1310+
ggml_backend_graph_optimize(sched->backends[split->backend_id], &split->graph);
13111311

13121312
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
13131313
for (int j = 0; j < split->n_inputs; j++) {

ggml/src/ggml-blas/ggml-blas.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,7 @@ static struct ggml_backend_i blas_backend_i = {
270270
/* .graph_compute = */ ggml_backend_blas_graph_compute,
271271
/* .event_record = */ NULL,
272272
/* .event_wait = */ NULL,
273-
/* .optimize_graph = */ NULL,
273+
/* .graph_optimize = */ NULL,
274274
};
275275

276276
static ggml_guid_t ggml_backend_blas_guid(void) {

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2756,7 +2756,7 @@ static const ggml_backend_i ggml_backend_cann_interface = {
27562756
/* .graph_compute = */ ggml_backend_cann_graph_compute,
27572757
/* .event_record = */ ggml_backend_cann_event_record,
27582758
/* .event_wait = */ ggml_backend_cann_event_wait,
2759-
/* .optimize_graph = */ NULL,
2759+
/* .graph_optimize = */ NULL,
27602760
};
27612761

27622762
/**

ggml/src/ggml-cpu/amx/amx.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#include "ggml-cpu.h"
88
#include "traits.h"
99

10-
#if defined(__gnu_linux__)
10+
#if defined(__linux__)
1111
#include <sys/syscall.h>
1212
#include <unistd.h>
1313
#endif
@@ -186,14 +186,16 @@ static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_ty
186186
#define XFEATURE_XTILEDATA 18
187187

188188
static bool ggml_amx_init() {
189-
#if defined(__gnu_linux__)
189+
#if defined(__linux__)
190190
if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
191191
fprintf(stderr, "AMX is not ready to be used!\n");
192192
return false;
193193
}
194194
return true;
195195
#elif defined(_WIN32)
196196
return true;
197+
#else
198+
return false;
197199
#endif
198200
}
199201

ggml/src/ggml-cpu/ggml-cpu.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
190190
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
191191
/* .event_record = */ NULL,
192192
/* .event_wait = */ NULL,
193-
/* .optimize_graph = */ NULL,
193+
/* .graph_optimize = */ NULL,
194194
};
195195

196196
static ggml_guid_t ggml_backend_cpu_guid(void) {

ggml/src/ggml-cuda/common.cuh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -652,6 +652,14 @@ static __device__ __forceinline__ uint32_t fastmodulo(uint32_t n, const uint3 fa
652652
return n - fastdiv(n, fastdiv_values) * fastdiv_values.z;
653653
}
654654

655+
// Calculate both division and modulo at once, returns <n/divisor, n%divisor>
656+
static __device__ __forceinline__ uint2 fast_div_modulo(uint32_t n, const uint3 fastdiv_values) {
657+
// expects fastdiv_values to contain <mp, L, divisor> in <x, y, z> (see init_fastdiv_values)
658+
const uint32_t div_val = fastdiv(n, fastdiv_values);
659+
const uint32_t mod_val = n - div_val * fastdiv_values.z;
660+
return make_uint2(div_val, mod_val);
661+
}
662+
655663
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, float2 & v);
656664

657665
static __device__ __forceinline__ float get_alibi_slope(

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3140,7 +3140,7 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
31403140
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
31413141
/* .event_record = */ ggml_backend_cuda_event_record,
31423142
/* .event_wait = */ ggml_backend_cuda_event_wait,
3143-
/* .optimize_graph = */ NULL,
3143+
/* .graph_optimize = */ NULL,
31443144
};
31453145

31463146
static ggml_guid_t ggml_backend_cuda_guid() {
Lines changed: 61 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,82 +1,89 @@
11
#include "pad_reflect_1d.cuh"
22

3-
static __global__ void pad_reflect_1d_kernel_f32(
4-
const void * __restrict__ src0,
5-
void * __restrict__ dst,
6-
const int64_t ne0,
7-
const int64_t ne00,
8-
const int64_t ne01,
9-
const int64_t ne02,
10-
const int64_t ne03,
11-
const int64_t nb00,
12-
const int64_t nb01,
13-
const int64_t nb02,
14-
const int64_t nb03,
15-
const int64_t nb0,
16-
const int64_t nb1,
17-
const int64_t nb2,
18-
const int64_t nb3,
19-
const int p0,
20-
const int p1) {
21-
3+
static __global__ __launch_bounds__(CUDA_PAD_REFLECT_1D_BLOCK_SIZE, 1) void
4+
pad_reflect_1d_kernel_f32(
5+
const void * __restrict__ src0,
6+
void * __restrict__ dst,
7+
const int64_t ne0,
8+
const int64_t ne00,
9+
const uint3 ne01,
10+
const int64_t ne02,
11+
const int64_t ne03,
12+
const int64_t nb00,
13+
const int64_t nb01,
14+
const int64_t nb02,
15+
const int64_t nb03,
16+
const int64_t nb0,
17+
const int64_t nb1,
18+
const int64_t nb2,
19+
const int64_t nb3,
20+
const int p0,
21+
const int p1) {
2222
const int64_t i3 = blockIdx.z;
2323
const int64_t i2 = blockIdx.y;
24-
const int64_t i1 = blockIdx.x;
2524

26-
if (i1 >= ne01 || i2 >= ne02 || i3 >= ne03) {
25+
const uint2 div_mod_packed = fast_div_modulo(blockIdx.x, ne01);
26+
const int64_t tile1 = div_mod_packed.y; // i1
27+
const int64_t tile0 = div_mod_packed.x; // nth i0 tile
28+
const int64_t i1 = tile1;
29+
const int64_t i0 = threadIdx.x + tile0 * blockDim.x;
30+
31+
// ne01.z is original value of unpacked ne01 (see init_fastdiv_values in common.cuh)
32+
if (i0 >= ne0 || i1 >= ne01.z || i2 >= ne02 || i3 >= ne03) {
2733
return;
2834
}
2935

30-
const char * src0_ptr = (const char *)src0 + i3*nb03 + i2*nb02 + i1*nb01;
31-
char * dst_ptr = (char *)dst + i3*nb3 + i2*nb2 + i1*nb1;
32-
33-
for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
34-
float value;
36+
const char * src0_ptr = (const char *) src0 + i3 * nb03 + i2 * nb02 + i1 * nb01;
37+
char * dst_ptr = (char *) dst + i3 * nb3 + i2 * nb2 + i1 * nb1;
3538

36-
if (i0 < p0) {
37-
// Left padding - reflect
38-
value = *(const float *)(src0_ptr + (p0 - i0) * nb00);
39-
} else if (i0 < ne0 - p1) {
40-
// Middle - copy
41-
value = *(const float *)(src0_ptr + (i0 - p0) * nb00);
42-
} else {
43-
// Right padding - reflect
44-
int64_t src_idx = (ne0 - p1 - p0) - (p1 + 1 - (ne0 - i0)) - 1;
45-
value = *(const float *)(src0_ptr + src_idx * nb00);
46-
}
39+
const int64_t rel_i0 = i0 - p0; // relative i0 in src0
40+
int64_t src_idx;
4741

48-
*(float *)(dst_ptr + i0 * nb0) = value;
42+
if (rel_i0 < 0) {
43+
// Left padding - reflect
44+
src_idx = -rel_i0;
45+
} else if (rel_i0 < ne00) {
46+
// Middle - copy
47+
src_idx = rel_i0;
48+
} else {
49+
// Right padding - reflect
50+
src_idx = 2 * ne00 - 2 - rel_i0;
4951
}
52+
const float value = *(const float *) (src0_ptr + src_idx * nb00);
53+
*(float *) (dst_ptr + i0 * nb0) = value;
5054
}
5155

5256
void ggml_cuda_op_pad_reflect_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
53-
const ggml_tensor * src0 = dst->src[0];
54-
cudaStream_t stream = ctx.stream();
57+
const ggml_tensor * src0 = dst->src[0];
58+
cudaStream_t stream = ctx.stream();
5559

5660
GGML_ASSERT(src0->type == GGML_TYPE_F32);
5761
GGML_ASSERT(dst->type == GGML_TYPE_F32);
5862

5963
const int32_t * opts = (const int32_t *) dst->op_params;
60-
const int p0 = opts[0];
61-
const int p1 = opts[1];
64+
const int p0 = opts[0];
65+
const int p1 = opts[1];
6266

63-
const int64_t ne00 = src0->ne[0];
64-
const int64_t ne01 = src0->ne[1];
65-
const int64_t ne02 = src0->ne[2];
66-
const int64_t ne03 = src0->ne[3];
67+
const int64_t ne00 = src0->ne[0];
68+
const int64_t ne01 = src0->ne[1];
69+
const uint3 ne01_packed = init_fastdiv_values(ne01);
70+
const int64_t ne02 = src0->ne[2];
71+
const int64_t ne03 = src0->ne[3];
6772

6873
const int64_t ne0 = dst->ne[0];
6974

75+
// sanity: padded length matches
7076
GGML_ASSERT(ne0 == ne00 + p0 + p1);
7177

72-
const dim3 block_dims(CUDA_PAD_REFLECT_1D_BLOCK_SIZE, 1, 1);
73-
const dim3 grid_dims(ne01, ne02, ne03);
78+
constexpr int64_t bx = CUDA_PAD_REFLECT_1D_BLOCK_SIZE; // threads per block (x)
79+
const int64_t tiles0 = (ne0 + bx - 1) / bx; // number of tiles along i0
80+
// grid.x covers i1 and all tiles of i0: [ne01 * tiles0]
81+
// grid.y covers i2: [ne02]
82+
// grid.z covers i3: [ne03]
83+
const dim3 grid_dims((unsigned) (ne01 * tiles0), (unsigned) ne02, (unsigned) ne03);
84+
const dim3 block_dims((unsigned) bx, 1, 1);
7485

7586
pad_reflect_1d_kernel_f32<<<grid_dims, block_dims, 0, stream>>>(
76-
src0->data, dst->data,
77-
ne0, ne00, ne01, ne02, ne03,
78-
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
79-
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3],
80-
p0, p1
81-
);
87+
src0->data, dst->data, ne0, ne00, ne01_packed, ne02, ne03, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
88+
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], p0, p1);
8289
}

0 commit comments

Comments
 (0)