Thireus
diff --git a/‎ggml/src/CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions b/‎ggml/src/CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎ggml/src/ggml-backend-impl.h‎
Lines changed: 1 addition & 1 deletion b/‎ggml/src/ggml-backend-impl.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ggml/src/ggml-backend.cpp‎
Lines changed: 4 additions & 4 deletions b/‎ggml/src/ggml-backend.cpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎ggml/src/ggml-blas/ggml-blas.cpp‎
Lines changed: 1 addition & 1 deletion b/‎ggml/src/ggml-blas/ggml-blas.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ggml/src/ggml-cann/ggml-cann.cpp‎
Lines changed: 1 addition & 1 deletion b/‎ggml/src/ggml-cann/ggml-cann.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ggml/src/ggml-cpu/amx/amx.cpp‎
Lines changed: 4 additions & 2 deletions b/‎ggml/src/ggml-cpu/amx/amx.cpp‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎ggml/src/ggml-cpu/ggml-cpu.cpp‎
Lines changed: 1 addition & 1 deletion b/‎ggml/src/ggml-cpu/ggml-cpu.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ggml/src/ggml-cuda/common.cuh‎
Lines changed: 8 additions & 0 deletions b/‎ggml/src/ggml-cuda/common.cuh‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cuda/ggml-cuda.cu‎
Lines changed: 1 addition & 1 deletion b/‎ggml/src/ggml-cuda/ggml-cuda.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ggml/src/ggml-cuda/pad_reflect_1d.cu‎
Lines changed: 61 additions & 54 deletions b/‎ggml/src/ggml-cuda/pad_reflect_1d.cu‎
Lines changed: 61 additions & 54 deletions
@@ -114,6 +114,9 @@ message(STATUS "GGML_SYSTEM_ARCH: ${GGML_SYSTEM_ARCH}")
 
 if (NOT MSVC)
     if (GGML_STATIC)
+        if (UNIX AND NOT APPLE)
+            set(CMAKE_FIND_LIBRARY_SUFFIXES ".a;.so")
+        endif()
         add_link_options(-static)
         if (MINGW)
             add_link_options(-static-libgcc -static-libstdc++)
 
@@ -116,7 +116,7 @@ extern "C" {
         void (*event_wait)  (ggml_backend_t backend, ggml_backend_event_t event);
 
         // (optional) sort/optimize the nodes in the graph
-        void                      (*optimize_graph)    (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        void                      (*graph_optimize)    (ggml_backend_t backend, struct ggml_cgraph * cgraph);
     };
 
     struct ggml_backend {
 
@@ -463,10 +463,10 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)
     backend->iface.event_wait(backend, event);
 }
 
-static void ggml_backend_optimize_graph(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+static void ggml_backend_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     GGML_ASSERT(backend);
-    if (backend->iface.optimize_graph != NULL) {
-        backend->iface.optimize_graph(backend, cgraph);
+    if (backend->iface.graph_optimize != NULL) {
+        backend->iface.graph_optimize(backend, cgraph);
     }
 }
 
@@ -1307,7 +1307,7 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
 
         // Optimize this split of the graph. This needs to happen before we make graph_copy,
         // so they are in sync.
-        ggml_backend_optimize_graph(sched->backends[split->backend_id], &split->graph);
+        ggml_backend_graph_optimize(sched->backends[split->backend_id], &split->graph);
 
         // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
         for (int j = 0; j < split->n_inputs; j++) {
 
@@ -270,7 +270,7 @@ static struct ggml_backend_i blas_backend_i = {
     /* .graph_compute           = */ ggml_backend_blas_graph_compute,
     /* .event_record            = */ NULL,
     /* .event_wait              = */ NULL,
-    /* .optimize_graph          = */ NULL,
+    /* .graph_optimize          = */ NULL,
 };
 
 static ggml_guid_t ggml_backend_blas_guid(void) {
 
@@ -2756,7 +2756,7 @@ static const ggml_backend_i ggml_backend_cann_interface = {
     /* .graph_compute           = */ ggml_backend_cann_graph_compute,
     /* .event_record            = */ ggml_backend_cann_event_record,
     /* .event_wait              = */ ggml_backend_cann_event_wait,
-    /* .optimize_graph          = */ NULL,
+    /* .graph_optimize          = */ NULL,
 };
 
 /**
 
@@ -7,7 +7,7 @@
 #include "ggml-cpu.h"
 #include "traits.h"
 
-#if defined(__gnu_linux__)
+#if defined(__linux__)
 #include <sys/syscall.h>
 #include <unistd.h>
 #endif
@@ -186,14 +186,16 @@ static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_ty
 #define XFEATURE_XTILEDATA      18
 
 static bool ggml_amx_init() {
-#if defined(__gnu_linux__)
+#if defined(__linux__)
     if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
         fprintf(stderr, "AMX is not ready to be used!\n");
         return false;
     }
     return true;
 #elif defined(_WIN32)
     return true;
+#else
+    return false;
 #endif
 }
 
 
@@ -190,7 +190,7 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
     /* .graph_compute           = */ ggml_backend_cpu_graph_compute,
     /* .event_record            = */ NULL,
     /* .event_wait              = */ NULL,
-    /* .optimize_graph          = */ NULL,
+    /* .graph_optimize          = */ NULL,
 };
 
 static ggml_guid_t ggml_backend_cpu_guid(void) {
 
@@ -652,6 +652,14 @@ static __device__ __forceinline__ uint32_t fastmodulo(uint32_t n, const uint3 fa
     return n - fastdiv(n, fastdiv_values) * fastdiv_values.z;
 }
 
+// Calculate both division and modulo at once, returns <n/divisor, n%divisor>
+static __device__ __forceinline__ uint2 fast_div_modulo(uint32_t n, const uint3 fastdiv_values) {
+    // expects  fastdiv_values to contain <mp, L, divisor> in <x, y, z> (see init_fastdiv_values)
+    const uint32_t div_val = fastdiv(n, fastdiv_values);
+    const uint32_t mod_val = n - div_val * fastdiv_values.z;
+    return make_uint2(div_val, mod_val);
+}
+
 typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, float2 & v);
 
 static __device__ __forceinline__ float get_alibi_slope(
 
@@ -3140,7 +3140,7 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
     /* .graph_compute           = */ ggml_backend_cuda_graph_compute,
     /* .event_record            = */ ggml_backend_cuda_event_record,
     /* .event_wait              = */ ggml_backend_cuda_event_wait,
-    /* .optimize_graph          = */ NULL,
+    /* .graph_optimize          = */ NULL,
 };
 
 static ggml_guid_t ggml_backend_cuda_guid() {
 
@@ -1,82 +1,89 @@
 #include "pad_reflect_1d.cuh"
 
-static __global__ void pad_reflect_1d_kernel_f32(
-    const void * __restrict__ src0,
-    void * __restrict__ dst,
-    const int64_t ne0,
-    const int64_t ne00,
-    const int64_t ne01,
-    const int64_t ne02,
-    const int64_t ne03,
-    const int64_t nb00,
-    const int64_t nb01,
-    const int64_t nb02,
-    const int64_t nb03,
-    const int64_t nb0,
-    const int64_t nb1,
-    const int64_t nb2,
-    const int64_t nb3,
-    const int p0,
-    const int p1) {
-
+static __global__ __launch_bounds__(CUDA_PAD_REFLECT_1D_BLOCK_SIZE, 1) void
+    pad_reflect_1d_kernel_f32(
+        const void * __restrict__ src0,
+        void * __restrict__       dst,
+        const int64_t             ne0,
+        const int64_t             ne00,
+        const uint3               ne01,
+        const int64_t             ne02,
+        const int64_t             ne03,
+        const int64_t             nb00,
+        const int64_t             nb01,
+        const int64_t             nb02,
+        const int64_t             nb03,
+        const int64_t             nb0,
+        const int64_t             nb1,
+        const int64_t             nb2,
+        const int64_t             nb3,
+        const int                 p0,
+        const int                 p1) {
     const int64_t i3 = blockIdx.z;
     const int64_t i2 = blockIdx.y;
-    const int64_t i1 = blockIdx.x;
 
-    if (i1 >= ne01 || i2 >= ne02 || i3 >= ne03) {
+    const uint2   div_mod_packed = fast_div_modulo(blockIdx.x, ne01);
+    const int64_t tile1          = div_mod_packed.y;  // i1
+    const int64_t tile0          = div_mod_packed.x;  // nth i0 tile
+    const int64_t i1             = tile1;
+    const int64_t i0             = threadIdx.x + tile0 * blockDim.x;
+
+    // ne01.z is original value of unpacked ne01 (see init_fastdiv_values in common.cuh)
+    if (i0 >= ne0 || i1 >= ne01.z || i2 >= ne02 || i3 >= ne03) {
         return;
     }
 
-    const char * src0_ptr = (const char *)src0 + i3*nb03 + i2*nb02 + i1*nb01;
-    char * dst_ptr = (char *)dst + i3*nb3 + i2*nb2 + i1*nb1;
-
-    for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
-        float value;
+    const char * src0_ptr = (const char *) src0 + i3 * nb03 + i2 * nb02 + i1 * nb01;
+    char *       dst_ptr  = (char *) dst + i3 * nb3 + i2 * nb2 + i1 * nb1;
 
-        if (i0 < p0) {
-            // Left padding - reflect
-            value = *(const float *)(src0_ptr + (p0 - i0) * nb00);
-        } else if (i0 < ne0 - p1) {
-            // Middle - copy
-            value = *(const float *)(src0_ptr + (i0 - p0) * nb00);
-        } else {
-            // Right padding - reflect
-            int64_t src_idx = (ne0 - p1 - p0) - (p1 + 1 - (ne0 - i0)) - 1;
-            value = *(const float *)(src0_ptr + src_idx * nb00);
-        }
+    const int64_t rel_i0 = i0 - p0;  // relative i0 in src0
+    int64_t src_idx;
 
-        *(float *)(dst_ptr + i0 * nb0) = value;
+    if (rel_i0 < 0) {
+        // Left padding - reflect
+        src_idx = -rel_i0;
+    } else if (rel_i0 < ne00) {
+        // Middle - copy
+        src_idx = rel_i0;
+    } else {
+        // Right padding - reflect
+        src_idx = 2 * ne00 - 2 - rel_i0;
     }
+    const float value               = *(const float *) (src0_ptr + src_idx * nb00);
+    *(float *) (dst_ptr + i0 * nb0) = value;
 }
 
 void ggml_cuda_op_pad_reflect_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    cudaStream_t stream = ctx.stream();
+    const ggml_tensor * src0   = dst->src[0];
+    cudaStream_t        stream = ctx.stream();
 
     GGML_ASSERT(src0->type == GGML_TYPE_F32);
     GGML_ASSERT(dst->type == GGML_TYPE_F32);
 
     const int32_t * opts = (const int32_t *) dst->op_params;
-    const int p0 = opts[0];
-    const int p1 = opts[1];
+    const int       p0   = opts[0];
+    const int       p1   = opts[1];
 
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
+    const int64_t ne00        = src0->ne[0];
+    const int64_t ne01        = src0->ne[1];
+    const uint3   ne01_packed = init_fastdiv_values(ne01);
+    const int64_t ne02        = src0->ne[2];
+    const int64_t ne03        = src0->ne[3];
 
     const int64_t ne0 = dst->ne[0];
 
+    // sanity: padded length matches
     GGML_ASSERT(ne0 == ne00 + p0 + p1);
 
-    const dim3 block_dims(CUDA_PAD_REFLECT_1D_BLOCK_SIZE, 1, 1);
-    const dim3 grid_dims(ne01, ne02, ne03);
+    constexpr int64_t bx     = CUDA_PAD_REFLECT_1D_BLOCK_SIZE;  // threads per block (x)
+    const int64_t     tiles0 = (ne0 + bx - 1) / bx;             // number of tiles along i0
+    // grid.x covers i1 and all tiles of i0: [ne01 * tiles0]
+    // grid.y covers i2: [ne02]
+    // grid.z covers i3: [ne03]
+    const dim3        grid_dims((unsigned) (ne01 * tiles0), (unsigned) ne02, (unsigned) ne03);
+    const dim3        block_dims((unsigned) bx, 1, 1);
 
     pad_reflect_1d_kernel_f32<<<grid_dims, block_dims, 0, stream>>>(
-        src0->data, dst->data,
-        ne0, ne00, ne01, ne02, ne03,
-        src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
-        dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3],
-        p0, p1
-    );
+        src0->data, dst->data, ne0, ne00, ne01_packed, ne02, ne03, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
+        dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], p0, p1);
 }
Original file line number	Diff line number	Diff line change
`@@ -463,10 +463,10 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)`
`463`	`463`	`backend->iface.event_wait(backend, event);`
`464`	`464`	`}`
`465`	`465`
`466`		`-static void ggml_backend_optimize_graph(ggml_backend_t backend, struct ggml_cgraph * cgraph) {`
	`466`	`+static void ggml_backend_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * cgraph) {`
`467`	`467`	`GGML_ASSERT(backend);`
`468`		`- if (backend->iface.optimize_graph != NULL) {`
`469`		`- backend->iface.optimize_graph(backend, cgraph);`
	`468`	`+ if (backend->iface.graph_optimize != NULL) {`
	`469`	`+ backend->iface.graph_optimize(backend, cgraph);`
`470`	`470`	`}`
`471`	`471`	`}`
`472`	`472`
`@@ -1307,7 +1307,7 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra`
`1307`	`1307`
`1308`	`1308`	`// Optimize this split of the graph. This needs to happen before we make graph_copy,`
`1309`	`1309`	`// so they are in sync.`
`1310`		`- ggml_backend_optimize_graph(sched->backends[split->backend_id], &split->graph);`
	`1310`	`+ ggml_backend_graph_optimize(sched->backends[split->backend_id], &split->graph);`
`1311`	`1311`
`1312`	`1312`	`// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split`
`1313`	`1313`	`for (int j = 0; j < split->n_inputs; j++) {`