iterate - CONT kernel (just a wrapper around CPY, as the reference is)

dbsanfte · dbsanfte · commit 600cba724dc9 · 2025-09-11T12:25:55.000Z
diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -43,6 +43,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
         ggml-cpu/numa-kernels/numa-kernels.h
         ggml-cpu/numa-kernels/add.c
         ggml-cpu/numa-kernels/add.h
+        ggml-cpu/numa-kernels/cont.c
+        ggml-cpu/numa-kernels/cont.h
         ggml-cpu/numa-kernels/cpy.c
         ggml-cpu/numa-kernels/cpy.h
         ggml-cpu/numa-kernels/get_rows.c
diff --git a/ggml/src/ggml-cpu/numa-kernels/cont.c b/ggml/src/ggml-cpu/numa-kernels/cont.c
@@ -0,0 +1,45 @@
+/**
+ * @file cont.c
+ * @brief NUMA CONT kernel implementation as thin wrapper around CPY
+ * 
+ * CONT operations in ggml are identical to CPY operations in terms of actual
+ * computation - both copy data from source to destination. The difference is
+ * only semantic in the graph representation. This implementation leverages
+ * the sophisticated CPY kernel as a thin wrapper.
+ * 
+ * @author David Sanftenberg
+ * @date 2024
+ */
+
+#include "cont.h"
+#include "cpy.h"  // Import CPY kernel functions
+#include "numa-kernels.h"
+#include "ggml-numa-openmp-coordinator.h"
+
+/**
+ * @brief NUMA CONT kernel execution function - thin wrapper around CPY
+ * 
+ * CONT operations are semantically identical to CPY operations in terms of
+ * data movement. This implementation delegates to the sophisticated CPY kernel
+ * which handles all quantization types, optimization strategies, and NUMA-aware
+ * execution patterns.
+ * 
+ * @param work_context Tensor to process (cast to ggml_tensor*)
+ * @param params Compute parameters with thread info
+ * @return GGML_STATUS_SUCCESS on success, error code on failure
+ */
+enum ggml_status ggml_numa_kernel_cont_execute(void * work_context, struct ggml_compute_params * params) {
+    // CONT is identical to CPY in terms of data movement
+    // Delegate to the sophisticated CPY kernel implementation
+    return ggml_numa_kernel_cpy_execute(work_context, params);
+}
+
+// Use the new streamlined registration system - CONT as standard operation
+NUMA_KERNEL_REGISTER_METADATA(
+    cont,                                 // op_name
+    GGML_OP_CONT,                         // ggml_op_type
+    "NUMA CONT Kernel (CPY wrapper)",     // kernel_display_name
+    256,                                  // threshold_single_single (same as CPY)
+    512,                                  // threshold_single_multi (same as CPY)
+    ggml_numa_kernel_cont_execute         // execute_function
+)
diff --git a/ggml/src/ggml-cpu/numa-kernels/cont.h b/ggml/src/ggml-cpu/numa-kernels/cont.h
@@ -0,0 +1,50 @@
+/**
+ * @file cont.h
+ * @brief NUMA CONT kernel interface (thin wrapper around CPY)
+ * 
+ * CONT operations ensure tensor data is stored contiguously in memory.
+ * This implementation provides a thin wrapper around the sophisticated CPY kernel.
+ * 
+ * @author David Sanftenberg
+ * @date 2024
+ */
+
+#pragma once
+
+#include "numa-kernels.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief NUMA CONT kernel execution function (thin wrapper around CPY)
+ * 
+ * @param work_context Tensor to process (cast to ggml_tensor*)
+ * @param params Compute parameters with thread info
+ * @return GGML_STATUS_SUCCESS on success, error code on failure
+ */
+enum ggml_status ggml_numa_kernel_cont_execute(void * work_context, struct ggml_compute_params * params);
+
+// ============================================================================
+// Registration Functions (Auto-generated by NUMA_KERNEL_REGISTER_METADATA macro)
+// ============================================================================
+
+/**
+ * @brief Query function for CONT strategy selection (auto-generated)
+ */
+ggml_numa_execution_strategy_t ggml_numa_kernel_cont_query(const struct ggml_tensor * tensor);
+
+/**
+ * @brief Work buffer calculation function for CONT (auto-generated)
+ */
+size_t ggml_numa_kernel_cont_work_buffer_calc(const struct ggml_tensor * tensor, int total_numa_nodes, int total_threads);
+
+/**
+ * @brief Registration function for CONT kernel (auto-generated)
+ */
+ggml_numa_kernel_registration_info_t ggml_numa_kernel_cont_register(void);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml/src/ggml-cpu/numa-kernels/cpy.c b/ggml/src/ggml-cpu/numa-kernels/cpy.c
@@ -386,7 +386,7 @@ enum ggml_status ggml_numa_kernel_cpy_execute(void * work_context, struct ggml_c
     
     struct ggml_tensor * src0 = dst->src[0];
     NUMA_ASSERT(src0 != NULL, "Source tensor cannot be null");
-    NUMA_ASSERT(dst->op == GGML_OP_CPY, "Expected CPY operation");
+    NUMA_ASSERT(dst->op == GGML_OP_CPY || dst->op == GGML_OP_CONT, "Expected CPY or CONT operation");
     NUMA_ASSERT(ggml_nelements(dst) == ggml_nelements(src0), "Element count must match");
     
     // Additional validation
diff --git a/ggml/src/ggml-cpu/numa-kernels/glu.c b/ggml/src/ggml-cpu/numa-kernels/glu.c
@@ -154,10 +154,10 @@ enum ggml_status ggml_numa_kernel_glu_execute(void * work_context, struct ggml_c
 // ============================================================================
 
 NUMA_KERNEL_REGISTER_METADATA(
-    glu,                                    // kernel name
+    glu,                                   // kernel name
     GGML_OP_GLU,                           // operation type
     "NUMA GLU Kernel",                     // kernel description
-    1024,                                   // single_single threshold
-    2048,                                  // single_multi threshold
+    4096,                                  // single_single threshold
+    8192,                                  // single_multi threshold
     ggml_numa_kernel_glu_execute           // execution function
 )
diff --git a/ggml/src/ggml-cpu/numa-kernels/numa-kernels.c b/ggml/src/ggml-cpu/numa-kernels/numa-kernels.c
@@ -28,6 +28,7 @@
 #include "permute.h"
 #include "rms_norm.h"
 #include "cpy.h"
+#include "cont.h"
 #include "get_rows.h"
 #include "../ggml-impl.h"
 #include "../ggml-vec-numa.h"
@@ -338,6 +339,7 @@ enum ggml_status ggml_numa_kernels_init(void) {
     
     // Register data movement kernels:
     NUMA_REGISTER_KERNEL(cpy);
+    NUMA_REGISTER_KERNEL(cont);  // CONT as thin wrapper around CPY
     NUMA_REGISTER_KERNEL(get_rows);
     
     // Register reduction kernels:
diff --git a/ggml/src/ggml-cpu/numa-kernels/numa-kernels.h b/ggml/src/ggml-cpu/numa-kernels/numa-kernels.h
@@ -404,6 +404,64 @@ bool ggml_numa_apply_kernel_force_strategy(ggml_numa_kernel_query_result_t * res
     } \
 } while(0)
 
+/**
+ * NUMA_REGISTER_SYMLINK_KERNEL - Macro to register one operation as a symlink to another
+ * 
+ * This macro allows one operation to use another operation's kernel implementation.
+ * Perfect for cases like CONT → CPY where operations have identical underlying logic.
+ * 
+ * @param symlink_kname - The operation to create a symlink for (e.g., cont)
+ * @param target_kname - The target operation to symlink to (e.g., cpy)
+ * @param symlink_op_type - The GGML operation type constant for the symlink (e.g., GGML_OP_CONT)
+ * 
+ * Usage example:
+ *   NUMA_REGISTER_SYMLINK_KERNEL(cont, cpy, GGML_OP_CONT);
+ * 
+ * This will register GGML_OP_CONT to use the CPY kernel's functions.
+ * The target kernel must already be registered.
+ */
+#define NUMA_REGISTER_SYMLINK_KERNEL(symlink_kname, target_kname, symlink_op_type) do { \
+    /* Get the target kernel's registration info */ \
+    ggml_numa_kernel_registration_info_t target_info = ggml_numa_kernel_##target_kname##_register(); \
+    ggml_numa_kernel_query_fn_t target_query_fn = ggml_numa_kernel_##target_kname##_query; \
+    ggml_numa_kernel_work_buffer_calc_fn_t target_work_buffer_fn = NULL; \
+    /* Check if target kernel provides work buffer calculation function */ \
+    if (target_info.work_buffer_calc_fn != NULL) { \
+        target_work_buffer_fn = target_info.work_buffer_calc_fn; \
+    } \
+    /* Create symlink registration info using target's functions but symlink's operation type */ \
+    ggml_numa_kernel_registration_info_t symlink_info = {0}; /* Initialize to zero */ \
+    symlink_info.op_type = symlink_op_type; /* Use the provided operation type */ \
+    symlink_info.strategy_array = target_info.strategy_array; \
+    symlink_info.work_funcs = target_info.work_funcs; \
+    symlink_info.agg_funcs = target_info.agg_funcs; \
+    symlink_info.work_buffer_calc_fn = target_info.work_buffer_calc_fn; \
+    symlink_info.supported = target_info.supported; \
+    symlink_info.is_noop = target_info.is_noop; \
+    snprintf(symlink_info.kernel_name, sizeof(symlink_info.kernel_name), "NUMA %s → %s Symlink", \
+             #symlink_kname, #target_kname); \
+    /* Register the symlink using target's functions */ \
+    enum ggml_status symlink_result = ggml_numa_register_kernel_strategy( \
+        symlink_info.op_type, &target_info.strategy_array, \
+        &target_info.work_funcs, &target_info.agg_funcs, target_query_fn, target_work_buffer_fn, \
+        target_info.supported, target_info.is_noop); \
+    if (symlink_result != GGML_STATUS_SUCCESS) { \
+        NUMA_LOG_ERROR("Failed to register " #symlink_kname " → " #target_kname " symlink"); \
+        return symlink_result; \
+    } \
+    if (target_info.supported) { \
+        NUMA_LOG_DEBUG("🔗 Symlinked %s → %s (thresholds: %zu/%zu%s%s)", \
+                      #symlink_kname, #target_kname, \
+                      target_info.strategy_array.thresholds[NUMA_STRATEGY_IDX_SINGLE_SINGLE], \
+                      target_info.strategy_array.thresholds[NUMA_STRATEGY_IDX_SINGLE_MULTI], \
+                      target_info.is_noop ? ", no-op" : "", \
+                      target_work_buffer_fn ? ", work-buffer" : ""); \
+    } else { \
+        NUMA_LOG_DEBUG("🚫 Disabled symlink %s → %s (target marked as unsupported)", \
+                      #symlink_kname, #target_kname); \
+    } \
+} while(0)
+
 // =============================================================================
 // SHARED KERNEL FUNCTION MACROS - Eliminate Code Duplication
 // =============================================================================
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -227,6 +227,12 @@ target_link_libraries(${LLAMA_TEST_NAME} PRIVATE ggml ggml-cpu common OpenMP::Op
 # Prioritize ggml/src/ggml-cpu/ to get the full implementation headers and add ggml/src for ggml-impl.h
 target_include_directories(${LLAMA_TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/ggml/src/ggml-cpu ${CMAKE_SOURCE_DIR}/ggml/src ${CMAKE_SOURCE_DIR}/ggml/include)
 
+# test-numa-mathematical-correctness-cont
+set(LLAMA_TEST_NAME test-numa-mathematical-correctness-cont)
+llama_build_and_test(test-numa-mathematical-correctness-cont.cpp)
+target_link_libraries(${LLAMA_TEST_NAME} PRIVATE ggml ggml-cpu common OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+target_include_directories(${LLAMA_TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/ggml/src/ggml-cpu ${CMAKE_SOURCE_DIR}/ggml/src ${CMAKE_SOURCE_DIR}/ggml/include)
+
 # test-numa-mathematical-correctness-mul
 set(LLAMA_TEST_NAME test-numa-mathematical-correctness-mul)
 llama_build_and_test(test-numa-mathematical-correctness-mul.cpp)
diff --git a/tests/run-numa-tests.sh b/tests/run-numa-tests.sh
@@ -56,6 +56,7 @@ BIN_DIR="$BUILD_DIR/bin"
 # Test binaries to run (in order of complexity)
 NUMA_TESTS=(
     "test-numa-mathematical-correctness-add"
+    "test-numa-mathematical-correctness-cont"
     "test-numa-mathematical-correctness-cpy"
     "test-numa-mathematical-correctness-get_rows"
     "test-numa-mathematical-correctness-rope"
diff --git a/tests/test-numa-mathematical-correctness-cont.cpp b/tests/test-numa-mathematical-correctness-cont.cpp