dbsanfte
diff --git a/‎.github/copilot-instructions.md‎
Lines changed: 2 additions & 4 deletions b/‎.github/copilot-instructions.md‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎ggml/src/ggml-cpu/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎ggml/src/ggml-cpu/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cpu/numa-kernels/cpy.c‎
Lines changed: 421 additions & 0 deletions b/‎ggml/src/ggml-cpu/numa-kernels/cpy.c‎
Lines changed: 421 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cpu/numa-kernels/cpy.h‎
Lines changed: 55 additions & 0 deletions b/‎ggml/src/ggml-cpu/numa-kernels/cpy.h‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cpu/numa-kernels/cpy.old.c‎
Lines changed: 0 additions & 254 deletions b/‎ggml/src/ggml-cpu/numa-kernels/cpy.old.c‎
Lines changed: 0 additions & 254 deletions
@@ -198,9 +198,6 @@ NUMA_GET_SOURCE_POINTER(src_data, tensor->src[0], float);
 
 // 5. Synchronization - Essential for correctness
 NUMA_BARRIER_AUTO();
-
-// 6. Early exit handling - Performance optimization
-NUMA_EARLY_EXIT_IF_NO_WORK(ctx);
 ```
 
 **🏗️ Composed Templates (Recommended for Common Patterns):**
@@ -536,10 +533,11 @@ cp tests/test-numa-mathematical-correctness-template.cpp tests/test-numa-mathema
 
 **Required tests:**
 - Multi-dimensional: TINY → GIGANTIC_16GB tensor sizes (now includes GB-scale support)
-- Multi-threading: 1, 2, 4, 6, 8, 15, 16, 31, 32, 64, 128 threads
+- Multi-strategy (use Executor methods to force the strategy): Single-thread/Single-node, Multi-thread/Single-Node, and Multi-thread/Multi-Node (data parallel)
 - Hardware-specific Data Parallel: Data parallel tests with all numas available on the machine using max thread counts per numa node
 - Mathematical equivalence: Exact comparison with reference
 - Add to CMake and verify with `cmake --build build --target test-numa-mathematical-correctness-YOUR_OPERATION`
+- Optionally, use `--filter <regex>` to filter on and run specific tests, and `--summary-only` to just get a final test run summary.
 
 ## 🏗️ Current Architecture Status
 
 
@@ -43,6 +43,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
         ggml-cpu/numa-kernels/numa-kernels.h
         ggml-cpu/numa-kernels/add.c
         ggml-cpu/numa-kernels/add.h
+        ggml-cpu/numa-kernels/cpy.c
+        ggml-cpu/numa-kernels/cpy.h
         ggml-cpu/numa-kernels/mul.c
         ggml-cpu/numa-kernels/mul.h
         ggml-cpu/numa-kernels/div.c
 
@@ -0,0 +1,55 @@
+/**
+ * @file cpy.h
+ * @brief NUMA-aware CPY/DUP kernel header with type conversion support
+ * @author David Sanftenberg
+ */
+
+#pragma once
+
+#include "ggml.h"
+#include "ggml-numa-shared.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Execute CPY operation using NUMA kernels with type conversion support
+ * 
+ * Handles tensor copying with optional type conversion between:
+ * - Same types (optimized memcpy path)
+ * - F32 ↔ F16 conversion
+ * - F32 ↔ BF16 conversion  
+ * - Quantized → F32 dequantization
+ * 
+ * @param work_context Tensor context (struct ggml_tensor*)
+ * @param params Compute parameters with threading info
+ * @return GGML_STATUS_SUCCESS on completion, GGML_STATUS_FAILED on error
+ */
+enum ggml_status ggml_numa_kernel_cpy_execute(void * work_context, struct ggml_compute_params * params);
+
+/**
+ * @brief Query execution strategy for CPY operations
+ * @param tensor Target tensor
+ * @return Recommended NUMA execution strategy
+ */
+ggml_numa_execution_strategy_t ggml_numa_kernel_cpy_query(const struct ggml_tensor * tensor);
+
+/**
+ * @brief Register CPY kernel with metadata
+ * @return Kernel registration information
+ */
+ggml_numa_kernel_registration_info_t ggml_numa_kernel_cpy_register(void);
+
+/**
+ * @brief Calculate work buffer size for CPY operations (unused - CPY doesn't need work buffers)
+ * @param tensor Target tensor
+ * @param total_numa_nodes Number of NUMA nodes
+ * @param total_threads Total number of threads
+ * @return Work buffer size (always 0 for CPY)
+ */
+size_t ggml_numa_kernel_cpy_work_buffer_calc(const struct ggml_tensor * tensor, int total_numa_nodes, int total_threads);
+
+#ifdef __cplusplus
+}
+#endif