add GGML_NUMA_CORE_IDS to specify which cores for the numa migrate feature, with 54 threads, 41 S_TG t/s, 55% uplift

boltliu85 · boltliu85 · commit 1f3fdfb388f3 · 2025-06-25T11:14:18.000+08:00
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
@@ -350,6 +350,8 @@ extern "C" {
     GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
 #ifdef GGML_USE_NUMA_MIGRATE
     GGML_API size_t ggml_backend_get_page_size(void);
+    GGML_API int ggml_backend_get_node_id(int index);
+    GGML_API void ggml_backend_init_node_id(void);
 #endif
 
 #ifdef  __cplusplus
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
@@ -67,6 +67,8 @@ class numa_migrate_mapping_cache {
 
 static std::set<numa_migrate_mapping_cache> ggml_mapping_cache;
 static size_t ggml_backend_page_size = 0;
+static int ggml_backend_node_id[GGML_NUMA_MIGRATE_NODES];
+static bool ggml_backend_node_id_inited = false;
 static std::mutex ggml_mapping_mutex;
 #endif
 
@@ -1718,7 +1720,7 @@ static int check_numa_pages_migration(void *addr, size_t total_size) {
     int num_nodes = GGML_NUMA_MIGRATE_NODES;
 
     for (int i = 0; i < num_nodes; ++i) {
-        int target_node = i;
+        int target_node = ggml_backend_node_id[i];
         size_t size_to_migrate = total_size / num_nodes;
 
         if (size_to_migrate > total_size - offset) {
@@ -1765,7 +1767,7 @@ static int check_numa_pages_migration(void *addr, size_t total_size) {
             (void **)malloc(num_pages_to_migrate * sizeof(void *));
         for (size_t j = 0; j < num_pages_to_migrate; j++) {
             status[j] = 0;
-            nodes[j] = i;
+            nodes[j] = target_node;
             addr_to_migrate[j] = (void *)((char *)migrate_start_addr +
                                           j * ggml_backend_page_size);
         }
@@ -1842,7 +1844,7 @@ static int migrate_pages_multiple_nodes(void *addr, size_t total_size) {
     int num_nodes = GGML_NUMA_MIGRATE_NODES;
 
     for (int i = 0; i < num_nodes; ++i) {
-        int target_node = i;
+        int target_node = ggml_backend_node_id[i];
         size_t size_to_migrate = total_size / num_nodes;
 
         if (size_to_migrate > total_size - offset) {
@@ -1891,7 +1893,7 @@ static int migrate_pages_multiple_nodes(void *addr, size_t total_size) {
             (void **)malloc(num_pages_to_migrate * sizeof(void *));
         for (size_t j = 0; j < num_pages_to_migrate; j++) {
             status[j] = 0;
-            nodes[j] = i;
+            nodes[j] = target_node;
             addr_to_migrate[j] = (void *)((char *)migrate_start_addr +
                                           j * ggml_backend_page_size);
         }
@@ -2244,18 +2246,62 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
 }
 
 #ifdef GGML_USE_NUMA_MIGRATE
+int ggml_backend_get_node_id(int index) {
+    return ggml_backend_node_id[index];
+}
+
 size_t ggml_backend_get_page_size(void) {
     if (ggml_backend_page_size == 0) {
         ggml_backend_page_size = sysconf(_SC_PAGE_SIZE);
     }
     return ggml_backend_page_size;
 }
+
+static void parse_numa_node_ids(const char *input, int *count, int *node_ids) {
+    *count = 0;
+    char *input_copy = strdup(input);
+    char *range = strtok(input_copy, ",");
+
+    while (range != NULL) {
+        if (strchr(range, '-') != NULL) {
+            int start, end;
+            sscanf(range, "%d-%d", &start, &end);
+            for (int i = start; i <= end; i++) {
+                node_ids[(*count)++] = i;
+            }
+        } else {
+            int node_id = atoi(range);
+            node_ids[(*count)++] = node_id;
+        }
+        range = strtok(NULL, ",");
+    }
+
+    free(input_copy);
+}
+
+void ggml_backend_init_node_id(void) {
+    if (ggml_backend_node_id_inited) {
+        return;
+    }
+    const char *env_var = getenv("GGML_NUMA_NODE_IDS");
+
+    if (env_var) {
+        int count = 0;
+        parse_numa_node_ids(env_var, &count, ggml_backend_node_id);
+    } else {
+        for (int node = 0; node < GGML_NUMA_MIGRATE_NODES; node++) {
+            ggml_backend_node_id[node] = node;
+        }
+    }
+    ggml_backend_node_id_inited = true;
+}
 #endif
 
 static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
 #ifdef GGML_USE_NUMA_MIGRATE
+    ggml_backend_init_node_id();
     ggml_backend_get_page_size();
-    void * data = numa_alloc_onnode(size, 0);
+    void * data = numa_alloc_onnode(size, ggml_backend_node_id[0]);
 #else
     void * data = ggml_aligned_malloc(size);
 #endif
diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -515,6 +515,7 @@ enum ggml_barrier_node_index {
 void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n);
 int ggml_cores_per_numa(int ith);
 int ggml_get_node_from_cpu(int ith);
+int ggml_get_start_id_in_node(int ith);
 #endif
 
 void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value);
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -524,6 +524,7 @@ struct ggml_numa_nodes {
 #ifdef GGML_USE_NUMA_MIGRATE
     int *node_num_of_cpu;
     int *cpu_core_mapping; // x logic core, y physical core
+    int logic_core_cnts;
     int cores_per_numa[GGML_NUMA_MIGRATE_NODES];
 #endif
 };
@@ -585,11 +586,80 @@ int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value) {
 }
 
 #ifdef GGML_USE_NUMA_MIGRATE
+
+static int** ggml_allocate_core_ids(int num_nodes, int max_cores) {
+    int **core_ids = malloc(num_nodes * sizeof(int *));
+    for (int i = 0; i < num_nodes; i++) {
+        core_ids[i] = malloc(max_cores * sizeof(int));
+        for (int j = 0; j < max_cores; j++) {
+            core_ids[i][j] = -1;
+        }
+    }
+    return core_ids;
+}
+
+static void ggml_free_core_ids(int **core_ids, int num_nodes) {
+    for (int i = 0; i < num_nodes; i++) {
+        free(core_ids[i]);
+    }
+    free(core_ids);
+}
+
+static void ggml_parse_cpu_core_ids(const char *env_var, int **core_ids, int max_numa_nodes, int max_cores_per_node) {
+    char *numa_node;
+    char *node_copy = strdup(env_var);
+    char *context;
+
+    numa_node = strtok_r(node_copy, "|", &context);
+    int node_count = 0;
+
+    while (numa_node != NULL && node_count < max_numa_nodes) {
+        int core_index = 0;
+
+        char *core_range = strtok(numa_node, ",");
+        while (core_range != NULL && core_index < max_cores_per_node) {
+            if (strchr(core_range, '-') != NULL) {
+                int start, end;
+                sscanf(core_range, "%d-%d", &start, &end);
+                for (int i = start; i <= end && core_index < max_cores_per_node; i++) {
+                    core_ids[node_count][core_index++] = i;
+                }
+            } else {
+                int core_id = atoi(core_range);
+                if (core_index < max_cores_per_node) {
+                    core_ids[node_count][core_index++] = core_id;
+                }
+            }
+            core_range = strtok(NULL, ",");
+        }
+        node_count++;
+        numa_node = strtok_r(NULL, "|", &context);
+    }
+
+    free(node_copy);
+}
+
+
 int ggml_get_node_from_cpu(int ith) {
     int cpu = g_state.numa.cpu_core_mapping[ith];
     return g_state.numa.node_num_of_cpu[cpu];
 }
 
+int ggml_get_start_id_in_node(int ith) {
+    int total_cpus = 0;
+    int prev_total_cpus = 0;
+    for (int node = 0; node < GGML_NUMA_MIGRATE_NODES; node++) {
+        prev_total_cpus = total_cpus;
+        total_cpus += g_state.numa.cores_per_numa[node];
+        if (ith < total_cpus) {
+            return (ith - prev_total_cpus);
+        }
+    }
+
+    assert(0);
+    return -1;
+}
+
 int ggml_cores_per_numa(int ith) {
     int node = ggml_get_node_from_cpu(ith);
     return g_state.numa.cores_per_numa[node];
@@ -605,6 +675,11 @@ void ggml_barrier_numa_aware(struct ggml_threadpool * tp, int ith, int node_n) {
     if (n_threads == 1) {
         return;
     }
+    if (n_threads != g_state.numa.logic_core_cnts) {
+        printf("bolt-test: n_threads: %d, g_state.numa.logic_core_cnts: %d\n", n_threads, g_state.numa.logic_core_cnts);
+        ggml_barrier(tp);
+        return;
+    }
 
     int cores_per_numa = ggml_cores_per_numa(ith);
     int numa_nodes = GGML_NUMA_MIGRATE_NODES;
@@ -733,34 +808,60 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
 #ifdef GGML_USE_NUMA_MIGRATE
     g_state.numa.node_num_of_cpu = (int *)malloc(g_state.numa.total_cpus * sizeof(int));
     g_state.numa.cpu_core_mapping = (int *)malloc(g_state.numa.total_cpus * sizeof(int));
-    for (uint32_t i = 0; i < g_state.numa.total_cpus; i++) {
-        g_state.numa.node_num_of_cpu[i] = numa_node_of_cpu(i);
-    }
-
-    FILE *fp = fopen("/sys/devices/system/cpu/online", "r");
-    if (fp == NULL) {
-        perror("fopen");
-        exit(EXIT_FAILURE);
-    }
-
-    int cpu0, cpu1;
     int logic_core_index = 0;
-    while (fscanf(fp, "%d", &cpu0) != EOF) {
-        cpu1 = cpu0;
-        while (fgetc(fp) == '-') {
-            fscanf(fp, "%d", &cpu1);
+
+    const char *env_var = getenv("GGML_NUMA_CORE_IDS");
+    if (env_var) {
+        int max_numa_nodes = GGML_NUMA_MIGRATE_NODES;
+        int **core_ids = ggml_allocate_core_ids(max_numa_nodes, g_state.numa.total_cpus);
+        ggml_parse_cpu_core_ids(env_var, core_ids, max_numa_nodes, g_state.numa.total_cpus);
+
+        for (int node = 0; node < max_numa_nodes; node++) {
+            for (int core = 0; core < (int)g_state.numa.total_cpus; core++) {
+                int phy_core_id = core_ids[node][core];
+                if (phy_core_id != -1) {
+                    g_state.numa.node_num_of_cpu[phy_core_id] = node;
+                    g_state.numa.cpu_core_mapping[logic_core_index] = phy_core_id;
+                    g_state.numa.cores_per_numa[node]++;
+                    GGML_PRINT_DEBUG("setting core ids, core: %d, logic_core_index: %d, mapping: %d, cores_per_numa: %d, node_num_of_cpu: %d\n",
+                        phy_core_id,
+                        logic_core_index,
+                        g_state.numa.cpu_core_mapping[logic_core_index],
+                        g_state.numa.cores_per_numa[node],
+                        g_state.numa.node_num_of_cpu[phy_core_id]);
+                    logic_core_index++;
+                    g_state.numa.logic_core_cnts++;
+                }
+            }
+        }
+        ggml_free_core_ids(core_ids, max_numa_nodes);
+    } else {
+        FILE *fp = fopen("/sys/devices/system/cpu/online", "r");
+        if (fp == NULL) {
+            perror("fopen");
+            exit(EXIT_FAILURE);
         }
 
-        for (int cpu_index = cpu0; cpu_index <= cpu1; cpu_index++) {
-            g_state.numa.cpu_core_mapping[logic_core_index++] = cpu_index;
-            int node = g_state.numa.node_num_of_cpu[cpu_index];
-            if (node < GGML_NUMA_MIGRATE_NODES) {
-                g_state.numa.cores_per_numa[node]++;
+        int cpu0, cpu1;
+        while (fscanf(fp, "%d", &cpu0) != EOF) {
+            cpu1 = cpu0;
+            while (fgetc(fp) == '-') {
+                fscanf(fp, "%d", &cpu1);
+            }
+
+            for (int cpu_index = cpu0; cpu_index <= cpu1; cpu_index++) {
+                g_state.numa.cpu_core_mapping[logic_core_index++] = cpu_index;
+                g_state.numa.node_num_of_cpu[cpu_index] = numa_node_of_cpu(cpu_index);
+                int node = g_state.numa.node_num_of_cpu[cpu_index];
+                if (node < GGML_NUMA_MIGRATE_NODES) {
+                    g_state.numa.logic_core_cnts++;
+                    g_state.numa.cores_per_numa[node]++;
+                }
             }
         }
-    }
 
-    fclose(fp);
+        fclose(fp);
+    }
 #endif
 
     if (ggml_is_numa()) {
@@ -3219,10 +3320,12 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
         threadpool->n_barrier_passed = 0;
 
 #ifdef GGML_USE_NUMA_MIGRATE
+        ggml_backend_init_node_id();
         for (int node = 0; node < GGML_NUMA_MIGRATE_NODES; node++) {
-            threadpool->n_barrier_node[node] = (atomic_int *)numa_alloc_onnode(sizeof(atomic_int), node);
+            int node_id = ggml_backend_get_node_id(node);
+            threadpool->n_barrier_node[node] = (atomic_int *)numa_alloc_onnode(sizeof(atomic_int), node_id);
             *threadpool->n_barrier_node[node] = 0;
-            threadpool->n_barrier_passed_node[node] = (atomic_int *)numa_alloc_onnode(sizeof(atomic_int), node);
+            threadpool->n_barrier_passed_node[node] = (atomic_int *)numa_alloc_onnode(sizeof(atomic_int), node_id);
             *threadpool->n_barrier_passed_node[node] = 0;
         }
 
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -176,7 +176,7 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s
 
 #ifdef GGML_USE_NUMA_MIGRATE
         for (int i = 0; i < GGML_NUMA_MIGRATE_NODES; i++) {
-            cpu_ctx->work_data_numa[i] = (uint8_t *)numa_alloc_onnode(cplan.work_size, i);
+            cpu_ctx->work_data_numa[i] = (uint8_t *)numa_alloc_onnode(cplan.work_size, ggml_backend_get_node_id(i));
             if (cpu_ctx->work_data_numa[i] == NULL) {
                 cpu_ctx->work_size = 0;
                 return GGML_STATUS_ALLOC_FAILED;
diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
@@ -1250,11 +1250,8 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
         int64_t i11_processed = 0;
 #ifdef GGML_USE_NUMA_MIGRATE
         int round_cnts = ggml_cores_per_numa(ith);
-        int start_id = ith - round_cnts * node_id;
-        if (round_cnts == 0) {
-            round_cnts = nth;
-            start_id = ith;
-        }
+        assert(round_cnts);
+        int start_id = ggml_get_start_id_in_node(ith);
 #else
         int round_cnts = nth;
         int start_id = ith;