use row_segment for supporting 16 tasklets

mryvae · lanhin · commit 7004727f9081 · 2025-04-14T17:20:01.000+08:00
diff --git a/dpu/dpu_main.c b/dpu/dpu_main.c
@@ -11,13 +11,21 @@
 #include <alloc.h>
 #include <barrier.h>
 #include <seqread.h>
+#include <mutex_pool.h>
 
 #define PIM_KERNEL_DPU 1
 #include "../ggml/include/ggml.h"
 #define GGML_COMMON_DECL_C
 #include "../ggml/src/ggml-common.h"
 
 #define PRINT 0
+#define SEGMENT_PER_ROW 4
+
+// Find the lowest index for the rank-th group
+#define BLOCK_LOW(rank, size, n) ((rank) * (n) / (size))
+
+// Find the highest index for the rank-th group
+#define BLOCK_HIGH(rank, size, n) (BLOCK_LOW((rank) + 1, (size), (n)) - 1)
 
 __mram_ptr float *ptable_f32_f16;
 
@@ -35,6 +43,7 @@ inline static float lookup_fp16_to_fp32(uint16_t f) {
 
 // Barrier
 BARRIER_INIT(my_barrier, NR_TASKLETS);
+MUTEX_POOL_INIT(g_psumf_mutex_pool, NR_TASKLETS);
 
 /*
 DPU MRAM Memory:
@@ -91,8 +100,9 @@ int wram2mram(__mram_ptr void *pmram,void *pwram,uint32_t size)
 }
 
 
-// set psumf to global value for each thread access
-static float *psumf = NULL;
+// set g_psumf to global value for each thread access
+static float *g_psumf = NULL;
+static block_q8_0 *g_pinput_cache = NULL;
 
 void init(unsigned int tasklet_id) {
 #if PRINT
@@ -140,9 +150,11 @@ int main() {
 #endif
 
     // set sart line, end line and line number in each thread
-    uint16_t weight_rows_per_thread = cache_meta->rows_per_dpu / NR_TASKLETS;
-    uint16_t weight_start_row = tasklet_id * weight_rows_per_thread;
-    uint16_t weight_end_row = weight_start_row + weight_rows_per_thread;
+    uint16_t segments_num = cache_meta->rows_per_dpu * SEGMENT_PER_ROW;
+    uint16_t segment_start = BLOCK_LOW(tasklet_id, NR_TASKLETS, segments_num);
+    uint16_t segment_end = BLOCK_HIGH(tasklet_id, NR_TASKLETS, segments_num);
+
+    assert(segment_start <= segment_end && "There are not enough segments to allocate to the tasklets");
 
     // todo:rest row is existed, first thread in every dpu can one more row
     uint16_t weight_rows_cur_thread;
@@ -184,83 +196,80 @@ int main() {
             return -1;
         }
         int nb = pinputcache->ne[0]/QK8_0;
+
+        assert(SEGMENT_PER_ROW <= nb && nb % SEGMENT_PER_ROW == 0 
+            && "Too many segments are allocated to each row.");
+
         int qk = QK8_0;
         input_row_size = nb*sizeof(block_q8_0);
         __mram_ptr void *pweight_base = (__mram_ptr void *)(weightmetadatabase + sizeof(struct pim_meta));
         __mram_ptr void *pinput_base = DPU_MRAM_HEAP_POINTER + cache_meta->input_offset + sizeof(pim_matrix_des);
-
+        
         if (tasklet_id == 0) {
-            psumf = (float *)mem_alloc(sizeof(float)*input_cols*weight_rows_cur_thread);
+            g_psumf = (float *)mem_alloc(sizeof(float)*input_cols*weight_rows_cur_thread);
+            g_pinput_cache = (block_q8_0 *) mem_alloc(sizeof(block_q8_0) * nb);
+            memset(g_psumf, 0 ,sizeof(float)*input_cols*weight_rows_cur_thread);
         }
-        barrier_wait(&my_barrier);
 
-        // psumf = (float *)mem_alloc(sizeof(float)*input_cols*weight_rows_cur_thread);
-        memset(psumf, 0 ,sizeof(float)*input_cols*weight_rows_cur_thread);
-        
 #if PRINT
         printf("input_cols=%d, rows_cur_thread=%d, nb=%d, input_row_size=%d\n",input_cols,weight_rows_cur_thread,nb,input_row_size);
 #endif
-        block_q4_0 *pweight_cache = (block_q4_0 *) mem_alloc(sizeof(block_q4_0)*nb);
-        block_q8_0 *pinput_cache = (block_q8_0 *) mem_alloc(sizeof(block_q8_0)*nb);
+
+        uint16_t segment_nb_size = nb / SEGMENT_PER_ROW;
+        block_q4_0 *pweight_cache = (block_q4_0 *) mem_alloc(sizeof(block_q4_0) * segment_nb_size);
 
         // weight_rows_cur_thread = 16;
         for(int l = 0;l < input_cols;l++) {
-          __mram_ptr block_q8_0 *pinput = pinput_base + l * nb * sizeof(block_q8_0);
-            mram2wram(pinput, pinput_cache, sizeof(block_q8_0)*nb);
-#if PRINT
-            printf("input:\n");
-            for (int i = 0; i < nb; i++) {
-              printf("d=%u\n",pinput[i].d);
-              for (int kkk=0;kkk<QK8_0;kkk++) {
-                printf("%d ",pinput[i].qs[kkk]);
-              }
-            printf("\n");
+            if (tasklet_id == 0) {
+                __mram_ptr block_q8_0 *pinput = pinput_base + l * nb * sizeof(block_q8_0);
+                mram2wram(pinput, g_pinput_cache, sizeof(block_q8_0)*nb);
             }
-            printf("pweight_base: %p\n", pweight_base);
-#endif
-            // for(int k = 0;k < weight_rows_cur_thread;k++) {
-            for (int k = weight_start_row; k < weight_end_row; ++k) {
-              __mram_ptr block_q4_0 *pweight = pweight_base + pinputcache->layerid * cache_meta->layer_len + k * nb * sizeof(block_q4_0);
-                mram2wram(pweight, pweight_cache, sizeof(block_q4_0)*nb);
-#if PRINT
-                if (k % 64 == 0) {
-                  printf("pweight_cache[%d].d=%d\n pweight_cache[%d].qs=", k*128, pweight_cache[0].d, k*128);
-                  for (int kkk=0;kkk<QK4_0/2;kkk++) {
-                    int v0 = (pweight_cache[0].qs[kkk] & 0x0f) - 8;
-                    int v1 = (pweight_cache[0].qs[kkk]  >> 4) - 8;
-                    printf(" %d, %d", v0, v1);
-                  }
-                  printf("\n");
-                }
-#endif
 
-                for (int i = 0; i < nb; i++) {
-                    //printf("input_col:%d, current inner weight row idx:%d\n",l,k);
+            barrier_wait(&my_barrier);
+
+            __mram_ptr block_q4_0 *pweight_addr = pweight_base + pinputcache->layerid * cache_meta->layer_len;
 
+            for (int k = segment_start; k <= segment_end; ++k) {
+                __mram_ptr block_q4_0 *pweight = pweight_addr + k * segment_nb_size;
+                mram2wram(pweight, pweight_cache, sizeof(block_q4_0) * segment_nb_size);
+
+                block_q8_0 *pinput_cache = g_pinput_cache + k % SEGMENT_PER_ROW * segment_nb_size;
+
+                for (int i = 0; i < segment_nb_size; i++) {
                     int sumi = 0;
                     for (int j = 0; j < qk/2; ++j) {
                         const int v0 = (pweight_cache[i].qs[j] & 0x0F) - 8;
                         const int v1 = (pweight_cache[i].qs[j] >>   4) - 8;
 
                         sumi += (v0 * pinput_cache[i].qs[j]) + (v1 * pinput_cache[i].qs[j + qk/2]);
                     }
-
-                    psumf[l*weight_rows_cur_thread + k] += sumi*FP16_TO_FP32(pweight_cache[i].d)*FP16_TO_FP32(pinput_cache[i].d);
+                    
+                    int psumf_idx = l * weight_rows_cur_thread + k / SEGMENT_PER_ROW;
+                    float sum = sumi * FP16_TO_FP32(pweight_cache[i].d) * FP16_TO_FP32(pinput_cache[i].d);
+                    mutex_pool_lock(&g_psumf_mutex_pool, psumf_idx);
+                    g_psumf[psumf_idx] += sum;
+                    // g_psumf[psumf_idx] += sumi;
+                    mutex_pool_unlock(&g_psumf_mutex_pool, psumf_idx);
                 }
             }
         }
     }
 
-    offset += (sizeof(pim_matrix_des) + input_row_size * input_cols);
-#if PRINT
-    for(int iii=0;iii<cache_meta->rows_per_dpu;iii+=128) {
-        printf("psumf[%d]=%f\n",iii,psumf[iii]);
+    barrier_wait(&my_barrier);
+
+    if (tasklet_id == 0){
+        offset += (sizeof(pim_matrix_des) + input_row_size * input_cols);
+        #if PRINT
+            for(int iii=0;iii<cache_meta->rows_per_dpu;iii+=128) {
+                printf("g_psumf[%d]=%f\n",iii,g_psumf[iii]);
+            }
+        
+            printf("output offset=%d\n",offset);
+        #endif
+        // Write C Matrix to current MRAM block
+        // Note: with input_cols > 1, the results should be rearranged on host
+        wram2mram((__mram_ptr void *) (DPU_MRAM_HEAP_POINTER + offset), g_psumf, sizeof(float)*input_cols*weight_rows_cur_thread);
     }
 
-    printf("output offset=%d\n",offset);
-#endif
-    // Write C Matrix to current MRAM block
-    // Note: with input_cols > 1, the results should be rearranged on host
-    wram2mram((__mram_ptr void *) (DPU_MRAM_HEAP_POINTER + offset), psumf, sizeof(float)*input_cols*weight_rows_cur_thread);
     return 0;
 }
diff --git a/dpu/pim_build.sh b/dpu/pim_build.sh
@@ -1,2 +1,2 @@
 #!/bin/bash
-dpu-upmem-dpurte-clang -Wall -Wextra -O2 -DNR_TASKLETS=8 -DBL=11 -o gemv_dpu dpu_main.c
+dpu-upmem-dpurte-clang -Wall -Wextra -O3 -DNR_TASKLETS=16 -DBL=11 -o gemv_dpu dpu_main.c
diff --git a/examples/tensor/ts.cpp b/examples/tensor/ts.cpp
@@ -3,9 +3,7 @@
 #include <iomanip>
 #include <chrono>
 
-#include <vector>
-
-#define NR_DPUS 2048
+#define NR_DPUS 64
 #define NR_LAYER 2
 #define DPU_BINARY "./dpu/gemv_dpu"
 
@@ -105,8 +103,8 @@ int gemv_dpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct
 
   dur = ex_tp2 - ex_tp1;
 
-  std::cout << "dpu: 执行用时：" << std::chrono::duration_cast<std::chrono::milliseconds>(dur).count() << " ms" << std::endl;
-  std::cout << "dpu: 执行用时：" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count() << " us" << std::endl;
+  // std::cout << "执行用时：" << std::chrono::duration_cast<std::chrono::milliseconds>(dur).count() << " ms" << std::endl;
+  std::cout << "执行用时：" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count() << " us" << std::endl;
 
   // Check results
   float *mul_mat_res = (float *)res->data;
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -17413,12 +17413,23 @@ static int dpu_launch_gemv_async(
     uint32_t input_offset = res->inout_offset;
     dpu_set = *(res->dpu_set);
     // broadcast input metadata
+
+#if PIM_DEBUG_PERF_PRINT
+    uint64_t t_start = get_time_us();
+#endif
+
     DPU_ASSERT(dpu_broadcast_to(dpu_set, DPU_MRAM_HEAP_POINTER_NAME, input_offset, &input_descript, sizeof(pim_matrix_des), DPU_XFER_DEFAULT));
     input_offset += sizeof(pim_matrix_des);
 
     // broadcast input data
     uint32_t bclen = ggml_row_size(vec_dot_type, input->ne[0])*input->ne[1]*input->ne[2]*input->ne[3];
     DPU_ASSERT(dpu_broadcast_to(dpu_set, DPU_MRAM_HEAP_POINTER_NAME, input_offset, wdata, bclen, DPU_XFER_DEFAULT));
+
+#if PIM_DEBUG_PERF_PRINT
+    uint64_t t_us = get_time_us() - t_start;
+    printf("\n%s: PIM broadcast time = %ld  \n", __FUNCTION__, t_us);
+#endif
+
     input_offset += bclen;
 
     res->inout_offset = input_offset;
@@ -17433,9 +17444,9 @@ static __inline__ void dpu_kernel_barrier(struct dpu_set_t dpu_set) {
     struct dpu_set_t dpu;
     dpu_sync(dpu_set);
     //打印dpu log
-    DPU_FOREACH(dpu_set, dpu) {
-        DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout));
-    }
+    // DPU_FOREACH(dpu_set, dpu) {
+    //     DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout));
+    // }
     return;
 }
 

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`#!/bin/bash`
`2`		`-dpu-upmem-dpurte-clang -Wall -Wextra -O2 -DNR_TASKLETS=8 -DBL=11 -o gemv_dpu dpu_main.c`
	`2`	`+dpu-upmem-dpurte-clang -Wall -Wextra -O3 -DNR_TASKLETS=16 -DBL=11 -o gemv_dpu dpu_main.c`