use mul_table_int4_int8

mryvae · lanhin · commit 46aaeba9bc55 · 2025-04-14T17:20:40.000+08:00
diff --git a/dpu/dpu_main.c b/dpu/dpu_main.c
@@ -29,6 +29,8 @@
 
 __mram_ptr float *ptable_f32_f16;
 
+__host int16_t mul_table_int4_int8[1<<4][1<<8];
+
 inline static float lookup_fp16_to_fp32(uint16_t f) {
     uint16_t s;
     memcpy(&s, &f, sizeof(uint16_t));
@@ -238,10 +240,12 @@ int main() {
                 for (int i = 0; i < segment_nb_size; i++) {
                     int sumi = 0;
                     for (int j = 0; j < qk/2; ++j) {
-                        const int v0 = (pweight_cache[i].qs[j] & 0x0F) - 8;
-                        const int v1 = (pweight_cache[i].qs[j] >>   4) - 8;
+                        const int8_t v0 = (pweight_cache[i].qs[j] & 0x0F) - 8;
+                        const int8_t v1 = (pweight_cache[i].qs[j] >>   4) - 8;
 
-                        sumi += (v0 * pinput_cache[i].qs[j]) + (v1 * pinput_cache[i].qs[j + qk/2]);
+                        // sumi += (v0 * pinput_cache[i].qs[j]) + (v1 * pinput_cache[i].qs[j + qk/2]);
+                        sumi += mul_table_int4_int8[v0 + 8][pinput_cache[i].qs[j] - INT8_MIN] + 
+                                mul_table_int4_int8[v1 + 8][pinput_cache[i].qs[j + qk/2] - INT8_MIN];
                     }
                     
                     int psumf_idx = l * weight_rows_cur_thread + k / SEGMENT_PER_ROW;
diff --git a/examples/tensor/ts.cpp b/examples/tensor/ts.cpp
@@ -3,10 +3,12 @@
 #include <iomanip>
 #include <chrono>
 
-#define NR_DPUS 64
+#define NR_DPUS 512
 #define NR_LAYER 2
 #define DPU_BINARY "./dpu/gemv_dpu"
 
+int16_t mul_table_int4_int8[1<<4][1<<8];
+
 void fp_table_init(void) {
   for (int i = 0; i < (1 << 16); ++i) {
                 union {
@@ -17,13 +19,22 @@ void fp_table_init(void) {
             }
 }
 
+void mul_table_int4_int8_init(void) {
+  for(int i = 0; i < (1 << 4); ++i){
+    for(int j = 0; j< (1 << 8); ++j){
+      mul_table_int4_int8[i][j] = (i - 8) * (j + INT8_MIN);
+    }
+  }
+}
+
 #ifdef PIM_KERNEL
 int gemv_dpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct ggml_tensor * in_q, struct ggml_tensor * res) {
   uint32_t pim_offset = 0;
   struct dpu_set_t dpu;
 
   std::chrono::high_resolution_clock::time_point ex_tp1 = std::chrono::high_resolution_clock::now();
 
+  DPU_ASSERT(dpu_broadcast_to(context->dpu_set, "mul_table_int4_int8", 0, (void *)(mul_table_int4_int8), sizeof(mul_table_int4_int8), DPU_XFER_DEFAULT));
   //ggml_table_f32_f16 tbl is transferred to pim
   DPU_ASSERT(dpu_broadcast_to(context->dpu_set, DPU_MRAM_HEAP_POINTER_NAME, pim_offset, (void *)(ggml_table_f32_f16), sizeof(ggml_table_f32_f16), DPU_XFER_DEFAULT));
   pim_offset += sizeof(ggml_table_f32_f16);
@@ -163,6 +174,7 @@ void gemv_cpu_kernel(struct pim_context *context, struct ggml_tensor * w, struct
 int main(int argc, char** argv) {
   // init fp table for fp16 dump
   fp_table_init();
+  mul_table_int4_int8_init();
 
 #ifdef PIM_KERNEL
   // WQ-PIM allocate dpu
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -490,6 +490,10 @@ static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
 // precomputed f32 table for f16 (256 KB) (ggml-impl.h)
 float ggml_table_f32_f16[1 << 16];
 
+#ifdef PIM_KERNEL
+    int16_t mul_table_int4_int8[1<<4][1<<8];
+#endif
+
 #if defined(__ARM_ARCH)
 struct ggml_arm_arch_features_type {
     int has_neon;
diff --git a/include/llama.h b/include/llama.h
@@ -425,7 +425,7 @@ extern "C" {
 
 #ifdef PIM_KERNEL
 #define NR_DPUS 512
-#define NR_LAYER 2
+#define NR_LAYER 32
 #define DPU_BINARY "./dpu/gemv_dpu"
   enum WeightId {
     WQ,
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -9342,6 +9342,16 @@ static struct ggml_tensor * llm_build_lora_mm(
 
 #ifdef PIM_KERNEL
 extern float ggml_table_f32_f16[1 << 16];
+extern int16_t mul_table_int4_int8[1<<4][1<<8];
+
+static void mul_table_int4_int8_init(void) {
+    for(int i = 0; i < (1 << 4); ++i){
+        for(int j = 0; j< (1 << 8); ++j){
+            mul_table_int4_int8[i][j] = (i - 8) * (j + INT8_MIN);
+        }
+    }
+}
+
 int load_weight2dpu(enum WeightId w_id, struct dpu_set_t dpu_set, struct llama_model *model, struct pim_meta *pim_metadata, uint32_t offset_base) {
   GGML_ASSERT(w_id < WCNT);
   struct dpu_set_t dpu;
@@ -9393,6 +9403,9 @@ int llama_load2dpu(struct llama_context *ctx, struct llama_model *model) {
     DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &pqcontext->dpu_set));
     DPU_ASSERT(dpu_load(pqcontext->dpu_set, DPU_BINARY, NULL));
 
+    mul_table_int4_int8_init();
+    DPU_ASSERT(dpu_broadcast_to(pqcontext->dpu_set, "mul_table_int4_int8", 0, (void *)(mul_table_int4_int8), sizeof(mul_table_int4_int8), DPU_XFER_DEFAULT));
+
     for (int uuu=0;uuu<16;uuu++) {
         printf("ggml_table_f32_f16[%d]=%f\n",uuu,ggml_table_f32_f16[uuu]);
     }