cuda: Add Q5_1, Q5_0, Q4_1 and Q4_0 to F32 conversion support. (#10976)

gcp · gcp · commit 01d4b5922c8a · 2025-02-19T23:45:32.000+01:00
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
@@ -225,6 +225,91 @@ static __device__ void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) {
     memcpy(dsti->qh, &qh, sizeof(qh));
 }
 
+static __device__ void cpy_blck_q5_0_f32(const char * cxi, char * cdsti) {
+    const block_q5_0 * xi = (const block_q5_0 *) cxi;
+    float * dst = (float *) cdsti;
+    float d = xi->d;       // scale factor (computed as vmax / -16)
+    const float shift = 16.0f;
+
+    // Safely copy the 32-bit qh value to avoid misaligned access.
+    unsigned int qh;
+    memcpy(&qh, xi->qh, sizeof(qh));
+
+    // First half: lower nibble stores element j.
+    for (int j = 0; j < QK5_0/2; j++) {
+         uint8_t lower = xi->qs[j] & 0xF;
+         uint8_t high  = (qh >> j) & 1;
+         uint8_t q = (high << 4) | lower;
+         dst[j] = ((float)q - shift) * d;
+    }
+    // Second half: upper nibble stores element j + QK5_0/2.
+    for (int j = QK5_0/2; j < QK5_0; j++) {
+         int k = j - QK5_0/2;
+         uint8_t lower = (xi->qs[k] >> 4) & 0xF;
+         uint8_t high  = (qh >> j) & 1;
+         uint8_t q = (high << 4) | lower;
+         dst[j] = ((float)q - shift) * d;
+    }
+}
+
+static __device__ void cpy_blck_q5_1_f32(const char * cxi, char * cdsti) {
+    const block_q5_1 * xi = (const block_q5_1 *) cxi;
+    float * dst = (float *) cdsti;
+    float d = xi->dm.x;       // scale
+    float min_val = xi->dm.y; // minimum value
+
+    // Safely copy the 32-bit qh value to avoid misaligned access.
+    unsigned int qh;
+    memcpy(&qh, xi->qh, sizeof(qh));
+
+    // Decode first half: lower nibble of xi->qs[j] holds element j.
+    for (int j = 0; j < QK5_1/2; j++) {
+         uint8_t lower = xi->qs[j] & 0xF;
+         uint8_t high  = (qh >> j) & 1;
+         uint8_t q = (high << 4) | lower;
+         dst[j] = min_val + d * (float)q;
+    }
+    // Decode second half: upper nibble of xi->qs[j] holds element j+QK5_1/2.
+    for (int j = QK5_1/2; j < QK5_1; j++) {
+         int k = j - QK5_1/2;
+         uint8_t lower = (xi->qs[k] >> 4) & 0xF;
+         uint8_t high  = (qh >> j) & 1;
+         uint8_t q = (high << 4) | lower;
+         dst[j] = min_val + d * (float)q;
+    }
+}
+
+static __device__ void cpy_blck_q4_0_f32(const char * cxi, char * cdsti) {
+    const block_q4_0 * xi = (const block_q4_0 *) cxi;
+    float * dst = (float *) cdsti;
+    float d = xi->d;
+    const float shift = 8.0f;
+
+    // Each byte packs two 4-bit quantized values.
+    for (int j = 0; j < QK4_0/2; j++) {
+         uint8_t q_val = xi->qs[j];
+         uint8_t q0 = q_val & 0x0F;
+         uint8_t q1 = (q_val >> 4) & 0x0F;
+         dst[j] = ((float)q0 - shift) * d;
+         dst[j + QK4_0/2] = ((float)q1 - shift) * d;
+    }
+}
+
+static __device__ void cpy_blck_q4_1_f32(const char * cxi, char * cdsti) {
+    const block_q4_1 * xi = (const block_q4_1 *) cxi;
+    float * dst = (float *) cdsti;
+    const float d = xi->dm.x;
+    const float vmin = xi->dm.y;
+
+    // Each byte packs two 4-bit quantized values.
+    for (int j = 0; j < QK4_1/2; ++j) {
+        uint8_t byte_val = xi->qs[j];
+        uint8_t q0 = byte_val & 0x0F;
+        uint8_t q1 = (byte_val >> 4) & 0x0F;
+        dst[j] = vmin + d * (float)q0;
+        dst[j + QK4_1/2] = vmin + d * (float)q1;
+    }
+}
 
 static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) {
     if (x <= val[0]) return 0;
@@ -420,6 +505,58 @@ static void ggml_cpy_f32_q5_1_cuda(
         (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
 
+static void ggml_cpy_q5_1_f32_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02,
+    const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12,
+    const int nb10, const int nb11, const int nb12, const int nb13,
+    cudaStream_t stream) {
+    const int num_blocks = ne;
+    cpy_q_f32<cpy_blck_q5_1_f32, QK5_1><<<num_blocks, 1, 0, stream>>>(
+        cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+        ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+}
+
+static void ggml_cpy_q5_0_f32_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02,
+    const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12,
+    const int nb10, const int nb11, const int nb12, const int nb13,
+    cudaStream_t stream) {
+    const int num_blocks = ne;
+    cpy_q_f32<cpy_blck_q5_0_f32, QK5_0><<<num_blocks, 1, 0, stream>>>(
+        cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+        ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+}
+
+static void ggml_cpy_q4_1_f32_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02,
+    const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12,
+    const int nb10, const int nb11, const int nb12, const int nb13,
+    cudaStream_t stream) {
+    const int num_blocks = ne;
+    cpy_q_f32<cpy_blck_q4_1_f32, QK4_1><<<num_blocks, 1, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+         ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+}
+
+static void ggml_cpy_q4_0_f32_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02,
+    const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12,
+    const int nb10, const int nb11, const int nb12, const int nb13,
+    cudaStream_t stream) {
+    const int num_blocks = ne;
+    cpy_q_f32<cpy_blck_q4_0_f32, QK4_0><<<num_blocks, 1, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+         ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+}
+
 static void ggml_cpy_f32_iq4_nl_cuda(
     const char * cx, char * cdst, const int ne,
     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@@ -488,14 +625,25 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
         ggml_cpy_q8_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
         ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_F32) {
+        ggml_cpy_q4_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
+            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
         ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    }  else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_F32) {
+        ggml_cpy_q4_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
+            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
         ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_F32) {
+        ggml_cpy_q5_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
+            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
         ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
         ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
+        ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
         ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
@@ -524,14 +672,22 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
         return (void*) cpy_q_f32<cpy_blck_q8_0_f32, QK8_0>;
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
         return (void*) cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>;
+    } else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_F32) {
+        return (void*) cpy_q_f32<cpy_blck_q4_0_f32, QK4_0>;
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
         return (void*) cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>;
+    } else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_F32) {
+        return (void*) cpy_q_f32<cpy_blck_q4_1_f32, QK4_1>;
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
         return (void*) cpy_f32_q<cpy_blck_f32_q5_0, QK5_0>;
+    }  else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_F32) {
+        return (void*) cpy_q_f32<cpy_blck_q5_0_f32, QK5_0>;
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
         return (void*) cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>;
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
         return (void*) cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>;
+    } else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
+        return (void*) cpy_q_f32<cpy_blck_q5_1_f32, QK5_1>;
     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
         return (void*) cpy_f32_f16<cpy_1_f32_f16>;
     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3073,15 +3073,27 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                 if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
                     return true;
                 }
+                if (src0_type == GGML_TYPE_Q4_0 && src1_type == GGML_TYPE_F32) {
+                    return true;
+                }
                 if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
                     return true;
                 }
+                if (src0_type == GGML_TYPE_Q4_1 && src1_type == GGML_TYPE_F32) {
+                    return true;
+                }
                 if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_0) {
                     return true;
                 }
+                if (src0_type == GGML_TYPE_Q5_0 && src1_type == GGML_TYPE_F32) {
+                    return true;
+                }
                 if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_1) {
                     return true;
                 }
+                if (src0_type == GGML_TYPE_Q5_1 && src1_type == GGML_TYPE_F32) {
+                    return true;
+                }
                 if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
                     return true;
                 }

Original file line number	Diff line number	Diff line change
`@@ -3073,15 +3073,27 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g`
`3073`	`3073`	`if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {`
`3074`	`3074`	`return true;`
`3075`	`3075`	`}`
	`3076`	`+ if (src0_type == GGML_TYPE_Q4_0 && src1_type == GGML_TYPE_F32) {`
	`3077`	`+ return true;`
	`3078`	`+ }`
`3076`	`3079`	`if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {`
`3077`	`3080`	`return true;`
`3078`	`3081`	`}`
	`3082`	`+ if (src0_type == GGML_TYPE_Q4_1 && src1_type == GGML_TYPE_F32) {`
	`3083`	`+ return true;`
	`3084`	`+ }`
`3079`	`3085`	`if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_0) {`
`3080`	`3086`	`return true;`
`3081`	`3087`	`}`
	`3088`	`+ if (src0_type == GGML_TYPE_Q5_0 && src1_type == GGML_TYPE_F32) {`
	`3089`	`+ return true;`
	`3090`	`+ }`
`3082`	`3091`	`if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_1) {`
`3083`	`3092`	`return true;`
`3084`	`3093`	`}`
	`3094`	`+ if (src0_type == GGML_TYPE_Q5_1 && src1_type == GGML_TYPE_F32) {`
	`3095`	`+ return true;`
	`3096`	`+ }`
`3085`	`3097`	`if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {`
`3086`	`3098`	`return true;`
`3087`	`3099`	`}`