44#define CUDA_Q8_0_NE_ALIGN 2048
55
66template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t >
7- static __global__ void dequantize_block (const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
8- const int i = 2 *(blockDim .x *blockIdx .x + threadIdx .x );
7+ static __global__ void dequantize_block (const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
8+ const int64_t i = 2 *(blockDim .x *blockIdx .x + threadIdx .x );
99
1010 if (i >= k) {
1111 return ;
1212 }
1313
14- const int ib = i/qk; // block index
14+ const int64_t ib = i/qk; // block index
1515 const int iqs = (i%qk)/qr; // quant index
1616 const int iybs = i - i%qk; // y block start index
1717 const int y_offset = qr == 1 ? 1 : qk/2 ;
@@ -25,7 +25,7 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
2525}
2626
2727template <bool need_check>
28- static __global__ void dequantize_block_q8_0_f16 (const void * __restrict__ vx, half * __restrict__ y, const int k) {
28+ static __global__ void dequantize_block_q8_0_f16 (const void * __restrict__ vx, half * __restrict__ y, const int64_t k) {
2929#if __CUDA_ARCH__ >= CC_PASCAL
3030 constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof (int ) + WARP_SIZE;
3131
@@ -68,13 +68,13 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
6868template <typename dst_t >
6969static __global__ void dequantize_block_q4_0 (const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
7070
71- const int i = blockIdx .x ;
71+ const int64_t i = blockIdx .x ;
7272
7373 // assume 32 threads
7474 const int tid = threadIdx .x ;
7575 const int il = tid/8 ;
7676 const int ir = tid%8 ;
77- const int ib = 8 *i + ir;
77+ const int64_t ib = 8 *i + ir;
7878 if (ib >= nb32) {
7979 return ;
8080 }
@@ -96,13 +96,13 @@ static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t
9696template <typename dst_t >
9797static __global__ void dequantize_block_q4_1 (const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
9898
99- const int i = blockIdx .x ;
99+ const int64_t i = blockIdx .x ;
100100
101101 // assume 32 threads
102102 const int tid = threadIdx .x ;
103103 const int il = tid/8 ;
104104 const int ir = tid%8 ;
105- const int ib = 8 *i + ir;
105+ const int64_t ib = 8 *i + ir;
106106 if (ib >= nb32) {
107107 return ;
108108 }
@@ -313,14 +313,14 @@ template<typename dst_t>
313313static __global__ void dequantize_block_q6_K (const void * __restrict__ vx, dst_t * __restrict__ yy) {
314314 const block_q6_K * x = (const block_q6_K *) vx;
315315
316- const int i = blockIdx .x ;
316+ const int64_t i = blockIdx .x ;
317317#if QK_K == 256
318318
319319 // assume 64 threads - this is very slightly better than the one below
320- const int tid = threadIdx .x ;
321- const int ip = tid/32 ; // ip is 0 or 1
322- const int il = tid - 32 *ip; // 0...32
323- const int is = 8 *ip + il/16 ;
320+ const int64_t tid = threadIdx .x ;
321+ const int64_t ip = tid/32 ; // ip is 0 or 1
322+ const int64_t il = tid - 32 *ip; // 0...32
323+ const int64_t is = 8 *ip + il/16 ;
324324
325325 dst_t * y = yy + i*QK_K + 128 *ip + il;
326326
@@ -337,9 +337,9 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
337337#else
338338
339339 // assume 32 threads
340- const int tid = threadIdx .x ;
341- const int ip = tid/16 ; // 0 or 1
342- const int il = tid - 16 *ip; // 0...15
340+ const int64_t tid = threadIdx .x ;
341+ const int64_t ip = tid/16 ; // 0 or 1
342+ const int64_t il = tid - 16 *ip; // 0...15
343343
344344 dst_t * y = yy + i*QK_K + 16 *ip + il;
345345
@@ -571,12 +571,12 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
571571#endif
572572
573573template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t >
574- static void dequantize_block_cuda (const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
574+ static void dequantize_block_cuda (const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
575575 const int num_blocks = (k + 2 *CUDA_DEQUANTIZE_BLOCK_SIZE - 1 ) / (2 *CUDA_DEQUANTIZE_BLOCK_SIZE);
576576 dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0 , stream>>> (vx, y, k);
577577}
578578
579- static void dequantize_block_q8_0_f16_cuda (const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
579+ static void dequantize_block_q8_0_f16_cuda (const void * __restrict__ vx, half * __restrict__ y, const int64_t k, cudaStream_t stream) {
580580 const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1 ) / CUDA_Q8_0_NE_ALIGN;
581581 if (k % CUDA_Q8_0_NE_ALIGN == 0 ) {
582582 const bool need_check = false ;
@@ -588,7 +588,7 @@ static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half *
588588}
589589
590590template <typename dst_t >
591- static void dequantize_row_q2_K_cuda (const void * vx, dst_t * y, const int k, cudaStream_t stream) {
591+ static void dequantize_row_q2_K_cuda (const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
592592 const int nb = k / QK_K;
593593#if QK_K == 256
594594 dequantize_block_q2_K<<<nb, 64 , 0 , stream>>> (vx, y);
@@ -598,7 +598,7 @@ static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cu
598598}
599599
600600template <typename dst_t >
601- static void dequantize_row_q3_K_cuda (const void * vx, dst_t * y, const int k, cudaStream_t stream) {
601+ static void dequantize_row_q3_K_cuda (const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
602602 const int nb = k / QK_K;
603603#if QK_K == 256
604604 dequantize_block_q3_K<<<nb, 64 , 0 , stream>>> (vx, y);
@@ -608,27 +608,27 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cu
608608}
609609
610610template <typename dst_t >
611- static void dequantize_row_q4_0_cuda (const void * vx, dst_t * y, const int k, cudaStream_t stream) {
611+ static void dequantize_row_q4_0_cuda (const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
612612 const int nb32 = k / 32 ;
613613 const int nb = (k + 255 ) / 256 ;
614614 dequantize_block_q4_0<<<nb, 32 , 0 , stream>>> (vx, y, nb32);
615615}
616616
617617template <typename dst_t >
618- static void dequantize_row_q4_1_cuda (const void * vx, dst_t * y, const int k, cudaStream_t stream) {
618+ static void dequantize_row_q4_1_cuda (const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
619619 const int nb32 = k / 32 ;
620620 const int nb = (k + 255 ) / 256 ;
621621 dequantize_block_q4_1<<<nb, 32 , 0 , stream>>> (vx, y, nb32);
622622}
623623
624624template <typename dst_t >
625- static void dequantize_row_q4_K_cuda (const void * vx, dst_t * y, const int k, cudaStream_t stream) {
625+ static void dequantize_row_q4_K_cuda (const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
626626 const int nb = k / QK_K;
627627 dequantize_block_q4_K<<<nb, 32 , 0 , stream>>> (vx, y);
628628}
629629
630630template <typename dst_t >
631- static void dequantize_row_q5_K_cuda (const void * vx, dst_t * y, const int k, cudaStream_t stream) {
631+ static void dequantize_row_q5_K_cuda (const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
632632 const int nb = k / QK_K;
633633#if QK_K == 256
634634 dequantize_block_q5_K<<<nb, 64 , 0 , stream>>> (vx, y);
@@ -638,7 +638,7 @@ static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cu
638638}
639639
640640template <typename dst_t >
641- static void dequantize_row_q6_K_cuda (const void * vx, dst_t * y, const int k, cudaStream_t stream) {
641+ static void dequantize_row_q6_K_cuda (const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
642642 const int nb = k / QK_K;
643643#if QK_K == 256
644644 dequantize_block_q6_K<<<nb, 64 , 0 , stream>>> (vx, y);
@@ -648,55 +648,55 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
648648}
649649
650650template <typename dst_t >
651- static void dequantize_row_iq2_xxs_cuda (const void * vx, dst_t * y, const int k, cudaStream_t stream) {
651+ static void dequantize_row_iq2_xxs_cuda (const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
652652 const int nb = k / QK_K;
653653 dequantize_block_iq2_xxs<<<nb, 32 , 0 , stream>>> (vx, y);
654654}
655655
656656template <typename dst_t >
657- static void dequantize_row_iq2_xs_cuda (const void * vx, dst_t * y, const int k, cudaStream_t stream) {
657+ static void dequantize_row_iq2_xs_cuda (const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
658658 const int nb = k / QK_K;
659659 dequantize_block_iq2_xs<<<nb, 32 , 0 , stream>>> (vx, y);
660660}
661661
662662template <typename dst_t >
663- static void dequantize_row_iq2_s_cuda (const void * vx, dst_t * y, const int k, cudaStream_t stream) {
663+ static void dequantize_row_iq2_s_cuda (const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
664664 const int nb = k / QK_K;
665665 dequantize_block_iq2_s<<<nb, 32 , 0 , stream>>> (vx, y);
666666}
667667
668668template <typename dst_t >
669- static void dequantize_row_iq3_xxs_cuda (const void * vx, dst_t * y, const int k, cudaStream_t stream) {
669+ static void dequantize_row_iq3_xxs_cuda (const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
670670 const int nb = k / QK_K;
671671 dequantize_block_iq3_xxs<<<nb, 32 , 0 , stream>>> (vx, y);
672672}
673673
674674template <typename dst_t >
675- static void dequantize_row_iq3_s_cuda (const void * vx, dst_t * y, const int k, cudaStream_t stream) {
675+ static void dequantize_row_iq3_s_cuda (const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
676676 const int nb = k / QK_K;
677677 dequantize_block_iq3_s<<<nb, 32 , 0 , stream>>> (vx, y);
678678}
679679
680680template <typename dst_t >
681- static void dequantize_row_iq1_s_cuda (const void * vx, dst_t * y, const int k, cudaStream_t stream) {
681+ static void dequantize_row_iq1_s_cuda (const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
682682 const int nb = k / QK_K;
683683 dequantize_block_iq1_s<<<nb, 32 , 0 , stream>>> (vx, y);
684684}
685685
686686template <typename dst_t >
687- static void dequantize_row_iq4_nl_cuda (const void * vx, dst_t * y, const int k, cudaStream_t stream) {
687+ static void dequantize_row_iq4_nl_cuda (const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
688688 const int nb = (k + QK_K - 1 ) / QK_K;
689689 dequantize_block_iq4_nl<<<nb, 32 , 0 , stream>>> (vx, y);
690690}
691691
692692template <typename dst_t >
693- static void dequantize_row_iq1_m_cuda (const void * vx, dst_t * y, const int k, cudaStream_t stream) {
693+ static void dequantize_row_iq1_m_cuda (const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
694694 const int nb = k / QK_K;
695695 dequantize_block_iq1_m<<<nb, 32 , 0 , stream>>> (vx, y);
696696}
697697
698698template <typename dst_t >
699- static void dequantize_row_iq4_xs_cuda (const void * vx, dst_t * y, const int k, cudaStream_t stream) {
699+ static void dequantize_row_iq4_xs_cuda (const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
700700 const int nb = (k + QK_K - 1 ) / QK_K;
701701#if QK_K == 64
702702 dequantize_block_iq4_nl<<<nb, 32 , 0 , stream>>> (vx, y);
@@ -706,8 +706,8 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k,
706706}
707707
708708template <typename src_t , typename dst_t >
709- static __global__ void convert_unary (const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
710- const int i = blockDim .x *blockIdx .x + threadIdx .x ;
709+ static __global__ void convert_unary (const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
710+ const int64_t i = ( int64_t ) blockDim .x *blockIdx .x + threadIdx .x ;
711711
712712 if (i >= k) {
713713 return ;
@@ -719,7 +719,7 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res
719719}
720720
721721template <typename src_t , typename dst_t >
722- static void convert_unary_cuda (const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
722+ static void convert_unary_cuda (const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
723723 const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1 ) / CUDA_DEQUANTIZE_BLOCK_SIZE;
724724 convert_unary<src_t ><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0 , stream>>> (vx, y, k);
725725}
0 commit comments