@@ -510,7 +510,7 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float
510510}
511511
512512template <typename T, int BLOCK_SIZE, int THREADS, int NUM_PER_TH>
513- __global__ void kDequantizeBlockwise (float *code, unsigned char * __restrict__ const A, float * __restrict__ const absmax, T *out, const int n)
513+ __global__ void kDequantizeBlockwise (float *code, unsigned char * A, float * absmax, T *out, const int n)
514514{
515515
516516 const int n_full = gridDim .x * BLOCK_SIZE;
@@ -526,10 +526,11 @@ __global__ void kDequantizeBlockwise(float *code, unsigned char * __restrict__ c
526526
527527 __shared__ typename LoadChar::TempStorage loadchar;
528528 __shared__ typename StoreT::TempStorage storet;
529- __shared__ float smem_code[256 ];
529+ // __shared__ float smem_code[256];
530+ // float local_code[16];
530531
531- if (threadIdx .x < 256 )
532- smem_code[threadIdx .x ] = code[threadIdx .x ];
532+ // if(threadIdx.x < 256)
533+ // smem_code[threadIdx.x] = code[threadIdx.x];
533534
534535 for (unsigned int i = base_idx; i < n_full; i += gridDim .x *BLOCK_SIZE)
535536 {
@@ -539,9 +540,10 @@ __global__ void kDequantizeBlockwise(float *code, unsigned char * __restrict__ c
539540 __syncthreads ();
540541 LoadChar (loadchar).Load (&(A[i]), qvals, valid_items, 128 );
541542
543+ // load code through read-only cache via __ldg
542544 #pragma unroll NUM_PER_TH
543545 for (int j = 0 ; j < NUM_PER_TH; j++)
544- vals[j] = smem_code [qvals[j]]*local_abs_max;
546+ vals[j] = __ldg (&code [qvals[j]]) *local_abs_max;
545547
546548 __syncthreads ();
547549 StoreT (storet).Store (&(out[i]), vals, valid_items);
@@ -2798,14 +2800,14 @@ template __global__ void kQuantizeBlockwise<float, 1024, 4, 0>(float * code, flo
27982800template __global__ void kQuantizeBlockwise <half, 512 , 2 , 0 >(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n);
27992801template __global__ void kQuantizeBlockwise <float , 512 , 2 , 0 >(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n);
28002802
2801- template __global__ void kDequantizeBlockwise <half, 4096 , 1024 , 4 >(float *code, unsigned char * __restrict__ const A, float * __restrict__ const absmax, half *out, const int n);
2802- template __global__ void kDequantizeBlockwise <float , 4096 , 1024 , 4 >(float *code, unsigned char * __restrict__ const A, float * __restrict__ const absmax, float *out, const int n);
2803- template __global__ void kDequantizeBlockwise <half, 2048 , 512 , 4 >(float *code, unsigned char * __restrict__ const A, float * __restrict__ const absmax, half *out, const int n);
2804- template __global__ void kDequantizeBlockwise <float , 2048 , 512 , 4 >(float *code, unsigned char * __restrict__ const A, float * __restrict__ const absmax, float *out, const int n);
2805- template __global__ void kDequantizeBlockwise <half, 1024 , 256 , 4 >(float *code, unsigned char * __restrict__ const A, float * __restrict__ const absmax, half *out, const int n);
2806- template __global__ void kDequantizeBlockwise <float , 1024 , 256 , 4 >(float *code, unsigned char * __restrict__ const A, float * __restrict__ const absmax, float *out, const int n);
2807- template __global__ void kDequantizeBlockwise <half, 512 , 256 , 2 >(float *code, unsigned char * __restrict__ const A, float * __restrict__ const absmax, half *out, const int n);
2808- template __global__ void kDequantizeBlockwise <float , 512 , 256 , 2 >(float *code, unsigned char * __restrict__ const A, float * __restrict__ const absmax, float *out, const int n);
2803+ template __global__ void kDequantizeBlockwise <half, 4096 , 1024 , 4 >(float *code, unsigned char * A, float * absmax, half *out, const int n);
2804+ template __global__ void kDequantizeBlockwise <float , 4096 , 1024 , 4 >(float *code, unsigned char * A, float * absmax, float *out, const int n);
2805+ template __global__ void kDequantizeBlockwise <half, 2048 , 512 , 4 >(float *code, unsigned char * A, float * absmax, half *out, const int n);
2806+ template __global__ void kDequantizeBlockwise <float , 2048 , 512 , 4 >(float *code, unsigned char * A, float * absmax, float *out, const int n);
2807+ template __global__ void kDequantizeBlockwise <half, 1024 , 256 , 4 >(float *code, unsigned char * A, float * absmax, half *out, const int n);
2808+ template __global__ void kDequantizeBlockwise <float , 1024 , 256 , 4 >(float *code, unsigned char * A, float * absmax, float *out, const int n);
2809+ template __global__ void kDequantizeBlockwise <half, 512 , 256 , 2 >(float *code, unsigned char * A, float * absmax, half *out, const int n);
2810+ template __global__ void kDequantizeBlockwise <float , 512 , 256 , 2 >(float *code, unsigned char * A, float * absmax, float *out, const int n);
28092811
28102812
28112813
0 commit comments