@@ -3044,23 +3044,29 @@ MAKE_kQuantizeBlockwise(half, 1024, 4, 0, General8bit)
30443044MAKE_kQuantizeBlockwise(half, 512 , 2 , 0 , General8bit)
30453045MAKE_kQuantizeBlockwise(half, 256 , 2 , 0 , General8bit)
30463046MAKE_kQuantizeBlockwise(half, 128 , 2 , 0 , General8bit)
3047- MAKE_kQuantizeBlockwise(half, 64 , 2 , 0 , General8bit)
3047+ #if WARP_SIZE == 32
3048+ MAKE_kQuantizeBlockwise (half, 64 , 2 , 0 , General8bit)
3049+ #endif
30483050
30493051MAKE_kQuantizeBlockwise (half, 4096 , 4 , 0 , FP4)
30503052MAKE_kQuantizeBlockwise(half, 2048 , 4 , 0 , FP4)
30513053MAKE_kQuantizeBlockwise(half, 1024 , 4 , 0 , FP4)
30523054MAKE_kQuantizeBlockwise(half, 512 , 2 , 0 , FP4)
30533055MAKE_kQuantizeBlockwise(half, 256 , 2 , 0 , FP4)
30543056MAKE_kQuantizeBlockwise(half, 128 , 2 , 0 , FP4)
3055- MAKE_kQuantizeBlockwise(half, 64 , 2 , 0 , FP4)
3057+ #if WARP_SIZE == 32
3058+ MAKE_kQuantizeBlockwise (half, 64 , 2 , 0 , FP4)
3059+ #endif
30563060
30573061MAKE_kQuantizeBlockwise (half, 4096 , 4 , 0 , NF4)
30583062MAKE_kQuantizeBlockwise(half, 2048 , 4 , 0 , NF4)
30593063MAKE_kQuantizeBlockwise(half, 1024 , 4 , 0 , NF4)
30603064MAKE_kQuantizeBlockwise(half, 512 , 2 , 0 , NF4)
30613065MAKE_kQuantizeBlockwise(half, 256 , 2 , 0 , NF4)
30623066MAKE_kQuantizeBlockwise(half, 128 , 2 , 0 , NF4)
3063- MAKE_kQuantizeBlockwise(half, 64 , 2 , 0 , NF4)
3067+ #if WARP_SIZE == 32
3068+ MAKE_kQuantizeBlockwise (half, 64 , 2 , 0 , NF4)
3069+ #endif
30643070
30653071MAKE_kQuantizeBlockwise (float , 4096 , 4 , 0 , General8bit)
30663072MAKE_kQuantizeBlockwise(float , 4096 , 4 , 1 , General8bit)
@@ -3069,23 +3075,29 @@ MAKE_kQuantizeBlockwise(float, 1024, 4, 0, General8bit)
30693075MAKE_kQuantizeBlockwise(float , 512 , 2 , 0 , General8bit)
30703076MAKE_kQuantizeBlockwise(float , 256 , 2 , 0 , General8bit)
30713077MAKE_kQuantizeBlockwise(float , 128 , 2 , 0 , General8bit)
3072- MAKE_kQuantizeBlockwise(float , 64 , 2 , 0 , General8bit)
3078+ #if WARP_SIZE == 32
3079+ MAKE_kQuantizeBlockwise (float , 64 , 2 , 0 , General8bit)
3080+ #endif
30733081
30743082MAKE_kQuantizeBlockwise (float , 4096 , 4 , 0 , FP4)
30753083MAKE_kQuantizeBlockwise(float , 2048 , 4 , 0 , FP4)
30763084MAKE_kQuantizeBlockwise(float , 1024 , 4 , 0 , FP4)
30773085MAKE_kQuantizeBlockwise(float , 512 , 2 , 0 , FP4)
30783086MAKE_kQuantizeBlockwise(float , 256 , 2 , 0 , FP4)
30793087MAKE_kQuantizeBlockwise(float , 128 , 2 , 0 , FP4)
3080- MAKE_kQuantizeBlockwise(float , 64 , 2 , 0 , FP4)
3088+ #if WARP_SIZE == 32
3089+ MAKE_kQuantizeBlockwise (float , 64 , 2 , 0 , FP4)
3090+ #endif
30813091
30823092MAKE_kQuantizeBlockwise (float , 4096 , 4 , 0 , NF4)
30833093MAKE_kQuantizeBlockwise(float , 2048 , 4 , 0 , NF4)
30843094MAKE_kQuantizeBlockwise(float , 1024 , 4 , 0 , NF4)
30853095MAKE_kQuantizeBlockwise(float , 512 , 2 , 0 , NF4)
30863096MAKE_kQuantizeBlockwise(float , 256 , 2 , 0 , NF4)
30873097MAKE_kQuantizeBlockwise(float , 128 , 2 , 0 , NF4)
3088- MAKE_kQuantizeBlockwise(float , 64 , 2 , 0 , NF4)
3098+ #if WARP_SIZE == 32
3099+ MAKE_kQuantizeBlockwise (float , 64 , 2 , 0 , NF4)
3100+ #endif
30893101
30903102MAKE_kQuantizeBlockwise (hip_bfloat16, 4096 , 4 , 0 , General8bit)
30913103MAKE_kQuantizeBlockwise(hip_bfloat16, 4096 , 4 , 1 , General8bit)
@@ -3094,23 +3106,29 @@ MAKE_kQuantizeBlockwise(hip_bfloat16, 1024, 4, 0, General8bit)
30943106MAKE_kQuantizeBlockwise(hip_bfloat16, 512 , 2 , 0 , General8bit)
30953107MAKE_kQuantizeBlockwise(hip_bfloat16, 256 , 2 , 0 , General8bit)
30963108MAKE_kQuantizeBlockwise(hip_bfloat16, 128 , 2 , 0 , General8bit)
3097- MAKE_kQuantizeBlockwise(hip_bfloat16, 64 , 2 , 0 , General8bit)
3109+ #if WARP_SIZE == 32
3110+ MAKE_kQuantizeBlockwise (hip_bfloat16, 64 , 2 , 0 , General8bit)
3111+ #endif
30983112
30993113MAKE_kQuantizeBlockwise (hip_bfloat16, 4096 , 4 , 0 , FP4)
31003114MAKE_kQuantizeBlockwise(hip_bfloat16, 2048 , 4 , 0 , FP4)
31013115MAKE_kQuantizeBlockwise(hip_bfloat16, 1024 , 4 , 0 , FP4)
31023116MAKE_kQuantizeBlockwise(hip_bfloat16, 512 , 2 , 0 , FP4)
31033117MAKE_kQuantizeBlockwise(hip_bfloat16, 256 , 2 , 0 , FP4)
31043118MAKE_kQuantizeBlockwise(hip_bfloat16, 128 , 2 , 0 , FP4)
3105- MAKE_kQuantizeBlockwise(hip_bfloat16, 64 , 2 , 0 , FP4)
3119+ #if WARP_SIZE == 32
3120+ MAKE_kQuantizeBlockwise (hip_bfloat16, 64 , 2 , 0 , FP4)
3121+ #endif
31063122
31073123MAKE_kQuantizeBlockwise (hip_bfloat16, 4096 , 4 , 0 , NF4)
31083124MAKE_kQuantizeBlockwise(hip_bfloat16, 2048 , 4 , 0 , NF4)
31093125MAKE_kQuantizeBlockwise(hip_bfloat16, 1024 , 4 , 0 , NF4)
31103126MAKE_kQuantizeBlockwise(hip_bfloat16, 512 , 2 , 0 , NF4)
31113127MAKE_kQuantizeBlockwise(hip_bfloat16, 256 , 2 , 0 , NF4)
31123128MAKE_kQuantizeBlockwise(hip_bfloat16, 128 , 2 , 0 , NF4)
3113- MAKE_kQuantizeBlockwise(hip_bfloat16, 64 , 2 , 0 , NF4)
3129+ #if WARP_SIZE == 32
3130+ MAKE_kQuantizeBlockwise (hip_bfloat16, 64 , 2 , 0 , NF4)
3131+ #endif
31143132
31153133template __global__ void kDequantizeBlockwise <half, 512 , 64 , 8 , FP4>(float *code, unsigned char * A, float * absmax, half *out, const int blocksize, const int n);
31163134template __global__ void kDequantizeBlockwise <half, 512 , 64 , 8 , General8bit>(float *code, unsigned char * A, float * absmax, half *out, const int blocksize, const int n);
0 commit comments