-
Notifications
You must be signed in to change notification settings - Fork 13.7k
Closed
Description
2 static constexpr methods mmq_get_dp4a_tile_x_sizes and mmq_get_mma_tile_x_k in mmq.cuh cause the CUDA compiler to run out of heap space due to cascaded question mark operators, i dont know if this was fixed in a later compiler but i'm using CUDA 11.4, this is the correct way to write these methods in mmq.cuh:
static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml_type type, int mmq_y){
switch(type){
case GGML_TYPE_Q4_0:
return MMQ_DP4A_TXS_Q4_0;
case GGML_TYPE_Q4_1:
return MMQ_DP4A_TXS_Q4_1;
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q8_0:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ4_NL:
return MMQ_DP4A_TXS_Q8_0;
case GGML_TYPE_Q5_1:
return MMQ_DP4A_TXS_Q8_1;
case GGML_TYPE_Q2_K:
return MMQ_DP4A_TXS_Q2_K;
case GGML_TYPE_Q3_K:
return MMQ_DP4A_TXS_Q3_K;
case GGML_TYPE_Q4_K:
return MMQ_DP4A_TXS_Q4_K;
case GGML_TYPE_Q5_K:
return MMQ_DP4A_TXS_Q5_K;
case GGML_TYPE_Q6_K:
return MMQ_DP4A_TXS_Q6_K;
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ2_S:
return MMQ_DP4A_TXS_Q8_0_16;
default:
return tile_x_sizes{0, 0, 0};
}
}
static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type){
switch(type){
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q8_0:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ4_NL:
return MMQ_MMA_TILE_X_K_Q8_0;
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
return MMQ_MMA_TILE_X_K_Q8_1;
case GGML_TYPE_Q2_K:
return MMQ_MMA_TILE_X_K_Q2_K;
case GGML_TYPE_Q3_K:
return MMQ_MMA_TILE_X_K_Q3_K;
case GGML_TYPE_Q6_K:
return MMQ_MMA_TILE_X_K_Q6_K;
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ2_S:
return MMQ_MMA_TILE_X_K_Q3_K;
default:
return 0;
}
}
Metadata
Metadata
Assignees
Labels
No labels