Various fixes

Nexesenex · Nexesenex · commit 135132e6c949 · 2025-07-25T12:41:39.000+02:00
Add IQ3_KS to sd
Fixing a few mismerges
Update requirements.txt
Update ggml-cuda.cu
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1807,8 +1807,8 @@ void ggml_compute_forward_mul_mat(
     int64_t                  const vec_dot_num_rows     = type_traits_cpu[src0->type].nrows;
     // int64_t                  const matmul_num_cols      = type_traits_cpu[type].ncols;
 #if !GGML_USE_IQK_MULMAT
-    ggml_from_float_to_mat_t const from_float_to_mat    = type_traits[vec_dot_type].from_float_to_mat;
-    int64_t                  const blck_size_interleave = type_traits[type].blck_size_interleave;
+    // ggml_from_float_to_mat_t const from_float_to_mat    = type_traits[vec_dot_type].from_float_to_mat;
+    // int64_t                  const blck_size_interleave = type_traits[type].blck_size_interleave;
 #endif
 
     GGML_ASSERT(ne0 == ne01);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2214,12 +2214,12 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
         ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
 
     // } else if (!split && use_mul_mat_vec && dst->ne[3] == 1 && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
-    } else if (!split && use_mul_mat_vec && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
+    } else if (!split && use_mul_mat_vec && (src0->ne[1] <= MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
         // the custom F16 vector kernel can be used over batched cuBLAS GEMM
         // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
         ggml_cuda_mul_mat_vec(ctx, src0, src1, dst);
 
-    } else if (!split && src0->type == GGML_TYPE_F16 && src1->ne[1] == 1 && dst->ne[3] == 1 && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
+    } else if (!split && src0->type == GGML_TYPE_F16 && src1->ne[1] == 1 && dst->ne[3] == 1 && (src0->ne[1] <= MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
         ggml_cuda_mul_mat_vec(ctx, src0, src1, dst);
 
     } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
@@ -4999,12 +4999,12 @@ size_t quantize_iq1_m(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
     return nrow * nblock * sizeof(block_iq1_m);
 }
 
-void quantize_row_iq1_m_ref  (const float * GGML_RESTRICT x, block_iq1_s   * GGML_RESTRICT y, int64_t k) {
+void quantize_row_iq1_m_ref  (const float * GGML_RESTRICT x, block_iq1_m   * GGML_RESTRICT y, int64_t k) {
     int nblock = k/QK_K;
     float qw[QK_K];
     for (int j = 0; j < QK_K; ++j) qw[j] = 1;
     for (int ibl = 0; ibl < nblock; ++ibl) {
-        quantize_iq1_s(x + ibl*QK_K, &y[ibl], 1, QK_K, qw);
+        quantize_iq1_m(x + ibl*QK_K, &y[ibl], 1, QK_K, qw);
     }
 }
 
diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h
@@ -38,8 +38,8 @@ GGML_API void quantize_row_q8_K64_ref(const float * GGML_RESTRICT x, block_q8_K6
 GGML_API void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k);
 GGML_API void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k);
 
-GGML_API void quantize_row_iq1_s_ref(const float * GGML_RESTRICT x, block_iq2_xxs * GGML_RESTRICT y, int64_t k);
-GGML_API void quantize_row_iq1_m_ref(const float * GGML_RESTRICT x, block_iq2_xs * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_iq1_s_ref(const float * GGML_RESTRICT x, block_iq1_s * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_iq1_m_ref(const float * GGML_RESTRICT x, block_iq1_m * GGML_RESTRICT y, int64_t k);
 GGML_API void quantize_row_iq2_xxs_ref(const float * GGML_RESTRICT x, block_iq2_xxs * GGML_RESTRICT y, int64_t k);
 GGML_API void quantize_row_iq2_xs_ref (const float * GGML_RESTRICT x, block_iq2_xs  * GGML_RESTRICT y, int64_t k);
 GGML_API void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
diff --git a/otherarch/sdcpp/stable-diffusion.h b/otherarch/sdcpp/stable-diffusion.h
@@ -130,6 +130,7 @@ enum sd_type_t {
     SD_TYPE_IQ2_KT   = 153,
     SD_TYPE_IQ3_KT   = 154,
     SD_TYPE_IQ4_KT   = 155,
+    SD_TYPE_IQ3_KS   = 156,
 
     SD_TYPE_IQ3_KS_V1   = 196,
 
diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,11 @@
 numpy>=1.26.4
 sentencepiece>=0.2.0
 transformers>=4.51.3
-gguf~=0.13.0
+gguf~=0.10.0
 customtkinter>=5.2.2
 protobuf>=4.21.12
 psutil>=6.1.1
 darkdetect>=0.8.0
+pdfplumber>=0.11.6
+PyMuPdf>=1.26.0
+tqdm>=4.67.1

Original file line number	Diff line number	Diff line change
`@@ -4999,12 +4999,12 @@ size_t quantize_iq1_m(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,`
`4999`	`4999`	`return nrow * nblock * sizeof(block_iq1_m);`
`5000`	`5000`	`}`
`5001`	`5001`
`5002`		`-void quantize_row_iq1_m_ref (const float * GGML_RESTRICT x, block_iq1_s * GGML_RESTRICT y, int64_t k) {`
	`5002`	`+void quantize_row_iq1_m_ref (const float * GGML_RESTRICT x, block_iq1_m * GGML_RESTRICT y, int64_t k) {`
`5003`	`5003`	`int nblock = k/QK_K;`
`5004`	`5004`	`float qw[QK_K];`
`5005`	`5005`	`for (int j = 0; j < QK_K; ++j) qw[j] = 1;`
`5006`	`5006`	`for (int ibl = 0; ibl < nblock; ++ibl) {`
`5007`		`- quantize_iq1_s(x + ibl*QK_K, &y[ibl], 1, QK_K, qw);`
	`5007`	`+ quantize_iq1_m(x + ibl*QK_K, &y[ibl], 1, QK_K, qw);`
`5008`	`5008`	`}`
`5009`	`5009`	`}`
`5010`	`5010`