Adapt LCPP refactor merge and reinstate Q6_0

Nexesenex · Nexesenex · commit 61c1ce9959d2 · 2024-11-18T17:14:04.000+01:00
And update stable-diffusion.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -69,7 +69,7 @@ set(LLAMA_SCHED_MAX_COPIES  "1" CACHE STRING "llama: max input copies for pipeli
 option(LLAMA_CUDA_ENABLE_UNIFIED_MEMORY      "llama: enable to avoid OOM in Full Offload"       OFF)
 
 option(GGML_IQK_MUL_MAT                     "ggml: use optimized iqk matrix multiplications"    OFF)
-option(GGML_USE_LLAMA_CPP_MAINLINE          "ggml: use Llama CPP mainline MatMul           "    ON)
+option(GGML_USE_LLAMA_CPP_MAINLINE          "ggml: use Llama CPP mainline MatMul           "    OFF)
 
 #
 # Compile flags
@@ -85,7 +85,7 @@ find_package(Threads REQUIRED)
 add_compile_definitions(LOG_DISABLE_LOGS)
 
 file(GLOB GGML_SOURCES_CUDA "ggml/src/ggml-cuda/*.cu")
-list(APPEND GGML_SOURCES_CUDA "ggml/src/ggml-cuda.cu")
+list(APPEND GGML_SOURCES_CUDA "ggml/src/ggml-cuda/ggml-cuda.cu")
 file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu")
 list(APPEND GGML_SOURCES_CUDA ${SRCS})
 set(GGML_V3_CUDA_SOURCES otherarch/ggml_v3-cuda.cu otherarch/ggml_v3-cuda.h)
@@ -329,7 +329,7 @@ if (LLAMA_HIPBLAS)
     if (${hipblas_FOUND} AND ${hip_FOUND})
         message(STATUS "HIP and hipBLAS found")
         file(GLOB GGML_SOURCES_ROCM "ggml/src/ggml-cuda/*.cu")
-        list(APPEND GGML_SOURCES_ROCM "ggml/src/ggml-cuda.cu")
+        list(APPEND GGML_SOURCES_ROCM "ggml/src/ggml-cuda/ggml-cuda.cu")
         file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu")
         list(APPEND GGML_SOURCES_ROCM ${SRCS})
         file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/mmq*.cu")
@@ -686,65 +686,65 @@ if (GGML_IQK_MUL_MAT)
 	set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
     add_compile_definitions(GGML_USE_IQK_MULMAT)
 elseif (GGML_USE_LLAMA_CPP_MAINLINE)
-	add_library(ggml
-				ggml/src/ggml.c
-				ggml/include/ggml.h
-				ggml/src/ggml-cpu.c
-				ggml/include/ggml-cpu.h
-				ggml/src/ggml-alloc.c
-				ggml/include/ggml-alloc.h
-				ggml/src/ggml-backend.cpp
-				ggml/src/ggml-backend-impl.h
-				ggml/include/ggml-backend.h
-				ggml/include/ggml-cpp.h
-				ggml/src/ggml-quants.c
-				ggml/src/ggml-quants.h
-				ggml/src/llamafile/sgemm.cpp
-				ggml/src/llamafile/sgemm.h
-				ggml/src/ggml-aarch64.c
-				ggml/src/ggml-aarch64.h
-				${GGML_SOURCES_CUDA})
-	target_include_directories(ggml PUBLIC . ./ggml/include ./ggml/src ./include ./otherarch ./otherarch/tools)
-	target_compile_features(ggml PUBLIC c_std_11) # don't bump
-	target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
-	set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
+	# add_library(ggml
+				# ggml/src/ggml.c
+				# ggml/include/ggml.h
+				# ggml/src/ggml-cpu.c
+				# ggml/include/ggml-cpu.h
+				# ggml/src/ggml-alloc.c
+				# ggml/include/ggml-alloc.h
+				# ggml/src/ggml-backend.cpp
+				# ggml/src/ggml-backend-impl.h
+				# ggml/include/ggml-backend.h
+				# ggml/include/ggml-cpp.h
+				# ggml/src/ggml-quants.c
+				# ggml/src/ggml-quants.h
+				# ggml/src/llamafile/sgemm.cpp
+				# ggml/src/llamafile/sgemm.h
+				# ggml/src/ggml-aarch64.c
+				# ggml/src/ggml-aarch64.h
+				# ${GGML_SOURCES_CUDA})
+	# target_include_directories(ggml PUBLIC . ./ggml/include ./ggml/src ./include ./otherarch ./otherarch/tools)
+	# target_compile_features(ggml PUBLIC c_std_11) # don't bump
+	# target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
+	# set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
     add_compile_definitions(GGML_USE_LLAMA_CPP_MAINLINE)
 else ()
-	add_library(ggml
-				ggml/src/ggml.c
-				ggml/include/ggml.h
-				ggml/src/ggml-cpu/ggml-cpu.c
-				ggml/include/ggml-cpu.h
-				ggml/src/ggml-alloc.c
-				ggml/include/ggml-alloc.h
-				ggml/src/ggml-backend.cpp
-				ggml/src/ggml-backend-impl.h
-				ggml/include/ggml-backend.h
-				ggml/include/ggml-cpp.h
-				ggml/src/ggml-quants.c
-				ggml/src/ggml-quants.h
-				ggml/src/ggml-cpu/llamafile/sgemm.cpp
-				ggml/src/ggml-cpu/llamafile/sgemm.h
-				ggml/src/ggml-aarch64.c
-				ggml/src/ggml-aarch64.h
-				ggml/src/ggml-threading.cpp
-				ggml/src/ggml-cpu/ggml-cpu.cpp
-				ggml/src/ggml-cpu/ggml-cpu-aarch64.c
-				ggml/src/ggml-cpu/ggml-cpu-aarch64.h
-				ggml/src/ggml-cpu/ggml-cpu-quants.c
-				ggml/src/ggml-cpu/ggml-cpu-quants.h
-				ggml/src/ggml-backend-reg.cpp
-				${GGML_SOURCES_CUDA})
-	target_include_directories(ggml PUBLIC . ./ggml/include ./ggml/src ./include ./otherarch ./otherarch/tools)
-	target_compile_features(ggml PUBLIC c_std_11) # don't bump
-	target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
-	set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    add_library(ggml
+                ggml/src/ggml.c
+                ggml/include/ggml.h
+                ggml/src/ggml-cpu/ggml-cpu.c
+                ggml/include/ggml-cpu.h
+                ggml/src/ggml-alloc.c
+                ggml/include/ggml-alloc.h
+                ggml/src/ggml-backend.cpp
+                ggml/src/ggml-backend-impl.h
+                ggml/include/ggml-backend.h
+                ggml/include/ggml-cpp.h
+                ggml/src/ggml-quants.c
+                ggml/src/ggml-quants.h
+                ggml/src/ggml-cpu/llamafile/sgemm.cpp
+                ggml/src/ggml-cpu/llamafile/sgemm.h
+                ggml/src/ggml-aarch64.c
+                ggml/src/ggml-aarch64.h
+                ggml/src/ggml-threading.cpp
+                ggml/src/ggml-cpu/ggml-cpu.cpp
+                ggml/src/ggml-cpu/ggml-cpu-aarch64.c
+                ggml/src/ggml-cpu/ggml-cpu-aarch64.h
+                ggml/src/ggml-cpu/ggml-cpu-quants.c
+                ggml/src/ggml-cpu/ggml-cpu-quants.h
+                ggml/src/ggml-backend-reg.cpp
+                ${GGML_SOURCES_CUDA})
+    target_include_directories(ggml PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools)
+    target_compile_features(ggml PUBLIC c_std_11) # don't bump
+    target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
+    set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
 
 add_library(ggml_v1
             otherarch/ggml_v1.c
             otherarch/ggml_v1.h)
-target_include_directories(ggml_v1 PUBLIC . ./ggml/include ./ggml/src ./include ./otherarch ./otherarch/tools)
+target_include_directories(ggml_v1 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools)
 target_compile_features(ggml_v1 PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml_v1 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
 set_target_properties(ggml_v1 PROPERTIES POSITION_INDEPENDENT_CODE ON)
@@ -754,7 +754,7 @@ add_library(ggml_v2
             otherarch/ggml_v2.h
             ${GGML_V2_CUDA_SOURCES}
             ${GGML_V2_LEGACY_CUDA_SOURCES})
-target_include_directories(ggml_v2 PUBLIC . ./ggml/include ./ggml/src ./include ./otherarch ./otherarch/tools)
+target_include_directories(ggml_v2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools)
 target_compile_features(ggml_v2 PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml_v2 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
 set_target_properties(ggml_v2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
@@ -763,7 +763,7 @@ add_library(ggml_v3
             otherarch/ggml_v3.c
             otherarch/ggml_v3.h
             ${GGML_V3_CUDA_SOURCES})
-target_include_directories(ggml_v3 PUBLIC . ./ggml/include ./ggml/src ./include ./otherarch ./otherarch/tools)
+target_include_directories(ggml_v3 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools)
 target_compile_features(ggml_v3 PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml_v3 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
 set_target_properties(ggml_v3 PROPERTIES POSITION_INDEPENDENT_CODE ON)
@@ -780,36 +780,36 @@ add_library(common2
             src/unicode.h
             src/unicode.cpp
             src/unicode-data.cpp)
-target_include_directories(common2 PUBLIC . ./ggml/include ./ggml/src ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
+target_include_directories(common2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
 target_compile_features(common2 PUBLIC cxx_std_11) # don't bump
 target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS})
 set_target_properties(common2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
 add_library(sdtype_adapter
             otherarch/sdcpp/sdtype_adapter.cpp)
-target_include_directories(sdtype_adapter PUBLIC . ./ggml/include ./ggml/src ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
+target_include_directories(sdtype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
 target_compile_features(sdtype_adapter PUBLIC cxx_std_11) # don't bump
 target_link_libraries(sdtype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
 set_target_properties(sdtype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
 add_library(whisper_adapter
             otherarch/whispercpp/whisper_adapter.cpp)
-target_include_directories(whisper_adapter PUBLIC . ./ggml/include ./ggml/src ./include ./otherarch ./otherarch/tools ./otherarch/whispercpp ./examples ./common)
+target_include_directories(whisper_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/whispercpp ./examples ./common)
 target_compile_features(whisper_adapter PUBLIC cxx_std_11) # don't bump
 target_link_libraries(whisper_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
 set_target_properties(whisper_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
 add_library(gpttype_adapter
             gpttype_adapter.cpp)
-target_include_directories(gpttype_adapter PUBLIC . ./ggml/include ./ggml/src ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
+target_include_directories(gpttype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
 target_compile_features(gpttype_adapter PUBLIC cxx_std_11) # don't bump
 target_link_libraries(gpttype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
 set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
 if (LLAMA_CUBLAS)
     set(TARGET koboldcpp_cublas)
     add_library(${TARGET} SHARED expose.cpp expose.h)
-    target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
+    target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
     target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump
     set_target_properties(${TARGET} PROPERTIES PREFIX "")
     set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas")
@@ -821,7 +821,7 @@ endif()
 if (LLAMA_HIPBLAS)
     set(TARGET koboldcpp_hipblas)
     add_library(${TARGET} SHARED expose.cpp expose.h)
-    target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
+    target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
     target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump
     set_target_properties(${TARGET} PROPERTIES PREFIX "")
     set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_hipblas")
diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c
@@ -707,6 +707,10 @@ void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
     quantize_row_q5_1_ref(x, y, k);
 }
 
+void quantize_row_q6_0(const float * restrict x, void * restrict y, int64_t k) {
+    quantize_row_q6_0_ref(x, y, k);
+}
+
 void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) {
     assert(QK8_0 == 32);
     assert(k % QK8_0 == 0);
@@ -3328,6 +3332,21 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
     *s = sumf;
 }
 
+void ggml_vec_dot_q6_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+#if GGML_USE_IQK_MULMAT
+#ifdef __AVX2__
+    const enum ggml_type vec_dot_type = GGML_TYPE_Q8_1;
+#else
+    const enum ggml_type vec_dot_type = GGML_TYPE_Q8_0;
+#endif
+    if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_Q6_0, vx, bx, vec_dot_type, vy, by, s, bs, 0, 1)) {
+        return;
+    }
+#endif
+    // TODO
+    *s = 0;
+}
+
 void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
     const int qk = QK8_0;
     const int nb = n / qk;
diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.h b/ggml/src/ggml-cpu/ggml-cpu-quants.h
@@ -18,6 +18,7 @@ void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
 void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q6_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 
 void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
@@ -38,6 +39,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
 void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q6_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -301,6 +301,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         .nrows                    = 1,
     },
     [GGML_TYPE_Q6_0] = {
+        .from_float               = quantize_row_q6_0,
         .vec_dot                  = ggml_vec_dot_q6_0_q8_0,
         .vec_dot_type             = GGML_TYPE_Q8_0,
         .nrows                    = 1,
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
@@ -2130,6 +2130,10 @@ static void quantize_row_q6_0_impl(const float * restrict x, block_q6_0 * restri
 }
 
 size_t quantize_q6_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    if (!quant_weights) {
+        quantize_row_q6_0_ref(src, dst, (int64_t)nrow*n_per_row);
+        return nrow * ggml_row_size(GGML_TYPE_Q6_0, n_per_row);
+    }
     size_t row_size = ggml_row_size(GGML_TYPE_Q6_0, n_per_row);
     char * qrow = (char *)dst;
     for (int64_t row = 0; row < nrow; ++row) {
diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h
@@ -30,7 +30,7 @@ GGML_API void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K *
 GGML_API void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
 GGML_API void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
 GGML_API void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
-GGML_API void quantize_row_q8_K64_ref(const float * GGML_RESTRICT x, block_q8_K64 * GGML_RESTRICT y, int64_t k);
+// GGML_API void quantize_row_q8_K64_ref(const float * GGML_RESTRICT x, block_q8_K64 * GGML_RESTRICT y, int64_t k);
 
 
 
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -22,8 +22,9 @@
 #include "ggml-backend.h"
 #include "ggml-impl.h"
 #include "ggml-cpu-impl.h"
-#include "ggml-quants.h"
+#include "ggml-threading.h"
 #include "ggml.h"
+#include "ggml-quants.h"
 #include "ggml-aarch64.h"
 #endif
 
@@ -670,7 +671,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(block_q6_0),
         .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_q6_0,
-        .from_float               = quantize_row_q6_0,
         .from_float_ref           = (ggml_from_float_t) quantize_row_q6_0_ref,
     },
     [GGML_TYPE_Q8_0] = {
@@ -5065,7 +5065,6 @@ struct ggml_tensor * ggml_opt_step_adamw(
 
 ////////////////////////////////////////////////////////////////////////////////
 
-
 struct ggml_hash_set ggml_hash_set_new(size_t size) {
     size = ggml_hash_size(size);
     struct ggml_hash_set result;
diff --git a/otherarch/sdcpp/stable-diffusion.h b/otherarch/sdcpp/stable-diffusion.h
@@ -95,6 +95,8 @@ enum sd_type_t {
     SD_TYPE_Q4_0_8_8 = 33,
     SD_TYPE_TQ1_0   = 34,
     SD_TYPE_TQ2_0   = 35,
+    //
+    SD_TYPE_Q6_0    = 133,
     SD_TYPE_COUNT,
 };