Thireus
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎common/CMakeLists.txt‎
Lines changed: 4 additions & 1 deletion b/‎common/CMakeLists.txt‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎ggml/src/ggml-cuda/common.cuh‎
Lines changed: 34 additions & 9 deletions b/‎ggml/src/ggml-cuda/common.cuh‎
Lines changed: 34 additions & 9 deletions
@@ -92,7 +92,7 @@ option(LLAMA_TOOLS_INSTALL  "llama: install tools"        ${LLAMA_TOOLS_INSTALL_
 
 # 3rd party libs
 option(LLAMA_CURL       "llama: use libcurl to download model from an URL" ON)
-option(LLAMA_OPENSSL    "llama: use openssl to download model from an URL" ON)
+option(LLAMA_OPENSSL    "llama: use openssl to support HTTPS" OFF)
 option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
 
 # Required for relocatable CMake package
 
@@ -87,11 +87,13 @@ if (LLAMA_CURL)
     target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
     include_directories(${CURL_INCLUDE_DIRS})
     set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
+endif()
 
-elseif (LLAMA_OPENSSL)
+if (LLAMA_OPENSSL)
     find_package(OpenSSL)
     if (OpenSSL_FOUND)
         include(CheckCSourceCompiles)
+        set(SAVED_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
         set(CMAKE_REQUIRED_INCLUDES ${OPENSSL_INCLUDE_DIR})
         check_c_source_compiles("
         #include <openssl/opensslv.h>
@@ -106,6 +108,7 @@ elseif (LLAMA_OPENSSL)
         #endif
         int main() { return 0; }
         " OPENSSL_VERSION_SUPPORTED)
+        set(CMAKE_REQUIRED_INCLUDES ${SAVED_CMAKE_REQUIRED_INCLUDES})
         if (OPENSSL_VERSION_SUPPORTED)
             message(STATUS "OpenSSL found: ${OPENSSL_VERSION}")
             target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_OPENSSL_SUPPORT)
 
@@ -586,17 +586,42 @@ static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v,
 #endif // defined(GGML_USE_HIP) && (defined(RDNA2)  || defined(RDNA3) || defined(RDNA4) || defined(GCN5) || defined(CDNA))
 }
 
+static __device__ __forceinline__ void ggml_cuda_mad(half2 & acc, const half2 v, const half2 u) {
+#ifdef FAST_FP16_AVAILABLE
+    acc += v*u;
+#else
+    const float2 tmpv = __half22float2(v);
+    const float2 tmpu = __half22float2(u);
+    float2 tmpacc = __half22float2(acc);
+    tmpacc.x += tmpv.x * tmpu.x;
+    tmpacc.y += tmpv.y * tmpu.y;
+    acc = make_half2(tmpacc.x, tmpacc.y);
+#endif // FAST_FP16_AVAILABLE
+}
+
 // Aligned memory transfers of 8/16 bytes can be faster than 2 transfers with 4 bytes, especially on AMD.
-template <int nbytes>
+template <int nbytes, int alignment = 0>
 static __device__ __forceinline__ void ggml_cuda_memcpy_1(void * __restrict__ dst, const void * __restrict__ src) {
-    if constexpr (nbytes == 4) {
-        *(int *) dst = *(const int *) src;
-    } else if constexpr (nbytes == 8) {
-        *(int2 *) dst = *(const int2 *) src;
-    } else if constexpr (nbytes == 16) {
-        *(int4 *) dst = *(const int4 *) src;
-    } else {
-        static_assert(nbytes == 0 && nbytes == -1, "bad nbytes");
+    if constexpr (alignment != 0) {
+        static_assert(nbytes % alignment == 0, "bad alignment");
+    }
+    constexpr int nb_per_cpy = alignment == 0 ? nbytes : alignment;
+
+#pragma unroll
+    for (int i = 0; i < nbytes/nb_per_cpy; ++i) {
+        if constexpr (nb_per_cpy == 1) {
+            ((char *) dst)[i] = ((const char *) src)[i];
+        } else if constexpr (nb_per_cpy == 2) {
+            ((short *) dst)[i] = ((const short *) src)[i];
+        } else if constexpr (nb_per_cpy == 4) {
+            ((int *) dst)[i] = ((const int *) src)[i];
+        } else if constexpr (nb_per_cpy == 8) {
+            ((int2 *) dst)[i] = ((const int2 *) src)[i];
+        } else if constexpr (nb_per_cpy == 16) {
+            ((int4 *) dst)[i] = ((const int4 *) src)[i];
+        } else {
+            static_assert(nbytes == 0 && nbytes == -1, "bad nbytes");
+        }
     }
 }