99#include < stdint.h>
1010#include < stdio.h>
1111#include < vector>
12+ #include < cstring>
1213
1314
1415#if defined(GGML_USE_HIP)
1920// for rocblas_initialize()
2021#include " rocblas/rocblas.h"
2122#endif // __HIP_PLATFORM_AMD__
23+
24+ #if HIP_VERSION >= 60500000
25+ #define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F
26+ #define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F
27+ #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F
28+ #define cublasComputeType_t hipblasComputeType_t
29+ #define cudaDataType_t hipDataType
30+ #else
2231#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
2332#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
2433#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
34+ #define cublasComputeType_t hipblasDatatype_t
35+ #define cudaDataType_t hipblasDatatype_t
36+ #endif // HIP_VERSION >= 6050000
37+
2538#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
2639#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
2740#define CUBLAS_OP_N HIPBLAS_OP_N
3144#define CUDA_R_16F HIPBLAS_R_16F
3245#define CUDA_R_32F HIPBLAS_R_32F
3346#define __shfl_xor_sync (mask, var, laneMask, width ) __shfl_xor (var, laneMask, width)
34- #define cublasComputeType_t hipblasDatatype_t // deprecated, new hipblasComputeType_t not in 5.6
3547#define cublasCreate hipblasCreate
3648#define cublasGemmEx hipblasGemmEx
3749#define cublasGemmBatchedEx hipblasGemmBatchedEx
4153#define cublasSetStream hipblasSetStream
4254#define cublasSgemm hipblasSgemm
4355#define cublasStatus_t hipblasStatus_t
44- #define cudaDataType_t hipblasDatatype_t // deprecated, new hipblasDatatype not in 5.6
4556#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
4657#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
4758#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
@@ -7857,7 +7868,7 @@ static void ggml_v3_cuda_op_leaky_relu(
78577868 GGML_V3_ASSERT ( dst->type == GGML_V3_TYPE_F32);
78587869
78597870 float negative_slope;
7860- memcpy (&negative_slope, dst->op_params , sizeof (float ));
7871+ std:: memcpy (&negative_slope, dst->op_params , sizeof (float ));
78617872
78627873 leaky_relu_f32_cuda (src0_dd, dst_dd, ggml_v3_nelements (src0), negative_slope, main_stream);
78637874
@@ -7891,7 +7902,7 @@ static void ggml_v3_cuda_op_norm(
78917902 const int64_t nrows = ggml_v3_nrows (src0);
78927903
78937904 float eps;
7894- memcpy (&eps, dst->op_params , sizeof (float ));
7905+ std:: memcpy (&eps, dst->op_params , sizeof (float ));
78957906
78967907 norm_f32_cuda (src0_dd, dst_dd, ne00, nrows, eps, main_stream);
78977908
@@ -7977,7 +7988,7 @@ static void ggml_v3_cuda_op_rms_norm(
79777988 const int64_t nrows = ggml_v3_nrows (src0);
79787989
79797990 float eps;
7980- memcpy (&eps, dst->op_params , sizeof (float ));
7991+ std:: memcpy (&eps, dst->op_params , sizeof (float ));
79817992
79827993 rms_norm_f32_cuda (src0_dd, dst_dd, ne00, nrows, eps, main_stream);
79837994
@@ -8370,12 +8381,12 @@ static void ggml_v3_cuda_op_rope(
83708381
83718382 // RoPE alteration for extended context
83728383 float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
8373- memcpy (&freq_base, (int32_t *) dst->op_params + 5 , sizeof (float ));
8374- memcpy (&freq_scale, (int32_t *) dst->op_params + 6 , sizeof (float ));
8375- memcpy (&ext_factor, (int32_t *) dst->op_params + 7 , sizeof (float ));
8376- memcpy (&attn_factor, (int32_t *) dst->op_params + 8 , sizeof (float ));
8377- memcpy (&beta_fast, (int32_t *) dst->op_params + 9 , sizeof (float ));
8378- memcpy (&beta_slow, (int32_t *) dst->op_params + 10 , sizeof (float ));
8384+ std:: memcpy (&freq_base, (int32_t *) dst->op_params + 5 , sizeof (float ));
8385+ std:: memcpy (&freq_scale, (int32_t *) dst->op_params + 6 , sizeof (float ));
8386+ std:: memcpy (&ext_factor, (int32_t *) dst->op_params + 7 , sizeof (float ));
8387+ std:: memcpy (&attn_factor, (int32_t *) dst->op_params + 8 , sizeof (float ));
8388+ std:: memcpy (&beta_fast, (int32_t *) dst->op_params + 9 , sizeof (float ));
8389+ std:: memcpy (&beta_slow, (int32_t *) dst->op_params + 10 , sizeof (float ));
83798390
83808391 const int32_t * pos = nullptr ;
83818392 if ((mode & 1 ) == 0 ) {
@@ -8444,7 +8455,7 @@ static void ggml_v3_cuda_op_alibi(
84448455 // const int n_past = ((int32_t *) dst->op_params)[0];
84458456 const int n_head = ((int32_t *) dst->op_params )[1 ];
84468457 float max_bias;
8447- memcpy (&max_bias, (int32_t *) dst->op_params + 2 , sizeof (float ));
8458+ std:: memcpy (&max_bias, (int32_t *) dst->op_params + 2 , sizeof (float ));
84488459
84498460 // GGML_V3_ASSERT(ne01 + n_past == ne00);
84508461 GGML_V3_ASSERT (n_head == ne02);
@@ -8565,7 +8576,7 @@ static void ggml_v3_cuda_op_soft_max(
85658576 const int64_t nrows_y = src1 ? ggml_v3_nrows (src1) : 1 ;
85668577
85678578 float scale = 1 .0f ;
8568- memcpy (&scale, dst->op_params , sizeof (float ));
8579+ std:: memcpy (&scale, dst->op_params , sizeof (float ));
85698580
85708581#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION >= CUDART_HMAX
85718582#ifdef GGML_V3_CUDA_F16
@@ -8594,7 +8605,7 @@ static void ggml_v3_cuda_op_scale(
85948605 GGML_V3_ASSERT ( dst->type == GGML_V3_TYPE_F32);
85958606
85968607 float scale;
8597- memcpy (&scale, dst->op_params , sizeof (float ));
8608+ std:: memcpy (&scale, dst->op_params , sizeof (float ));
85988609
85998610 scale_f32_cuda (src0_dd, dst_dd, scale, ggml_v3_nelements (src0), main_stream);
86008611 CUDA_CHECK (cudaGetLastError ());
@@ -8613,8 +8624,8 @@ static void ggml_v3_cuda_op_clamp(
86138624
86148625 float min;
86158626 float max;
8616- memcpy (&min, dst->op_params , sizeof (float ));
8617- memcpy (&max, (float *) dst->op_params + 1 , sizeof (float ));
8627+ std:: memcpy (&min, dst->op_params , sizeof (float ));
8628+ std:: memcpy (&max, (float *) dst->op_params + 1 , sizeof (float ));
86188629
86198630 clamp_f32_cuda (src0_dd, dst_dd, min, max, ggml_v3_nelements (src0), main_stream);
86208631 CUDA_CHECK (cudaGetLastError ());
@@ -9643,7 +9654,7 @@ static void ggml_v3_cuda_mul_mat_id(const ggml_v3_tensor * src0, const ggml_v3_t
96439654 CUDA_CHECK (cudaMemcpyAsync (ids_host.data (), ids_dev, ggml_v3_nbytes (ids), cudaMemcpyDeviceToHost, stream));
96449655 CUDA_CHECK (cudaStreamSynchronize (stream));
96459656 } else {
9646- memcpy (ids_host.data (), ids->data , ggml_v3_nbytes (ids));
9657+ std:: memcpy (ids_host.data (), ids->data , ggml_v3_nbytes (ids));
96479658 }
96489659
96499660 const ggml_v3_tensor_extra_gpu * src1_extra = (const ggml_v3_tensor_extra_gpu *) src1->extra ;
@@ -10020,7 +10031,7 @@ static void ggml_v3_cuda_assign_buffers_impl(struct ggml_v3_tensor * tensor, boo
1002010031 char * src0_ddc = (char *) src0_extra->data_device [g_main_device_v3];
1002110032 size_t offset = 0 ;
1002210033 if (tensor->op == GGML_V3_OP_VIEW) {
10023- memcpy (&offset, tensor->op_params , sizeof (size_t ));
10034+ std:: memcpy (&offset, tensor->op_params , sizeof (size_t ));
1002410035 }
1002510036 extra = ggml_v3_cuda_alloc_temp_tensor_extra ();
1002610037 extra->data_device [g_main_device_v3] = src0_ddc + offset;
@@ -10076,7 +10087,7 @@ void ggml_v3_cuda_assign_scratch_offset(struct ggml_v3_tensor * tensor, size_t o
1007610087 char * src0_ddc = (char *) src0_extra->data_device [g_main_device_v3];
1007710088 size_t view_offset = 0 ;
1007810089 if (tensor->op == GGML_V3_OP_VIEW) {
10079- memcpy (&view_offset, tensor->op_params , sizeof (size_t ));
10090+ std:: memcpy (&view_offset, tensor->op_params , sizeof (size_t ));
1008010091 }
1008110092 extra->data_device [g_main_device_v3] = src0_ddc + view_offset;
1008210093 } else {
0 commit comments