Skip to content

Commit 8018e52

Browse files
committed
try fix compile issues rocm (+4 squashed commit)
Squashed commit: [9e33a33] try fix compile issues rocm [36e5eb5] try fix compile issues rocm [6ca1881] try fix compile issues rocm [b56ad2e] try fix compile issues rocm
1 parent 3170fc3 commit 8018e52

File tree

3 files changed

+48
-19
lines changed

3 files changed

+48
-19
lines changed

otherarch/ggml_v2-cuda-legacy.cu

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,17 @@
88
#include <hip/hip_runtime.h>
99
#include <hipblas/hipblas.h>
1010
#include <hip/hip_fp16.h>
11+
12+
#if HIP_VERSION >= 60500000
13+
#define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F
14+
#define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F
15+
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F
16+
#else
17+
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
1118
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
1219
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
20+
#endif // HIP_VERSION >= 6050000
21+
1322
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
1423
#define CUBLAS_OP_N HIPBLAS_OP_N
1524
#define CUBLAS_OP_T HIPBLAS_OP_T

otherarch/ggml_v2-cuda.cu

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,17 @@
88
#include <hip/hip_runtime.h>
99
#include <hipblas/hipblas.h>
1010
#include <hip/hip_fp16.h>
11+
12+
#if HIP_VERSION >= 60500000
13+
#define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F
14+
#define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F
15+
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F
16+
#else
17+
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
1118
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
1219
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
20+
#endif // HIP_VERSION >= 6050000
21+
1322
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
1423
#define CUBLAS_OP_N HIPBLAS_OP_N
1524
#define CUBLAS_OP_T HIPBLAS_OP_T

otherarch/ggml_v3-cuda.cu

Lines changed: 30 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <stdint.h>
1010
#include <stdio.h>
1111
#include <vector>
12+
#include <cstring>
1213

1314

1415
#if defined(GGML_USE_HIP)
@@ -19,9 +20,21 @@
1920
// for rocblas_initialize()
2021
#include "rocblas/rocblas.h"
2122
#endif // __HIP_PLATFORM_AMD__
23+
24+
#if HIP_VERSION >= 60500000
25+
#define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F
26+
#define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F
27+
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F
28+
#define cublasComputeType_t hipblasComputeType_t
29+
#define cudaDataType_t hipDataType
30+
#else
2231
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
2332
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
2433
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
34+
#define cublasComputeType_t hipblasDatatype_t
35+
#define cudaDataType_t hipblasDatatype_t
36+
#endif // HIP_VERSION >= 6050000
37+
2538
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
2639
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
2740
#define CUBLAS_OP_N HIPBLAS_OP_N
@@ -31,7 +44,6 @@
3144
#define CUDA_R_16F HIPBLAS_R_16F
3245
#define CUDA_R_32F HIPBLAS_R_32F
3346
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
34-
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
3547
#define cublasCreate hipblasCreate
3648
#define cublasGemmEx hipblasGemmEx
3749
#define cublasGemmBatchedEx hipblasGemmBatchedEx
@@ -41,7 +53,6 @@
4153
#define cublasSetStream hipblasSetStream
4254
#define cublasSgemm hipblasSgemm
4355
#define cublasStatus_t hipblasStatus_t
44-
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
4556
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
4657
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
4758
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
@@ -7857,7 +7868,7 @@ static void ggml_v3_cuda_op_leaky_relu(
78577868
GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_F32);
78587869

78597870
float negative_slope;
7860-
memcpy(&negative_slope, dst->op_params, sizeof(float));
7871+
std::memcpy(&negative_slope, dst->op_params, sizeof(float));
78617872

78627873
leaky_relu_f32_cuda(src0_dd, dst_dd, ggml_v3_nelements(src0), negative_slope, main_stream);
78637874

@@ -7891,7 +7902,7 @@ static void ggml_v3_cuda_op_norm(
78917902
const int64_t nrows = ggml_v3_nrows(src0);
78927903

78937904
float eps;
7894-
memcpy(&eps, dst->op_params, sizeof(float));
7905+
std::memcpy(&eps, dst->op_params, sizeof(float));
78957906

78967907
norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
78977908

@@ -7977,7 +7988,7 @@ static void ggml_v3_cuda_op_rms_norm(
79777988
const int64_t nrows = ggml_v3_nrows(src0);
79787989

79797990
float eps;
7980-
memcpy(&eps, dst->op_params, sizeof(float));
7991+
std::memcpy(&eps, dst->op_params, sizeof(float));
79817992

79827993
rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
79837994

@@ -8370,12 +8381,12 @@ static void ggml_v3_cuda_op_rope(
83708381

83718382
// RoPE alteration for extended context
83728383
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
8373-
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
8374-
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
8375-
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
8376-
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
8377-
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
8378-
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
8384+
std::memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
8385+
std::memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
8386+
std::memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
8387+
std::memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
8388+
std::memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
8389+
std::memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
83798390

83808391
const int32_t * pos = nullptr;
83818392
if ((mode & 1) == 0) {
@@ -8444,7 +8455,7 @@ static void ggml_v3_cuda_op_alibi(
84448455
//const int n_past = ((int32_t *) dst->op_params)[0];
84458456
const int n_head = ((int32_t *) dst->op_params)[1];
84468457
float max_bias;
8447-
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
8458+
std::memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
84488459

84498460
//GGML_V3_ASSERT(ne01 + n_past == ne00);
84508461
GGML_V3_ASSERT(n_head == ne02);
@@ -8565,7 +8576,7 @@ static void ggml_v3_cuda_op_soft_max(
85658576
const int64_t nrows_y = src1 ? ggml_v3_nrows(src1) : 1;
85668577

85678578
float scale = 1.0f;
8568-
memcpy(&scale, dst->op_params, sizeof(float));
8579+
std::memcpy(&scale, dst->op_params, sizeof(float));
85698580

85708581
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION >= CUDART_HMAX
85718582
#ifdef GGML_V3_CUDA_F16
@@ -8594,7 +8605,7 @@ static void ggml_v3_cuda_op_scale(
85948605
GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_F32);
85958606

85968607
float scale;
8597-
memcpy(&scale, dst->op_params, sizeof(float));
8608+
std::memcpy(&scale, dst->op_params, sizeof(float));
85988609

85998610
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_v3_nelements(src0), main_stream);
86008611
CUDA_CHECK(cudaGetLastError());
@@ -8613,8 +8624,8 @@ static void ggml_v3_cuda_op_clamp(
86138624

86148625
float min;
86158626
float max;
8616-
memcpy(&min, dst->op_params, sizeof(float));
8617-
memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
8627+
std::memcpy(&min, dst->op_params, sizeof(float));
8628+
std::memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
86188629

86198630
clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_v3_nelements(src0), main_stream);
86208631
CUDA_CHECK(cudaGetLastError());
@@ -9643,7 +9654,7 @@ static void ggml_v3_cuda_mul_mat_id(const ggml_v3_tensor * src0, const ggml_v3_t
96439654
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_v3_nbytes(ids), cudaMemcpyDeviceToHost, stream));
96449655
CUDA_CHECK(cudaStreamSynchronize(stream));
96459656
} else {
9646-
memcpy(ids_host.data(), ids->data, ggml_v3_nbytes(ids));
9657+
std::memcpy(ids_host.data(), ids->data, ggml_v3_nbytes(ids));
96479658
}
96489659

96499660
const ggml_v3_tensor_extra_gpu * src1_extra = (const ggml_v3_tensor_extra_gpu *) src1->extra;
@@ -10020,7 +10031,7 @@ static void ggml_v3_cuda_assign_buffers_impl(struct ggml_v3_tensor * tensor, boo
1002010031
char * src0_ddc = (char *) src0_extra->data_device[g_main_device_v3];
1002110032
size_t offset = 0;
1002210033
if (tensor->op == GGML_V3_OP_VIEW) {
10023-
memcpy(&offset, tensor->op_params, sizeof(size_t));
10034+
std::memcpy(&offset, tensor->op_params, sizeof(size_t));
1002410035
}
1002510036
extra = ggml_v3_cuda_alloc_temp_tensor_extra();
1002610037
extra->data_device[g_main_device_v3] = src0_ddc + offset;
@@ -10076,7 +10087,7 @@ void ggml_v3_cuda_assign_scratch_offset(struct ggml_v3_tensor * tensor, size_t o
1007610087
char * src0_ddc = (char *) src0_extra->data_device[g_main_device_v3];
1007710088
size_t view_offset = 0;
1007810089
if (tensor->op == GGML_V3_OP_VIEW) {
10079-
memcpy(&view_offset, tensor->op_params, sizeof(size_t));
10090+
std::memcpy(&view_offset, tensor->op_params, sizeof(size_t));
1008010091
}
1008110092
extra->data_device[g_main_device_v3] = src0_ddc + view_offset;
1008210093
} else {

0 commit comments

Comments
 (0)