Skip to content

Commit 46dc1fd

Browse files
authored
Merge b3487
b3487
2 parents 54ef11a + 439b3fc commit 46dc1fd

File tree

10 files changed

+502
-375
lines changed

10 files changed

+502
-375
lines changed

.github/workflows/build.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -860,7 +860,8 @@ jobs:
860860
mkdir build
861861
cd build
862862
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
863-
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1))
863+
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
864+
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
864865
865866
- name: Determine tag name
866867
id: tag

ggml/src/ggml-cuda/common.cuh

Lines changed: 4 additions & 374 deletions
Large diffs are not rendered by default.

ggml/src/ggml-cuda/vendors/cuda.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#pragma once
2+
3+
#include <cuda_runtime.h>
4+
#include <cuda.h>
5+
#include <cublas_v2.h>
6+
#include <cuda_fp16.h>
7+
8+
#if CUDART_VERSION < 11020
9+
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
10+
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
11+
#define CUBLAS_COMPUTE_16F CUDA_R_16F
12+
#define CUBLAS_COMPUTE_32F CUDA_R_32F
13+
#define cublasComputeType_t cudaDataType_t
14+
#endif // CUDART_VERSION < 11020

ggml/src/ggml-cuda/vendors/hip.h

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
#pragma once
2+
3+
#include <hip/hip_runtime.h>
4+
#include <hipblas/hipblas.h>
5+
#include <hip/hip_fp16.h>
6+
#ifdef __HIP_PLATFORM_AMD__
7+
// for rocblas_initialize()
8+
#include "rocblas/rocblas.h"
9+
#endif // __HIP_PLATFORM_AMD__
10+
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
11+
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
12+
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
13+
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
14+
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
15+
#define CUBLAS_OP_N HIPBLAS_OP_N
16+
#define CUBLAS_OP_T HIPBLAS_OP_T
17+
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
18+
#define CUBLAS_TF32_TENSOR_OP_MATH 0
19+
#define CUDA_R_16F HIPBLAS_R_16F
20+
#define CUDA_R_32F HIPBLAS_R_32F
21+
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
22+
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
23+
#define cublasCreate hipblasCreate
24+
#define cublasDestroy hipblasDestroy
25+
#define cublasGemmEx hipblasGemmEx
26+
#define cublasGemmBatchedEx hipblasGemmBatchedEx
27+
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
28+
#define cublasHandle_t hipblasHandle_t
29+
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
30+
#define cublasSetStream hipblasSetStream
31+
#define cublasSgemm hipblasSgemm
32+
#define cublasStatus_t hipblasStatus_t
33+
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
34+
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
35+
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
36+
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
37+
#define cudaDeviceProp hipDeviceProp_t
38+
#define cudaDeviceSynchronize hipDeviceSynchronize
39+
#define cudaError_t hipError_t
40+
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
41+
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
42+
#define cudaEventCreateWithFlags hipEventCreateWithFlags
43+
#define cudaEventDisableTiming hipEventDisableTiming
44+
#define cudaEventRecord hipEventRecord
45+
#define cudaEventSynchronize hipEventSynchronize
46+
#define cudaEvent_t hipEvent_t
47+
#define cudaEventDestroy hipEventDestroy
48+
#define cudaFree hipFree
49+
#define cudaFreeHost hipHostFree
50+
#define cudaGetDevice hipGetDevice
51+
#define cudaGetDeviceCount hipGetDeviceCount
52+
#define cudaGetDeviceProperties hipGetDeviceProperties
53+
#define cudaGetErrorString hipGetErrorString
54+
#define cudaGetLastError hipGetLastError
55+
#define cudaHostRegister hipHostRegister
56+
#define cudaHostRegisterPortable hipHostRegisterPortable
57+
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
58+
#define cudaHostUnregister hipHostUnregister
59+
#define cudaLaunchHostFunc hipLaunchHostFunc
60+
#define cudaMalloc hipMalloc
61+
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
62+
#define cudaMemcpy hipMemcpy
63+
#define cudaMemcpyAsync hipMemcpyAsync
64+
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
65+
#define cudaMemcpy2DAsync hipMemcpy2DAsync
66+
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
67+
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
68+
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
69+
#define cudaMemcpyKind hipMemcpyKind
70+
#define cudaMemset hipMemset
71+
#define cudaMemsetAsync hipMemsetAsync
72+
#define cudaMemGetInfo hipMemGetInfo
73+
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
74+
#define cudaSetDevice hipSetDevice
75+
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
76+
#define cudaStreamDestroy hipStreamDestroy
77+
#define cudaStreamFireAndForget hipStreamFireAndForget
78+
#define cudaStreamNonBlocking hipStreamNonBlocking
79+
#define cudaStreamPerThread hipStreamPerThread
80+
#define cudaStreamSynchronize hipStreamSynchronize
81+
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
82+
#define cudaStream_t hipStream_t
83+
#define cudaSuccess hipSuccess
84+
#define __trap() do { abort(); __builtin_unreachable(); } while(0)
85+
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
86+
#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
87+
#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
88+
#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
89+
#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
90+
#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
91+
#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
92+
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
93+
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
94+
95+
#define __CUDA_ARCH__ 1300
96+
97+
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
98+
defined(__gfx1150__) || defined(__gfx1151__)
99+
#define RDNA3
100+
#endif
101+
102+
#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
103+
defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
104+
#define RDNA2
105+
#endif
106+
107+
#if defined(__gfx1010__) || defined(__gfx1012__)
108+
#define RDNA1
109+
#endif
110+
111+
#ifndef __has_builtin
112+
#define __has_builtin(x) 0
113+
#endif
114+
115+
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
116+
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
117+
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
118+
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
119+
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
120+
#if __has_builtin(__builtin_elementwise_sub_sat)
121+
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
122+
return reinterpret_cast<const int &>(c);
123+
#else
124+
int8x4_t c;
125+
int16_t tmp;
126+
#pragma unroll
127+
for (int i = 0; i < 4; i++) {
128+
tmp = va[i] - vb[i];
129+
if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
130+
if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
131+
c[i] = tmp;
132+
}
133+
return reinterpret_cast<int &>(c);
134+
#endif // __has_builtin(__builtin_elementwise_sub_sat)
135+
}
136+
137+
static __device__ __forceinline__ int __vsub4(const int a, const int b) {
138+
return __vsubss4(a, b);
139+
}
140+
141+
static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
142+
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
143+
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
144+
unsigned int c;
145+
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
146+
#pragma unroll
147+
for (int i = 0; i < 4; ++i) {
148+
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
149+
}
150+
return c;
151+
}
152+
153+
static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
154+
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
155+
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
156+
unsigned int c;
157+
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
158+
#pragma unroll
159+
for (int i = 0; i < 4; ++i) {
160+
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
161+
}
162+
return c;
163+
}
164+
165+
#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
166+
// __shfl_xor() for half2 was added in ROCm 5.6
167+
static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
168+
typedef union half2_b32 {
169+
half2 val;
170+
int b32;
171+
} half2_b32_t;
172+
half2_b32_t tmp;
173+
tmp.val = var;
174+
tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
175+
return tmp.val;
176+
}
177+
#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000

ggml/src/ggml-cuda/vendors/musa.h

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
#pragma once
2+
3+
#include <musa_runtime.h>
4+
#include <musa.h>
5+
#include <mublas.h>
6+
#include <musa_fp16.h>
7+
#define CUBLAS_COMPUTE_16F CUDA_R_16F
8+
#define CUBLAS_COMPUTE_32F CUDA_R_32F
9+
#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
10+
#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
11+
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
12+
#define CUBLAS_OP_N MUBLAS_OP_N
13+
#define CUBLAS_OP_T MUBLAS_OP_T
14+
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
15+
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
16+
#define CUDA_R_16F MUSA_R_16F
17+
#define CUDA_R_32F MUSA_R_32F
18+
#define cublasComputeType_t cudaDataType_t
19+
#define cublasCreate mublasCreate
20+
#define cublasDestroy mublasDestroy
21+
#define cublasGemmEx mublasGemmEx
22+
#define cublasGemmBatchedEx mublasGemmBatchedEx
23+
#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
24+
#define cublasHandle_t mublasHandle_t
25+
#define cublasSetMathMode mublasSetMathMode
26+
#define cublasSetStream mublasSetStream
27+
#define cublasSgemm mublasSgemm
28+
#define cublasStatus_t mublasStatus_t
29+
#define cublasGetStatusString mublasStatus_to_string
30+
#define cudaDataType_t musaDataType_t
31+
#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
32+
#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
33+
#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
34+
#define cudaDeviceProp musaDeviceProp
35+
#define cudaDeviceSynchronize musaDeviceSynchronize
36+
#define cudaError_t musaError_t
37+
#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
38+
#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
39+
#define cudaEventCreateWithFlags musaEventCreateWithFlags
40+
#define cudaEventDisableTiming musaEventDisableTiming
41+
#define cudaEventRecord musaEventRecord
42+
#define cudaEventSynchronize musaEventSynchronize
43+
#define cudaEvent_t musaEvent_t
44+
#define cudaEventDestroy musaEventDestroy
45+
#define cudaFree musaFree
46+
#define cudaFreeHost musaFreeHost
47+
#define cudaGetDevice musaGetDevice
48+
#define cudaGetDeviceCount musaGetDeviceCount
49+
#define cudaGetDeviceProperties musaGetDeviceProperties
50+
#define cudaGetErrorString musaGetErrorString
51+
#define cudaGetLastError musaGetLastError
52+
#define cudaHostRegister musaHostRegister
53+
#define cudaHostRegisterPortable musaHostRegisterPortable
54+
#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
55+
#define cudaHostUnregister musaHostUnregister
56+
#define cudaLaunchHostFunc musaLaunchHostFunc
57+
#define cudaMalloc musaMalloc
58+
#define cudaMallocHost musaMallocHost
59+
#define cudaMemcpy musaMemcpy
60+
#define cudaMemcpyAsync musaMemcpyAsync
61+
#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
62+
#define cudaMemcpy2DAsync musaMemcpy2DAsync
63+
#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
64+
#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
65+
#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
66+
#define cudaMemcpyKind musaMemcpyKind
67+
#define cudaMemset musaMemset
68+
#define cudaMemsetAsync musaMemsetAsync
69+
#define cudaMemGetInfo musaMemGetInfo
70+
#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
71+
#define cudaSetDevice musaSetDevice
72+
#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
73+
#define cudaStreamDestroy musaStreamDestroy
74+
#define cudaStreamFireAndForget musaStreamFireAndForget
75+
#define cudaStreamNonBlocking musaStreamNonBlocking
76+
#define cudaStreamPerThread musaStreamPerThread
77+
#define cudaStreamSynchronize musaStreamSynchronize
78+
#define cudaStreamWaitEvent musaStreamWaitEvent
79+
#define cudaStream_t musaStream_t
80+
#define cudaSuccess musaSuccess
81+
82+
// Additional mappings for MUSA virtual memory pool
83+
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
84+
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
85+
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
86+
#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
87+
#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
88+
#define CUdevice MUdevice
89+
#define CUdeviceptr MUdeviceptr
90+
#define CUmemAccessDesc MUmemAccessDesc
91+
#define CUmemAllocationProp MUmemAllocationProp
92+
#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
93+
#define cuDeviceGet muDeviceGet
94+
#define cuDeviceGetAttribute muDeviceGetAttribute
95+
#define cuMemAddressFree muMemAddressFree
96+
#define cuMemAddressReserve muMemAddressReserve
97+
#define cuMemCreate muMemCreate
98+
#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
99+
#define cuMemMap muMemMap
100+
#define cuMemRelease muMemRelease
101+
#define cuMemSetAccess muMemSetAccess
102+
#define cuMemUnmap muMemUnmap
103+
#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
104+
#define cudaFuncSetAttribute musaFuncSetAttribute
105+
#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
106+
#define make_cudaExtent make_musaExtent
107+
#define make_cudaPitchedPtr make_musaPitchedPtr
108+
109+
// Additional mappings for MUSA graphs
110+
#define CUDA_SUCCESS MUSA_SUCCESS
111+
#define CUresult MUresult
112+
#define cuGetErrorString muGetErrorString
113+
#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
114+
#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
115+
#define cudaGraphDestroy musaGraphDestroy
116+
#define cudaGraphExecDestroy musaGraphExecDestroy
117+
#define cudaGraphExec_t musaGraphExec_t
118+
#define cudaGraphExecUpdate musaGraphExecUpdate
119+
#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
120+
#define cudaGraphGetNodes musaGraphGetNodes
121+
#define cudaGraphInstantiate musaGraphInstantiate
122+
#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
123+
#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
124+
#define cudaGraphLaunch musaGraphLaunch
125+
#define cudaGraphNodeGetType musaGraphNodeGetType
126+
#define cudaGraphNode_t musaGraphNode_t
127+
#define cudaGraphNodeType musaGraphNodeType
128+
#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
129+
#define cudaGraph_t musaGraph_t
130+
#define cudaKernelNodeParams musaKernelNodeParams
131+
#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
132+
#define cudaStreamEndCapture musaStreamEndCapture
133+
134+
// XXX: Clang builtins mapping
135+
#define __vsub4 __vsub4_musa
136+
#define __vcmpeq4 __vcmpeq4_musa
137+
#define __vcmpne4 __vcmpne4_musa
138+
139+
#ifndef __has_builtin
140+
#define __has_builtin(x) 0
141+
#endif
142+
143+
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
144+
145+
static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
146+
return __vsubss4(a, b);
147+
}
148+
149+
static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
150+
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
151+
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
152+
unsigned int c;
153+
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
154+
#pragma unroll
155+
for (int i = 0; i < 4; ++i) {
156+
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
157+
}
158+
return c;
159+
}
160+
161+
static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
162+
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
163+
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
164+
unsigned int c;
165+
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
166+
#pragma unroll
167+
for (int i = 0; i < 4; ++i) {
168+
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
169+
}
170+
return c;
171+
}

0 commit comments

Comments
 (0)