Skip to content

Commit 19c2ebe

Browse files
committed
issue/632 - adapt to iluvatar core 20
1 parent 4bcaa94 commit 19c2ebe

File tree

5 files changed

+24
-8
lines changed

5 files changed

+24
-8
lines changed

src/infiniop/devices/nvidia/nvidia_kernel_common.cuh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
// Posible maximum number of threads per block for CUDA architectures
1515
// Used for picking correct kernel launch configuration
1616
#define CUDA_BLOCK_SIZE_4096 4096
17+
#define CUDA_BLOCK_SIZE_2048 2048
1718
#define CUDA_BLOCK_SIZE_1024 1024
1819
#define CUDA_BLOCK_SIZE_512 512
1920

src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -76,18 +76,22 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
7676
const void *x,
7777
void *stream_) const {
7878
cudaStream_t stream = (cudaStream_t)stream_;
79-
if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
79+
if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
80+
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
81+
y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len,
82+
_info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream));
83+
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
84+
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_2048>(
85+
y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len,
86+
_info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream));
87+
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
8088
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(
8189
y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len,
8290
_info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream));
8391
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
8492
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(
8593
y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len,
8694
_info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream));
87-
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
88-
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
89-
y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len,
90-
_info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream));
9195
} else {
9296
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
9397
}

src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -117,12 +117,14 @@ infiniStatus_t Descriptor::calculate(
117117
auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
118118

119119
// launch kernel with different block sizes
120-
if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
120+
if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
121+
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(batch_size, nhead, dim, y, _info.atype, stride_y_batch, stride_y_nhead, x, stride_x_batch, stride_x_nhead, w, _info.wtype, _info.epsilon, cuda_stream));
122+
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
123+
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_2048>(batch_size, nhead, dim, y, _info.atype, stride_y_batch, stride_y_nhead, x, stride_x_batch, stride_x_nhead, w, _info.wtype, _info.epsilon, cuda_stream));
124+
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
121125
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(batch_size, nhead, dim, y, _info.atype, stride_y_batch, stride_y_nhead, x, stride_x_batch, stride_x_nhead, w, _info.wtype, _info.epsilon, cuda_stream));
122126
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
123127
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(batch_size, nhead, dim, y, _info.atype, stride_y_batch, stride_y_nhead, x, stride_x_batch, stride_x_nhead, w, _info.wtype, _info.epsilon, cuda_stream));
124-
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
125-
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(batch_size, nhead, dim, y, _info.atype, stride_y_batch, stride_y_nhead, x, stride_x_batch, stride_x_nhead, w, _info.wtype, _info.epsilon, cuda_stream));
126128
} else {
127129
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
128130
}

xmake.lua

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,12 @@ option("iluvatar-gpu")
114114
set_description("Whether to compile implementations for Iluvatar GPU")
115115
option_end()
116116

117+
option("ivcore-20")
118+
set_default(false)
119+
set_showmenu(true)
120+
set_description("Use ivcore20")
121+
option_end()
122+
117123
if has_config("iluvatar-gpu") then
118124
add_defines("ENABLE_ILUVATAR_API")
119125
includes("xmake/iluvatar.lua")

xmake/iluvatar.lua

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ target("infiniop-iluvatar")
4444
set_warnings("all", "error")
4545
add_cuflags("-Wno-error=unused-private-field")
4646
add_cuflags("-fPIC", "-x", "ivcore", "-std=c++17", {force = true})
47+
if has_config("ivcore-20") then
48+
add_cuflags("--cuda-gpu-arch=ivcore20", {force = true})
49+
end
4750
add_culdflags("-fPIC")
4851
add_cxflags("-fPIC")
4952

0 commit comments

Comments
 (0)