Skip to content

Commit 121d7ad

Browse files
authored
[Inference] Delete duplicated copy_vector (#5716)
1 parent 7806842 commit 121d7ad

File tree

6 files changed

+28
-47
lines changed

6 files changed

+28
-47
lines changed

extensions/csrc/kernel/cuda/decode_kv_cache_memcpy_kernel.cu

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
#include "funcs/cast_functor.h"
66
#include "common/micros.h"
77

8-
using colossalAI::cuda::utils::copy_vector;
98
using colossalAI::cuda::utils::get_vec_size;
109
using colossalAI::cuda::utils::copy;
1110
using colossalAI::funcs::CastFunctor;

extensions/csrc/kernel/cuda/fused_rotary_emb_and_cache_kernel.cu

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
#include "funcs/cast_functor.h"
99
#include "funcs/binary_functor.h"
1010

11-
using colossalAI::cuda::utils::copy_vector;
1211
using colossalAI::cuda::utils::get_vec_size;
1312
using colossalAI::cuda::utils::copy;
1413
using colossalAI::funcs::CastFunctor;

extensions/csrc/kernel/cuda/get_cos_and_sin_kernel.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#include "utils/vec_copy.h"
55
#include "common/micros.h"
66

7-
using colossalAI::cuda::utils::copy_vector;
7+
using colossalAI::cuda::utils::copy;
88
using colossalAI::cuda::utils::get_vec_size;
99

1010

@@ -23,8 +23,8 @@ __device__ void apply_cos_and_sin_memcopy(
2323
int begin_id = threadIdx.x * VecSize;
2424

2525
for (; begin_id <= head_dim - VecSize; begin_id += blockDim.x){
26-
copy_vector<scalar_t, VecSize>(cos + dest_offset_id + begin_id, cos_cache_ptr + src_offset_id + begin_id);
27-
copy_vector<scalar_t, VecSize>(sin + dest_offset_id + begin_id, sin_cache_ptr + src_offset_id + begin_id);
26+
copy<scalar_t, VecSize>(cos_cache_ptr + src_offset_id + begin_id, cos + dest_offset_id + begin_id);
27+
copy<scalar_t, VecSize>(sin_cache_ptr + src_offset_id + begin_id, sin + dest_offset_id + begin_id);
2828
}
2929

3030
if (!Aligned) {

extensions/csrc/kernel/cuda/scaled_masked_softmax_kernel.cu

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ using colossalAI::funcs::UnaryOpFunctor;
2323
using colossalAI::funcs::UnaryOpType;
2424
using colossalAI::funcs::warp_reduce;
2525
using colossalAI::funcs::ReduceType;
26-
using colossalAI::cuda::utils::copy_vector;
26+
using colossalAI::cuda::utils::copy;
2727

2828

2929
/*
@@ -87,8 +87,8 @@ __global__ void scaled_masked_softmax_warp_forward(
8787

8888
if (element_index < batch_element_count) {
8989
int itr_idx = i * element_count + it * WARP_SIZE;
90-
copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_data, src + itr_idx);
91-
copy_vector<uint8_t, ELEMENTS_PER_LDG_STG>(temp_mask, mask + itr_idx);
90+
copy<input_t, ELEMENTS_PER_LDG_STG>(src + itr_idx, temp_data);
91+
copy<uint8_t, ELEMENTS_PER_LDG_STG>(mask + itr_idx, temp_mask);
9292

9393
#pragma unroll
9494
for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
@@ -144,8 +144,8 @@ __global__ void scaled_masked_softmax_warp_forward(
144144
for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
145145
out[element] = elements[i][it + element] / sum[i];
146146
}
147-
copy_vector<output_t, ELEMENTS_PER_LDG_STG>(
148-
dst + i * element_count + it * WARP_SIZE, out);
147+
copy<output_t, ELEMENTS_PER_LDG_STG>(
148+
out, dst + i * element_count + it * WARP_SIZE);
149149
} else {
150150
break;
151151
}
@@ -200,10 +200,10 @@ __global__ void scaled_masked_softmax_warp_backward(
200200
for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
201201
int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
202202
if (element_index < batch_element_count) {
203-
copy_vector<input_t, ELEMENTS_PER_LDG_STG>(
204-
temp_grad, grad + i * element_count + it * WARP_SIZE);
205-
copy_vector<input_t, ELEMENTS_PER_LDG_STG>(
206-
temp_output, output + i * element_count + it * WARP_SIZE);
203+
copy<input_t, ELEMENTS_PER_LDG_STG>(
204+
grad + i * element_count + it * WARP_SIZE, temp_grad);
205+
copy<input_t, ELEMENTS_PER_LDG_STG>(
206+
output + i * element_count + it * WARP_SIZE, temp_output);
207207

208208
#pragma unroll
209209
for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
@@ -245,8 +245,8 @@ __global__ void scaled_masked_softmax_warp_backward(
245245
(output_t)(scale * (grad_reg[i][it + element] -
246246
output_reg[i][it + element] * sum[i]));
247247
}
248-
copy_vector<output_t, ELEMENTS_PER_LDG_STG>(
249-
gradInput + i * element_count + it * WARP_SIZE, out);
248+
copy<output_t, ELEMENTS_PER_LDG_STG>(
249+
out, gradInput + i * element_count + it * WARP_SIZE);
250250
}
251251
}
252252
}

extensions/csrc/kernel/cuda/scaled_upper_triang_masked_softmax_kernel.cu

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ using colossalAI::funcs::UnaryOpFunctor;
2323
using colossalAI::funcs::UnaryOpType;
2424
using colossalAI::funcs::warp_reduce;
2525
using colossalAI::funcs::ReduceType;
26-
using colossalAI::cuda::utils::copy_vector;
27-
using colossalAI::cuda::utils::copy_zero_vector;
26+
using colossalAI::cuda::utils::copy;
27+
using colossalAI::cuda::utils::copy_zero;
2828

2929
/*
3030
* Extended softmax (from native aten pytorch) with following additional
@@ -75,8 +75,8 @@ __global__ void scaled_upper_triang_masked_softmax_warp_forward(
7575
int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
7676

7777
if (element_index < batch_element_count) {
78-
copy_vector<input_t, ELEMENTS_PER_LDG_STG>(
79-
temp_data, src + i * element_count * stride + it * WARP_SIZE);
78+
copy<input_t, ELEMENTS_PER_LDG_STG>(
79+
src + i * element_count * stride + it * WARP_SIZE, temp_data);
8080

8181
#pragma unroll
8282
for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
@@ -140,10 +140,10 @@ __global__ void scaled_upper_triang_masked_softmax_warp_forward(
140140
out[element] = 0;
141141
}
142142
}
143-
copy_vector<output_t, ELEMENTS_PER_LDG_STG>(
144-
dst + i * element_count * stride + it * WARP_SIZE, out);
143+
copy<output_t, ELEMENTS_PER_LDG_STG>(
144+
out, dst + i * element_count * stride + it * WARP_SIZE);
145145
} else if (element_index < element_count) {
146-
copy_zero_vector<output_t, ELEMENTS_PER_LDG_STG>(
146+
copy_zero<output_t, ELEMENTS_PER_LDG_STG>(
147147
dst + i * element_count * stride + it * WARP_SIZE);
148148
} else {
149149
break;
@@ -199,10 +199,10 @@ __global__ void scaled_upper_triang_masked_softmax_warp_backward(
199199
for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
200200
int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
201201
if (element_index < batch_element_count) {
202-
copy_vector<input_t, ELEMENTS_PER_LDG_STG>(
203-
temp_grad, grad + i * element_count * stride + it * WARP_SIZE);
204-
copy_vector<input_t, ELEMENTS_PER_LDG_STG>(
205-
temp_output, output + i * element_count * stride + it * WARP_SIZE);
202+
copy<input_t, ELEMENTS_PER_LDG_STG>(
203+
grad + i * element_count * stride + it * WARP_SIZE, temp_grad);
204+
copy<input_t, ELEMENTS_PER_LDG_STG>(
205+
output + i * element_count * stride + it * WARP_SIZE, temp_output);
206206

207207
#pragma unroll
208208
for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
@@ -248,8 +248,8 @@ __global__ void scaled_upper_triang_masked_softmax_warp_backward(
248248
(output_t)(scale * (grad_reg[i][it + element] -
249249
output_reg[i][it + element] * sum[i]));
250250
}
251-
copy_vector<output_t, ELEMENTS_PER_LDG_STG>(
252-
gradInput + i * element_count * stride + it * WARP_SIZE, out);
251+
copy<output_t, ELEMENTS_PER_LDG_STG>(
252+
out, gradInput + i * element_count * stride + it * WARP_SIZE);
253253
}
254254
}
255255
}

extensions/csrc/kernel/cuda/utils/vec_copy.h

Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,25 +8,8 @@ namespace colossalAI {
88
namespace cuda {
99
namespace utils {
1010

11-
// Note(LiuYang): Depreciated
1211
template <typename T, int VecSize>
13-
__device__ __inline__ void copy_vector(T *dst, const T *src) {
14-
using VT = typename common::VecTypeTrait<T, VecSize>::Type;
15-
*(reinterpret_cast<VT *>(dst)) = *(reinterpret_cast<const VT *>(src));
16-
}
17-
18-
template <>
19-
__device__ __inline__ void copy_vector<float, 8>(float *dst, const float *src) {
20-
// Since the maximum memory alignment length is 128 bits, we choose float4
21-
// here.
22-
*(reinterpret_cast<float4 *>(dst)) = *(reinterpret_cast<const float4 *>(src));
23-
*(reinterpret_cast<float4 *>(dst + 4)) =
24-
*(reinterpret_cast<const float4 *>(src + 4));
25-
}
26-
27-
// Note(LiuYang): Depreciated
28-
template <typename T, int VecSize>
29-
__device__ __inline__ void copy_zero_vector(T *dst) {
12+
__device__ __inline__ void copy_zero(T *dst) {
3013
using VT = typename common::VecTypeTrait<T, VecSize>::Type;
3114
*(reinterpret_cast<VT *>(dst)) = funcs::CastFunctor<float, VT>()(0.0f);
3215
}

0 commit comments

Comments
 (0)