Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ steps:
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
- pytest -v -s entrypoints/test_chat_utils.py
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

Expand Down Expand Up @@ -328,6 +328,14 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- bash ./run-tests.sh -c configs/models-small.txt -t 1

- label: OpenAI API correctness
source_file_dependencies:
- csrc/
- vllm/entrypoints/openai/
- vllm/model_executor/models/whisper.py
commands: # LMEval+Transcription WER check
- pytest -s entrypoints/openai/correctness/

- label: Encoder Decoder tests # 5min
source_file_dependencies:
- vllm/
Expand Down
83 changes: 82 additions & 1 deletion csrc/activation_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

#include <cmath>

#include "core/math.hpp"

#include "cuda_compat.h"
#include "dispatch_utils.h"

Expand Down Expand Up @@ -31,6 +33,69 @@ __global__ void act_and_mul_kernel(
}
}

// NOTE: temporary vectorized version.

template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
__global__ void act_and_mul_kernel_vectorized(
scalar_t* __restrict__ out, // [..., d]
const scalar_t* __restrict__ input, // [..., 2, d]
const int d) {
const int64_t token_idx = blockIdx.x;
const int32_t blocks_per_token = gridDim.y;

const int32_t elems_per_128bit_load = (128 / 8) / sizeof(scalar_t);

const int32_t tgt_elems_per_block = ceil_div(d, blocks_per_token);
const int32_t elems_per_block =
next_multiple_of(elems_per_128bit_load, tgt_elems_per_block);
const int64_t block_start = blockIdx.y * int64_t(elems_per_block);
int64_t block_end = block_start + elems_per_block;
block_end = block_end > d ? d : block_end;

const scalar_t* __restrict__ x_ptr = input + token_idx * 2 * d;
const scalar_t* __restrict__ y_ptr = input + token_idx * 2 * d + d;
scalar_t* __restrict__ out_ptr = out + token_idx * d;

// 128-bit vectorized code
const int32_t vec_loop_end =
prev_multiple_of(elems_per_128bit_load, block_end);
const int32_t vec_end_idx = vec_loop_end / elems_per_128bit_load;
const int32_t vec_start_idx = block_start / elems_per_128bit_load;

const int4* __restrict__ x_128bit_ptr = reinterpret_cast<const int4*>(x_ptr);
const int4* __restrict__ y_128bit_ptr = reinterpret_cast<const int4*>(y_ptr);
int4* __restrict__ out_128bit_ptr = reinterpret_cast<int4*>(out_ptr);

#pragma unroll
for (int32_t vec_idx = vec_start_idx + threadIdx.x; vec_idx < vec_end_idx;
vec_idx += blockDim.x) {
const int4 x_128bit = VLLM_LDG(&x_128bit_ptr[vec_idx]);
const int4 y_128bit = VLLM_LDG(&y_128bit_ptr[vec_idx]);
using scalar_128bit_vec_t = std::array<scalar_t, elems_per_128bit_load>;

scalar_128bit_vec_t out_vec;
const auto x_vec = reinterpret_cast<scalar_128bit_vec_t const&>(x_128bit);
const auto y_vec = reinterpret_cast<scalar_128bit_vec_t const&>(y_128bit);

#pragma unroll
for (int i = 0; i < elems_per_128bit_load; i++) {
out_vec[i] = ACT_FN(x_vec[i]) * y_vec[i];
}

out_128bit_ptr[vec_idx] = reinterpret_cast<const int4&>(out_vec);
}

// Scalar cleanup code
if (block_end > vec_loop_end) {
for (int64_t idx = vec_loop_end + threadIdx.x; idx < block_end;
idx += blockDim.x) {
const scalar_t x = VLLM_LDG(&x_ptr[idx]);
const scalar_t y = VLLM_LDG(&y_ptr[idx]);
out_ptr[idx] = ACT_FN(x) * y;
}
}
}

template <typename T>
__device__ __forceinline__ T silu_kernel(const T& x) {
// x * sigmoid(x)
Expand Down Expand Up @@ -79,10 +144,26 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
input.data_ptr<scalar_t>(), d); \
});

// Launch activation and gating kernel.
// Vectorized Version
#define LAUNCH_ACTIVATION_GATE_KERNEL_VECTORIZED(KERNEL) \
int d = input.size(-1) / 2; \
int64_t num_tokens = input.numel() / input.size(-1); \
dim3 grid(num_tokens, num_tokens > 16 ? num_tokens > 32 ? 1 : 2 : 4); \
dim3 block(std::min(d, 512)); \
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
VLLM_DISPATCH_FLOATING_TYPES( \
input.scalar_type(), "act_and_mul_kernel_vectorized", [&] { \
vllm::act_and_mul_kernel_vectorized<scalar_t, KERNEL<scalar_t>> \
<<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
input.data_ptr<scalar_t>(), d); \
});

void silu_and_mul(torch::Tensor& out, // [..., d]
torch::Tensor& input) // [..., 2 * d]
{
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, true);
LAUNCH_ACTIVATION_GATE_KERNEL_VECTORIZED(vllm::silu_kernel);
}

void mul_and_silu(torch::Tensor& out, // [..., d]
Expand Down
14 changes: 13 additions & 1 deletion csrc/core/math.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,16 @@ inline constexpr uint32_t next_pow_2(uint32_t const num) {
template <typename T>
inline constexpr std::enable_if_t<std::is_integral_v<T>, T> ceil_div(T a, T b) {
return (a + b - 1) / b;
}
}

// Compute the next multiple of a that is greater than or equal to b
template <typename A, typename B>
static inline constexpr auto next_multiple_of(A a, B b) {
return ceil_div(b, a) * a;
}

// Compute the largest multiple of a that is less than or equal to b
template <typename A, typename B>
static inline constexpr auto prev_multiple_of(A a, B b) {
return (b / a) * a;
}
Loading