diff --git a/examples/cpp/gptj/gptj_config.ini b/examples/cpp/gptj/gptj_config.ini index b59a464ad..860904673 100644 --- a/examples/cpp/gptj/gptj_config.ini +++ b/examples/cpp/gptj/gptj_config.ini @@ -14,6 +14,7 @@ enable_custom_all_reduce=0 tensor_para_size=1 pipeline_para_size=1 +int8_mode=0 ;only support 0 or 1 (when fp16) model_name=gptj_6B model_dir=../models/j6b_ckpt/ diff --git a/examples/cpp/gptj/gptj_example.cc b/examples/cpp/gptj/gptj_example.cc index 05f285e85..2ae51e84e 100644 --- a/examples/cpp/gptj/gptj_example.cc +++ b/examples/cpp/gptj/gptj_example.cc @@ -92,6 +92,7 @@ void gptj_example(const INIReader reader) int tensor_para_size = reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"); int pipeline_para_size = reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"); + int int8_mode = reader.GetInteger("ft_instance_hyperparameter", "int8_mode", 0); const size_t head_num = reader.GetInteger(model_name, "head_num"); const size_t size_per_head = reader.GetInteger(model_name, "size_per_head"); @@ -287,6 +288,7 @@ void gptj_example(const INIReader reader) tensor_para.rank_, pipeline_para.world_size_, pipeline_para.rank_, + int8_mode, prompt_learning_type, prefix_prompt_table_pair); // optional if you don't need prefix prompts @@ -336,7 +338,11 @@ void gptj_example(const INIReader reader) &allocator, false, &prop, - attention_type); + attention_type, + int8_mode, + nullptr, + 0, + 1.0f); int* d_output_ids; int* d_sequence_lengths; diff --git a/examples/cpp/gptneox/gptneox_config.ini b/examples/cpp/gptneox/gptneox_config.ini index ab1bf7556..c4c998529 100644 --- a/examples/cpp/gptneox/gptneox_config.ini +++ b/examples/cpp/gptneox/gptneox_config.ini @@ -4,7 +4,7 @@ enable_custom_all_reduce=0 tensor_para_size=2 pipeline_para_size=1 - +int8_mode=0 ;only support 0 or 1 (when fp16) model_name=gptneox_20B model_dir=../models/gptneox diff --git a/examples/cpp/gptneox/gptneox_example.cc b/examples/cpp/gptneox/gptneox_example.cc index 6fc2233ee..93ceb5602 100644 --- a/examples/cpp/gptneox/gptneox_example.cc +++ b/examples/cpp/gptneox/gptneox_example.cc @@ -47,6 +47,8 @@ int main(int argc, char* argv[]) ini_name = "../examples/cpp/gptneox/gptneox_config.ini"; } + std::cout << "Ini file name: " << ini_name << std::endl; + INIReader reader = INIReader(ini_name); if (reader.ParseError() < 0) { std::cout << "[ERROR] Can't load '" << ini_name << "'\n"; @@ -76,6 +78,7 @@ void gptneox_example(const INIReader reader) int tensor_para_size = reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"); int pipeline_para_size = reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"); + int int8_mode = reader.GetInteger("ft_instance_hyperparameter", "int8_mode", 0); const size_t head_num = reader.GetInteger(model_name, "head_num"); const size_t size_per_head = reader.GetInteger(model_name, "size_per_head"); @@ -275,6 +278,7 @@ void gptneox_example(const INIReader reader) pipeline_para.world_size_, pipeline_para.rank_, use_gptj_residual, + int8_mode, prompt_learning_type, prefix_prompt_table_pair); @@ -321,7 +325,11 @@ void gptneox_example(const INIReader reader) &allocator, false, &prop, - attention_type); + attention_type, + int8_mode, + nullptr, + 0, + 1.0f); int* d_output_ids; int* d_sequence_lengths; diff --git a/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp b/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp index 8e7cb92a2..060704912 100644 --- a/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp +++ b/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp @@ -24,12 +24,12 @@ #include #include -// #define MMHA_USE_HMMA_FOR_REDUCTION +#define MMHA_USE_HMMA_FOR_REDUCTION // Below are knobs to extend FP32 accumulation for higher FP16 accuracy // Does not seem to affect the accuracy that much -// #define MMHA_USE_FP32_ACUM_FOR_FMA +#define MMHA_USE_FP32_ACUM_FOR_FMA // Seems to slightly improve the accuracy #define MMHA_USE_FP32_ACUM_FOR_OUT @@ -389,26 +389,6 @@ struct Qk_vec_acum_fp32_ { using Type = Float8_; }; -template<> -struct Qk_vec_acum_fp32_ { - using Type = Float8_; -}; -template<> -struct Qk_vec_acum_fp32_<__nv_bfloat16> { - using Type = float; -}; -template<> -struct Qk_vec_acum_fp32_<__nv_bfloat162> { - using Type = float2; -}; -template<> -struct Qk_vec_acum_fp32_ { - using Type = Float4_; -}; -template<> -struct Qk_vec_acum_fp32_ { - using Type = Float8_; -}; #ifdef ENABLE_FP8 // template<> // struct Qk_vec_acum_fp32_ { diff --git a/src/fastertransformer/kernels/decoding_kernels.cu b/src/fastertransformer/kernels/decoding_kernels.cu index 89f0d5011..c673a6597 100644 --- a/src/fastertransformer/kernels/decoding_kernels.cu +++ b/src/fastertransformer/kernels/decoding_kernels.cu @@ -496,9 +496,9 @@ __global__ void gatherTree(gatherTreeParam param) int tmp_len = param.max_sequence_lengths[batch * param.beam_width + j] + param.max_sequence_length_final_step; // also remove the length of the soft prompts, p_prompt_tuning - param.max_sequence_lengths[batch * param.beam_width + j] = - tmp_len - param.max_prefix_soft_prompt_length - - (param.max_input_length - param.max_input_without_prompt_length); + param.sequence_lengths_for_output[batch * param.beam_width + j] = + uint32_t(tmp_len - param.max_prefix_soft_prompt_length + - (param.max_input_length - param.max_input_without_prompt_length)); // update the response input length if (update_response_input_length) { param.response_input_lengths[batch * param.beam_width + j] = input_len - prompt_len; diff --git a/src/fastertransformer/kernels/decoding_kernels.h b/src/fastertransformer/kernels/decoding_kernels.h index 7527d8fc4..0ebf15a0c 100644 --- a/src/fastertransformer/kernels/decoding_kernels.h +++ b/src/fastertransformer/kernels/decoding_kernels.h @@ -123,6 +123,7 @@ void invokeGatherTree(int* beams, struct gatherTreeParam { int* beams = nullptr; int* max_sequence_lengths = nullptr; + uint* sequence_lengths_for_output = nullptr; int max_sequence_length_final_step = 0; const int* input_lengths = nullptr; // response input lengths (used to slice the ids during postprocessing) diff --git a/src/fastertransformer/models/gptj/GptJ.cc b/src/fastertransformer/models/gptj/GptJ.cc index 0382d8863..fb7970418 100644 --- a/src/fastertransformer/models/gptj/GptJ.cc +++ b/src/fastertransformer/models/gptj/GptJ.cc @@ -43,6 +43,7 @@ void GptJ::initialize() is_free_buffer_after_forward_, is_context_qk_buf_float_, attention_type_, + int8_mode_, custom_all_reduce_comm_, enable_custom_all_reduce_); @@ -60,6 +61,7 @@ void GptJ::initialize() cublas_wrapper_, allocator_, is_free_buffer_after_forward_, + int8_mode_, custom_all_reduce_comm_, enable_custom_all_reduce_); @@ -153,6 +155,13 @@ void GptJ::allocateBuffer( (float*)(allocator_->reMalloc(output_log_probs_buf_, sizeof(float) * batchxbeam * max_seq_len, false)); generation_should_stop_ = (bool*)(allocator_->reMalloc(generation_should_stop_, sizeof(bool), true, true)); + if (shared_contexts_ratio_ > 0.0f) { + shared_contexts_idx_ = (int*)allocator_->reMalloc(shared_contexts_idx_, batch_size * sizeof(int), false); + batch_to_compact_idx_ = (int*)allocator_->reMalloc(batch_to_compact_idx_, batchxbeam * sizeof(int), false); + compact_idx_ = (int*)allocator_->reMalloc(compact_idx_, batch_size * sizeof(int), false); + compact_size_ = (int*)allocator_->reMalloc(compact_size_, sizeof(int), false); + } + is_allocate_buffer_ = true; } @@ -205,6 +214,11 @@ void GptJ::freeBuffer() allocator_->free((void**)(&generation_should_stop_), true); + if (shared_contexts_ratio_ > 0.0f) { + allocator_->free((void**)(&shared_contexts_idx_)); + allocator_->free((void**)(&compact_size_)); + } + is_allocate_buffer_ = false; } } @@ -237,8 +251,10 @@ GptJ::GptJ(size_t max_batch_size, bool is_free_buffer_after_forward, cudaDeviceProp* cuda_device_prop, AttentionType attention_type, + int int8_mode, std::shared_ptr custom_all_reduce_comm, - int enable_custom_all_reduce): + int enable_custom_all_reduce, + float shared_contexts_ratio): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop), head_num_(head_num), size_per_head_(size_per_head), @@ -252,7 +268,9 @@ GptJ::GptJ(size_t max_batch_size, prompt_learning_type_(prompt_learning_type), hidden_units_(head_num * size_per_head), local_head_num_(head_num / 1), - attention_type_(attention_type) + attention_type_(attention_type), + int8_mode_(int8_mode), + shared_contexts_ratio_(shared_contexts_ratio) { tensor_para_.world_size_ = 1; tensor_para_.rank_ = 0; @@ -297,8 +315,10 @@ GptJ::GptJ(size_t max_batch_size, bool is_free_buffer_after_forward, cudaDeviceProp* cuda_device_prop, AttentionType attention_type, + int int8_mode, std::shared_ptr custom_all_reduce_comm, - int enable_custom_all_reduce): + int enable_custom_all_reduce, + float shared_contexts_ratio): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop), head_num_(head_num), size_per_head_(size_per_head), @@ -315,8 +335,10 @@ GptJ::GptJ(size_t max_batch_size, pipeline_para_(pipeline_para), local_head_num_(head_num / tensor_para.world_size_), attention_type_(attention_type), + int8_mode_(int8_mode), custom_all_reduce_comm_(custom_all_reduce_comm), - enable_custom_all_reduce_(enable_custom_all_reduce) + enable_custom_all_reduce_(enable_custom_all_reduce), + shared_contexts_ratio_(shared_contexts_ratio) { int local_vacab_size = ceil(vocab_size_ / 1.f / tensor_para_.world_size_); if (std::is_same::value) { @@ -345,8 +367,10 @@ GptJ::GptJ(GptJ const& gpt): local_head_num_(gpt.local_head_num_), vocab_size_padded_(gpt.vocab_size_padded_), attention_type_(gpt.attention_type_), + int8_mode_(gpt.int8_mode_), custom_all_reduce_comm_(gpt.custom_all_reduce_comm_), - enable_custom_all_reduce_(gpt.enable_custom_all_reduce_) + enable_custom_all_reduce_(gpt.enable_custom_all_reduce_), + shared_contexts_ratio_(gpt.shared_contexts_ratio_) { initialize(); } @@ -584,6 +608,23 @@ void GptJ::forward(std::unordered_map* output_tens cudaMemsetAsync(cache_indirections_[0], 0, 2 * sizeof(int) * batch_size * beam_width * max_seq_len, stream_); } + int compact_size; + bool use_shared_contexts = (shared_contexts_ratio_ > 0.0f) && (max_input_length >= 1) && (batch_size > 1); + if (use_shared_contexts) { + invokeFindContextDups(shared_contexts_idx_, + batch_to_compact_idx_, + compact_idx_, + compact_size_, + input_tensors->at("input_ids").getPtr(), + batch_size, + beam_width, + max_input_length, + stream_); + cudaD2Hcpy(&compact_size, compact_size_, 1); + use_shared_contexts = compact_size <= shared_contexts_ratio_ * batch_size; + sync_check_cuda_error(); + } + // Prefix prompts if (has_prefix_prompt_) { cudaMemcpyAsync(prompt_learning_weight_batch_, @@ -685,6 +726,14 @@ void GptJ::forward(std::unordered_map* output_tens {batch_size * beam_width}, has_prefix_prompt_ ? tiled_prompt_lengths_buf_ : nullptr}}}; + if (use_shared_contexts) { + decoder_input_tensors.insert( + {"compact_idx", Tensor(MEMORY_GPU, TYPE_INT32, {(size_t)compact_size}, compact_idx_)}); + decoder_input_tensors.insert( + {"batch_to_compact_idx", + Tensor(MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, batch_to_compact_idx_)}); + } + std::unordered_map decoder_output_tensors{ {"decoder_output", Tensor{MEMORY_GPU, diff --git a/src/fastertransformer/models/gptj/GptJ.h b/src/fastertransformer/models/gptj/GptJ.h index 63fec7e93..5865540e7 100644 --- a/src/fastertransformer/models/gptj/GptJ.h +++ b/src/fastertransformer/models/gptj/GptJ.h @@ -53,11 +53,14 @@ class GptJ: public BaseLayer { int enable_custom_all_reduce_; AttentionType attention_type_; + const int int8_mode_ = 0; size_t vocab_size_padded_; const bool is_context_qk_buf_float_ = (std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM") == nullptr || std::string(std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM")) != "ON"); + + float shared_contexts_ratio_; // Prompt Learning Parameters PromptLearningType prompt_learning_type_; @@ -117,6 +120,11 @@ class GptJ: public BaseLayer { bool* generation_should_stop_ = nullptr; + int* shared_contexts_idx_ = nullptr; + int* compact_idx_ = nullptr; + int* batch_to_compact_idx_ = nullptr; + int* compact_size_ = nullptr; + T* context_decoder_input_buf_; T* context_decoder_output_buf_; float* output_log_probs_buf_; @@ -161,8 +169,10 @@ class GptJ: public BaseLayer { bool is_free_buffer_after_forward, cudaDeviceProp* cuda_device_prop = nullptr, AttentionType attention_type = AttentionType::UNFUSED_MHA, + int int8_mode = 0, std::shared_ptr custom_all_reduce_comm = nullptr, - int enable_custom_all_reduce = 0); + int enable_custom_all_reduce = 0, + float shared_contexts_ratio = 1.0f); GptJ(size_t max_batch_size, size_t max_seq_len, @@ -193,8 +203,10 @@ class GptJ: public BaseLayer { bool is_free_buffer_after_forward, cudaDeviceProp* cuda_device_prop = nullptr, AttentionType attention_type = AttentionType::UNFUSED_MHA, + int int8_mode = 0, std::shared_ptr custom_all_reduce_comm = nullptr, - int enable_custom_all_reduce = 0); + int enable_custom_all_reduce = 0, + float shared_contexts_ratio = 1.0f); GptJ(GptJ const& GptJ); diff --git a/src/fastertransformer/models/gptj/GptJContextDecoder.cc b/src/fastertransformer/models/gptj/GptJContextDecoder.cc index 28560c634..2f69541bc 100644 --- a/src/fastertransformer/models/gptj/GptJContextDecoder.cc +++ b/src/fastertransformer/models/gptj/GptJContextDecoder.cc @@ -40,7 +40,7 @@ void GptJContextDecoder::initialize() is_free_buffer_after_forward_, is_qk_buf_float_, false, - 0, + int8_mode_, custom_all_reduce_comm_, enable_custom_all_reduce_); @@ -57,7 +57,7 @@ void GptJContextDecoder::initialize() true, is_free_buffer_after_forward_, false, - 0, + int8_mode_, false, // use_gated_activation = false; custom_all_reduce_comm_, enable_custom_all_reduce_); @@ -70,7 +70,7 @@ void GptJContextDecoder::allocateBuffer() } template -void GptJContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len) +void GptJContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len, bool use_shared_contexts) { decoder_normed_input_ = reinterpret_cast( allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); @@ -84,6 +84,18 @@ void GptJContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len) padding_offset_ = reinterpret_cast(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false)); cu_seqlens_ = reinterpret_cast(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false)); + if (use_shared_contexts) { + compact_decoder_features_ = reinterpret_cast( + allocator_->reMalloc(compact_decoder_features_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + compact_attention_mask_ = reinterpret_cast( + allocator_->reMalloc(compact_attention_mask_, sizeof(T) * batch_size * seq_len * seq_len, false)); + compact_input_lengths_ = + reinterpret_cast(allocator_->reMalloc(compact_input_lengths_, sizeof(int) * batch_size, false)); + k_cache_layer_ = reinterpret_cast( + allocator_->reMalloc(k_cache_layer_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + v_cache_layer_ = reinterpret_cast( + allocator_->reMalloc(v_cache_layer_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + } is_allocate_buffer_ = true; } @@ -98,6 +110,13 @@ void GptJContextDecoder::freeBuffer() allocator_->free((void**)(&h_pinned_token_num_ptr_), true); allocator_->free((void**)(&padding_offset_)); allocator_->free((void**)(&cu_seqlens_)); + if (compact_decoder_features_ != nullptr) { + allocator_->free((void**)(&compact_decoder_features_)); + allocator_->free((void**)(&compact_attention_mask_)); + allocator_->free((void**)(&compact_input_lengths_)); + allocator_->free((void**)(&k_cache_layer_)); + allocator_->free((void**)(&v_cache_layer_)); + } is_allocate_buffer_ = false; } } @@ -149,6 +168,7 @@ GptJContextDecoder::GptJContextDecoder(size_t ma bool is_free_buffer_after_forward, bool is_qk_buf_float, AttentionType attention_type, + int int8_mode, std::shared_ptr custom_all_reduce_comm, int enable_custom_all_reduce): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward), @@ -166,6 +186,7 @@ GptJContextDecoder::GptJContextDecoder(size_t ma pipeline_para_(pipeline_para), is_qk_buf_float_(is_qk_buf_float), attention_type_(attention_type), + int8_mode_(int8_mode), custom_all_reduce_comm_(custom_all_reduce_comm), enable_custom_all_reduce_(enable_custom_all_reduce) { @@ -188,6 +209,7 @@ GptJContextDecoder::GptJContextDecoder(GptJContextDecoder const& decoder): pipeline_para_(decoder.pipeline_para_), is_qk_buf_float_(decoder.is_qk_buf_float_), attention_type_(decoder.attention_type_), + int8_mode_(decoder.int8_mode_), custom_all_reduce_comm_(decoder.custom_all_reduce_comm_), enable_custom_all_reduce_(decoder.enable_custom_all_reduce_) { @@ -241,15 +263,23 @@ void GptJContextDecoder::forward(std::unordered_map* // For example, the shape of decoder_input becomes [ite, batch_size, seq_len, hidden_dimension] during // computing. - FT_CHECK(input_tensors->size() == 5); + FT_CHECK(input_tensors->size() >= 5); FT_CHECK(output_tensors->size() == 4); - const int batch_size = input_tensors->at("decoder_input").shape[0]; - const int seq_len = input_tensors->at("decoder_input").shape[1]; // max_input_len + const bool use_shared_contexts = input_tensors->find("compact_idx") != input_tensors->end(); + FT_CHECK(!use_shared_contexts || (input_tensors->find("batch_to_compact_idx") != input_tensors->end())); + const size_t request_batch_size = input_tensors->at("decoder_input").shape[0]; + // compacted batch size. + const size_t batch_size = + use_shared_contexts ? input_tensors->at("compact_idx").shape[0] : input_tensors->at("decoder_input").shape[0]; + const int seq_len = input_tensors->at("decoder_input").shape[1]; // max_input_len + // The maximum length of generation. + const size_t max_seq_len = output_tensors->at("value_cache").shape[3]; + const int max_prompt_length = input_tensors->at("attention_mask").shape[3] - input_tensors->at("attention_mask").shape[2]; const DataType data_type = getTensorType(); - allocateBuffer(batch_size, seq_len); + allocateBuffer(batch_size, seq_len, use_shared_contexts); T* decoder_input = input_tensors->at("decoder_input").getPtr(); T* decoder_output = output_tensors->at("decoder_output").getPtr(); @@ -257,6 +287,20 @@ void GptJContextDecoder::forward(std::unordered_map* const T** d_prefix_prompt_batch = input_tensors->at("d_prefix_prompt_batch").getPtr(); const int* d_prefix_prompt_lengths = input_tensors->at("d_prefix_prompt_lengths").getPtr(); + if (use_shared_contexts) { + invokeCompactInputs(compact_decoder_features_, + compact_attention_mask_, + compact_input_lengths_, + decoder_input, + attention_mask, + input_tensors->at("input_lengths").getPtr(), + input_tensors->at("compact_idx").getPtr(), + batch_size, + seq_len, + hidden_units_, + stream_); + } + const int local_batch_size = getLocalBatchSize(batch_size, seq_len, pipeline_para_.world_size_); FT_CHECK(batch_size % local_batch_size == 0); const int iteration_num = batch_size / local_batch_size; @@ -274,6 +318,12 @@ void GptJContextDecoder::forward(std::unordered_map* self_v_cache_size.push_back(*t); } + if (use_shared_contexts) { + // we use k_cache_layer_ and v_cache_layer_ + self_k_cache_size[3] = seq_len; + self_v_cache_size[2] = seq_len; + } + AttentionType attention_type = (d_prefix_prompt_lengths != nullptr) ? getUnfusedAttentionType(attention_type_) : attention_type_; @@ -282,7 +332,8 @@ void GptJContextDecoder::forward(std::unordered_map* for (int ite = 0; ite < iteration_num; ite++) { size_t h_token_num = local_batch_size * seq_len; if (is_unpadded_mha) { - const int* base_input_lengths = input_tensors->at("input_lengths").getPtr(); + const int* base_input_lengths = + use_shared_contexts ? compact_input_lengths_ : input_tensors->at("input_lengths").getPtr(); invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_, &h_token_num, padding_offset_, @@ -298,8 +349,9 @@ void GptJContextDecoder::forward(std::unordered_map* } if (l == 0 && is_unpadded_mha) { + const T* base_input = (use_shared_contexts ? compact_decoder_features_ : decoder_input); invokeRemovePadding(decoder_layer_output_, - decoder_input + ite * local_batch_size * seq_len * hidden_units_, + base_input + ite * local_batch_size * seq_len * hidden_units_, padding_offset_, h_token_num, hidden_units_, @@ -310,11 +362,11 @@ void GptJContextDecoder::forward(std::unordered_map* T* layer_output = decoder_layer_output_; if (!is_unpadded_mha) { if (l == 0) { - layer_input = decoder_input; + layer_input = use_shared_contexts ? compact_decoder_features_ : decoder_input; layer_input += ite * local_batch_size * seq_len * hidden_units_; } if (l == num_layer_ - 1) { - layer_output = decoder_output; + layer_output = use_shared_contexts ? compact_decoder_features_ : decoder_output; layer_output += ite * local_batch_size * seq_len * hidden_units_; } } @@ -339,12 +391,14 @@ void GptJContextDecoder::forward(std::unordered_map* h_token_num, hidden_units_, (float*)nullptr, - 0, + int8_mode_, stream_); sync_check_cuda_error(); const bool is_final = false; // TODO(bhsueh) remove this flag + const T* attention_ptr = use_shared_contexts ? compact_attention_mask_ : attention_mask; + TensorMap self_attention_input_tensors{ {"input_query", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}, @@ -352,7 +406,7 @@ void GptJContextDecoder::forward(std::unordered_map* Tensor{MEMORY_GPU, data_type, {(size_t)local_batch_size, (size_t)1, (size_t)seq_len, (size_t)(seq_len + max_prompt_length)}, - attention_mask + local_batch_size * ite * seq_len * (seq_len + max_prompt_length)}}, + attention_ptr + local_batch_size * ite * seq_len * (seq_len + max_prompt_length)}}, {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type}}, {"is_final_layer", Tensor{MEMORY_CPU, TYPE_BOOL, {(size_t)1}, &is_final}}, {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}}; @@ -388,17 +442,43 @@ void GptJContextDecoder::forward(std::unordered_map* } cache_offset += ite_cache_offset; + T* k_cache_ptr = use_shared_contexts ? k_cache_layer_ : k_cache.getPtrWithOffset(cache_offset); + T* v_cache_ptr = use_shared_contexts ? v_cache_layer_ : v_cache.getPtrWithOffset(cache_offset); + TensorMap self_attention_output_tensors{ {"hidden_features", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, self_attn_output_}}, - {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}}, - {"value_cache", - Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}}; + {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache_ptr}}, + {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache_ptr}}}; self_attention_layer_->forward(&self_attention_output_tensors, &self_attention_input_tensors, &gpt_decoder_layer_weight->at(l).self_attention_weights); + if (use_shared_contexts) { + // Even with local batches, we must process the whole K/V caches as any + // element in batch_idx_to_compact_idx may reference the local batch + // we're processing. We also need to discard references that aren't in + // that particular local batch. + const size_t cache_stride_per_batch = hidden_units_ / tensor_para_.world_size_ * max_seq_len; + const size_t cache_layer_offset = + (l - getFirstLayerParallelId()) * request_batch_size * cache_stride_per_batch; + invokeUnCompactCaches(k_cache.getPtrWithOffset(cache_layer_offset), + v_cache.getPtrWithOffset(cache_layer_offset), + k_cache_layer_, + v_cache_layer_, + input_tensors->at("batch_to_compact_idx").getPtr(), + request_batch_size, // batch_size (uncompact) + v_cache.shape[2], // local_head_num + max_seq_len, + seq_len, + size_per_head_, + local_batch_size, + ite, + stream_); + sync_check_cuda_error(); + } + if (is_final == false) { TensorMap ffn_input_tensors( {{"ffn_input", @@ -429,7 +509,8 @@ void GptJContextDecoder::forward(std::unordered_map* } if ((l == num_layer_ - 1) && is_unpadded_mha) { - invokeRebuildPadding(decoder_output + ite * local_batch_size * seq_len * hidden_units_, + T* base_ptr = use_shared_contexts ? compact_decoder_features_ : decoder_output; + invokeRebuildPadding(base_ptr + ite * local_batch_size * seq_len * hidden_units_, decoder_layer_output_, padding_offset_, h_token_num, @@ -440,12 +521,22 @@ void GptJContextDecoder::forward(std::unordered_map* } } + if (use_shared_contexts) { + invokeUnCompactOutputs(decoder_output, + compact_decoder_features_, + input_tensors->at("batch_to_compact_idx").getPtr(), + request_batch_size, // batch + seq_len * hidden_units_, + stream_); + sync_check_cuda_error(); + } + // TODO(bhsueh) We could optimize this point by only computing the last token for the last layer invokeLookupHiddenStateOfLastToken(output_tensors->at("last_token_hidden_units").getPtr(), output_tensors->at("decoder_output").getPtr(), input_tensors->at("input_lengths").getPtr(), seq_len, - batch_size, + request_batch_size, hidden_units_, stream_); sync_check_cuda_error(); diff --git a/src/fastertransformer/models/gptj/GptJContextDecoder.h b/src/fastertransformer/models/gptj/GptJContextDecoder.h index 742a12f72..548aa9a77 100644 --- a/src/fastertransformer/models/gptj/GptJContextDecoder.h +++ b/src/fastertransformer/models/gptj/GptJContextDecoder.h @@ -59,6 +59,7 @@ class GptJContextDecoder: public BaseLayer { int enable_custom_all_reduce_; AttentionType attention_type_; + int int8_mode_ = 0; bool is_qk_buf_float_; @@ -66,7 +67,7 @@ class GptJContextDecoder: public BaseLayer { FfnLayer* ffn_layer_; void allocateBuffer() override; - void allocateBuffer(size_t batch_size, size_t seq_len); + void allocateBuffer(size_t batch_size, size_t seq_len, bool use_shared_contexts); void freeBuffer() override; bool isValidLayerParallelId(uint l); @@ -85,6 +86,12 @@ class GptJContextDecoder: public BaseLayer { int* padding_offset_ = nullptr; int* cu_seqlens_ = nullptr; + T* compact_decoder_features_ = nullptr; + T* compact_attention_mask_ = nullptr; + int* compact_input_lengths_ = nullptr; + T* k_cache_layer_ = nullptr; + T* v_cache_layer_ = nullptr; + public: GptJContextDecoder(size_t max_batch_size, size_t max_seq_len, @@ -103,6 +110,7 @@ class GptJContextDecoder: public BaseLayer { bool is_free_buffer_after_forward, bool is_qk_buf_float, AttentionType attention_type = AttentionType::UNFUSED_MHA, + int int8_mode = 0, std::shared_ptr custom_all_reduce_comm = nullptr, int enable_custom_all_reduce_ = 0); diff --git a/src/fastertransformer/models/gptj/GptJDecoder.cc b/src/fastertransformer/models/gptj/GptJDecoder.cc index fdcbe593f..efc4be19d 100644 --- a/src/fastertransformer/models/gptj/GptJDecoder.cc +++ b/src/fastertransformer/models/gptj/GptJDecoder.cc @@ -35,7 +35,7 @@ void GptJDecoder::initialize() true, is_free_buffer_after_forward_, false, - 0, + int8_mode_, custom_all_reduce_comm_, enable_custom_all_reduce_); @@ -52,7 +52,7 @@ void GptJDecoder::initialize() true, is_free_buffer_after_forward_, false, - 0, + int8_mode_, false, // use_gated_activation = false; custom_all_reduce_comm_, enable_custom_all_reduce_); @@ -134,6 +134,7 @@ GptJDecoder::GptJDecoder(size_t max_batch_size, cublasMMWrapper* cublas_wrapper, IAllocator* allocator, bool is_free_buffer_after_forward, + int int8_mode, std::shared_ptr custom_all_reduce_comm, int enable_custom_all_reduce): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward), @@ -148,6 +149,7 @@ GptJDecoder::GptJDecoder(size_t max_batch_size, hidden_units_(head_num_ * size_per_head), tensor_para_(tensor_para), pipeline_para_(pipeline_para), + int8_mode_(int8_mode), custom_all_reduce_comm_(custom_all_reduce_comm), enable_custom_all_reduce_(enable_custom_all_reduce) { @@ -167,6 +169,7 @@ GptJDecoder::GptJDecoder(GptJDecoder const& decoder): hidden_units_(decoder.hidden_units_), tensor_para_(decoder.tensor_para_), pipeline_para_(decoder.pipeline_para_), + int8_mode_(decoder.int8_mode_), custom_all_reduce_comm_(decoder.custom_all_reduce_comm_), enable_custom_all_reduce_(decoder.enable_custom_all_reduce_) { @@ -268,7 +271,7 @@ void GptJDecoder::forward(std::unordered_map* outp local_batch_size, hidden_units_, (float*)nullptr, - 0, + int8_mode_, stream_); sync_check_cuda_error(); diff --git a/src/fastertransformer/models/gptj/GptJDecoder.h b/src/fastertransformer/models/gptj/GptJDecoder.h index 019bfb783..be5381e97 100644 --- a/src/fastertransformer/models/gptj/GptJDecoder.h +++ b/src/fastertransformer/models/gptj/GptJDecoder.h @@ -72,6 +72,8 @@ class GptJDecoder: public BaseLayer { BaseAttentionLayer* self_attention_layer_; FfnLayer* ffn_layer_; + int int8_mode_ = 0; + public: GptJDecoder(size_t max_batch_size, size_t head_num, @@ -87,6 +89,7 @@ class GptJDecoder: public BaseLayer { cublasMMWrapper* cublas_wrapper, IAllocator* allocator, bool is_free_buffer_after_forward, + int int8_mode = 0, std::shared_ptr custom_all_reduce_comm = nullptr, int enable_custom_all_reduce_ = 0); diff --git a/src/fastertransformer/models/gptj/GptJDecoderLayerWeight.cc b/src/fastertransformer/models/gptj/GptJDecoderLayerWeight.cc index f3ef6e689..10ab4521e 100644 --- a/src/fastertransformer/models/gptj/GptJDecoderLayerWeight.cc +++ b/src/fastertransformer/models/gptj/GptJDecoderLayerWeight.cc @@ -23,14 +23,25 @@ template GptJDecoderLayerWeight::GptJDecoderLayerWeight(const int hidden_units, const int inter_size, const int tensor_para_size, - const int tensor_para_rank): + const int tensor_para_rank, + const int int8_mode): hidden_units_(hidden_units), inter_size_(inter_size), tensor_para_size_(tensor_para_size), - tensor_para_rank_(tensor_para_rank) + tensor_para_rank_(tensor_para_rank), + int8_mode_(int8_mode) { mallocWeights(); setWeightPtr(); + + FT_CHECK_WITH_INFO(int8_mode_ != 2, "GptJ doesn't support int8_model == 2"); + FT_CHECK_WITH_INFO(!(std::is_same::value && int8_mode_ == 1), + "Weight only quant does not work with FP32 compute."); +} + +template +GptJDecoderLayerWeight::GptJDecoderLayerWeight(const int int8_mode): int8_mode_(int8_mode) +{ } template @@ -51,30 +62,78 @@ GptJDecoderLayerWeight::~GptJDecoderLayerWeight() ffn_weights.intermediate_weight.bias = nullptr; ffn_weights.output_weight.kernel = nullptr; ffn_weights.output_weight.bias = nullptr; + + if (int8_mode_ != 0) { + for (int i = 0; i < int8_weights_ptr.size(); i++) { + if (int8_weights_ptr[i] != nullptr) { + deviceFree(int8_weights_ptr[i]); + } + } + + if (int8_mode_ == 1) { + for (int i = 0; i < weight_only_scale_ptr.size(); i++) { + if (weight_only_scale_ptr[i] != nullptr) { + deviceFree(weight_only_scale_ptr[i]); + } + } + } + + self_attention_weights.query_weight.int8_kernel = nullptr; + self_attention_weights.query_weight.weight_only_quant_scale = nullptr; + self_attention_weights.attention_output_weight.int8_kernel = nullptr; + self_attention_weights.attention_output_weight.weight_only_quant_scale = nullptr; + ffn_weights.intermediate_weight.int8_kernel = nullptr; + ffn_weights.intermediate_weight.weight_only_quant_scale = nullptr; + ffn_weights.output_weight.int8_kernel = nullptr; + ffn_weights.output_weight.weight_only_quant_scale = nullptr; + } + is_maintain_buffer = false; } } template -GptJDecoderLayerWeight::GptJDecoderLayerWeight(const GptJDecoderLayerWeight& other): - hidden_units_(other.hidden_units_), - inter_size_(other.inter_size_), - tensor_para_size_(other.tensor_para_size_), - tensor_para_rank_(other.tensor_para_rank_) +void GptJDecoderLayerWeight::copyFrom(const GptJDecoderLayerWeight& other) { - mallocWeights(); - cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_); cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); - cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); - cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_ * inter_size_ / tensor_para_size_); cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], inter_size_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_ * hidden_units_); cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], hidden_units_); + + if (int8_mode_ == 0) { + cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); + cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_ * inter_size_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_ * hidden_units_); + } + else { + cudaD2Dcpy(int8_weights_ptr[0], other.int8_weights_ptr[0], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); + cudaD2Dcpy(int8_weights_ptr[1], other.int8_weights_ptr[1], hidden_units_ / tensor_para_size_ * hidden_units_); + cudaD2Dcpy(int8_weights_ptr[2], other.int8_weights_ptr[2], hidden_units_ * inter_size_ / tensor_para_size_); + cudaD2Dcpy(int8_weights_ptr[3], other.int8_weights_ptr[3], inter_size_ / tensor_para_size_ * hidden_units_); + + if (int8_mode_ == 1) { + cudaD2Dcpy(weight_only_scale_ptr[0], other.weight_only_scale_ptr[0], 3 * hidden_units_ / tensor_para_size_); + cudaD2Dcpy(weight_only_scale_ptr[1], other.weight_only_scale_ptr[1], hidden_units_); + cudaD2Dcpy(weight_only_scale_ptr[2], other.weight_only_scale_ptr[2], inter_size_ / tensor_para_size_); + cudaD2Dcpy(weight_only_scale_ptr[3], other.weight_only_scale_ptr[3], hidden_units_); + } + } +} + +template +GptJDecoderLayerWeight::GptJDecoderLayerWeight(const GptJDecoderLayerWeight& other): + hidden_units_(other.hidden_units_), + inter_size_(other.inter_size_), + tensor_para_size_(other.tensor_para_size_), + tensor_para_rank_(other.tensor_para_rank_), + int8_mode_(other.int8_mode_) +{ + mallocWeights(); + copyFrom(other); setWeightPtr(); } @@ -85,20 +144,10 @@ GptJDecoderLayerWeight& GptJDecoderLayerWeight::operator=(const GptJDecode inter_size_ = other.inter_size_; tensor_para_size_ = other.tensor_para_size_; tensor_para_rank_ = other.tensor_para_rank_; + int8_mode_ = other.int8_mode_; mallocWeights(); - - cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_); - cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); - cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); - - cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_ * inter_size_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], inter_size_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_ * hidden_units_); - cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], hidden_units_); - + copyFrom(other); setWeightPtr(); return *this; } @@ -113,32 +162,64 @@ void GptJDecoderLayerWeight::loadModel(std::string dir_path, FtCudaDataType m weights_ptr[0], {(size_t)hidden_units_}, dir_path + ".input_layernorm.bias.bin", model_file_type); loadWeightFromBin( weights_ptr[1], {(size_t)hidden_units_}, dir_path + ".input_layernorm.weight.bin", model_file_type); - loadWeightFromBin(weights_ptr[2], - {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)}, - dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin", - model_file_type); // GPT-J does not have bias for QKV cudaMemset(weights_ptr[3], 0, sizeof(T) * 3 * hidden_units_ / tensor_para_size_); - loadWeightFromBin(weights_ptr[4], + + loadWeightFromBin(weights_ptr[6], + {(size_t)(inter_size_ / tensor_para_size_)}, + dir_path + ".mlp.dense_h_to_4h.bias." + rank_spec + ".bin", + model_file_type); + loadWeightFromBin( + weights_ptr[8], {(size_t)hidden_units_}, dir_path + ".mlp.dense_4h_to_h.bias.bin", model_file_type); + + // Load weights for GPT + if (int8_mode_ == 0) { + loadWeightFromBin(weights_ptr[2], + {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)}, + dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin", + model_file_type); + + loadWeightFromBin(weights_ptr[4], {(size_t)(hidden_units_ / tensor_para_size_), (size_t)hidden_units_}, dir_path + ".attention.dense.weight." + rank_spec + ".bin", model_file_type); - - loadWeightFromBin(weights_ptr[5], + + loadWeightFromBin(weights_ptr[5], {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)}, dir_path + ".mlp.dense_h_to_4h.weight." + rank_spec + ".bin", model_file_type); - loadWeightFromBin(weights_ptr[6], - {(size_t)(inter_size_ / tensor_para_size_)}, - dir_path + ".mlp.dense_h_to_4h.bias." + rank_spec + ".bin", - model_file_type); - loadWeightFromBin(weights_ptr[7], + + loadWeightFromBin(weights_ptr[7], {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_}, dir_path + ".mlp.dense_4h_to_h.weight." + rank_spec + ".bin", model_file_type); - loadWeightFromBin( - weights_ptr[8], {(size_t)hidden_units_}, dir_path + ".mlp.dense_4h_to_h.bias.bin", model_file_type); + } + else if (int8_mode_ == 1) { + loadWeightFromBinAndQuantizeForWeightOnly(int8_weights_ptr[0], + weight_only_scale_ptr[0], + {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)}, + dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin", + model_file_type); + + loadWeightFromBinAndQuantizeForWeightOnly(int8_weights_ptr[1], + weight_only_scale_ptr[1], + {(size_t)(hidden_units_ / tensor_para_size_), (size_t)hidden_units_}, + dir_path + ".attention.dense.weight." + rank_spec + ".bin", + model_file_type); + + loadWeightFromBinAndQuantizeForWeightOnly(int8_weights_ptr[2], + weight_only_scale_ptr[2], + {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)}, + dir_path + ".mlp.dense_h_to_4h.weight." + rank_spec + ".bin", + model_file_type); + + loadWeightFromBinAndQuantizeForWeightOnly(int8_weights_ptr[3], + weight_only_scale_ptr[3], + {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_}, + dir_path + ".mlp.dense_4h_to_h.weight." + rank_spec + ".bin", + model_file_type); + } } template @@ -155,6 +236,20 @@ void GptJDecoderLayerWeight::setWeightPtr() ffn_weights.output_weight.kernel = weights_ptr[7]; ffn_weights.output_weight.bias = weights_ptr[8]; + if (int8_mode_ != 0) { + self_attention_weights.query_weight.int8_kernel = int8_weights_ptr[0]; + self_attention_weights.attention_output_weight.int8_kernel = int8_weights_ptr[1]; + ffn_weights.intermediate_weight.int8_kernel = int8_weights_ptr[2]; + ffn_weights.output_weight.int8_kernel = int8_weights_ptr[3]; + + if (int8_mode_ == 1) { + self_attention_weights.query_weight.weight_only_quant_scale = weight_only_scale_ptr[0]; + self_attention_weights.attention_output_weight.weight_only_quant_scale = weight_only_scale_ptr[1]; + ffn_weights.intermediate_weight.weight_only_quant_scale = weight_only_scale_ptr[2]; + ffn_weights.output_weight.weight_only_quant_scale = weight_only_scale_ptr[3]; + } + } + is_maintain_buffer = true; } @@ -163,14 +258,32 @@ void GptJDecoderLayerWeight::mallocWeights() { deviceMalloc(&weights_ptr[0], hidden_units_); deviceMalloc(&weights_ptr[1], hidden_units_); - deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); deviceMalloc(&weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); - deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); - deviceMalloc(&weights_ptr[5], hidden_units_ * inter_size_ / tensor_para_size_); deviceMalloc(&weights_ptr[6], inter_size_ / tensor_para_size_); - deviceMalloc(&weights_ptr[7], inter_size_ / tensor_para_size_ * hidden_units_); deviceMalloc(&weights_ptr[8], hidden_units_); + + if (int8_mode_ == 0) { + deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); // qkv weight + deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); // attention output weight + deviceMalloc(&weights_ptr[5], hidden_units_ * inter_size_ / tensor_para_size_); // ffn inter weight + deviceMalloc(&weights_ptr[7], inter_size_ / tensor_para_size_ * hidden_units_); // ffn output weight + } + else { + // Alloc FFN and Attention int8 weights + deviceMalloc(&int8_weights_ptr[0], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); + deviceMalloc(&int8_weights_ptr[1], hidden_units_ / tensor_para_size_ * hidden_units_); + deviceMalloc(&int8_weights_ptr[2], hidden_units_ * inter_size_ / tensor_para_size_); + deviceMalloc(&int8_weights_ptr[3], inter_size_ / tensor_para_size_ * hidden_units_); + + if (int8_mode_ == 1) { + // Alloc scales for weight only quant for attention and FFN weights + deviceMalloc(&weight_only_scale_ptr[0], 3 * hidden_units_ / tensor_para_size_); + deviceMalloc(&weight_only_scale_ptr[1], hidden_units_); + deviceMalloc(&weight_only_scale_ptr[2], inter_size_ / tensor_para_size_); + deviceMalloc(&weight_only_scale_ptr[3], hidden_units_); + } + } } template struct GptJDecoderLayerWeight; diff --git a/src/fastertransformer/models/gptj/GptJDecoderLayerWeight.h b/src/fastertransformer/models/gptj/GptJDecoderLayerWeight.h index ea1b0f1d7..29402469a 100644 --- a/src/fastertransformer/models/gptj/GptJDecoderLayerWeight.h +++ b/src/fastertransformer/models/gptj/GptJDecoderLayerWeight.h @@ -29,10 +29,12 @@ template struct GptJDecoderLayerWeight { public: GptJDecoderLayerWeight() = delete; + GptJDecoderLayerWeight(const int int8_mode); GptJDecoderLayerWeight(const int hidden_units, const int inter_size, const int tensor_para_size = 1, - const int tensor_para_rank = 0); + const int tensor_para_rank = 0, + const int int8_mode = 0); ~GptJDecoderLayerWeight(); GptJDecoderLayerWeight(const GptJDecoderLayerWeight& other); GptJDecoderLayerWeight& operator=(const GptJDecoderLayerWeight& other); @@ -50,9 +52,14 @@ struct GptJDecoderLayerWeight { int tensor_para_rank_; bool is_maintain_buffer = false; T* weights_ptr[9]; + int int8_mode_ = 0; + + std::vector int8_weights_ptr = std::vector(4, nullptr); + std::vector weight_only_scale_ptr = std::vector(4, nullptr); void setWeightPtr(); void mallocWeights(); + void copyFrom(const GptJDecoderLayerWeight& other); }; } // namespace fastertransformer diff --git a/src/fastertransformer/models/gptj/GptJWeight.cc b/src/fastertransformer/models/gptj/GptJWeight.cc index e64162b1f..0a3a9193b 100644 --- a/src/fastertransformer/models/gptj/GptJWeight.cc +++ b/src/fastertransformer/models/gptj/GptJWeight.cc @@ -28,6 +28,7 @@ GptJWeight::GptJWeight(const int hidden_unit const int tensor_para_rank, const int layer_para_size, const int layer_para_rank, + const int int8_mode, PromptLearningType prompt_learning_type, std::map> prompt_learning_pair): hidden_units_(hidden_units), @@ -39,6 +40,7 @@ GptJWeight::GptJWeight(const int hidden_unit tensor_para_rank_(tensor_para_rank), layer_para_size_(layer_para_size), layer_para_rank_(layer_para_rank), + int8_mode_(int8_mode), prompt_learning_type_(prompt_learning_type), prompt_learning_pair_(prompt_learning_pair) { @@ -60,7 +62,7 @@ GptJWeight::GptJWeight(const int hidden_unit for (int l = 0; l < num_layer_; l++) { if (isValidLayerParallelId(l)) { decoder_layer_weights.push_back( - GptJDecoderLayerWeight(hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_)); + GptJDecoderLayerWeight(hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_, int8_mode_)); } else { // Layer-parallelism: allocate empty layer because @@ -101,6 +103,7 @@ GptJWeight::GptJWeight(const GptJWeight& other): tensor_para_rank_(other.tensor_para_rank_), layer_para_size_(other.layer_para_size_), layer_para_rank_(other.layer_para_rank_), + int8_mode_(other.int8_mode_), prompt_token_weight_size_(other.prompt_token_weight_size_), malloc_load_prompt_weights_(other.malloc_load_prompt_weights_), prompt_learning_type_(other.prompt_learning_type_), @@ -148,6 +151,7 @@ GptJWeight& GptJWeight::operator=(const GptJWeight& other) tensor_para_rank_ = other.tensor_para_rank_; layer_para_size_ = other.layer_para_size_; layer_para_rank_ = other.layer_para_rank_; + int8_mode_ = other.int8_mode_; prompt_token_weight_size_ = other.prompt_token_weight_size_; malloc_load_prompt_weights_ = other.malloc_load_prompt_weights_; prompt_learning_type_ = other.prompt_learning_type_; diff --git a/src/fastertransformer/models/gptj/GptJWeight.h b/src/fastertransformer/models/gptj/GptJWeight.h index c75547e65..34e7ce673 100644 --- a/src/fastertransformer/models/gptj/GptJWeight.h +++ b/src/fastertransformer/models/gptj/GptJWeight.h @@ -37,6 +37,7 @@ struct GptJWeight { const int tensor_para_rank = 0, const int layer_para_size = 1, const int layer_para_rank = 0, + const int int8_mode = 0, PromptLearningType prompt_learning_type = PromptLearningType::no_prompt, std::map> prompt_learning_pair = std::map>{}); @@ -80,6 +81,8 @@ struct GptJWeight { int layer_para_size_; int layer_para_rank_; + size_t int8_mode_ = 0; + // prompt learning pair (task_name, (task_name_id, prompt_len)) PromptLearningType prompt_learning_type_; std::map> prompt_learning_pair_; diff --git a/src/fastertransformer/models/gptneox/GptNeoX.cc b/src/fastertransformer/models/gptneox/GptNeoX.cc index 2ce2dae7b..07356bec8 100644 --- a/src/fastertransformer/models/gptneox/GptNeoX.cc +++ b/src/fastertransformer/models/gptneox/GptNeoX.cc @@ -42,6 +42,7 @@ void GptNeoX::initialize() is_free_buffer_after_forward_, is_context_qk_buf_float_, attention_type_, + int8_mode_, custom_all_reduce_comm_, enable_custom_all_reduce_); @@ -59,6 +60,7 @@ void GptNeoX::initialize() cublas_wrapper_, allocator_, is_free_buffer_after_forward_, + int8_mode_, custom_all_reduce_comm_, enable_custom_all_reduce_); @@ -150,6 +152,13 @@ void GptNeoX::allocateBuffer( generation_should_stop_ = (bool*)allocator_->reMalloc(generation_should_stop_, sizeof(bool), true, true); + if (shared_contexts_ratio_ > 0.0f) { + shared_contexts_idx_ = (int*)allocator_->reMalloc(shared_contexts_idx_, batch_size * sizeof(int), false); + batch_to_compact_idx_ = (int*)allocator_->reMalloc(batch_to_compact_idx_, batchxbeam * sizeof(int), false); + compact_idx_ = (int*)allocator_->reMalloc(compact_idx_, batch_size * sizeof(int), false); + compact_size_ = (int*)allocator_->reMalloc(compact_size_, sizeof(int), false); + } + is_allocate_buffer_ = true; } @@ -201,6 +210,11 @@ void GptNeoX::freeBuffer() allocator_->free((void**)(&generation_should_stop_), true); + if (shared_contexts_ratio_ > 0.0f) { + allocator_->free((void**)(&shared_contexts_idx_)); + allocator_->free((void**)(&compact_size_)); + } + is_allocate_buffer_ = false; } } @@ -230,8 +244,10 @@ GptNeoX::GptNeoX(size_t head_num, bool is_free_buffer_after_forward, cudaDeviceProp* cuda_device_prop, AttentionType attention_type, + int int8_mode, std::shared_ptr custom_all_reduce_comm, - int enable_custom_all_reduce): + int enable_custom_all_reduce, + float shared_contexts_ratio): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop), head_num_(head_num), size_per_head_(size_per_head), @@ -246,7 +262,9 @@ GptNeoX::GptNeoX(size_t head_num, use_gptj_residual_(use_gptj_residual), hidden_units_(head_num * size_per_head), local_head_num_(head_num / 1), - attention_type_(attention_type) + attention_type_(attention_type), + int8_mode_(int8_mode), + shared_contexts_ratio_(shared_contexts_ratio) { tensor_para_.world_size_ = 1; tensor_para_.rank_ = 0; @@ -288,8 +306,10 @@ GptNeoX::GptNeoX(size_t head_num, bool is_free_buffer_after_forward, cudaDeviceProp* cuda_device_prop, AttentionType attention_type, + int int8_mode, std::shared_ptr custom_all_reduce_comm, - int enable_custom_all_reduce): + int enable_custom_all_reduce, + float shared_contexts_ratio): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop), head_num_(head_num), size_per_head_(size_per_head), @@ -308,7 +328,9 @@ GptNeoX::GptNeoX(size_t head_num, local_head_num_(head_num / tensor_para.world_size_), custom_all_reduce_comm_(custom_all_reduce_comm), enable_custom_all_reduce_(enable_custom_all_reduce), - attention_type_(attention_type) + attention_type_(attention_type), + int8_mode_(int8_mode), + shared_contexts_ratio_(shared_contexts_ratio) { int local_vacab_size = ceil(vocab_size_ / 1.f / tensor_para_.world_size_); if (std::is_same::value) { @@ -339,7 +361,9 @@ GptNeoX::GptNeoX(GptNeoX const& gpt): vocab_size_padded_(gpt.vocab_size_padded_), custom_all_reduce_comm_(gpt.custom_all_reduce_comm_), enable_custom_all_reduce_(gpt.enable_custom_all_reduce_), - attention_type_(gpt.attention_type_) + attention_type_(gpt.attention_type_), + int8_mode_(gpt.int8_mode_), + shared_contexts_ratio_(gpt.shared_contexts_ratio_) { initialize(); } @@ -561,6 +585,23 @@ void GptNeoX::forward(std::unordered_map* output_t cudaMemsetAsync(cache_indirections_[0], 0, 2 * sizeof(int) * batch_size * beam_width * max_seq_len, stream_); } + int compact_size; + bool use_shared_contexts = (shared_contexts_ratio_ > 0.0f) && (max_input_length >= 1) && (batch_size > 1); + if (use_shared_contexts) { + invokeFindContextDups(shared_contexts_idx_, + batch_to_compact_idx_, + compact_idx_, + compact_size_, + input_tensors->at("input_ids").getPtr(), + batch_size, + beam_width, + max_input_length, + stream_); + cudaD2Hcpy(&compact_size, compact_size_, 1); + use_shared_contexts = compact_size <= shared_contexts_ratio_ * batch_size; + sync_check_cuda_error(); + } + // Prefix prompts if (has_prefix_prompt_) { cudaMemcpyAsync(prompt_learning_weight_batch_, @@ -662,6 +703,14 @@ void GptNeoX::forward(std::unordered_map* output_t {batch_size * beam_width}, has_prefix_prompt_ ? tiled_prompt_lengths_buf_ : nullptr}}}; + if (use_shared_contexts) { + decoder_input_tensors.insert( + {"compact_idx", Tensor(MEMORY_GPU, TYPE_INT32, {(size_t)compact_size}, compact_idx_)}); + decoder_input_tensors.insert( + {"batch_to_compact_idx", + Tensor(MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, batch_to_compact_idx_)}); + } + std::unordered_map decoder_output_tensors{ {"decoder_output", Tensor{MEMORY_GPU, @@ -1137,6 +1186,7 @@ void GptNeoX::setOutputTensors(std::unordered_map* gatherTreeParam param; param.beams = transposed_output_ids_buf_; param.max_sequence_lengths = sequence_lengths_; + param.sequence_lengths_for_output = sequence_lengths; // add sequence_length 1 here because the sequence_length of time step t is t - 1 param.max_sequence_length_final_step = 1; param.max_time = max_output_seq_len; @@ -1154,8 +1204,8 @@ void GptNeoX::setOutputTensors(std::unordered_map* param.stream = stream_; param.output_ids = output_tensors->at("output_ids").getPtr(); invokeGatherTree(param); - invokeCudaD2DcpyConvert( - sequence_lengths, sequence_lengths_, output_tensors->at("sequence_length").size(), stream_); + // invokeCudaD2DcpyConvert( + // sequence_lengths, sequence_lengths_, output_tensors->at("sequence_length").size(), stream_); sync_check_cuda_error(); } if ((output_tensors->count("output_log_probs") > 0 && output_tensors->at("output_log_probs").data != nullptr)) { diff --git a/src/fastertransformer/models/gptneox/GptNeoX.h b/src/fastertransformer/models/gptneox/GptNeoX.h index 9749a2070..1b3a1b1c1 100644 --- a/src/fastertransformer/models/gptneox/GptNeoX.h +++ b/src/fastertransformer/models/gptneox/GptNeoX.h @@ -41,6 +41,7 @@ class GptNeoX: public BaseLayer { static constexpr bool neox_rotary_style_ = true; static constexpr float layernorm_eps_ = 1e-5f; + float shared_contexts_ratio_; int start_id_; int end_id_; @@ -54,6 +55,7 @@ class GptNeoX: public BaseLayer { int enable_custom_all_reduce_; AttentionType attention_type_; + const int int8_mode_ = 0; size_t vocab_size_padded_; const bool is_context_qk_buf_float_ = @@ -120,6 +122,11 @@ class GptNeoX: public BaseLayer { bool* generation_should_stop_ = nullptr; + int* shared_contexts_idx_ = nullptr; + int* compact_idx_ = nullptr; + int* batch_to_compact_idx_ = nullptr; + int* compact_size_ = nullptr; + T* context_decoder_input_buf_; T* context_decoder_output_buf_; float* output_log_probs_buf_; @@ -161,8 +168,10 @@ class GptNeoX: public BaseLayer { bool is_free_buffer_after_forward, cudaDeviceProp* cuda_device_prop = nullptr, AttentionType attention_type = AttentionType::UNFUSED_MHA, + int int8_mode = 0, std::shared_ptr custom_all_reduce_comm = nullptr, - int enable_custom_all_reduce = 0); + int enable_custom_all_reduce = 0, + float shared_contexts_ratio = 1.0f); GptNeoX(size_t head_num, size_t size_per_head, @@ -190,8 +199,10 @@ class GptNeoX: public BaseLayer { bool is_free_buffer_after_forward, cudaDeviceProp* cuda_device_prop = nullptr, AttentionType attention_type = AttentionType::UNFUSED_MHA, + int int8_mode = 0, std::shared_ptr custom_all_reduce_comm = nullptr, - int enable_custom_all_reduce = 0); + int enable_custom_all_reduce = 0, + float shared_contexts_ratio = 1.0f); GptNeoX(GptNeoX const& GptNeoX); diff --git a/src/fastertransformer/models/gptneox/GptNeoXContextDecoder.cc b/src/fastertransformer/models/gptneox/GptNeoXContextDecoder.cc index f23d1a977..4b56e271f 100644 --- a/src/fastertransformer/models/gptneox/GptNeoXContextDecoder.cc +++ b/src/fastertransformer/models/gptneox/GptNeoXContextDecoder.cc @@ -40,7 +40,7 @@ void GptNeoXContextDecoder::initialize() is_free_buffer_after_forward_, is_qk_buf_float_, false, - 0, + int8_mode_, custom_all_reduce_comm_, enable_custom_all_reduce_); @@ -57,7 +57,7 @@ void GptNeoXContextDecoder::initialize() !use_gptj_residual_, is_free_buffer_after_forward_, false, - 0, + int8_mode_, false, // use_gated_activation = false; custom_all_reduce_comm_, enable_custom_all_reduce_); @@ -70,7 +70,7 @@ void GptNeoXContextDecoder::allocateBuffer() } template -void GptNeoXContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len) +void GptNeoXContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len, bool use_shared_contexts) { decoder_normed_input_ = reinterpret_cast( allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); @@ -84,6 +84,19 @@ void GptNeoXContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len) padding_offset_ = reinterpret_cast(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false)); cu_seqlens_ = reinterpret_cast(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false)); + if (use_shared_contexts) { + compact_decoder_features_ = reinterpret_cast( + allocator_->reMalloc(compact_decoder_features_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + compact_attention_mask_ = reinterpret_cast( + allocator_->reMalloc(compact_attention_mask_, sizeof(T) * batch_size * seq_len * seq_len, false)); + compact_input_lengths_ = + reinterpret_cast(allocator_->reMalloc(compact_input_lengths_, sizeof(int) * batch_size, false)); + k_cache_layer_ = reinterpret_cast( + allocator_->reMalloc(k_cache_layer_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + v_cache_layer_ = reinterpret_cast( + allocator_->reMalloc(v_cache_layer_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + } + is_allocate_buffer_ = true; } @@ -98,6 +111,13 @@ void GptNeoXContextDecoder::freeBuffer() allocator_->free((void**)(&h_pinned_token_num_ptr_), true); allocator_->free((void**)(&padding_offset_)); allocator_->free((void**)(&cu_seqlens_)); + if (compact_decoder_features_ != nullptr) { + allocator_->free((void**)(&compact_decoder_features_)); + allocator_->free((void**)(&compact_attention_mask_)); + allocator_->free((void**)(&compact_input_lengths_)); + allocator_->free((void**)(&k_cache_layer_)); + allocator_->free((void**)(&v_cache_layer_)); + } is_allocate_buffer_ = false; } } @@ -148,6 +168,7 @@ GptNeoXContextDecoder::GptNeoXContextDecoder(size_t bool is_free_buffer_after_forward, bool is_qk_buf_float, AttentionType attention_type, + int int8_mode, std::shared_ptr custom_all_reduce_comm, int enable_custom_all_reduce): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward), @@ -164,6 +185,7 @@ GptNeoXContextDecoder::GptNeoXContextDecoder(size_t pipeline_para_(pipeline_para), is_qk_buf_float_(is_qk_buf_float), attention_type_(attention_type), + int8_mode_(int8_mode), custom_all_reduce_comm_(custom_all_reduce_comm), enable_custom_all_reduce_(enable_custom_all_reduce) { @@ -186,6 +208,7 @@ GptNeoXContextDecoder::GptNeoXContextDecoder(GptNeoXContextDecoder const& pipeline_para_(decoder.pipeline_para_), is_qk_buf_float_(decoder.is_qk_buf_float_), attention_type_(decoder.attention_type_), + int8_mode_(decoder.int8_mode_), custom_all_reduce_comm_(decoder.custom_all_reduce_comm_), enable_custom_all_reduce_(decoder.enable_custom_all_reduce_) { @@ -239,15 +262,23 @@ void GptNeoXContextDecoder::forward(std::unordered_map* // For example, the shape of decoder_input becomes [ite, batch_size, seq_len, hidden_dimension] during // computing. - FT_CHECK(input_tensors->size() == 5); + FT_CHECK(input_tensors->size() >= 5); FT_CHECK(output_tensors->size() == 4); - const int batch_size = input_tensors->at("decoder_input").shape[0]; - const int seq_len = input_tensors->at("decoder_input").shape[1]; + const bool use_shared_contexts = input_tensors->find("compact_idx") != input_tensors->end(); + FT_CHECK(!use_shared_contexts || (input_tensors->find("batch_to_compact_idx") != input_tensors->end())); + const size_t request_batch_size = input_tensors->at("decoder_input").shape[0]; + // compacted batch size. + const size_t batch_size = + use_shared_contexts ? input_tensors->at("compact_idx").shape[0] : input_tensors->at("decoder_input").shape[0]; + const int seq_len = input_tensors->at("decoder_input").shape[1]; // max_input_len + // The maximum length of generation. + const size_t max_seq_len = output_tensors->at("value_cache").shape[3]; + const int max_prompt_length = input_tensors->at("attention_mask").shape[3] - input_tensors->at("attention_mask").shape[2]; const DataType data_type = getTensorType(); - allocateBuffer(batch_size, seq_len); + allocateBuffer(batch_size, seq_len, use_shared_contexts); T* decoder_input = input_tensors->at("decoder_input").getPtr(); T* decoder_output = output_tensors->at("decoder_output").getPtr(); @@ -255,6 +286,20 @@ void GptNeoXContextDecoder::forward(std::unordered_map* const T** d_prefix_prompt_batch = input_tensors->at("d_prefix_prompt_batch").getPtr(); const int* d_prefix_prompt_lengths = input_tensors->at("d_prefix_prompt_lengths").getPtr(); + if (use_shared_contexts) { + invokeCompactInputs(compact_decoder_features_, + compact_attention_mask_, + compact_input_lengths_, + decoder_input, + attention_mask, + input_tensors->at("input_lengths").getPtr(), + input_tensors->at("compact_idx").getPtr(), + batch_size, + seq_len, + hidden_units_, + stream_); + } + const int local_batch_size = getLocalBatchSize(batch_size, seq_len, pipeline_para_.world_size_); FT_CHECK(batch_size % local_batch_size == 0); const int iteration_num = batch_size / local_batch_size; @@ -272,6 +317,12 @@ void GptNeoXContextDecoder::forward(std::unordered_map* self_v_cache_size.push_back(*t); } + if (use_shared_contexts) { + // we use k_cache_layer_ and v_cache_layer_ + self_k_cache_size[3] = seq_len; + self_v_cache_size[2] = seq_len; + } + AttentionType attention_type = (d_prefix_prompt_lengths != nullptr) ? getUnfusedAttentionType(attention_type_) : attention_type_; @@ -280,7 +331,8 @@ void GptNeoXContextDecoder::forward(std::unordered_map* for (int ite = 0; ite < iteration_num; ite++) { size_t h_token_num = local_batch_size * seq_len; if (is_unpadded_mha) { - const int* base_input_lengths = input_tensors->at("input_lengths").getPtr(); + const int* base_input_lengths = + use_shared_contexts ? compact_input_lengths_ : input_tensors->at("input_lengths").getPtr(); invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_, &h_token_num, padding_offset_, @@ -296,8 +348,9 @@ void GptNeoXContextDecoder::forward(std::unordered_map* } if (l == 0 && is_unpadded_mha) { + const T* base_input = (use_shared_contexts ? compact_decoder_features_ : decoder_input); invokeRemovePadding(decoder_layer_output_, - decoder_input + ite * local_batch_size * seq_len * hidden_units_, + base_input + ite * local_batch_size * seq_len * hidden_units_, padding_offset_, h_token_num, hidden_units_, @@ -309,11 +362,11 @@ void GptNeoXContextDecoder::forward(std::unordered_map* T* layer_output = decoder_layer_output_; if (!is_unpadded_mha) { if (l == 0) { - layer_input = decoder_input; + layer_input = use_shared_contexts ? compact_decoder_features_ : decoder_input; layer_input += ite * local_batch_size * seq_len * hidden_units_; } if (l == num_layer_ - 1) { - layer_output = decoder_output; + layer_output = use_shared_contexts ? compact_decoder_features_ : decoder_output; layer_output += ite * local_batch_size * seq_len * hidden_units_; } } @@ -338,10 +391,12 @@ void GptNeoXContextDecoder::forward(std::unordered_map* h_token_num, hidden_units_, (float*)nullptr, - 0, + int8_mode_, stream_); sync_check_cuda_error(); + const T* attention_ptr = use_shared_contexts ? compact_attention_mask_ : attention_mask; + TensorMap self_attention_input_tensors{ {"input_query", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}, @@ -349,7 +404,7 @@ void GptNeoXContextDecoder::forward(std::unordered_map* Tensor{MEMORY_GPU, data_type, {(size_t)local_batch_size, (size_t)1, (size_t)seq_len, (size_t)(seq_len + max_prompt_length)}, - attention_mask + local_batch_size * ite * seq_len * (seq_len + max_prompt_length)}}, + attention_ptr + local_batch_size * ite * seq_len * (seq_len + max_prompt_length)}}, {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type}}, {"is_final_layer", Tensor{MEMORY_CPU, TYPE_BOOL, {(size_t)1}, &is_final}}, {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}}; @@ -384,16 +439,42 @@ void GptNeoXContextDecoder::forward(std::unordered_map* } cache_offset += ite_cache_offset; + T* k_cache_ptr = use_shared_contexts ? k_cache_layer_ : k_cache.getPtrWithOffset(cache_offset); + T* v_cache_ptr = use_shared_contexts ? v_cache_layer_ : v_cache.getPtrWithOffset(cache_offset); + TensorMap self_attention_output_tensors{ {"hidden_features", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, self_attn_output_}}, - {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}}, - {"value_cache", - Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}}; + {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache_ptr}}, + {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache_ptr}}}; self_attention_layer_->forward(&self_attention_output_tensors, &self_attention_input_tensors, &gpt_decoder_layer_weight->at(l)->self_attention_weights); + + if (use_shared_contexts) { + // Even with local batches, we must process the whole K/V caches as any + // element in batch_idx_to_compact_idx may reference the local batch + // we're processing. We also need to discard references that aren't in + // that particular local batch. + const size_t cache_stride_per_batch = hidden_units_ / tensor_para_.world_size_ * max_seq_len; + const size_t cache_layer_offset = + (l - getFirstLayerParallelId()) * request_batch_size * cache_stride_per_batch; + invokeUnCompactCaches(k_cache.getPtrWithOffset(cache_layer_offset), + v_cache.getPtrWithOffset(cache_layer_offset), + k_cache_layer_, + v_cache_layer_, + input_tensors->at("batch_to_compact_idx").getPtr(), + request_batch_size, // batch_size (uncompact) + v_cache.shape[2], // local_head_num + max_seq_len, + seq_len, + size_per_head_, + local_batch_size, + ite, + stream_); + sync_check_cuda_error(); + } if (is_final == false) { if (use_gptj_residual_) { @@ -405,7 +486,7 @@ void GptNeoXContextDecoder::forward(std::unordered_map* h_token_num, hidden_units_, (float*)nullptr, - 0, + int8_mode_, stream_); } else { @@ -424,7 +505,7 @@ void GptNeoXContextDecoder::forward(std::unordered_map* (float*)nullptr, (float*)nullptr, (float*)nullptr, - 0, + int8_mode_, stream_); } @@ -483,7 +564,8 @@ void GptNeoXContextDecoder::forward(std::unordered_map* } if ((l == num_layer_ - 1) && is_unpadded_mha) { - invokeRebuildPadding(decoder_output + ite * local_batch_size * seq_len * hidden_units_, + T* base_ptr = use_shared_contexts ? compact_decoder_features_ : decoder_output; + invokeRebuildPadding(base_ptr + ite * local_batch_size * seq_len * hidden_units_, decoder_layer_output_, padding_offset_, h_token_num, @@ -494,12 +576,22 @@ void GptNeoXContextDecoder::forward(std::unordered_map* } } + if (use_shared_contexts) { + invokeUnCompactOutputs(decoder_output, + compact_decoder_features_, + input_tensors->at("batch_to_compact_idx").getPtr(), + request_batch_size, // batch + seq_len * hidden_units_, + stream_); + sync_check_cuda_error(); + } + // TODO(bhsueh) We could optimize this point by only computing the last token for the last layer invokeLookupHiddenStateOfLastToken(output_tensors->at("last_token_hidden_units").getPtr(), output_tensors->at("decoder_output").getPtr(), input_tensors->at("input_lengths").getPtr(), seq_len, - batch_size, + request_batch_size, hidden_units_, stream_); sync_check_cuda_error(); diff --git a/src/fastertransformer/models/gptneox/GptNeoXContextDecoder.h b/src/fastertransformer/models/gptneox/GptNeoXContextDecoder.h index c81dcfe90..becc24277 100644 --- a/src/fastertransformer/models/gptneox/GptNeoXContextDecoder.h +++ b/src/fastertransformer/models/gptneox/GptNeoXContextDecoder.h @@ -56,13 +56,15 @@ class GptNeoXContextDecoder: public BaseLayer { AttentionType attention_type_; + int int8_mode_ = 0; + bool is_qk_buf_float_; BaseAttentionLayer* self_attention_layer_; FfnLayer* ffn_layer_; void allocateBuffer() override; - void allocateBuffer(size_t batch_size, size_t seq_len); + void allocateBuffer(size_t batch_size, size_t seq_len, bool use_shared_contexts); void freeBuffer() override; bool isValidLayerParallelId(uint l); @@ -81,6 +83,12 @@ class GptNeoXContextDecoder: public BaseLayer { int* padding_offset_ = nullptr; int* cu_seqlens_ = nullptr; + T* compact_decoder_features_ = nullptr; + T* compact_attention_mask_ = nullptr; + int* compact_input_lengths_ = nullptr; + T* k_cache_layer_ = nullptr; + T* v_cache_layer_ = nullptr; + public: GptNeoXContextDecoder(size_t head_num, size_t size_per_head, @@ -98,6 +106,7 @@ class GptNeoXContextDecoder: public BaseLayer { bool is_free_buffer_after_forward, bool is_qk_buf_float, AttentionType attention_type = AttentionType::FUSED_MHA, + int int8_mode = 0, std::shared_ptr custom_all_reduce_comm = nullptr, int enable_custom_all_reduce_ = 0); diff --git a/src/fastertransformer/models/gptneox/GptNeoXDecoder.cc b/src/fastertransformer/models/gptneox/GptNeoXDecoder.cc index 7b73ba8ee..50372f68e 100644 --- a/src/fastertransformer/models/gptneox/GptNeoXDecoder.cc +++ b/src/fastertransformer/models/gptneox/GptNeoXDecoder.cc @@ -35,7 +35,7 @@ void GptNeoXDecoder::initialize() !use_gptj_residual_, is_free_buffer_after_forward_, false, - 0, + int8_mode_, custom_all_reduce_comm_, enable_custom_all_reduce_); @@ -52,7 +52,7 @@ void GptNeoXDecoder::initialize() !use_gptj_residual_, is_free_buffer_after_forward_, false, - 0, + int8_mode_, false, // use_gated_activation = false; custom_all_reduce_comm_, enable_custom_all_reduce_); @@ -134,6 +134,7 @@ GptNeoXDecoder::GptNeoXDecoder(size_t head_num, cublasMMWrapper* cublas_wrapper, IAllocator* allocator, bool is_free_buffer_after_forward, + int int8_mode, std::shared_ptr custom_all_reduce_comm, int enable_custom_all_reduce): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward), @@ -148,6 +149,7 @@ GptNeoXDecoder::GptNeoXDecoder(size_t head_num, hidden_units_(head_num_ * size_per_head), tensor_para_(tensor_para), pipeline_para_(pipeline_para), + int8_mode_(int8_mode), custom_all_reduce_comm_(custom_all_reduce_comm), enable_custom_all_reduce_(enable_custom_all_reduce) { @@ -168,6 +170,7 @@ GptNeoXDecoder::GptNeoXDecoder(GptNeoXDecoder const& decoder): hidden_units_(decoder.hidden_units_), tensor_para_(decoder.tensor_para_), pipeline_para_(decoder.pipeline_para_), + int8_mode_(decoder.int8_mode_), custom_all_reduce_comm_(decoder.custom_all_reduce_comm_), enable_custom_all_reduce_(decoder.enable_custom_all_reduce_) { @@ -269,7 +272,7 @@ void GptNeoXDecoder::forward(std::unordered_map* local_batch_size, hidden_units_, (float*)nullptr, - 0, + int8_mode_, stream_); sync_check_cuda_error(); @@ -304,7 +307,7 @@ void GptNeoXDecoder::forward(std::unordered_map* local_batch_size, hidden_units_, (float*)nullptr, - 0, + int8_mode_, stream_); } else { @@ -323,7 +326,7 @@ void GptNeoXDecoder::forward(std::unordered_map* (float*)nullptr, (float*)nullptr, (float*)nullptr, - 0, + int8_mode_, stream_); } diff --git a/src/fastertransformer/models/gptneox/GptNeoXDecoder.h b/src/fastertransformer/models/gptneox/GptNeoXDecoder.h index add736adc..297d22067 100644 --- a/src/fastertransformer/models/gptneox/GptNeoXDecoder.h +++ b/src/fastertransformer/models/gptneox/GptNeoXDecoder.h @@ -70,6 +70,8 @@ class GptNeoXDecoder: public BaseLayer { BaseAttentionLayer* self_attention_layer_; FfnLayer* ffn_layer_; + int int8_mode_ = 0; + public: GptNeoXDecoder(size_t head_num, size_t size_per_head, @@ -85,6 +87,7 @@ class GptNeoXDecoder: public BaseLayer { cublasMMWrapper* cublas_wrapper, IAllocator* allocator, bool is_free_buffer_after_forward, + int int8_mode = 0, std::shared_ptr custom_all_reduce_comm = nullptr, int enable_custom_all_reduce_ = 0); diff --git a/src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.cc b/src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.cc index 3d62df83d..2ee56f4b6 100644 --- a/src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.cc +++ b/src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.cc @@ -24,15 +24,26 @@ GptNeoXDecoderLayerWeight::GptNeoXDecoderLayerWeight(const int hidden_units, const int inter_size, const int tensor_para_size, const int tensor_para_rank, - const bool use_gptj_residual): + const bool use_gptj_residual, + const int int8_mode): hidden_units_(hidden_units), inter_size_(inter_size), tensor_para_size_(tensor_para_size), tensor_para_rank_(tensor_para_rank), + int8_mode_(int8_mode), use_gptj_residual_(use_gptj_residual) { mallocWeights(); setWeightPtr(); + + FT_CHECK_WITH_INFO(int8_mode_ != 2, "GptNeox doesn't support int8_model == 2"); + FT_CHECK_WITH_INFO(!(std::is_same::value && int8_mode_ == 1), + "Weight only quant does not work with FP32 compute."); +} + +template +GptNeoXDecoderLayerWeight::GptNeoXDecoderLayerWeight(const int int8_mode): int8_mode_(int8_mode) +{ } template @@ -58,34 +69,84 @@ GptNeoXDecoderLayerWeight::~GptNeoXDecoderLayerWeight() ffn_weights.intermediate_weight.bias = nullptr; ffn_weights.output_weight.kernel = nullptr; ffn_weights.output_weight.bias = nullptr; + + if (int8_mode_ != 0) { + for (int i = 0; i < int8_weights_ptr.size(); i++) { + if (int8_weights_ptr[i] != nullptr) { + deviceFree(int8_weights_ptr[i]); + } + } + + if (int8_mode_ == 1) { + for (int i = 0; i < weight_only_scale_ptr.size(); i++) { + if (weight_only_scale_ptr[i] != nullptr) { + deviceFree(weight_only_scale_ptr[i]); + } + } + } + + self_attention_weights.query_weight.int8_kernel = nullptr; + self_attention_weights.query_weight.weight_only_quant_scale = nullptr; + self_attention_weights.attention_output_weight.int8_kernel = nullptr; + self_attention_weights.attention_output_weight.weight_only_quant_scale = nullptr; + ffn_weights.intermediate_weight.int8_kernel = nullptr; + ffn_weights.intermediate_weight.weight_only_quant_scale = nullptr; + ffn_weights.output_weight.int8_kernel = nullptr; + ffn_weights.output_weight.weight_only_quant_scale = nullptr; + } + is_maintain_buffer = false; } } template -GptNeoXDecoderLayerWeight::GptNeoXDecoderLayerWeight(const GptNeoXDecoderLayerWeight& other): - hidden_units_(other.hidden_units_), - inter_size_(other.inter_size_), - tensor_para_size_(other.tensor_para_size_), - tensor_para_rank_(other.tensor_para_rank_), - use_gptj_residual_(other.use_gptj_residual_) +void GptNeoXDecoderLayerWeight::copyFrom(const GptNeoXDecoderLayerWeight& other) { - mallocWeights(); cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_); cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); - cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); if (!use_gptj_residual_) { cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_); } - cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ / tensor_para_size_ * hidden_units_); cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_); cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], hidden_units_); cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], hidden_units_); + + + if (int8_mode_ == 0) { + cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); + cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ / tensor_para_size_ * hidden_units_); + } + else { + cudaD2Dcpy(int8_weights_ptr[0], other.int8_weights_ptr[0], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); + cudaD2Dcpy(int8_weights_ptr[1], other.int8_weights_ptr[1], hidden_units_ / tensor_para_size_ * hidden_units_); + cudaD2Dcpy(int8_weights_ptr[2], other.int8_weights_ptr[2], hidden_units_ * inter_size_ / tensor_para_size_); + cudaD2Dcpy(int8_weights_ptr[3], other.int8_weights_ptr[3], inter_size_ / tensor_para_size_ * hidden_units_); + + if (int8_mode_ == 1) { + cudaD2Dcpy(weight_only_scale_ptr[0], other.weight_only_scale_ptr[0], 3 * hidden_units_ / tensor_para_size_); + cudaD2Dcpy(weight_only_scale_ptr[1], other.weight_only_scale_ptr[1], hidden_units_); + cudaD2Dcpy(weight_only_scale_ptr[2], other.weight_only_scale_ptr[2], inter_size_ / tensor_para_size_); + cudaD2Dcpy(weight_only_scale_ptr[3], other.weight_only_scale_ptr[3], hidden_units_); + } + } +} + +template +GptNeoXDecoderLayerWeight::GptNeoXDecoderLayerWeight(const GptNeoXDecoderLayerWeight& other): + hidden_units_(other.hidden_units_), + inter_size_(other.inter_size_), + tensor_para_size_(other.tensor_para_size_), + tensor_para_rank_(other.tensor_para_rank_), + int8_mode_(other.int8_mode_), + use_gptj_residual_(other.use_gptj_residual_) +{ + mallocWeights(); + copyFrom(other); setWeightPtr(); } @@ -96,24 +157,11 @@ GptNeoXDecoderLayerWeight& GptNeoXDecoderLayerWeight::operator=(const GptN inter_size_ = other.inter_size_; tensor_para_size_ = other.tensor_para_size_; tensor_para_rank_ = other.tensor_para_rank_; + int8_mode_ = other.int8_mode_; use_gptj_residual_ = other.use_gptj_residual_; mallocWeights(); - - cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_); - cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); - cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); - if (!use_gptj_residual_) { - cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_); - } - cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ / tensor_para_size_ * hidden_units_); - cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_); - cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], hidden_units_); - cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], hidden_units_); + copyFrom(other); setWeightPtr(); return *this; } @@ -128,38 +176,22 @@ void GptNeoXDecoderLayerWeight::loadModel(std::string dir_path, FtCudaDataTyp weights_ptr[0], {(size_t)hidden_units_}, dir_path + ".input_layernorm.bias.bin", model_file_type); loadWeightFromBin( weights_ptr[1], {(size_t)hidden_units_}, dir_path + ".input_layernorm.weight.bin", model_file_type); - loadWeightFromBin(weights_ptr[2], - {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)}, - dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin", - model_file_type); loadWeightFromBin(weights_ptr[3], {(size_t)(3 * hidden_units_ / tensor_para_size_)}, dir_path + ".attention.query_key_value.bias." + rank_spec + ".bin", model_file_type); - loadWeightFromBin(weights_ptr[4], - {(size_t)(hidden_units_ / tensor_para_size_), (size_t)hidden_units_}, - dir_path + ".attention.dense.weight." + rank_spec + ".bin", - model_file_type); - if (!use_gptj_residual_) { loadWeightFromBin( weights_ptr[5], {(size_t)hidden_units_}, dir_path + ".attention.dense.bias.bin", model_file_type); } - loadWeightFromBin(weights_ptr[6], - {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)}, - dir_path + ".mlp.dense_h_to_4h.weight." + rank_spec + ".bin", - model_file_type); loadWeightFromBin(weights_ptr[7], {(size_t)(inter_size_ / tensor_para_size_)}, dir_path + ".mlp.dense_h_to_4h.bias." + rank_spec + ".bin", model_file_type); - loadWeightFromBin(weights_ptr[8], - {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_}, - dir_path + ".mlp.dense_4h_to_h.weight." + rank_spec + ".bin", - model_file_type); + if (use_gptj_residual_) { loadWeightFromBin( weights_ptr[9], {(size_t)hidden_units_}, dir_path + ".mlp.attention.bias.sum.bin", model_file_type); @@ -172,6 +204,54 @@ void GptNeoXDecoderLayerWeight::loadModel(std::string dir_path, FtCudaDataTyp weights_ptr[10], {(size_t)hidden_units_}, dir_path + ".post_attention_layernorm.bias.bin", model_file_type); loadWeightFromBin( weights_ptr[11], {(size_t)hidden_units_}, dir_path + ".post_attention_layernorm.weight.bin", model_file_type); + + // Load weights for GPT + if (int8_mode_ == 0) { + loadWeightFromBin(weights_ptr[2], + {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)}, + dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin", + model_file_type); + + loadWeightFromBin(weights_ptr[4], + {(size_t)(hidden_units_ / tensor_para_size_), (size_t)hidden_units_}, + dir_path + ".attention.dense.weight." + rank_spec + ".bin", + model_file_type); + + loadWeightFromBin(weights_ptr[6], + {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)}, + dir_path + ".mlp.dense_h_to_4h.weight." + rank_spec + ".bin", + model_file_type); + + loadWeightFromBin(weights_ptr[8], + {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_}, + dir_path + ".mlp.dense_4h_to_h.weight." + rank_spec + ".bin", + model_file_type); + } + else if (int8_mode_ == 1) { + loadWeightFromBinAndQuantizeForWeightOnly(int8_weights_ptr[0], + weight_only_scale_ptr[0], + {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)}, + dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin", + model_file_type); + + loadWeightFromBinAndQuantizeForWeightOnly(int8_weights_ptr[1], + weight_only_scale_ptr[1], + {(size_t)(hidden_units_ / tensor_para_size_), (size_t)hidden_units_}, + dir_path + ".attention.dense.weight." + rank_spec + ".bin", + model_file_type); + + loadWeightFromBinAndQuantizeForWeightOnly(int8_weights_ptr[2], + weight_only_scale_ptr[2], + {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)}, + dir_path + ".mlp.dense_h_to_4h.weight." + rank_spec + ".bin", + model_file_type); + + loadWeightFromBinAndQuantizeForWeightOnly(int8_weights_ptr[3], + weight_only_scale_ptr[3], + {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_}, + dir_path + ".mlp.dense_4h_to_h.weight." + rank_spec + ".bin", + model_file_type); + } } template @@ -191,6 +271,21 @@ void GptNeoXDecoderLayerWeight::setWeightPtr() post_attention_layernorm_weights.beta = weights_ptr[10]; post_attention_layernorm_weights.gamma = weights_ptr[11]; + + if (int8_mode_ != 0) { + self_attention_weights.query_weight.int8_kernel = int8_weights_ptr[0]; + self_attention_weights.attention_output_weight.int8_kernel = int8_weights_ptr[1]; + ffn_weights.intermediate_weight.int8_kernel = int8_weights_ptr[2]; + ffn_weights.output_weight.int8_kernel = int8_weights_ptr[3]; + + if (int8_mode_ == 1) { + self_attention_weights.query_weight.weight_only_quant_scale = weight_only_scale_ptr[0]; + self_attention_weights.attention_output_weight.weight_only_quant_scale = weight_only_scale_ptr[1]; + ffn_weights.intermediate_weight.weight_only_quant_scale = weight_only_scale_ptr[2]; + ffn_weights.output_weight.weight_only_quant_scale = weight_only_scale_ptr[3]; + } + } + is_maintain_buffer = true; } @@ -199,19 +294,38 @@ void GptNeoXDecoderLayerWeight::mallocWeights() { deviceMalloc(&weights_ptr[0], hidden_units_); deviceMalloc(&weights_ptr[1], hidden_units_); - deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); deviceMalloc(&weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); - deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); if (!use_gptj_residual_) { deviceMalloc(&weights_ptr[5], hidden_units_); } - deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); deviceMalloc(&weights_ptr[7], inter_size_ / tensor_para_size_); - deviceMalloc(&weights_ptr[8], inter_size_ / tensor_para_size_ * hidden_units_); deviceMalloc(&weights_ptr[9], hidden_units_); deviceMalloc(&weights_ptr[10], hidden_units_); deviceMalloc(&weights_ptr[11], hidden_units_); + + + if (int8_mode_ == 0) { + deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); // qkv weight + deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); // attention output weight + deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); // ffn inter weight + deviceMalloc(&weights_ptr[8], inter_size_ / tensor_para_size_ * hidden_units_); // ffn output weight + } + else { + // Alloc FFN and Attention int8 weights + deviceMalloc(&int8_weights_ptr[0], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); + deviceMalloc(&int8_weights_ptr[1], hidden_units_ / tensor_para_size_ * hidden_units_); + deviceMalloc(&int8_weights_ptr[2], hidden_units_ * inter_size_ / tensor_para_size_); + deviceMalloc(&int8_weights_ptr[3], inter_size_ / tensor_para_size_ * hidden_units_); + + if (int8_mode_ == 1) { + // Alloc scales for weight only quant for attention and FFN weights + deviceMalloc(&weight_only_scale_ptr[0], 3 * hidden_units_ / tensor_para_size_); + deviceMalloc(&weight_only_scale_ptr[1], hidden_units_); + deviceMalloc(&weight_only_scale_ptr[2], inter_size_ / tensor_para_size_); + deviceMalloc(&weight_only_scale_ptr[3], hidden_units_); + } + } } template struct GptNeoXDecoderLayerWeight; diff --git a/src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h b/src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h index 2850da466..bca2b2d3a 100644 --- a/src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h +++ b/src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h @@ -29,11 +29,13 @@ template struct GptNeoXDecoderLayerWeight { public: GptNeoXDecoderLayerWeight() = default; + GptNeoXDecoderLayerWeight(const int int8_mode); GptNeoXDecoderLayerWeight(const int hidden_units, const int inter_size, const int tensor_para_size = 1, const int tensor_para_rank = 0, - const bool use_gptj_residual = true); + const bool use_gptj_residual = true, + const int int8_mode = 0); ~GptNeoXDecoderLayerWeight(); GptNeoXDecoderLayerWeight(const GptNeoXDecoderLayerWeight& other); GptNeoXDecoderLayerWeight& operator=(const GptNeoXDecoderLayerWeight& other); @@ -54,9 +56,14 @@ struct GptNeoXDecoderLayerWeight { const int attention_dense_bias_weight_id = 5; bool is_maintain_buffer = false; T* weights_ptr[12]; + int int8_mode_ = 0; + + std::vector int8_weights_ptr = std::vector(4, nullptr); + std::vector weight_only_scale_ptr = std::vector(4, nullptr); void setWeightPtr(); void mallocWeights(); + void copyFrom(const GptNeoXDecoderLayerWeight& other); }; } // namespace fastertransformer diff --git a/src/fastertransformer/models/gptneox/GptNeoXWeight.cc b/src/fastertransformer/models/gptneox/GptNeoXWeight.cc index 26995f255..56ece0d47 100644 --- a/src/fastertransformer/models/gptneox/GptNeoXWeight.cc +++ b/src/fastertransformer/models/gptneox/GptNeoXWeight.cc @@ -29,6 +29,7 @@ GptNeoXWeight::GptNeoXWeight(const int hidde const int layer_para_size, const int layer_para_rank, const bool use_gptj_residual, + const int int8_mode, PromptLearningType prompt_learning_type, std::map> prompt_learning_pair): hidden_units_(hidden_units), @@ -41,6 +42,7 @@ GptNeoXWeight::GptNeoXWeight(const int hidde layer_para_size_(layer_para_size), layer_para_rank_(layer_para_rank), use_gptj_residual_(use_gptj_residual), + int8_mode_(int8_mode), prompt_learning_type_(prompt_learning_type), prompt_learning_pair_(prompt_learning_pair) { @@ -62,7 +64,7 @@ GptNeoXWeight::GptNeoXWeight(const int hidde for (int l = 0; l < num_layer_; l++) { if (isValidLayerParallelId(l)) { decoder_layer_weights.push_back(new GptNeoXDecoderLayerWeight( - hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_, use_gptj_residual_)); + hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_, use_gptj_residual_, int8_mode_)); } else { // Layer-parallelism: allocate empty layer because @@ -103,6 +105,7 @@ GptNeoXWeight::GptNeoXWeight(const GptNeoXWeight& other): layer_para_size_(other.layer_para_size_), layer_para_rank_(other.layer_para_rank_), use_gptj_residual_(other.use_gptj_residual_), + int8_mode_(other.int8_mode_), prompt_token_weight_size_(other.prompt_token_weight_size_), malloc_load_prompt_weights_(other.malloc_load_prompt_weights_), prompt_learning_type_(other.prompt_learning_type_), @@ -149,6 +152,7 @@ GptNeoXWeight& GptNeoXWeight::operator=(const GptNeoXWeight& other) layer_para_size_ = other.layer_para_size_; layer_para_rank_ = other.layer_para_rank_; use_gptj_residual_ = other.use_gptj_residual_; + int8_mode_ = other.int8_mode_; prompt_token_weight_size_ = other.prompt_token_weight_size_; malloc_load_prompt_weights_ = other.malloc_load_prompt_weights_; prompt_learning_type_ = other.prompt_learning_type_; diff --git a/src/fastertransformer/models/gptneox/GptNeoXWeight.h b/src/fastertransformer/models/gptneox/GptNeoXWeight.h index 3e868854e..f360d5328 100644 --- a/src/fastertransformer/models/gptneox/GptNeoXWeight.h +++ b/src/fastertransformer/models/gptneox/GptNeoXWeight.h @@ -38,6 +38,7 @@ struct GptNeoXWeight { const int layer_para_size = 1, const int layer_para_rank = 0, const bool use_gptj_residual_ = true, + const int int8_mode = 0, PromptLearningType prompt_learning_type = PromptLearningType::no_prompt, std::map> prompt_learning_pair = std::map>{}); @@ -88,6 +89,8 @@ struct GptNeoXWeight { int layer_para_size_; int layer_para_rank_; + size_t int8_mode_ = 0; + // residual type bool use_gptj_residual_; diff --git a/src/fastertransformer/th_op/gptneox/GptNeoXOp.h b/src/fastertransformer/th_op/gptneox/GptNeoXOp.h index 222fdd409..465d80fa8 100755 --- a/src/fastertransformer/th_op/gptneox/GptNeoXOp.h +++ b/src/fastertransformer/th_op/gptneox/GptNeoXOp.h @@ -198,6 +198,7 @@ class FTGptNeoX: public IFGptNeoX { false, // is_free_buffer_after_forward &prop_, // cuda_device_prop attention_type, // attention_type + 0, // don't support int8 in python for now nullptr, // custom_all_reduce_comm 0); // enable_custom_all_reduce diff --git a/src/fastertransformer/triton_backend/gptj/GptJTritonModel.cc b/src/fastertransformer/triton_backend/gptj/GptJTritonModel.cc index ea17bc5dc..4fb499c9e 100644 --- a/src/fastertransformer/triton_backend/gptj/GptJTritonModel.cc +++ b/src/fastertransformer/triton_backend/gptj/GptJTritonModel.cc @@ -53,7 +53,7 @@ std::shared_ptr AbstractTransformerModel::createGptJMo prompt_learning_table_pair.insert({task_name, {task_name_id, prompt_length}}); } - if (data_type == "fp16") { + if (data_type == "half" || data_type == "fp16") { return std::make_shared>( reader.GetInteger("ft_instance_hyperparameter", "max_seq_len"), reader.GetInteger(model_name, "head_num"), @@ -71,7 +71,8 @@ std::shared_ptr AbstractTransformerModel::createGptJMo reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"), reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0), model_name, - model_dir); + model_dir, + reader.GetInteger("ft_instance_hyperparameter", "int8_mode", 0)); } else if (data_type == "fp32") { return std::make_shared>( @@ -91,7 +92,8 @@ std::shared_ptr AbstractTransformerModel::createGptJMo reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"), reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0), model_name, - model_dir); + model_dir, + reader.GetInteger("ft_instance_hyperparameter", "int8_mode", 0)); } #ifdef ENABLE_BF16 else if (data_type == "bf16") { @@ -112,7 +114,8 @@ std::shared_ptr AbstractTransformerModel::createGptJMo reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"), reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0), model_name, - model_dir); + model_dir, + reader.GetInteger("ft_instance_hyperparameter", "int8_mode", 0)); } #endif else { @@ -125,11 +128,13 @@ template GptJTritonModel::GptJTritonModel(size_t tensor_para_size, size_t pipeline_para_size, int enable_custom_all_reduce, - std::string model_dir): + std::string model_dir, + int int8_mode): tensor_para_size_(tensor_para_size), pipeline_para_size_(pipeline_para_size), enable_custom_all_reduce_(enable_custom_all_reduce), shared_weights_(std::vector>>(ft::getDeviceCount())), + int8_mode_(int8_mode), model_dir_(model_dir) { INIReader reader = INIReader(model_dir + "/config.ini"); @@ -202,7 +207,8 @@ GptJTritonModel::GptJTritonModel(size_t m size_t pipeline_para_size, int enable_custom_all_reduce, std::string model_name, - std::string model_dir): + std::string model_dir, + int int8_mode): max_seq_len_(max_seq_len), head_num_(head_num), size_per_head_(size_per_head), @@ -216,6 +222,7 @@ GptJTritonModel::GptJTritonModel(size_t m pipeline_para_size_(pipeline_para_size), enable_custom_all_reduce_(enable_custom_all_reduce), shared_weights_(std::vector>>(ft::getDeviceCount())), + int8_mode_(int8_mode), model_name_(model_name), model_dir_(model_dir), prompt_learning_start_id_(prompt_learning_start_id), @@ -308,6 +315,7 @@ GptJTritonModel::createModelInstance(int false, cuda_device_prop_ptr.get(), attention_type, + int8_mode_, custom_all_reduce_comm, enable_custom_all_reduce_)); @@ -335,6 +343,7 @@ void GptJTritonModel::createSharedWeights(int device_id, int rank) tensor_para_rank, pipeline_para_size_, pipeline_para_rank, + int8_mode_, prompt_learning_type_, prompt_learning_table_pair_); shared_weights_[device_id]->loadModel(model_dir_); @@ -350,6 +359,7 @@ std::string GptJTritonModel::toString() << "\nnum_layer: " << num_layer_ << "\nvocab_size: " << vocab_size_ << "\nstart_id: " << start_id_ << "\nend_id: " << end_id_ << "\ntensor_para_size: " << tensor_para_size_ << "\npipeline_para_size: " << pipeline_para_size_ << "\nenable_custom_all_reduce: " << enable_custom_all_reduce_ + << "\nint8_mode: " << int8_mode_ << "\nmodel_name: " << model_name_ << "\nmodel_dir: " << model_dir_ << std::endl; return ss.str(); } diff --git a/src/fastertransformer/triton_backend/gptj/GptJTritonModel.h b/src/fastertransformer/triton_backend/gptj/GptJTritonModel.h index c1d270a3b..582cef666 100644 --- a/src/fastertransformer/triton_backend/gptj/GptJTritonModel.h +++ b/src/fastertransformer/triton_backend/gptj/GptJTritonModel.h @@ -43,12 +43,14 @@ struct GptJTritonModel: public AbstractTransformerModel { size_t pipeline_para_size, int enable_custom_all_reduce, std::string model_name, - std::string model_dir); + std::string model_dir, + int int8_mode); GptJTritonModel(size_t tensor_para_size, size_t pipeline_para_size, int enable_custom_all_reduce, - std::string model_dir); + std::string model_dir, + int int8_mode); ~GptJTritonModel() = default; @@ -84,6 +86,8 @@ struct GptJTritonModel: public AbstractTransformerModel { bool is_fp16_; int enable_custom_all_reduce_ = 0; + int int8_mode_ = 0; + // shared weights for each device std::vector>> shared_weights_; diff --git a/src/fastertransformer/triton_backend/gptneox/GptNeoXTritonModel.cc b/src/fastertransformer/triton_backend/gptneox/GptNeoXTritonModel.cc index 206f81ad3..e1350fe2b 100644 --- a/src/fastertransformer/triton_backend/gptneox/GptNeoXTritonModel.cc +++ b/src/fastertransformer/triton_backend/gptneox/GptNeoXTritonModel.cc @@ -34,19 +34,21 @@ std::shared_ptr AbstractTransformerModel::createGptNeo int tensor_para_size = reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"); std::string model_dir = reader.Get("ft_instance_hyperparameter", "model_dir"); - if (data_type == "half") { + if (data_type == "half" || data_type == "fp16") { return std::make_shared>( reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"), reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"), reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0), - model_dir); + model_dir, + reader.GetInteger("ft_instance_hyperparameter", "int8_mode", 0)); } else { return std::make_shared>( reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"), reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"), reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0), - model_dir); + model_dir, + reader.GetInteger("ft_instance_hyperparameter", "int8_mode", 0)); } } @@ -54,11 +56,13 @@ template GptNeoXTritonModel::GptNeoXTritonModel(size_t tensor_para_size, size_t pipeline_para_size, int enable_custom_all_reduce, - std::string model_dir): + std::string model_dir, + int int8_mode): tensor_para_size_(tensor_para_size), pipeline_para_size_(pipeline_para_size), shared_weights_(std::vector>>(ft::getDeviceCount())), - enable_custom_all_reduce_(enable_custom_all_reduce) + enable_custom_all_reduce_(enable_custom_all_reduce), + int8_mode_(int8_mode) { model_dir_ = model_dir; const std::string inifile{model_dir + "/config.ini"}; @@ -168,6 +172,7 @@ std::unique_ptr GptNeoXTritonModel::createM false, cuda_device_prop_ptr.get(), attention_type, + int8_mode_, custom_all_reduce_comm, enable_custom_all_reduce_)); @@ -197,6 +202,7 @@ void GptNeoXTritonModel::createSharedWeights(int device_id, int rank) pipeline_para_size_, pipeline_para_rank, use_gptj_residual_, + int8_mode_, prompt_learning_type_, prompt_learning_table_pair_); shared_weights_[device_id]->loadModel(model_dir_); @@ -214,6 +220,7 @@ std::string GptNeoXTritonModel::toString() << "\nprompt_learning_type_: " << static_cast(prompt_learning_type_) << "\nprompt_learning_start_id_: " << prompt_learning_start_id_ << "\ntensor_para_size: " << tensor_para_size_ << "\npipeline_para_size: " << pipeline_para_size_ << "\nenable_custom_all_reduce: " << enable_custom_all_reduce_ + << "\nint8_mode: " << int8_mode_ << "\nmodel_name: " << model_name_ << "\nmodel_dir: " << model_dir_ << std::endl; return ss.str(); } diff --git a/src/fastertransformer/triton_backend/gptneox/GptNeoXTritonModel.h b/src/fastertransformer/triton_backend/gptneox/GptNeoXTritonModel.h index 9fe16e311..3ed113e79 100644 --- a/src/fastertransformer/triton_backend/gptneox/GptNeoXTritonModel.h +++ b/src/fastertransformer/triton_backend/gptneox/GptNeoXTritonModel.h @@ -30,7 +30,8 @@ struct GptNeoXTritonModel: public AbstractTransformerModel { GptNeoXTritonModel(size_t tensor_para_size, size_t pipeline_para_size, int enable_custom_all_reduce, - std::string model_dir); + std::string model_dir, + int int8_mode); ~GptNeoXTritonModel() = default; @@ -68,6 +69,8 @@ struct GptNeoXTritonModel: public AbstractTransformerModel { // residual type bool use_gptj_residual_ = true; + int int8_mode_ = 0; + // number of tasks (for prefix-prompt, p/prompt-tuning) size_t num_tasks_ = 0; int prompt_learning_start_id_ = 0;