diff --git a/examples/cpp/gptj/gptj_config.ini b/examples/cpp/gptj/gptj_config.ini
index b59a464ad..860904673 100644
--- a/examples/cpp/gptj/gptj_config.ini
+++ b/examples/cpp/gptj/gptj_config.ini
@@ -14,6 +14,7 @@ enable_custom_all_reduce=0
 
 tensor_para_size=1
 pipeline_para_size=1
+int8_mode=0 ;only support 0 or 1 (when fp16)
 
 model_name=gptj_6B
 model_dir=../models/j6b_ckpt/
diff --git a/examples/cpp/gptj/gptj_example.cc b/examples/cpp/gptj/gptj_example.cc
index 05f285e85..2ae51e84e 100644
--- a/examples/cpp/gptj/gptj_example.cc
+++ b/examples/cpp/gptj/gptj_example.cc
@@ -92,6 +92,7 @@ void gptj_example(const INIReader reader)
 
     int tensor_para_size   = reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size");
     int pipeline_para_size = reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size");
+    int int8_mode  = reader.GetInteger("ft_instance_hyperparameter", "int8_mode", 0);
 
     const size_t head_num             = reader.GetInteger(model_name, "head_num");
     const size_t size_per_head        = reader.GetInteger(model_name, "size_per_head");
@@ -287,6 +288,7 @@ void gptj_example(const INIReader reader)
         tensor_para.rank_,
         pipeline_para.world_size_,
         pipeline_para.rank_,
+        int8_mode,
         prompt_learning_type,
         prefix_prompt_table_pair);  // optional if you don't need prefix prompts
 
@@ -336,7 +338,11 @@ void gptj_example(const INIReader reader)
                           &allocator,
                           false,
                           &prop,
-                          attention_type);
+                          attention_type,
+                          int8_mode,
+                          nullptr,
+                          0,
+                          1.0f);
 
     int* d_output_ids;
     int* d_sequence_lengths;
diff --git a/examples/cpp/gptneox/gptneox_config.ini b/examples/cpp/gptneox/gptneox_config.ini
index ab1bf7556..c4c998529 100644
--- a/examples/cpp/gptneox/gptneox_config.ini
+++ b/examples/cpp/gptneox/gptneox_config.ini
@@ -4,7 +4,7 @@ enable_custom_all_reduce=0
 
 tensor_para_size=2
 pipeline_para_size=1
-
+int8_mode=0 ;only support 0 or 1 (when fp16)
 model_name=gptneox_20B
 model_dir=../models/gptneox
 
diff --git a/examples/cpp/gptneox/gptneox_example.cc b/examples/cpp/gptneox/gptneox_example.cc
index 6fc2233ee..93ceb5602 100644
--- a/examples/cpp/gptneox/gptneox_example.cc
+++ b/examples/cpp/gptneox/gptneox_example.cc
@@ -47,6 +47,8 @@ int main(int argc, char* argv[])
         ini_name = "../examples/cpp/gptneox/gptneox_config.ini";
     }
 
+    std::cout << "Ini file name: " << ini_name << std::endl;
+
     INIReader reader = INIReader(ini_name);
     if (reader.ParseError() < 0) {
         std::cout << "[ERROR] Can't load '" << ini_name << "'\n";
@@ -76,6 +78,7 @@ void gptneox_example(const INIReader reader)
 
     int tensor_para_size   = reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size");
     int pipeline_para_size = reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size");
+    int int8_mode  = reader.GetInteger("ft_instance_hyperparameter", "int8_mode", 0);
 
     const size_t head_num             = reader.GetInteger(model_name, "head_num");
     const size_t size_per_head        = reader.GetInteger(model_name, "size_per_head");
@@ -275,6 +278,7 @@ void gptneox_example(const INIReader reader)
                                                     pipeline_para.world_size_,
                                                     pipeline_para.rank_,
                                                     use_gptj_residual,
+                                                    int8_mode,
                                                     prompt_learning_type,
                                                     prefix_prompt_table_pair);
 
@@ -321,7 +325,11 @@ void gptneox_example(const INIReader reader)
                                 &allocator,
                                 false,
                                 &prop,
-                                attention_type);
+                                attention_type,
+                                int8_mode,
+                                nullptr,
+                                0,
+                                1.0f);
 
     int* d_output_ids;
     int* d_sequence_lengths;
diff --git a/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp b/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
index 8e7cb92a2..060704912 100644
--- a/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+++ b/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
@@ -24,12 +24,12 @@
 #include <float.h>
 #include <type_traits>
 
-// #define MMHA_USE_HMMA_FOR_REDUCTION
+#define MMHA_USE_HMMA_FOR_REDUCTION
 
 // Below are knobs to extend FP32 accumulation for higher FP16 accuracy
 
 // Does not seem to affect the accuracy that much
-// #define MMHA_USE_FP32_ACUM_FOR_FMA
+#define MMHA_USE_FP32_ACUM_FOR_FMA
 
 // Seems to slightly improve the accuracy
 #define MMHA_USE_FP32_ACUM_FOR_OUT
@@ -389,26 +389,6 @@ struct Qk_vec_acum_fp32_<bf16_8_t> {
     using Type = Float8_;
 };
 
-template<>
-struct Qk_vec_acum_fp32_<uint4> {
-    using Type = Float8_;
-};
-template<>
-struct Qk_vec_acum_fp32_<__nv_bfloat16> {
-    using Type = float;
-};
-template<>
-struct Qk_vec_acum_fp32_<__nv_bfloat162> {
-    using Type = float2;
-};
-template<>
-struct Qk_vec_acum_fp32_<bf16_4_t> {
-    using Type = Float4_;
-};
-template<>
-struct Qk_vec_acum_fp32_<bf16_8_t> {
-    using Type = Float8_;
-};
 #ifdef ENABLE_FP8
 // template<>
 // struct Qk_vec_acum_fp32_<fp8_2_t> {
diff --git a/src/fastertransformer/kernels/decoding_kernels.cu b/src/fastertransformer/kernels/decoding_kernels.cu
index 89f0d5011..c673a6597 100644
--- a/src/fastertransformer/kernels/decoding_kernels.cu
+++ b/src/fastertransformer/kernels/decoding_kernels.cu
@@ -496,9 +496,9 @@ __global__ void gatherTree(gatherTreeParam param)
             int tmp_len =
                 param.max_sequence_lengths[batch * param.beam_width + j] + param.max_sequence_length_final_step;
             // also remove the length of the soft prompts, p_prompt_tuning
-            param.max_sequence_lengths[batch * param.beam_width + j] =
-                tmp_len - param.max_prefix_soft_prompt_length
-                - (param.max_input_length - param.max_input_without_prompt_length);
+            param.sequence_lengths_for_output[batch * param.beam_width + j] =
+                uint32_t(tmp_len - param.max_prefix_soft_prompt_length
+                - (param.max_input_length - param.max_input_without_prompt_length));
             // update the response input length
             if (update_response_input_length) {
                 param.response_input_lengths[batch * param.beam_width + j] = input_len - prompt_len;
diff --git a/src/fastertransformer/kernels/decoding_kernels.h b/src/fastertransformer/kernels/decoding_kernels.h
index 7527d8fc4..0ebf15a0c 100644
--- a/src/fastertransformer/kernels/decoding_kernels.h
+++ b/src/fastertransformer/kernels/decoding_kernels.h
@@ -123,6 +123,7 @@ void invokeGatherTree(int*         beams,
 struct gatherTreeParam {
     int*       beams                          = nullptr;
     int*       max_sequence_lengths           = nullptr;
+    uint*      sequence_lengths_for_output    = nullptr;
     int        max_sequence_length_final_step = 0;
     const int* input_lengths                  = nullptr;
     // response input lengths (used to slice the ids during postprocessing)
diff --git a/src/fastertransformer/models/gptj/GptJ.cc b/src/fastertransformer/models/gptj/GptJ.cc
index 0382d8863..fb7970418 100644
--- a/src/fastertransformer/models/gptj/GptJ.cc
+++ b/src/fastertransformer/models/gptj/GptJ.cc
@@ -43,6 +43,7 @@ void GptJ<T>::initialize()
                                                      is_free_buffer_after_forward_,
                                                      is_context_qk_buf_float_,
                                                      attention_type_,
+                                                     int8_mode_,
                                                      custom_all_reduce_comm_,
                                                      enable_custom_all_reduce_);
 
@@ -60,6 +61,7 @@ void GptJ<T>::initialize()
                                       cublas_wrapper_,
                                       allocator_,
                                       is_free_buffer_after_forward_,
+                                      int8_mode_,
                                       custom_all_reduce_comm_,
                                       enable_custom_all_reduce_);
 
@@ -153,6 +155,13 @@ void GptJ<T>::allocateBuffer(
         (float*)(allocator_->reMalloc(output_log_probs_buf_, sizeof(float) * batchxbeam * max_seq_len, false));
     generation_should_stop_ = (bool*)(allocator_->reMalloc(generation_should_stop_, sizeof(bool), true, true));
 
+    if (shared_contexts_ratio_ > 0.0f) {
+        shared_contexts_idx_  = (int*)allocator_->reMalloc(shared_contexts_idx_, batch_size * sizeof(int), false);
+        batch_to_compact_idx_ = (int*)allocator_->reMalloc(batch_to_compact_idx_, batchxbeam * sizeof(int), false);
+        compact_idx_          = (int*)allocator_->reMalloc(compact_idx_, batch_size * sizeof(int), false);
+        compact_size_         = (int*)allocator_->reMalloc(compact_size_, sizeof(int), false);
+    }
+
     is_allocate_buffer_ = true;
 }
 
@@ -205,6 +214,11 @@ void GptJ<T>::freeBuffer()
 
         allocator_->free((void**)(&generation_should_stop_), true);
 
+        if (shared_contexts_ratio_ > 0.0f) {
+            allocator_->free((void**)(&shared_contexts_idx_));
+            allocator_->free((void**)(&compact_size_));
+        }
+
         is_allocate_buffer_ = false;
     }
 }
@@ -237,8 +251,10 @@ GptJ<T>::GptJ(size_t                              max_batch_size,
               bool                                is_free_buffer_after_forward,
               cudaDeviceProp*                     cuda_device_prop,
               AttentionType                       attention_type,
+              int                                 int8_mode,
               std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
-              int                                 enable_custom_all_reduce):
+              int                                 enable_custom_all_reduce,
+              float                               shared_contexts_ratio):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop),
     head_num_(head_num),
     size_per_head_(size_per_head),
@@ -252,7 +268,9 @@ GptJ<T>::GptJ(size_t                              max_batch_size,
     prompt_learning_type_(prompt_learning_type),
     hidden_units_(head_num * size_per_head),
     local_head_num_(head_num / 1),
-    attention_type_(attention_type)
+    attention_type_(attention_type),
+    int8_mode_(int8_mode),
+    shared_contexts_ratio_(shared_contexts_ratio)
 {
     tensor_para_.world_size_   = 1;
     tensor_para_.rank_         = 0;
@@ -297,8 +315,10 @@ GptJ<T>::GptJ(size_t                              max_batch_size,
               bool                                is_free_buffer_after_forward,
               cudaDeviceProp*                     cuda_device_prop,
               AttentionType                       attention_type,
+              int                                 int8_mode,
               std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
-              int                                 enable_custom_all_reduce):
+              int                                 enable_custom_all_reduce,
+              float                               shared_contexts_ratio):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop),
     head_num_(head_num),
     size_per_head_(size_per_head),
@@ -315,8 +335,10 @@ GptJ<T>::GptJ(size_t                              max_batch_size,
     pipeline_para_(pipeline_para),
     local_head_num_(head_num / tensor_para.world_size_),
     attention_type_(attention_type),
+    int8_mode_(int8_mode),
     custom_all_reduce_comm_(custom_all_reduce_comm),
-    enable_custom_all_reduce_(enable_custom_all_reduce)
+    enable_custom_all_reduce_(enable_custom_all_reduce),
+    shared_contexts_ratio_(shared_contexts_ratio)
 {
     int local_vacab_size = ceil(vocab_size_ / 1.f / tensor_para_.world_size_);
     if (std::is_same<half, T>::value) {
@@ -345,8 +367,10 @@ GptJ<T>::GptJ(GptJ<T> const& gpt):
     local_head_num_(gpt.local_head_num_),
     vocab_size_padded_(gpt.vocab_size_padded_),
     attention_type_(gpt.attention_type_),
+    int8_mode_(gpt.int8_mode_),
     custom_all_reduce_comm_(gpt.custom_all_reduce_comm_),
-    enable_custom_all_reduce_(gpt.enable_custom_all_reduce_)
+    enable_custom_all_reduce_(gpt.enable_custom_all_reduce_),
+    shared_contexts_ratio_(gpt.shared_contexts_ratio_)
 {
     initialize();
 }
@@ -584,6 +608,23 @@ void GptJ<T>::forward(std::unordered_map<std::string, Tensor>*       output_tens
         cudaMemsetAsync(cache_indirections_[0], 0, 2 * sizeof(int) * batch_size * beam_width * max_seq_len, stream_);
     }
 
+    int  compact_size;
+    bool use_shared_contexts = (shared_contexts_ratio_ > 0.0f) && (max_input_length >= 1) && (batch_size > 1);
+    if (use_shared_contexts) {
+        invokeFindContextDups(shared_contexts_idx_,
+                              batch_to_compact_idx_,
+                              compact_idx_,
+                              compact_size_,
+                              input_tensors->at("input_ids").getPtr<int>(),
+                              batch_size,
+                              beam_width,
+                              max_input_length,
+                              stream_);
+        cudaD2Hcpy(&compact_size, compact_size_, 1);
+        use_shared_contexts = compact_size <= shared_contexts_ratio_ * batch_size;
+        sync_check_cuda_error();
+    }
+
     // Prefix prompts
     if (has_prefix_prompt_) {
         cudaMemcpyAsync(prompt_learning_weight_batch_,
@@ -685,6 +726,14 @@ void GptJ<T>::forward(std::unordered_map<std::string, Tensor>*       output_tens
                     {batch_size * beam_width},
                     has_prefix_prompt_ ? tiled_prompt_lengths_buf_ : nullptr}}};
 
+        if (use_shared_contexts) {
+            decoder_input_tensors.insert(
+                {"compact_idx", Tensor(MEMORY_GPU, TYPE_INT32, {(size_t)compact_size}, compact_idx_)});
+            decoder_input_tensors.insert(
+                {"batch_to_compact_idx",
+                 Tensor(MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, batch_to_compact_idx_)});
+        }
+
         std::unordered_map<std::string, Tensor> decoder_output_tensors{
             {"decoder_output",
              Tensor{MEMORY_GPU,
diff --git a/src/fastertransformer/models/gptj/GptJ.h b/src/fastertransformer/models/gptj/GptJ.h
index 63fec7e93..5865540e7 100644
--- a/src/fastertransformer/models/gptj/GptJ.h
+++ b/src/fastertransformer/models/gptj/GptJ.h
@@ -53,11 +53,14 @@ class GptJ: public BaseLayer {
     int                                 enable_custom_all_reduce_;
 
     AttentionType attention_type_;
+    const int int8_mode_ = 0;
 
     size_t     vocab_size_padded_;
     const bool is_context_qk_buf_float_ =
         (std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM") == nullptr ||
          std::string(std::getenv("CONTEXT_ATTENTION_BMM1_HALF_ACCUM")) != "ON");
+    
+    float shared_contexts_ratio_;
 
     // Prompt Learning Parameters
     PromptLearningType prompt_learning_type_;
@@ -117,6 +120,11 @@ class GptJ: public BaseLayer {
 
     bool* generation_should_stop_ = nullptr;
 
+    int* shared_contexts_idx_  = nullptr;
+    int* compact_idx_          = nullptr;
+    int* batch_to_compact_idx_ = nullptr;
+    int* compact_size_         = nullptr;
+
     T*     context_decoder_input_buf_;
     T*     context_decoder_output_buf_;
     float* output_log_probs_buf_;
@@ -161,8 +169,10 @@ class GptJ: public BaseLayer {
          bool                                is_free_buffer_after_forward,
          cudaDeviceProp*                     cuda_device_prop         = nullptr,
          AttentionType                       attention_type           = AttentionType::UNFUSED_MHA,
+         int                                 int8_mode                = 0,
          std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm   = nullptr,
-         int                                 enable_custom_all_reduce = 0);
+         int                                 enable_custom_all_reduce = 0,
+         float                               shared_contexts_ratio    = 1.0f);
 
     GptJ(size_t                              max_batch_size,
          size_t                              max_seq_len,
@@ -193,8 +203,10 @@ class GptJ: public BaseLayer {
          bool                                is_free_buffer_after_forward,
          cudaDeviceProp*                     cuda_device_prop         = nullptr,
          AttentionType                       attention_type           = AttentionType::UNFUSED_MHA,
+         int                                 int8_mode                = 0,
          std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm   = nullptr,
-         int                                 enable_custom_all_reduce = 0);
+         int                                 enable_custom_all_reduce = 0,
+         float                               shared_contexts_ratio    = 1.0f);
 
     GptJ(GptJ<T> const& GptJ);
 
diff --git a/src/fastertransformer/models/gptj/GptJContextDecoder.cc b/src/fastertransformer/models/gptj/GptJContextDecoder.cc
index 28560c634..2f69541bc 100644
--- a/src/fastertransformer/models/gptj/GptJContextDecoder.cc
+++ b/src/fastertransformer/models/gptj/GptJContextDecoder.cc
@@ -40,7 +40,7 @@ void GptJContextDecoder<T>::initialize()
                                                                           is_free_buffer_after_forward_,
                                                                           is_qk_buf_float_,
                                                                           false,
-                                                                          0,
+                                                                          int8_mode_,
                                                                           custom_all_reduce_comm_,
                                                                           enable_custom_all_reduce_);
 
@@ -57,7 +57,7 @@ void GptJContextDecoder<T>::initialize()
                                                    true,
                                                    is_free_buffer_after_forward_,
                                                    false,
-                                                   0,
+                                                   int8_mode_,
                                                    false,  // use_gated_activation = false;
                                                    custom_all_reduce_comm_,
                                                    enable_custom_all_reduce_);
@@ -70,7 +70,7 @@ void GptJContextDecoder<T>::allocateBuffer()
 }
 
 template<typename T>
-void GptJContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len)
+void GptJContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len, bool use_shared_contexts)
 {
     decoder_normed_input_ = reinterpret_cast<T*>(
         allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
@@ -84,6 +84,18 @@ void GptJContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len)
     padding_offset_ =
         reinterpret_cast<int*>(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false));
     cu_seqlens_ = reinterpret_cast<int*>(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false));
+    if (use_shared_contexts) {
+        compact_decoder_features_ = reinterpret_cast<T*>(
+            allocator_->reMalloc(compact_decoder_features_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+        compact_attention_mask_ = reinterpret_cast<T*>(
+            allocator_->reMalloc(compact_attention_mask_, sizeof(T) * batch_size * seq_len * seq_len, false));
+        compact_input_lengths_ =
+            reinterpret_cast<int*>(allocator_->reMalloc(compact_input_lengths_, sizeof(int) * batch_size, false));
+        k_cache_layer_ = reinterpret_cast<T*>(
+            allocator_->reMalloc(k_cache_layer_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+        v_cache_layer_ = reinterpret_cast<T*>(
+            allocator_->reMalloc(v_cache_layer_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+    }
     is_allocate_buffer_ = true;
 }
 
@@ -98,6 +110,13 @@ void GptJContextDecoder<T>::freeBuffer()
         allocator_->free((void**)(&h_pinned_token_num_ptr_), true);
         allocator_->free((void**)(&padding_offset_));
         allocator_->free((void**)(&cu_seqlens_));
+        if (compact_decoder_features_ != nullptr) {
+            allocator_->free((void**)(&compact_decoder_features_));
+            allocator_->free((void**)(&compact_attention_mask_));
+            allocator_->free((void**)(&compact_input_lengths_));
+            allocator_->free((void**)(&k_cache_layer_));
+            allocator_->free((void**)(&v_cache_layer_));
+        }
         is_allocate_buffer_ = false;
     }
 }
@@ -149,6 +168,7 @@ GptJContextDecoder<T>::GptJContextDecoder(size_t                              ma
                                           bool                                is_free_buffer_after_forward,
                                           bool                                is_qk_buf_float,
                                           AttentionType                       attention_type,
+                                          int                                 int8_mode,
                                           std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
                                           int                                 enable_custom_all_reduce):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
@@ -166,6 +186,7 @@ GptJContextDecoder<T>::GptJContextDecoder(size_t                              ma
     pipeline_para_(pipeline_para),
     is_qk_buf_float_(is_qk_buf_float),
     attention_type_(attention_type),
+    int8_mode_(int8_mode),
     custom_all_reduce_comm_(custom_all_reduce_comm),
     enable_custom_all_reduce_(enable_custom_all_reduce)
 {
@@ -188,6 +209,7 @@ GptJContextDecoder<T>::GptJContextDecoder(GptJContextDecoder<T> const& decoder):
     pipeline_para_(decoder.pipeline_para_),
     is_qk_buf_float_(decoder.is_qk_buf_float_),
     attention_type_(decoder.attention_type_),
+    int8_mode_(decoder.int8_mode_),
     custom_all_reduce_comm_(decoder.custom_all_reduce_comm_),
     enable_custom_all_reduce_(decoder.enable_custom_all_reduce_)
 {
@@ -241,15 +263,23 @@ void GptJContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     // For example, the shape of decoder_input becomes [ite, batch_size, seq_len, hidden_dimension] during
     // computing.
 
-    FT_CHECK(input_tensors->size() == 5);
+    FT_CHECK(input_tensors->size() >= 5);
     FT_CHECK(output_tensors->size() == 4);
 
-    const int batch_size = input_tensors->at("decoder_input").shape[0];
-    const int seq_len    = input_tensors->at("decoder_input").shape[1];  // max_input_len
+    const bool use_shared_contexts = input_tensors->find("compact_idx") != input_tensors->end();
+    FT_CHECK(!use_shared_contexts || (input_tensors->find("batch_to_compact_idx") != input_tensors->end()));
+    const size_t request_batch_size = input_tensors->at("decoder_input").shape[0];
+    // compacted batch size.
+    const size_t batch_size =
+        use_shared_contexts ? input_tensors->at("compact_idx").shape[0] : input_tensors->at("decoder_input").shape[0];
+    const int seq_len = input_tensors->at("decoder_input").shape[1];  // max_input_len
+    // The maximum length of generation.
+    const size_t max_seq_len = output_tensors->at("value_cache").shape[3];
+
     const int max_prompt_length =
         input_tensors->at("attention_mask").shape[3] - input_tensors->at("attention_mask").shape[2];
     const DataType data_type = getTensorType<T>();
-    allocateBuffer(batch_size, seq_len);
+    allocateBuffer(batch_size, seq_len, use_shared_contexts);
 
     T*         decoder_input           = input_tensors->at("decoder_input").getPtr<T>();
     T*         decoder_output          = output_tensors->at("decoder_output").getPtr<T>();
@@ -257,6 +287,20 @@ void GptJContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     const T**  d_prefix_prompt_batch   = input_tensors->at("d_prefix_prompt_batch").getPtr<const T*>();
     const int* d_prefix_prompt_lengths = input_tensors->at("d_prefix_prompt_lengths").getPtr<const int>();
 
+    if (use_shared_contexts) {
+        invokeCompactInputs(compact_decoder_features_,
+                            compact_attention_mask_,
+                            compact_input_lengths_,
+                            decoder_input,
+                            attention_mask,
+                            input_tensors->at("input_lengths").getPtr<int>(),
+                            input_tensors->at("compact_idx").getPtr<int>(),
+                            batch_size,
+                            seq_len,
+                            hidden_units_,
+                            stream_);
+    }
+
     const int local_batch_size = getLocalBatchSize(batch_size, seq_len, pipeline_para_.world_size_);
     FT_CHECK(batch_size % local_batch_size == 0);
     const int iteration_num = batch_size / local_batch_size;
@@ -274,6 +318,12 @@ void GptJContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         self_v_cache_size.push_back(*t);
     }
 
+    if (use_shared_contexts) {
+        // we use k_cache_layer_ and v_cache_layer_
+        self_k_cache_size[3] = seq_len;
+        self_v_cache_size[2] = seq_len;
+    }
+
     AttentionType attention_type  = (d_prefix_prompt_lengths != nullptr) ?
                                         getUnfusedAttentionType(attention_type_) :
                                         attention_type_;
@@ -282,7 +332,8 @@ void GptJContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     for (int ite = 0; ite < iteration_num; ite++) {
         size_t h_token_num = local_batch_size * seq_len;
         if (is_unpadded_mha) {
-            const int* base_input_lengths = input_tensors->at("input_lengths").getPtr<int>();
+            const int* base_input_lengths =
+                use_shared_contexts ? compact_input_lengths_ : input_tensors->at("input_lengths").getPtr<int>();
             invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_,
                                                &h_token_num,
                                                padding_offset_,
@@ -298,8 +349,9 @@ void GptJContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             }
 
             if (l == 0 && is_unpadded_mha) {
+                const T* base_input = (use_shared_contexts ? compact_decoder_features_ : decoder_input);
                 invokeRemovePadding(decoder_layer_output_,
-                                    decoder_input + ite * local_batch_size * seq_len * hidden_units_,
+                                    base_input + ite * local_batch_size * seq_len * hidden_units_,
                                     padding_offset_,
                                     h_token_num,
                                     hidden_units_,
@@ -310,11 +362,11 @@ void GptJContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             T* layer_output = decoder_layer_output_;
             if (!is_unpadded_mha) {
                 if (l == 0) {
-                    layer_input = decoder_input;
+                    layer_input = use_shared_contexts ? compact_decoder_features_ : decoder_input;
                     layer_input += ite * local_batch_size * seq_len * hidden_units_;
                 }
                 if (l == num_layer_ - 1) {
-                    layer_output = decoder_output;
+                    layer_output = use_shared_contexts ? compact_decoder_features_ : decoder_output;
                     layer_output += ite * local_batch_size * seq_len * hidden_units_;
                 }
             }
@@ -339,12 +391,14 @@ void GptJContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                    h_token_num,
                                    hidden_units_,
                                    (float*)nullptr,
-                                   0,
+                                   int8_mode_,
                                    stream_);
             sync_check_cuda_error();
 
             const bool is_final = false;  // TODO(bhsueh) remove this flag
 
+            const T* attention_ptr = use_shared_contexts ? compact_attention_mask_ : attention_mask;
+
             TensorMap self_attention_input_tensors{
                 {"input_query",
                  Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}},
@@ -352,7 +406,7 @@ void GptJContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                  Tensor{MEMORY_GPU,
                         data_type,
                         {(size_t)local_batch_size, (size_t)1, (size_t)seq_len, (size_t)(seq_len + max_prompt_length)},
-                        attention_mask + local_batch_size * ite * seq_len * (seq_len + max_prompt_length)}},
+                        attention_ptr + local_batch_size * ite * seq_len * (seq_len + max_prompt_length)}},
                 {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type}},
                 {"is_final_layer", Tensor{MEMORY_CPU, TYPE_BOOL, {(size_t)1}, &is_final}},
                 {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}};
@@ -388,17 +442,43 @@ void GptJContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             }
             cache_offset += ite_cache_offset;
 
+            T* k_cache_ptr = use_shared_contexts ? k_cache_layer_ : k_cache.getPtrWithOffset<T>(cache_offset);
+            T* v_cache_ptr = use_shared_contexts ? v_cache_layer_ : v_cache.getPtrWithOffset<T>(cache_offset);
+
             TensorMap self_attention_output_tensors{
                 {"hidden_features",
                  Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, self_attn_output_}},
-                {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}},
-                {"value_cache",
-                 Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}};
+                {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache_ptr}},
+                {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache_ptr}}};
 
             self_attention_layer_->forward(&self_attention_output_tensors,
                                            &self_attention_input_tensors,
                                            &gpt_decoder_layer_weight->at(l).self_attention_weights);
 
+            if (use_shared_contexts) {
+                // Even with local batches, we must process the whole K/V caches as any
+                // element in batch_idx_to_compact_idx may reference the local batch
+                // we're processing. We also need to discard references that aren't in
+                // that particular local batch.
+                const size_t cache_stride_per_batch = hidden_units_ / tensor_para_.world_size_ * max_seq_len;
+                const size_t cache_layer_offset =
+                    (l - getFirstLayerParallelId()) * request_batch_size * cache_stride_per_batch;
+                invokeUnCompactCaches(k_cache.getPtrWithOffset<T>(cache_layer_offset),
+                                      v_cache.getPtrWithOffset<T>(cache_layer_offset),
+                                      k_cache_layer_,
+                                      v_cache_layer_,
+                                      input_tensors->at("batch_to_compact_idx").getPtr<int>(),
+                                      request_batch_size,  // batch_size (uncompact)
+                                      v_cache.shape[2],    // local_head_num
+                                      max_seq_len,
+                                      seq_len,
+                                      size_per_head_,
+                                      local_batch_size,
+                                      ite,
+                                      stream_);
+                sync_check_cuda_error();
+            }
+
             if (is_final == false) {
                 TensorMap ffn_input_tensors(
                     {{"ffn_input",
@@ -429,7 +509,8 @@ void GptJContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                 }
 
                 if ((l == num_layer_ - 1) && is_unpadded_mha) {
-                    invokeRebuildPadding(decoder_output + ite * local_batch_size * seq_len * hidden_units_,
+                    T* base_ptr = use_shared_contexts ? compact_decoder_features_ : decoder_output;
+                    invokeRebuildPadding(base_ptr + ite * local_batch_size * seq_len * hidden_units_,
                                          decoder_layer_output_,
                                          padding_offset_,
                                          h_token_num,
@@ -440,12 +521,22 @@ void GptJContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         }
     }
 
+    if (use_shared_contexts) {
+        invokeUnCompactOutputs(decoder_output,
+                               compact_decoder_features_,
+                               input_tensors->at("batch_to_compact_idx").getPtr<int>(),
+                               request_batch_size,  // batch
+                               seq_len * hidden_units_,
+                               stream_);
+        sync_check_cuda_error();
+    }
+
     // TODO(bhsueh) We could optimize this point by only computing the last token for the last layer
     invokeLookupHiddenStateOfLastToken(output_tensors->at("last_token_hidden_units").getPtr<T>(),
                                        output_tensors->at("decoder_output").getPtr<T>(),
                                        input_tensors->at("input_lengths").getPtr<int>(),
                                        seq_len,
-                                       batch_size,
+                                       request_batch_size,
                                        hidden_units_,
                                        stream_);
     sync_check_cuda_error();
diff --git a/src/fastertransformer/models/gptj/GptJContextDecoder.h b/src/fastertransformer/models/gptj/GptJContextDecoder.h
index 742a12f72..548aa9a77 100644
--- a/src/fastertransformer/models/gptj/GptJContextDecoder.h
+++ b/src/fastertransformer/models/gptj/GptJContextDecoder.h
@@ -59,6 +59,7 @@ class GptJContextDecoder: public BaseLayer {
     int                                 enable_custom_all_reduce_;
 
     AttentionType attention_type_;
+    int int8_mode_ = 0;
 
     bool is_qk_buf_float_;
 
@@ -66,7 +67,7 @@ class GptJContextDecoder: public BaseLayer {
     FfnLayer<T>*           ffn_layer_;
 
     void allocateBuffer() override;
-    void allocateBuffer(size_t batch_size, size_t seq_len);
+    void allocateBuffer(size_t batch_size, size_t seq_len, bool use_shared_contexts);
     void freeBuffer() override;
 
     bool isValidLayerParallelId(uint l);
@@ -85,6 +86,12 @@ class GptJContextDecoder: public BaseLayer {
     int*    padding_offset_         = nullptr;
     int*    cu_seqlens_             = nullptr;
 
+    T*   compact_decoder_features_ = nullptr;
+    T*   compact_attention_mask_   = nullptr;
+    int* compact_input_lengths_    = nullptr;
+    T*   k_cache_layer_            = nullptr;
+    T*   v_cache_layer_            = nullptr;
+
 public:
     GptJContextDecoder(size_t                              max_batch_size,
                        size_t                              max_seq_len,
@@ -103,6 +110,7 @@ class GptJContextDecoder: public BaseLayer {
                        bool                                is_free_buffer_after_forward,
                        bool                                is_qk_buf_float,
                        AttentionType                       attention_type            = AttentionType::UNFUSED_MHA,
+                       int                                 int8_mode                = 0,
                        std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm    = nullptr,
                        int                                 enable_custom_all_reduce_ = 0);
 
diff --git a/src/fastertransformer/models/gptj/GptJDecoder.cc b/src/fastertransformer/models/gptj/GptJDecoder.cc
index fdcbe593f..efc4be19d 100644
--- a/src/fastertransformer/models/gptj/GptJDecoder.cc
+++ b/src/fastertransformer/models/gptj/GptJDecoder.cc
@@ -35,7 +35,7 @@ void GptJDecoder<T>::initialize()
                                                                            true,
                                                                            is_free_buffer_after_forward_,
                                                                            false,
-                                                                           0,
+                                                                           int8_mode_,
                                                                            custom_all_reduce_comm_,
                                                                            enable_custom_all_reduce_);
 
@@ -52,7 +52,7 @@ void GptJDecoder<T>::initialize()
                                                    true,
                                                    is_free_buffer_after_forward_,
                                                    false,
-                                                   0,
+                                                   int8_mode_,
                                                    false,  // use_gated_activation = false;
                                                    custom_all_reduce_comm_,
                                                    enable_custom_all_reduce_);
@@ -134,6 +134,7 @@ GptJDecoder<T>::GptJDecoder(size_t                              max_batch_size,
                             cublasMMWrapper*                    cublas_wrapper,
                             IAllocator*                         allocator,
                             bool                                is_free_buffer_after_forward,
+                            int                                 int8_mode,
                             std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
                             int                                 enable_custom_all_reduce):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
@@ -148,6 +149,7 @@ GptJDecoder<T>::GptJDecoder(size_t                              max_batch_size,
     hidden_units_(head_num_ * size_per_head),
     tensor_para_(tensor_para),
     pipeline_para_(pipeline_para),
+    int8_mode_(int8_mode),
     custom_all_reduce_comm_(custom_all_reduce_comm),
     enable_custom_all_reduce_(enable_custom_all_reduce)
 {
@@ -167,6 +169,7 @@ GptJDecoder<T>::GptJDecoder(GptJDecoder<T> const& decoder):
     hidden_units_(decoder.hidden_units_),
     tensor_para_(decoder.tensor_para_),
     pipeline_para_(decoder.pipeline_para_),
+    int8_mode_(decoder.int8_mode_),
     custom_all_reduce_comm_(decoder.custom_all_reduce_comm_),
     enable_custom_all_reduce_(decoder.enable_custom_all_reduce_)
 {
@@ -268,7 +271,7 @@ void GptJDecoder<T>::forward(std::unordered_map<std::string, Tensor>*       outp
                                local_batch_size,
                                hidden_units_,
                                (float*)nullptr,
-                               0,
+                               int8_mode_,
                                stream_);
         sync_check_cuda_error();
 
diff --git a/src/fastertransformer/models/gptj/GptJDecoder.h b/src/fastertransformer/models/gptj/GptJDecoder.h
index 019bfb783..be5381e97 100644
--- a/src/fastertransformer/models/gptj/GptJDecoder.h
+++ b/src/fastertransformer/models/gptj/GptJDecoder.h
@@ -72,6 +72,8 @@ class GptJDecoder: public BaseLayer {
     BaseAttentionLayer<T>* self_attention_layer_;
     FfnLayer<T>*           ffn_layer_;
 
+    int int8_mode_ = 0;
+
 public:
     GptJDecoder(size_t                              max_batch_size,
                 size_t                              head_num,
@@ -87,6 +89,7 @@ class GptJDecoder: public BaseLayer {
                 cublasMMWrapper*                    cublas_wrapper,
                 IAllocator*                         allocator,
                 bool                                is_free_buffer_after_forward,
+                int                                 int8_mode                 = 0,
                 std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm    = nullptr,
                 int                                 enable_custom_all_reduce_ = 0);
 
diff --git a/src/fastertransformer/models/gptj/GptJDecoderLayerWeight.cc b/src/fastertransformer/models/gptj/GptJDecoderLayerWeight.cc
index f3ef6e689..10ab4521e 100644
--- a/src/fastertransformer/models/gptj/GptJDecoderLayerWeight.cc
+++ b/src/fastertransformer/models/gptj/GptJDecoderLayerWeight.cc
@@ -23,14 +23,25 @@ template<typename T>
 GptJDecoderLayerWeight<T>::GptJDecoderLayerWeight(const int hidden_units,
                                                   const int inter_size,
                                                   const int tensor_para_size,
-                                                  const int tensor_para_rank):
+                                                  const int tensor_para_rank,
+                                                  const int int8_mode):
     hidden_units_(hidden_units),
     inter_size_(inter_size),
     tensor_para_size_(tensor_para_size),
-    tensor_para_rank_(tensor_para_rank)
+    tensor_para_rank_(tensor_para_rank),
+    int8_mode_(int8_mode)
 {
     mallocWeights();
     setWeightPtr();
+
+    FT_CHECK_WITH_INFO(int8_mode_ != 2, "GptJ doesn't support int8_model == 2");
+    FT_CHECK_WITH_INFO(!(std::is_same<T, float>::value && int8_mode_ == 1),
+                       "Weight only quant does not work with FP32 compute.");
+}
+
+template<typename T>
+GptJDecoderLayerWeight<T>::GptJDecoderLayerWeight(const int int8_mode): int8_mode_(int8_mode)
+{
 }
 
 template<typename T>
@@ -51,30 +62,78 @@ GptJDecoderLayerWeight<T>::~GptJDecoderLayerWeight()
         ffn_weights.intermediate_weight.bias   = nullptr;
         ffn_weights.output_weight.kernel       = nullptr;
         ffn_weights.output_weight.bias         = nullptr;
+
+        if (int8_mode_ != 0) {
+            for (int i = 0; i < int8_weights_ptr.size(); i++) {
+                if (int8_weights_ptr[i] != nullptr) {
+                    deviceFree(int8_weights_ptr[i]);
+                }
+            }
+
+            if (int8_mode_ == 1) {
+                for (int i = 0; i < weight_only_scale_ptr.size(); i++) {
+                    if (weight_only_scale_ptr[i] != nullptr) {
+                        deviceFree(weight_only_scale_ptr[i]);
+                    }
+                }
+            }
+
+            self_attention_weights.query_weight.int8_kernel                             = nullptr;
+            self_attention_weights.query_weight.weight_only_quant_scale                 = nullptr;
+            self_attention_weights.attention_output_weight.int8_kernel                  = nullptr;
+            self_attention_weights.attention_output_weight.weight_only_quant_scale      = nullptr;
+            ffn_weights.intermediate_weight.int8_kernel                                 = nullptr;
+            ffn_weights.intermediate_weight.weight_only_quant_scale                     = nullptr;
+            ffn_weights.output_weight.int8_kernel                                       = nullptr;
+            ffn_weights.output_weight.weight_only_quant_scale                           = nullptr;
+        }
+
         is_maintain_buffer                     = false;
     }
 }
 
 template<typename T>
-GptJDecoderLayerWeight<T>::GptJDecoderLayerWeight(const GptJDecoderLayerWeight& other):
-    hidden_units_(other.hidden_units_),
-    inter_size_(other.inter_size_),
-    tensor_para_size_(other.tensor_para_size_),
-    tensor_para_rank_(other.tensor_para_rank_)
+void GptJDecoderLayerWeight<T>::copyFrom(const GptJDecoderLayerWeight& other)
 {
-    mallocWeights();
-
     cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_);
     cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
-    cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
     cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
 
-    cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_ * inter_size_ / tensor_para_size_);
     cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], inter_size_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_ * hidden_units_);
     cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], hidden_units_);
 
+
+    if (int8_mode_ == 0) {
+        cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
+        cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
+        cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_ * inter_size_ / tensor_para_size_);
+        cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_ * hidden_units_);
+    }
+    else {
+        cudaD2Dcpy(int8_weights_ptr[0], other.int8_weights_ptr[0], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
+        cudaD2Dcpy(int8_weights_ptr[1], other.int8_weights_ptr[1], hidden_units_ / tensor_para_size_ * hidden_units_);
+        cudaD2Dcpy(int8_weights_ptr[2], other.int8_weights_ptr[2], hidden_units_ * inter_size_ / tensor_para_size_);
+        cudaD2Dcpy(int8_weights_ptr[3], other.int8_weights_ptr[3], inter_size_ / tensor_para_size_ * hidden_units_);
+
+        if (int8_mode_ == 1) {
+            cudaD2Dcpy(weight_only_scale_ptr[0], other.weight_only_scale_ptr[0], 3 * hidden_units_ / tensor_para_size_);
+            cudaD2Dcpy(weight_only_scale_ptr[1], other.weight_only_scale_ptr[1], hidden_units_);
+            cudaD2Dcpy(weight_only_scale_ptr[2], other.weight_only_scale_ptr[2], inter_size_ / tensor_para_size_);
+            cudaD2Dcpy(weight_only_scale_ptr[3], other.weight_only_scale_ptr[3], hidden_units_);
+        }
+    }
+}
+
+template<typename T>
+GptJDecoderLayerWeight<T>::GptJDecoderLayerWeight(const GptJDecoderLayerWeight& other):
+    hidden_units_(other.hidden_units_),
+    inter_size_(other.inter_size_),
+    tensor_para_size_(other.tensor_para_size_),
+    tensor_para_rank_(other.tensor_para_rank_),
+    int8_mode_(other.int8_mode_)
+{
+    mallocWeights();
+    copyFrom(other);
     setWeightPtr();
 }
 
@@ -85,20 +144,10 @@ GptJDecoderLayerWeight<T>& GptJDecoderLayerWeight<T>::operator=(const GptJDecode
     inter_size_       = other.inter_size_;
     tensor_para_size_ = other.tensor_para_size_;
     tensor_para_rank_ = other.tensor_para_rank_;
+    int8_mode_        = other.int8_mode_;
 
     mallocWeights();
-
-    cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_);
-    cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
-    cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
-
-    cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_ * inter_size_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], inter_size_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_ * hidden_units_);
-    cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], hidden_units_);
-
+    copyFrom(other);
     setWeightPtr();
     return *this;
 }
@@ -113,32 +162,64 @@ void GptJDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType m
         weights_ptr[0], {(size_t)hidden_units_}, dir_path + ".input_layernorm.bias.bin", model_file_type);
     loadWeightFromBin<T>(
         weights_ptr[1], {(size_t)hidden_units_}, dir_path + ".input_layernorm.weight.bin", model_file_type);
-    loadWeightFromBin<T>(weights_ptr[2],
-                         {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)},
-                         dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin",
-                         model_file_type);
 
     // GPT-J does not have bias for QKV
     cudaMemset(weights_ptr[3], 0, sizeof(T) * 3 * hidden_units_ / tensor_para_size_);
-    loadWeightFromBin<T>(weights_ptr[4],
+
+    loadWeightFromBin<T>(weights_ptr[6],
+                         {(size_t)(inter_size_ / tensor_para_size_)},
+                         dir_path + ".mlp.dense_h_to_4h.bias." + rank_spec + ".bin",
+                         model_file_type);
+    loadWeightFromBin<T>(
+        weights_ptr[8], {(size_t)hidden_units_}, dir_path + ".mlp.dense_4h_to_h.bias.bin", model_file_type);
+    
+    // Load weights for GPT
+    if (int8_mode_ == 0) {        
+        loadWeightFromBin<T>(weights_ptr[2],
+                         {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)},
+                         dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin",
+                         model_file_type);
+        
+        loadWeightFromBin<T>(weights_ptr[4],
                          {(size_t)(hidden_units_ / tensor_para_size_), (size_t)hidden_units_},
                          dir_path + ".attention.dense.weight." + rank_spec + ".bin",
                          model_file_type);
-
-    loadWeightFromBin<T>(weights_ptr[5],
+        
+        loadWeightFromBin<T>(weights_ptr[5],
                          {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)},
                          dir_path + ".mlp.dense_h_to_4h.weight." + rank_spec + ".bin",
                          model_file_type);
-    loadWeightFromBin<T>(weights_ptr[6],
-                         {(size_t)(inter_size_ / tensor_para_size_)},
-                         dir_path + ".mlp.dense_h_to_4h.bias." + rank_spec + ".bin",
-                         model_file_type);
-    loadWeightFromBin<T>(weights_ptr[7],
+
+        loadWeightFromBin<T>(weights_ptr[7],
                          {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_},
                          dir_path + ".mlp.dense_4h_to_h.weight." + rank_spec + ".bin",
                          model_file_type);
-    loadWeightFromBin<T>(
-        weights_ptr[8], {(size_t)hidden_units_}, dir_path + ".mlp.dense_4h_to_h.bias.bin", model_file_type);
+    }
+    else if (int8_mode_ == 1) {
+        loadWeightFromBinAndQuantizeForWeightOnly<T>(int8_weights_ptr[0],
+                                                     weight_only_scale_ptr[0],
+                                                     {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)},
+                                                     dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin",
+                                                     model_file_type);
+
+        loadWeightFromBinAndQuantizeForWeightOnly<T>(int8_weights_ptr[1],
+                                                     weight_only_scale_ptr[1],
+                                                     {(size_t)(hidden_units_ / tensor_para_size_), (size_t)hidden_units_},
+                                                     dir_path + ".attention.dense.weight." + rank_spec + ".bin",
+                                                     model_file_type);
+
+        loadWeightFromBinAndQuantizeForWeightOnly<T>(int8_weights_ptr[2],
+                                                     weight_only_scale_ptr[2],
+                                                     {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)},
+                                                     dir_path + ".mlp.dense_h_to_4h.weight." + rank_spec + ".bin",
+                                                     model_file_type);
+
+        loadWeightFromBinAndQuantizeForWeightOnly<T>(int8_weights_ptr[3],
+                                                     weight_only_scale_ptr[3],
+                                                     {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_},
+                                                     dir_path + ".mlp.dense_4h_to_h.weight." + rank_spec + ".bin",
+                                                     model_file_type);
+    }
 }
 
 template<typename T>
@@ -155,6 +236,20 @@ void GptJDecoderLayerWeight<T>::setWeightPtr()
     ffn_weights.output_weight.kernel       = weights_ptr[7];
     ffn_weights.output_weight.bias         = weights_ptr[8];
 
+    if (int8_mode_ != 0) {
+        self_attention_weights.query_weight.int8_kernel                 = int8_weights_ptr[0];
+        self_attention_weights.attention_output_weight.int8_kernel      = int8_weights_ptr[1];
+        ffn_weights.intermediate_weight.int8_kernel                     = int8_weights_ptr[2];
+        ffn_weights.output_weight.int8_kernel                           = int8_weights_ptr[3];
+
+        if (int8_mode_ == 1) {
+            self_attention_weights.query_weight.weight_only_quant_scale                 = weight_only_scale_ptr[0];
+            self_attention_weights.attention_output_weight.weight_only_quant_scale      = weight_only_scale_ptr[1];
+            ffn_weights.intermediate_weight.weight_only_quant_scale                     = weight_only_scale_ptr[2];
+            ffn_weights.output_weight.weight_only_quant_scale                           = weight_only_scale_ptr[3];
+        }
+    }
+
     is_maintain_buffer = true;
 }
 
@@ -163,14 +258,32 @@ void GptJDecoderLayerWeight<T>::mallocWeights()
 {
     deviceMalloc(&weights_ptr[0], hidden_units_);
     deviceMalloc(&weights_ptr[1], hidden_units_);
-    deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
     deviceMalloc(&weights_ptr[3], 3 * hidden_units_ / tensor_para_size_);
-    deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
 
-    deviceMalloc(&weights_ptr[5], hidden_units_ * inter_size_ / tensor_para_size_);
     deviceMalloc(&weights_ptr[6], inter_size_ / tensor_para_size_);
-    deviceMalloc(&weights_ptr[7], inter_size_ / tensor_para_size_ * hidden_units_);
     deviceMalloc(&weights_ptr[8], hidden_units_);
+
+    if (int8_mode_ == 0) {
+        deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);  // qkv weight
+        deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);  // attention output weight
+        deviceMalloc(&weights_ptr[5], hidden_units_ * inter_size_ / tensor_para_size_);   // ffn inter weight
+        deviceMalloc(&weights_ptr[7], inter_size_ / tensor_para_size_ * hidden_units_);  // ffn output weight
+    }
+    else {
+        // Alloc FFN and Attention int8 weights
+        deviceMalloc(&int8_weights_ptr[0], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
+        deviceMalloc(&int8_weights_ptr[1], hidden_units_ / tensor_para_size_ * hidden_units_);
+        deviceMalloc(&int8_weights_ptr[2], hidden_units_ * inter_size_ / tensor_para_size_);
+        deviceMalloc(&int8_weights_ptr[3], inter_size_ / tensor_para_size_ * hidden_units_);
+
+        if (int8_mode_ == 1) {
+            // Alloc scales for weight only quant for attention and FFN weights
+            deviceMalloc(&weight_only_scale_ptr[0], 3 * hidden_units_ / tensor_para_size_);
+            deviceMalloc(&weight_only_scale_ptr[1], hidden_units_);
+            deviceMalloc(&weight_only_scale_ptr[2], inter_size_ / tensor_para_size_);
+            deviceMalloc(&weight_only_scale_ptr[3], hidden_units_);
+        }
+    }
 }
 
 template struct GptJDecoderLayerWeight<float>;
diff --git a/src/fastertransformer/models/gptj/GptJDecoderLayerWeight.h b/src/fastertransformer/models/gptj/GptJDecoderLayerWeight.h
index ea1b0f1d7..29402469a 100644
--- a/src/fastertransformer/models/gptj/GptJDecoderLayerWeight.h
+++ b/src/fastertransformer/models/gptj/GptJDecoderLayerWeight.h
@@ -29,10 +29,12 @@ template<typename T>
 struct GptJDecoderLayerWeight {
 public:
     GptJDecoderLayerWeight() = delete;
+    GptJDecoderLayerWeight(const int int8_mode);
     GptJDecoderLayerWeight(const int hidden_units,
                            const int inter_size,
                            const int tensor_para_size = 1,
-                           const int tensor_para_rank = 0);
+                           const int tensor_para_rank = 0,
+                           const int int8_mode = 0);
     ~GptJDecoderLayerWeight();
     GptJDecoderLayerWeight(const GptJDecoderLayerWeight& other);
     GptJDecoderLayerWeight& operator=(const GptJDecoderLayerWeight& other);
@@ -50,9 +52,14 @@ struct GptJDecoderLayerWeight {
     int  tensor_para_rank_;
     bool is_maintain_buffer = false;
     T*   weights_ptr[9];
+    int  int8_mode_ = 0;
+
+    std::vector<int8_t*> int8_weights_ptr = std::vector<int8_t*>(4, nullptr);
+    std::vector<T*>      weight_only_scale_ptr = std::vector<T*>(4, nullptr);
 
     void setWeightPtr();
     void mallocWeights();
+    void copyFrom(const GptJDecoderLayerWeight& other);
 };
 
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/gptj/GptJWeight.cc b/src/fastertransformer/models/gptj/GptJWeight.cc
index e64162b1f..0a3a9193b 100644
--- a/src/fastertransformer/models/gptj/GptJWeight.cc
+++ b/src/fastertransformer/models/gptj/GptJWeight.cc
@@ -28,6 +28,7 @@ GptJWeight<T>::GptJWeight(const int                                  hidden_unit
                           const int                                  tensor_para_rank,
                           const int                                  layer_para_size,
                           const int                                  layer_para_rank,
+                          const int                                  int8_mode,
                           PromptLearningType                         prompt_learning_type,
                           std::map<std::string, std::pair<int, int>> prompt_learning_pair):
     hidden_units_(hidden_units),
@@ -39,6 +40,7 @@ GptJWeight<T>::GptJWeight(const int                                  hidden_unit
     tensor_para_rank_(tensor_para_rank),
     layer_para_size_(layer_para_size),
     layer_para_rank_(layer_para_rank),
+    int8_mode_(int8_mode),
     prompt_learning_type_(prompt_learning_type),
     prompt_learning_pair_(prompt_learning_pair)
 {
@@ -60,7 +62,7 @@ GptJWeight<T>::GptJWeight(const int                                  hidden_unit
     for (int l = 0; l < num_layer_; l++) {
         if (isValidLayerParallelId(l)) {
             decoder_layer_weights.push_back(
-                GptJDecoderLayerWeight<T>(hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_));
+                GptJDecoderLayerWeight<T>(hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_, int8_mode_));
         }
         else {
             // Layer-parallelism: allocate empty layer because
@@ -101,6 +103,7 @@ GptJWeight<T>::GptJWeight(const GptJWeight& other):
     tensor_para_rank_(other.tensor_para_rank_),
     layer_para_size_(other.layer_para_size_),
     layer_para_rank_(other.layer_para_rank_),
+    int8_mode_(other.int8_mode_),
     prompt_token_weight_size_(other.prompt_token_weight_size_),
     malloc_load_prompt_weights_(other.malloc_load_prompt_weights_),
     prompt_learning_type_(other.prompt_learning_type_),
@@ -148,6 +151,7 @@ GptJWeight<T>& GptJWeight<T>::operator=(const GptJWeight& other)
     tensor_para_rank_           = other.tensor_para_rank_;
     layer_para_size_            = other.layer_para_size_;
     layer_para_rank_            = other.layer_para_rank_;
+    int8_mode_                  = other.int8_mode_;
     prompt_token_weight_size_   = other.prompt_token_weight_size_;
     malloc_load_prompt_weights_ = other.malloc_load_prompt_weights_;
     prompt_learning_type_       = other.prompt_learning_type_;
diff --git a/src/fastertransformer/models/gptj/GptJWeight.h b/src/fastertransformer/models/gptj/GptJWeight.h
index c75547e65..34e7ce673 100644
--- a/src/fastertransformer/models/gptj/GptJWeight.h
+++ b/src/fastertransformer/models/gptj/GptJWeight.h
@@ -37,6 +37,7 @@ struct GptJWeight {
         const int                                  tensor_para_rank     = 0,
         const int                                  layer_para_size      = 1,
         const int                                  layer_para_rank      = 0,
+        const int                                  int8_mode            = 0,
         PromptLearningType                         prompt_learning_type = PromptLearningType::no_prompt,
         std::map<std::string, std::pair<int, int>> prompt_learning_pair = std::map<std::string, std::pair<int, int>>{});
 
@@ -80,6 +81,8 @@ struct GptJWeight {
     int layer_para_size_;
     int layer_para_rank_;
 
+    size_t int8_mode_    = 0;
+
     // prompt learning pair (task_name, (task_name_id, prompt_len))
     PromptLearningType                         prompt_learning_type_;
     std::map<std::string, std::pair<int, int>> prompt_learning_pair_;
diff --git a/src/fastertransformer/models/gptneox/GptNeoX.cc b/src/fastertransformer/models/gptneox/GptNeoX.cc
index 2ce2dae7b..07356bec8 100644
--- a/src/fastertransformer/models/gptneox/GptNeoX.cc
+++ b/src/fastertransformer/models/gptneox/GptNeoX.cc
@@ -42,6 +42,7 @@ void GptNeoX<T>::initialize()
                                                         is_free_buffer_after_forward_,
                                                         is_context_qk_buf_float_,
                                                         attention_type_,
+                                                        int8_mode_,
                                                         custom_all_reduce_comm_,
                                                         enable_custom_all_reduce_);
 
@@ -59,6 +60,7 @@ void GptNeoX<T>::initialize()
                                          cublas_wrapper_,
                                          allocator_,
                                          is_free_buffer_after_forward_,
+                                         int8_mode_,
                                          custom_all_reduce_comm_,
                                          enable_custom_all_reduce_);
 
@@ -150,6 +152,13 @@ void GptNeoX<T>::allocateBuffer(
 
     generation_should_stop_ = (bool*)allocator_->reMalloc(generation_should_stop_, sizeof(bool), true, true);
 
+    if (shared_contexts_ratio_ > 0.0f) {
+        shared_contexts_idx_  = (int*)allocator_->reMalloc(shared_contexts_idx_, batch_size * sizeof(int), false);
+        batch_to_compact_idx_ = (int*)allocator_->reMalloc(batch_to_compact_idx_, batchxbeam * sizeof(int), false);
+        compact_idx_          = (int*)allocator_->reMalloc(compact_idx_, batch_size * sizeof(int), false);
+        compact_size_         = (int*)allocator_->reMalloc(compact_size_, sizeof(int), false);
+    }
+
     is_allocate_buffer_ = true;
 }
 
@@ -201,6 +210,11 @@ void GptNeoX<T>::freeBuffer()
 
         allocator_->free((void**)(&generation_should_stop_), true);
 
+        if (shared_contexts_ratio_ > 0.0f) {
+            allocator_->free((void**)(&shared_contexts_idx_));
+            allocator_->free((void**)(&compact_size_));
+        }
+
         is_allocate_buffer_ = false;
     }
 }
@@ -230,8 +244,10 @@ GptNeoX<T>::GptNeoX(size_t                              head_num,
                     bool                                is_free_buffer_after_forward,
                     cudaDeviceProp*                     cuda_device_prop,
                     AttentionType                       attention_type,
+                    int                                 int8_mode,
                     std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
-                    int                                 enable_custom_all_reduce):
+                    int                                 enable_custom_all_reduce,
+                    float                               shared_contexts_ratio):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop),
     head_num_(head_num),
     size_per_head_(size_per_head),
@@ -246,7 +262,9 @@ GptNeoX<T>::GptNeoX(size_t                              head_num,
     use_gptj_residual_(use_gptj_residual),
     hidden_units_(head_num * size_per_head),
     local_head_num_(head_num / 1),
-    attention_type_(attention_type)
+    attention_type_(attention_type),
+    int8_mode_(int8_mode),
+    shared_contexts_ratio_(shared_contexts_ratio)
 {
     tensor_para_.world_size_   = 1;
     tensor_para_.rank_         = 0;
@@ -288,8 +306,10 @@ GptNeoX<T>::GptNeoX(size_t                              head_num,
                     bool                                is_free_buffer_after_forward,
                     cudaDeviceProp*                     cuda_device_prop,
                     AttentionType                       attention_type,
+                    int                                 int8_mode,
                     std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
-                    int                                 enable_custom_all_reduce):
+                    int                                 enable_custom_all_reduce,
+                    float                               shared_contexts_ratio):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop),
     head_num_(head_num),
     size_per_head_(size_per_head),
@@ -308,7 +328,9 @@ GptNeoX<T>::GptNeoX(size_t                              head_num,
     local_head_num_(head_num / tensor_para.world_size_),
     custom_all_reduce_comm_(custom_all_reduce_comm),
     enable_custom_all_reduce_(enable_custom_all_reduce),
-    attention_type_(attention_type)
+    attention_type_(attention_type),
+    int8_mode_(int8_mode),
+    shared_contexts_ratio_(shared_contexts_ratio)
 {
     int local_vacab_size = ceil(vocab_size_ / 1.f / tensor_para_.world_size_);
     if (std::is_same<half, T>::value) {
@@ -339,7 +361,9 @@ GptNeoX<T>::GptNeoX(GptNeoX<T> const& gpt):
     vocab_size_padded_(gpt.vocab_size_padded_),
     custom_all_reduce_comm_(gpt.custom_all_reduce_comm_),
     enable_custom_all_reduce_(gpt.enable_custom_all_reduce_),
-    attention_type_(gpt.attention_type_)
+    attention_type_(gpt.attention_type_),
+    int8_mode_(gpt.int8_mode_),
+    shared_contexts_ratio_(gpt.shared_contexts_ratio_)
 {
     initialize();
 }
@@ -561,6 +585,23 @@ void GptNeoX<T>::forward(std::unordered_map<std::string, Tensor>*       output_t
         cudaMemsetAsync(cache_indirections_[0], 0, 2 * sizeof(int) * batch_size * beam_width * max_seq_len, stream_);
     }
 
+    int  compact_size;
+    bool use_shared_contexts = (shared_contexts_ratio_ > 0.0f) && (max_input_length >= 1) && (batch_size > 1);
+    if (use_shared_contexts) {
+        invokeFindContextDups(shared_contexts_idx_,
+                              batch_to_compact_idx_,
+                              compact_idx_,
+                              compact_size_,
+                              input_tensors->at("input_ids").getPtr<int>(),
+                              batch_size,
+                              beam_width,
+                              max_input_length,
+                              stream_);
+        cudaD2Hcpy(&compact_size, compact_size_, 1);
+        use_shared_contexts = compact_size <= shared_contexts_ratio_ * batch_size;
+        sync_check_cuda_error();
+    }
+
     // Prefix prompts
     if (has_prefix_prompt_) {
         cudaMemcpyAsync(prompt_learning_weight_batch_,
@@ -662,6 +703,14 @@ void GptNeoX<T>::forward(std::unordered_map<std::string, Tensor>*       output_t
                     {batch_size * beam_width},
                     has_prefix_prompt_ ? tiled_prompt_lengths_buf_ : nullptr}}};
 
+        if (use_shared_contexts) {
+            decoder_input_tensors.insert(
+                {"compact_idx", Tensor(MEMORY_GPU, TYPE_INT32, {(size_t)compact_size}, compact_idx_)});
+            decoder_input_tensors.insert(
+                {"batch_to_compact_idx",
+                 Tensor(MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, batch_to_compact_idx_)});
+        }
+
         std::unordered_map<std::string, Tensor> decoder_output_tensors{
             {"decoder_output",
              Tensor{MEMORY_GPU,
@@ -1137,6 +1186,7 @@ void GptNeoX<T>::setOutputTensors(std::unordered_map<std::string, Tensor>*
         gatherTreeParam param;
         param.beams                = transposed_output_ids_buf_;
         param.max_sequence_lengths = sequence_lengths_;
+        param.sequence_lengths_for_output = sequence_lengths;
         // add sequence_length 1 here because the sequence_length of time step t is t - 1
         param.max_sequence_length_final_step = 1;
         param.max_time                       = max_output_seq_len;
@@ -1154,8 +1204,8 @@ void GptNeoX<T>::setOutputTensors(std::unordered_map<std::string, Tensor>*
         param.stream                          = stream_;
         param.output_ids                      = output_tensors->at("output_ids").getPtr<int>();
         invokeGatherTree(param);
-        invokeCudaD2DcpyConvert(
-            sequence_lengths, sequence_lengths_, output_tensors->at("sequence_length").size(), stream_);
+        // invokeCudaD2DcpyConvert(
+        //     sequence_lengths, sequence_lengths_, output_tensors->at("sequence_length").size(), stream_);
         sync_check_cuda_error();
     }
     if ((output_tensors->count("output_log_probs") > 0 && output_tensors->at("output_log_probs").data != nullptr)) {
diff --git a/src/fastertransformer/models/gptneox/GptNeoX.h b/src/fastertransformer/models/gptneox/GptNeoX.h
index 9749a2070..1b3a1b1c1 100644
--- a/src/fastertransformer/models/gptneox/GptNeoX.h
+++ b/src/fastertransformer/models/gptneox/GptNeoX.h
@@ -41,6 +41,7 @@ class GptNeoX: public BaseLayer {
 
     static constexpr bool  neox_rotary_style_ = true;
     static constexpr float layernorm_eps_     = 1e-5f;
+    float       shared_contexts_ratio_;
 
     int    start_id_;
     int    end_id_;
@@ -54,6 +55,7 @@ class GptNeoX: public BaseLayer {
     int                                 enable_custom_all_reduce_;
 
     AttentionType attention_type_;
+    const int     int8_mode_      = 0;
 
     size_t     vocab_size_padded_;
     const bool is_context_qk_buf_float_ =
@@ -120,6 +122,11 @@ class GptNeoX: public BaseLayer {
 
     bool* generation_should_stop_ = nullptr;
 
+    int* shared_contexts_idx_  = nullptr;
+    int* compact_idx_          = nullptr;
+    int* batch_to_compact_idx_ = nullptr;
+    int* compact_size_         = nullptr;
+
     T*     context_decoder_input_buf_;
     T*     context_decoder_output_buf_;
     float* output_log_probs_buf_;
@@ -161,8 +168,10 @@ class GptNeoX: public BaseLayer {
             bool                                is_free_buffer_after_forward,
             cudaDeviceProp*                     cuda_device_prop         = nullptr,
             AttentionType                       attention_type           = AttentionType::UNFUSED_MHA,
+            int                                 int8_mode                = 0,
             std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm   = nullptr,
-            int                                 enable_custom_all_reduce = 0);
+            int                                 enable_custom_all_reduce = 0,
+            float                               shared_contexts_ratio    = 1.0f);
 
     GptNeoX(size_t                              head_num,
             size_t                              size_per_head,
@@ -190,8 +199,10 @@ class GptNeoX: public BaseLayer {
             bool                                is_free_buffer_after_forward,
             cudaDeviceProp*                     cuda_device_prop         = nullptr,
             AttentionType                       attention_type           = AttentionType::UNFUSED_MHA,
+            int                                 int8_mode                = 0,
             std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm   = nullptr,
-            int                                 enable_custom_all_reduce = 0);
+            int                                 enable_custom_all_reduce = 0,
+            float                               shared_contexts_ratio    = 1.0f);
 
     GptNeoX(GptNeoX<T> const& GptNeoX);
 
diff --git a/src/fastertransformer/models/gptneox/GptNeoXContextDecoder.cc b/src/fastertransformer/models/gptneox/GptNeoXContextDecoder.cc
index f23d1a977..4b56e271f 100644
--- a/src/fastertransformer/models/gptneox/GptNeoXContextDecoder.cc
+++ b/src/fastertransformer/models/gptneox/GptNeoXContextDecoder.cc
@@ -40,7 +40,7 @@ void GptNeoXContextDecoder<T>::initialize()
                                                                           is_free_buffer_after_forward_,
                                                                           is_qk_buf_float_,
                                                                           false,
-                                                                          0,
+                                                                          int8_mode_,
                                                                           custom_all_reduce_comm_,
                                                                           enable_custom_all_reduce_);
 
@@ -57,7 +57,7 @@ void GptNeoXContextDecoder<T>::initialize()
                                                    !use_gptj_residual_,
                                                    is_free_buffer_after_forward_,
                                                    false,
-                                                   0,
+                                                   int8_mode_,
                                                    false,  // use_gated_activation = false;
                                                    custom_all_reduce_comm_,
                                                    enable_custom_all_reduce_);
@@ -70,7 +70,7 @@ void GptNeoXContextDecoder<T>::allocateBuffer()
 }
 
 template<typename T>
-void GptNeoXContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len)
+void GptNeoXContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len, bool use_shared_contexts)
 {
     decoder_normed_input_ = reinterpret_cast<T*>(
         allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
@@ -84,6 +84,19 @@ void GptNeoXContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len)
     padding_offset_ =
         reinterpret_cast<int*>(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false));
     cu_seqlens_ = reinterpret_cast<int*>(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false));
+    if (use_shared_contexts) {
+        compact_decoder_features_ = reinterpret_cast<T*>(
+            allocator_->reMalloc(compact_decoder_features_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+        compact_attention_mask_ = reinterpret_cast<T*>(
+            allocator_->reMalloc(compact_attention_mask_, sizeof(T) * batch_size * seq_len * seq_len, false));
+        compact_input_lengths_ =
+            reinterpret_cast<int*>(allocator_->reMalloc(compact_input_lengths_, sizeof(int) * batch_size, false));
+        k_cache_layer_ = reinterpret_cast<T*>(
+            allocator_->reMalloc(k_cache_layer_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+        v_cache_layer_ = reinterpret_cast<T*>(
+            allocator_->reMalloc(v_cache_layer_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+    }
+
     is_allocate_buffer_ = true;
 }
 
@@ -98,6 +111,13 @@ void GptNeoXContextDecoder<T>::freeBuffer()
         allocator_->free((void**)(&h_pinned_token_num_ptr_), true);
         allocator_->free((void**)(&padding_offset_));
         allocator_->free((void**)(&cu_seqlens_));
+        if (compact_decoder_features_ != nullptr) {
+            allocator_->free((void**)(&compact_decoder_features_));
+            allocator_->free((void**)(&compact_attention_mask_));
+            allocator_->free((void**)(&compact_input_lengths_));
+            allocator_->free((void**)(&k_cache_layer_));
+            allocator_->free((void**)(&v_cache_layer_));
+        }
         is_allocate_buffer_ = false;
     }
 }
@@ -148,6 +168,7 @@ GptNeoXContextDecoder<T>::GptNeoXContextDecoder(size_t
                                                 bool                                is_free_buffer_after_forward,
                                                 bool                                is_qk_buf_float,
                                                 AttentionType                       attention_type,
+                                                int                                 int8_mode,
                                                 std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
                                                 int                                 enable_custom_all_reduce):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
@@ -164,6 +185,7 @@ GptNeoXContextDecoder<T>::GptNeoXContextDecoder(size_t
     pipeline_para_(pipeline_para),
     is_qk_buf_float_(is_qk_buf_float),
     attention_type_(attention_type),
+    int8_mode_(int8_mode),
     custom_all_reduce_comm_(custom_all_reduce_comm),
     enable_custom_all_reduce_(enable_custom_all_reduce)
 {
@@ -186,6 +208,7 @@ GptNeoXContextDecoder<T>::GptNeoXContextDecoder(GptNeoXContextDecoder<T> const&
     pipeline_para_(decoder.pipeline_para_),
     is_qk_buf_float_(decoder.is_qk_buf_float_),
     attention_type_(decoder.attention_type_),
+    int8_mode_(decoder.int8_mode_),
     custom_all_reduce_comm_(decoder.custom_all_reduce_comm_),
     enable_custom_all_reduce_(decoder.enable_custom_all_reduce_)
 {
@@ -239,15 +262,23 @@ void GptNeoXContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     // For example, the shape of decoder_input becomes [ite, batch_size, seq_len, hidden_dimension] during
     // computing.
 
-    FT_CHECK(input_tensors->size() == 5);
+    FT_CHECK(input_tensors->size() >= 5);
     FT_CHECK(output_tensors->size() == 4);
 
-    const int batch_size = input_tensors->at("decoder_input").shape[0];
-    const int seq_len    = input_tensors->at("decoder_input").shape[1];
+    const bool use_shared_contexts = input_tensors->find("compact_idx") != input_tensors->end();
+    FT_CHECK(!use_shared_contexts || (input_tensors->find("batch_to_compact_idx") != input_tensors->end()));
+    const size_t request_batch_size = input_tensors->at("decoder_input").shape[0];
+    // compacted batch size.
+    const size_t batch_size =
+        use_shared_contexts ? input_tensors->at("compact_idx").shape[0] : input_tensors->at("decoder_input").shape[0];
+    const int seq_len = input_tensors->at("decoder_input").shape[1];  // max_input_len
+    // The maximum length of generation.
+    const size_t max_seq_len = output_tensors->at("value_cache").shape[3];
+
     const int max_prompt_length =
         input_tensors->at("attention_mask").shape[3] - input_tensors->at("attention_mask").shape[2];
     const DataType data_type = getTensorType<T>();
-    allocateBuffer(batch_size, seq_len);
+    allocateBuffer(batch_size, seq_len, use_shared_contexts);
 
     T*         decoder_input           = input_tensors->at("decoder_input").getPtr<T>();
     T*         decoder_output          = output_tensors->at("decoder_output").getPtr<T>();
@@ -255,6 +286,20 @@ void GptNeoXContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     const T**  d_prefix_prompt_batch   = input_tensors->at("d_prefix_prompt_batch").getPtr<const T*>();
     const int* d_prefix_prompt_lengths = input_tensors->at("d_prefix_prompt_lengths").getPtr<const int>();
 
+    if (use_shared_contexts) {
+        invokeCompactInputs(compact_decoder_features_,
+                            compact_attention_mask_,
+                            compact_input_lengths_,
+                            decoder_input,
+                            attention_mask,
+                            input_tensors->at("input_lengths").getPtr<int>(),
+                            input_tensors->at("compact_idx").getPtr<int>(),
+                            batch_size,
+                            seq_len,
+                            hidden_units_,
+                            stream_);
+    }
+
     const int local_batch_size = getLocalBatchSize(batch_size, seq_len, pipeline_para_.world_size_);
     FT_CHECK(batch_size % local_batch_size == 0);
     const int iteration_num = batch_size / local_batch_size;
@@ -272,6 +317,12 @@ void GptNeoXContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         self_v_cache_size.push_back(*t);
     }
 
+    if (use_shared_contexts) {
+        // we use k_cache_layer_ and v_cache_layer_
+        self_k_cache_size[3] = seq_len;
+        self_v_cache_size[2] = seq_len;
+    }
+
     AttentionType attention_type  = (d_prefix_prompt_lengths != nullptr) ?
                                         getUnfusedAttentionType(attention_type_) :
                                         attention_type_;
@@ -280,7 +331,8 @@ void GptNeoXContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     for (int ite = 0; ite < iteration_num; ite++) {
         size_t h_token_num = local_batch_size * seq_len;
         if (is_unpadded_mha) {
-            const int* base_input_lengths = input_tensors->at("input_lengths").getPtr<int>();
+            const int* base_input_lengths =
+                use_shared_contexts ? compact_input_lengths_ : input_tensors->at("input_lengths").getPtr<int>();
             invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_,
                                                &h_token_num,
                                                padding_offset_,
@@ -296,8 +348,9 @@ void GptNeoXContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             }
 
             if (l == 0 && is_unpadded_mha) {
+                const T* base_input = (use_shared_contexts ? compact_decoder_features_ : decoder_input);
                 invokeRemovePadding(decoder_layer_output_,
-                                    decoder_input + ite * local_batch_size * seq_len * hidden_units_,
+                                    base_input + ite * local_batch_size * seq_len * hidden_units_,
                                     padding_offset_,
                                     h_token_num,
                                     hidden_units_,
@@ -309,11 +362,11 @@ void GptNeoXContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             T*         layer_output = decoder_layer_output_;
             if (!is_unpadded_mha) {
                 if (l == 0) {
-                    layer_input = decoder_input;
+                    layer_input = use_shared_contexts ? compact_decoder_features_ : decoder_input;
                     layer_input += ite * local_batch_size * seq_len * hidden_units_;
                 }
                 if (l == num_layer_ - 1) {
-                    layer_output = decoder_output;
+                    layer_output = use_shared_contexts ? compact_decoder_features_ : decoder_output;
                     layer_output += ite * local_batch_size * seq_len * hidden_units_;
                 }
             }
@@ -338,10 +391,12 @@ void GptNeoXContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                    h_token_num,
                                    hidden_units_,
                                    (float*)nullptr,
-                                   0,
+                                   int8_mode_,
                                    stream_);
             sync_check_cuda_error();
 
+            const T* attention_ptr = use_shared_contexts ? compact_attention_mask_ : attention_mask;
+
             TensorMap self_attention_input_tensors{
                 {"input_query",
                  Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}},
@@ -349,7 +404,7 @@ void GptNeoXContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                  Tensor{MEMORY_GPU,
                         data_type,
                         {(size_t)local_batch_size, (size_t)1, (size_t)seq_len, (size_t)(seq_len + max_prompt_length)},
-                        attention_mask + local_batch_size * ite * seq_len * (seq_len + max_prompt_length)}},
+                        attention_ptr + local_batch_size * ite * seq_len * (seq_len + max_prompt_length)}},
                 {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type}},
                 {"is_final_layer", Tensor{MEMORY_CPU, TYPE_BOOL, {(size_t)1}, &is_final}},
                 {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}};
@@ -384,16 +439,42 @@ void GptNeoXContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             }
             cache_offset += ite_cache_offset;
 
+            T* k_cache_ptr = use_shared_contexts ? k_cache_layer_ : k_cache.getPtrWithOffset<T>(cache_offset);
+            T* v_cache_ptr = use_shared_contexts ? v_cache_layer_ : v_cache.getPtrWithOffset<T>(cache_offset);
+
             TensorMap self_attention_output_tensors{
                 {"hidden_features",
                  Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, self_attn_output_}},
-                {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}},
-                {"value_cache",
-                 Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}};
+                {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache_ptr}},
+                {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache_ptr}}};
 
             self_attention_layer_->forward(&self_attention_output_tensors,
                                            &self_attention_input_tensors,
                                            &gpt_decoder_layer_weight->at(l)->self_attention_weights);
+            
+            if (use_shared_contexts) {
+                // Even with local batches, we must process the whole K/V caches as any
+                // element in batch_idx_to_compact_idx may reference the local batch
+                // we're processing. We also need to discard references that aren't in
+                // that particular local batch.
+                const size_t cache_stride_per_batch = hidden_units_ / tensor_para_.world_size_ * max_seq_len;
+                const size_t cache_layer_offset =
+                    (l - getFirstLayerParallelId()) * request_batch_size * cache_stride_per_batch;
+                invokeUnCompactCaches(k_cache.getPtrWithOffset<T>(cache_layer_offset),
+                                      v_cache.getPtrWithOffset<T>(cache_layer_offset),
+                                      k_cache_layer_,
+                                      v_cache_layer_,
+                                      input_tensors->at("batch_to_compact_idx").getPtr<int>(),
+                                      request_batch_size,  // batch_size (uncompact)
+                                      v_cache.shape[2],    // local_head_num
+                                      max_seq_len,
+                                      seq_len,
+                                      size_per_head_,
+                                      local_batch_size,
+                                      ite,
+                                      stream_);
+                sync_check_cuda_error();
+            }
 
             if (is_final == false) {
                 if (use_gptj_residual_) {
@@ -405,7 +486,7 @@ void GptNeoXContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                            h_token_num,
                                            hidden_units_,
                                            (float*)nullptr,
-                                           0,
+                                           int8_mode_,
                                            stream_);
                 }
                 else {
@@ -424,7 +505,7 @@ void GptNeoXContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                         (float*)nullptr,
                         (float*)nullptr,
                         (float*)nullptr,
-                        0,
+                        int8_mode_,
                         stream_);
                 }
 
@@ -483,7 +564,8 @@ void GptNeoXContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                 }
 
                 if ((l == num_layer_ - 1) && is_unpadded_mha) {
-                    invokeRebuildPadding(decoder_output + ite * local_batch_size * seq_len * hidden_units_,
+                    T* base_ptr = use_shared_contexts ? compact_decoder_features_ : decoder_output;
+                    invokeRebuildPadding(base_ptr + ite * local_batch_size * seq_len * hidden_units_,
                                          decoder_layer_output_,
                                          padding_offset_,
                                          h_token_num,
@@ -494,12 +576,22 @@ void GptNeoXContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         }
     }
 
+    if (use_shared_contexts) {
+        invokeUnCompactOutputs(decoder_output,
+                               compact_decoder_features_,
+                               input_tensors->at("batch_to_compact_idx").getPtr<int>(),
+                               request_batch_size,  // batch
+                               seq_len * hidden_units_,
+                               stream_);
+        sync_check_cuda_error();
+    }
+
     // TODO(bhsueh) We could optimize this point by only computing the last token for the last layer
     invokeLookupHiddenStateOfLastToken(output_tensors->at("last_token_hidden_units").getPtr<T>(),
                                        output_tensors->at("decoder_output").getPtr<T>(),
                                        input_tensors->at("input_lengths").getPtr<int>(),
                                        seq_len,
-                                       batch_size,
+                                       request_batch_size,
                                        hidden_units_,
                                        stream_);
     sync_check_cuda_error();
diff --git a/src/fastertransformer/models/gptneox/GptNeoXContextDecoder.h b/src/fastertransformer/models/gptneox/GptNeoXContextDecoder.h
index c81dcfe90..becc24277 100644
--- a/src/fastertransformer/models/gptneox/GptNeoXContextDecoder.h
+++ b/src/fastertransformer/models/gptneox/GptNeoXContextDecoder.h
@@ -56,13 +56,15 @@ class GptNeoXContextDecoder: public BaseLayer {
 
     AttentionType attention_type_;
 
+    int int8_mode_ = 0;
+
     bool is_qk_buf_float_;
 
     BaseAttentionLayer<T>* self_attention_layer_;
     FfnLayer<T>*           ffn_layer_;
 
     void allocateBuffer() override;
-    void allocateBuffer(size_t batch_size, size_t seq_len);
+    void allocateBuffer(size_t batch_size, size_t seq_len, bool use_shared_contexts);
     void freeBuffer() override;
 
     bool isValidLayerParallelId(uint l);
@@ -81,6 +83,12 @@ class GptNeoXContextDecoder: public BaseLayer {
     int*    padding_offset_         = nullptr;
     int*    cu_seqlens_             = nullptr;
 
+    T*   compact_decoder_features_ = nullptr;
+    T*   compact_attention_mask_   = nullptr;
+    int* compact_input_lengths_    = nullptr;
+    T*   k_cache_layer_            = nullptr;
+    T*   v_cache_layer_            = nullptr;
+    
 public:
     GptNeoXContextDecoder(size_t                              head_num,
                           size_t                              size_per_head,
@@ -98,6 +106,7 @@ class GptNeoXContextDecoder: public BaseLayer {
                           bool                                is_free_buffer_after_forward,
                           bool                                is_qk_buf_float,
                           AttentionType                       attention_type            = AttentionType::FUSED_MHA,
+                          int                                 int8_mode                = 0,
                           std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm    = nullptr,
                           int                                 enable_custom_all_reduce_ = 0);
 
diff --git a/src/fastertransformer/models/gptneox/GptNeoXDecoder.cc b/src/fastertransformer/models/gptneox/GptNeoXDecoder.cc
index 7b73ba8ee..50372f68e 100644
--- a/src/fastertransformer/models/gptneox/GptNeoXDecoder.cc
+++ b/src/fastertransformer/models/gptneox/GptNeoXDecoder.cc
@@ -35,7 +35,7 @@ void GptNeoXDecoder<T>::initialize()
                                                                            !use_gptj_residual_,
                                                                            is_free_buffer_after_forward_,
                                                                            false,
-                                                                           0,
+                                                                           int8_mode_,
                                                                            custom_all_reduce_comm_,
                                                                            enable_custom_all_reduce_);
 
@@ -52,7 +52,7 @@ void GptNeoXDecoder<T>::initialize()
                                                    !use_gptj_residual_,
                                                    is_free_buffer_after_forward_,
                                                    false,
-                                                   0,
+                                                   int8_mode_,
                                                    false,  // use_gated_activation = false;
                                                    custom_all_reduce_comm_,
                                                    enable_custom_all_reduce_);
@@ -134,6 +134,7 @@ GptNeoXDecoder<T>::GptNeoXDecoder(size_t                              head_num,
                                   cublasMMWrapper*                    cublas_wrapper,
                                   IAllocator*                         allocator,
                                   bool                                is_free_buffer_after_forward,
+                                  int                                 int8_mode,
                                   std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
                                   int                                 enable_custom_all_reduce):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
@@ -148,6 +149,7 @@ GptNeoXDecoder<T>::GptNeoXDecoder(size_t                              head_num,
     hidden_units_(head_num_ * size_per_head),
     tensor_para_(tensor_para),
     pipeline_para_(pipeline_para),
+    int8_mode_(int8_mode),
     custom_all_reduce_comm_(custom_all_reduce_comm),
     enable_custom_all_reduce_(enable_custom_all_reduce)
 {
@@ -168,6 +170,7 @@ GptNeoXDecoder<T>::GptNeoXDecoder(GptNeoXDecoder<T> const& decoder):
     hidden_units_(decoder.hidden_units_),
     tensor_para_(decoder.tensor_para_),
     pipeline_para_(decoder.pipeline_para_),
+    int8_mode_(decoder.int8_mode_),
     custom_all_reduce_comm_(decoder.custom_all_reduce_comm_),
     enable_custom_all_reduce_(decoder.enable_custom_all_reduce_)
 {
@@ -269,7 +272,7 @@ void GptNeoXDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                local_batch_size,
                                hidden_units_,
                                (float*)nullptr,
-                               0,
+                               int8_mode_,
                                stream_);
         sync_check_cuda_error();
 
@@ -304,7 +307,7 @@ void GptNeoXDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                    local_batch_size,
                                    hidden_units_,
                                    (float*)nullptr,
-                                   0,
+                                   int8_mode_,
                                    stream_);
         }
         else {
@@ -323,7 +326,7 @@ void GptNeoXDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                 (float*)nullptr,
                 (float*)nullptr,
                 (float*)nullptr,
-                0,
+                int8_mode_,
                 stream_);
         }
 
diff --git a/src/fastertransformer/models/gptneox/GptNeoXDecoder.h b/src/fastertransformer/models/gptneox/GptNeoXDecoder.h
index add736adc..297d22067 100644
--- a/src/fastertransformer/models/gptneox/GptNeoXDecoder.h
+++ b/src/fastertransformer/models/gptneox/GptNeoXDecoder.h
@@ -70,6 +70,8 @@ class GptNeoXDecoder: public BaseLayer {
     BaseAttentionLayer<T>* self_attention_layer_;
     FfnLayer<T>*           ffn_layer_;
 
+    int int8_mode_ = 0;
+
 public:
     GptNeoXDecoder(size_t                              head_num,
                    size_t                              size_per_head,
@@ -85,6 +87,7 @@ class GptNeoXDecoder: public BaseLayer {
                    cublasMMWrapper*                    cublas_wrapper,
                    IAllocator*                         allocator,
                    bool                                is_free_buffer_after_forward,
+                   int                                 int8_mode                 = 0,
                    std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm    = nullptr,
                    int                                 enable_custom_all_reduce_ = 0);
 
diff --git a/src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.cc b/src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.cc
index 3d62df83d..2ee56f4b6 100644
--- a/src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.cc
+++ b/src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.cc
@@ -24,15 +24,26 @@ GptNeoXDecoderLayerWeight<T>::GptNeoXDecoderLayerWeight(const int  hidden_units,
                                                         const int  inter_size,
                                                         const int  tensor_para_size,
                                                         const int  tensor_para_rank,
-                                                        const bool use_gptj_residual):
+                                                        const bool use_gptj_residual,
+                                                        const int  int8_mode):
     hidden_units_(hidden_units),
     inter_size_(inter_size),
     tensor_para_size_(tensor_para_size),
     tensor_para_rank_(tensor_para_rank),
+    int8_mode_(int8_mode),
     use_gptj_residual_(use_gptj_residual)
 {
     mallocWeights();
     setWeightPtr();
+
+    FT_CHECK_WITH_INFO(int8_mode_ != 2, "GptNeox doesn't support int8_model == 2");
+    FT_CHECK_WITH_INFO(!(std::is_same<T, float>::value && int8_mode_ == 1),
+                       "Weight only quant does not work with FP32 compute.");
+}
+
+template<typename T>
+GptNeoXDecoderLayerWeight<T>::GptNeoXDecoderLayerWeight(const int int8_mode): int8_mode_(int8_mode)
+{
 }
 
 template<typename T>
@@ -58,34 +69,84 @@ GptNeoXDecoderLayerWeight<T>::~GptNeoXDecoderLayerWeight()
         ffn_weights.intermediate_weight.bias   = nullptr;
         ffn_weights.output_weight.kernel       = nullptr;
         ffn_weights.output_weight.bias         = nullptr;
+
+        if (int8_mode_ != 0) {
+            for (int i = 0; i < int8_weights_ptr.size(); i++) {
+                if (int8_weights_ptr[i] != nullptr) {
+                    deviceFree(int8_weights_ptr[i]);
+                }
+            }
+
+            if (int8_mode_ == 1) {
+                for (int i = 0; i < weight_only_scale_ptr.size(); i++) {
+                    if (weight_only_scale_ptr[i] != nullptr) {
+                        deviceFree(weight_only_scale_ptr[i]);
+                    }
+                }
+            }
+
+            self_attention_weights.query_weight.int8_kernel                             = nullptr;
+            self_attention_weights.query_weight.weight_only_quant_scale                 = nullptr;
+            self_attention_weights.attention_output_weight.int8_kernel                  = nullptr;
+            self_attention_weights.attention_output_weight.weight_only_quant_scale      = nullptr;
+            ffn_weights.intermediate_weight.int8_kernel                                 = nullptr;
+            ffn_weights.intermediate_weight.weight_only_quant_scale                     = nullptr;
+            ffn_weights.output_weight.int8_kernel                                       = nullptr;
+            ffn_weights.output_weight.weight_only_quant_scale                           = nullptr;
+        }
+
         is_maintain_buffer                     = false;
     }
 }
 
 template<typename T>
-GptNeoXDecoderLayerWeight<T>::GptNeoXDecoderLayerWeight(const GptNeoXDecoderLayerWeight& other):
-    hidden_units_(other.hidden_units_),
-    inter_size_(other.inter_size_),
-    tensor_para_size_(other.tensor_para_size_),
-    tensor_para_rank_(other.tensor_para_rank_),
-    use_gptj_residual_(other.use_gptj_residual_)
+void GptNeoXDecoderLayerWeight<T>::copyFrom(const GptNeoXDecoderLayerWeight& other)
 {
-    mallocWeights();
     cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_);
     cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
-    cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
     cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
     if (!use_gptj_residual_) {
         cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_);
     }
 
-    cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);
     cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ / tensor_para_size_ * hidden_units_);
     cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_);
     cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], hidden_units_);
     cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], hidden_units_);
+
+
+    if (int8_mode_ == 0) {
+        cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
+        cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
+        cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);
+        cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ / tensor_para_size_ * hidden_units_);
+    }
+    else {
+        cudaD2Dcpy(int8_weights_ptr[0], other.int8_weights_ptr[0], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
+        cudaD2Dcpy(int8_weights_ptr[1], other.int8_weights_ptr[1], hidden_units_ / tensor_para_size_ * hidden_units_);
+        cudaD2Dcpy(int8_weights_ptr[2], other.int8_weights_ptr[2], hidden_units_ * inter_size_ / tensor_para_size_);
+        cudaD2Dcpy(int8_weights_ptr[3], other.int8_weights_ptr[3], inter_size_ / tensor_para_size_ * hidden_units_);
+
+        if (int8_mode_ == 1) {
+            cudaD2Dcpy(weight_only_scale_ptr[0], other.weight_only_scale_ptr[0], 3 * hidden_units_ / tensor_para_size_);
+            cudaD2Dcpy(weight_only_scale_ptr[1], other.weight_only_scale_ptr[1], hidden_units_);
+            cudaD2Dcpy(weight_only_scale_ptr[2], other.weight_only_scale_ptr[2], inter_size_ / tensor_para_size_);
+            cudaD2Dcpy(weight_only_scale_ptr[3], other.weight_only_scale_ptr[3], hidden_units_);
+        }
+    }
+}
+
+template<typename T>
+GptNeoXDecoderLayerWeight<T>::GptNeoXDecoderLayerWeight(const GptNeoXDecoderLayerWeight& other):
+    hidden_units_(other.hidden_units_),
+    inter_size_(other.inter_size_),
+    tensor_para_size_(other.tensor_para_size_),
+    tensor_para_rank_(other.tensor_para_rank_),
+    int8_mode_(other.int8_mode_),
+    use_gptj_residual_(other.use_gptj_residual_)
+{
+    mallocWeights();
+    copyFrom(other);
     setWeightPtr();
 }
 
@@ -96,24 +157,11 @@ GptNeoXDecoderLayerWeight<T>& GptNeoXDecoderLayerWeight<T>::operator=(const GptN
     inter_size_        = other.inter_size_;
     tensor_para_size_  = other.tensor_para_size_;
     tensor_para_rank_  = other.tensor_para_rank_;
+    int8_mode_          = other.int8_mode_;
     use_gptj_residual_ = other.use_gptj_residual_;
 
     mallocWeights();
-
-    cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_);
-    cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
-    cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
-    if (!use_gptj_residual_) {
-        cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_);
-    }
-    cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ / tensor_para_size_ * hidden_units_);
-    cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_);
-    cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], hidden_units_);
-    cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], hidden_units_);
+    copyFrom(other);
     setWeightPtr();
     return *this;
 }
@@ -128,38 +176,22 @@ void GptNeoXDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataTyp
         weights_ptr[0], {(size_t)hidden_units_}, dir_path + ".input_layernorm.bias.bin", model_file_type);
     loadWeightFromBin<T>(
         weights_ptr[1], {(size_t)hidden_units_}, dir_path + ".input_layernorm.weight.bin", model_file_type);
-    loadWeightFromBin<T>(weights_ptr[2],
-                         {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)},
-                         dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin",
-                         model_file_type);
 
     loadWeightFromBin<T>(weights_ptr[3],
                          {(size_t)(3 * hidden_units_ / tensor_para_size_)},
                          dir_path + ".attention.query_key_value.bias." + rank_spec + ".bin",
                          model_file_type);
 
-    loadWeightFromBin<T>(weights_ptr[4],
-                         {(size_t)(hidden_units_ / tensor_para_size_), (size_t)hidden_units_},
-                         dir_path + ".attention.dense.weight." + rank_spec + ".bin",
-                         model_file_type);
-
     if (!use_gptj_residual_) {
         loadWeightFromBin<T>(
             weights_ptr[5], {(size_t)hidden_units_}, dir_path + ".attention.dense.bias.bin", model_file_type);
     }
 
-    loadWeightFromBin<T>(weights_ptr[6],
-                         {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)},
-                         dir_path + ".mlp.dense_h_to_4h.weight." + rank_spec + ".bin",
-                         model_file_type);
     loadWeightFromBin<T>(weights_ptr[7],
                          {(size_t)(inter_size_ / tensor_para_size_)},
                          dir_path + ".mlp.dense_h_to_4h.bias." + rank_spec + ".bin",
                          model_file_type);
-    loadWeightFromBin<T>(weights_ptr[8],
-                         {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_},
-                         dir_path + ".mlp.dense_4h_to_h.weight." + rank_spec + ".bin",
-                         model_file_type);
+                         
     if (use_gptj_residual_) {
         loadWeightFromBin<T>(
             weights_ptr[9], {(size_t)hidden_units_}, dir_path + ".mlp.attention.bias.sum.bin", model_file_type);
@@ -172,6 +204,54 @@ void GptNeoXDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataTyp
         weights_ptr[10], {(size_t)hidden_units_}, dir_path + ".post_attention_layernorm.bias.bin", model_file_type);
     loadWeightFromBin<T>(
         weights_ptr[11], {(size_t)hidden_units_}, dir_path + ".post_attention_layernorm.weight.bin", model_file_type);
+
+    // Load weights for GPT
+    if (int8_mode_ == 0) {        
+        loadWeightFromBin<T>(weights_ptr[2],
+                         {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)},
+                         dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin",
+                         model_file_type);
+        
+        loadWeightFromBin<T>(weights_ptr[4],
+                         {(size_t)(hidden_units_ / tensor_para_size_), (size_t)hidden_units_},
+                         dir_path + ".attention.dense.weight." + rank_spec + ".bin",
+                         model_file_type);
+        
+        loadWeightFromBin<T>(weights_ptr[6],
+                         {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)},
+                         dir_path + ".mlp.dense_h_to_4h.weight." + rank_spec + ".bin",
+                         model_file_type);
+
+        loadWeightFromBin<T>(weights_ptr[8],
+                         {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_},
+                         dir_path + ".mlp.dense_4h_to_h.weight." + rank_spec + ".bin",
+                         model_file_type);
+    }
+    else if (int8_mode_ == 1) {
+        loadWeightFromBinAndQuantizeForWeightOnly<T>(int8_weights_ptr[0],
+                                                     weight_only_scale_ptr[0],
+                                                     {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)},
+                                                     dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin",
+                                                     model_file_type);
+
+        loadWeightFromBinAndQuantizeForWeightOnly<T>(int8_weights_ptr[1],
+                                                     weight_only_scale_ptr[1],
+                                                     {(size_t)(hidden_units_ / tensor_para_size_), (size_t)hidden_units_},
+                                                     dir_path + ".attention.dense.weight." + rank_spec + ".bin",
+                                                     model_file_type);
+
+        loadWeightFromBinAndQuantizeForWeightOnly<T>(int8_weights_ptr[2],
+                                                     weight_only_scale_ptr[2],
+                                                     {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)},
+                                                     dir_path + ".mlp.dense_h_to_4h.weight." + rank_spec + ".bin",
+                                                     model_file_type);
+
+        loadWeightFromBinAndQuantizeForWeightOnly<T>(int8_weights_ptr[3],
+                                                     weight_only_scale_ptr[3],
+                                                     {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_},
+                                                     dir_path + ".mlp.dense_4h_to_h.weight." + rank_spec + ".bin",
+                                                     model_file_type);
+    }
 }
 
 template<typename T>
@@ -191,6 +271,21 @@ void GptNeoXDecoderLayerWeight<T>::setWeightPtr()
 
     post_attention_layernorm_weights.beta  = weights_ptr[10];
     post_attention_layernorm_weights.gamma = weights_ptr[11];
+
+    if (int8_mode_ != 0) {
+        self_attention_weights.query_weight.int8_kernel                 = int8_weights_ptr[0];
+        self_attention_weights.attention_output_weight.int8_kernel      = int8_weights_ptr[1];
+        ffn_weights.intermediate_weight.int8_kernel                     = int8_weights_ptr[2];
+        ffn_weights.output_weight.int8_kernel                           = int8_weights_ptr[3];
+
+        if (int8_mode_ == 1) {
+            self_attention_weights.query_weight.weight_only_quant_scale                 = weight_only_scale_ptr[0];
+            self_attention_weights.attention_output_weight.weight_only_quant_scale      = weight_only_scale_ptr[1];
+            ffn_weights.intermediate_weight.weight_only_quant_scale                     = weight_only_scale_ptr[2];
+            ffn_weights.output_weight.weight_only_quant_scale                           = weight_only_scale_ptr[3];
+        }
+    }
+
     is_maintain_buffer                     = true;
 }
 
@@ -199,19 +294,38 @@ void GptNeoXDecoderLayerWeight<T>::mallocWeights()
 {
     deviceMalloc(&weights_ptr[0], hidden_units_);
     deviceMalloc(&weights_ptr[1], hidden_units_);
-    deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
     deviceMalloc(&weights_ptr[3], 3 * hidden_units_ / tensor_para_size_);
-    deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
     if (!use_gptj_residual_) {
         deviceMalloc(&weights_ptr[5], hidden_units_);
     }
 
-    deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);
     deviceMalloc(&weights_ptr[7], inter_size_ / tensor_para_size_);
-    deviceMalloc(&weights_ptr[8], inter_size_ / tensor_para_size_ * hidden_units_);
     deviceMalloc(&weights_ptr[9], hidden_units_);
     deviceMalloc(&weights_ptr[10], hidden_units_);
     deviceMalloc(&weights_ptr[11], hidden_units_);
+
+
+    if (int8_mode_ == 0) {
+        deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);  // qkv weight
+        deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);  // attention output weight
+        deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);   // ffn inter weight
+        deviceMalloc(&weights_ptr[8], inter_size_ / tensor_para_size_ * hidden_units_);  // ffn output weight
+    }
+    else {
+        // Alloc FFN and Attention int8 weights
+        deviceMalloc(&int8_weights_ptr[0], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
+        deviceMalloc(&int8_weights_ptr[1], hidden_units_ / tensor_para_size_ * hidden_units_);
+        deviceMalloc(&int8_weights_ptr[2], hidden_units_ * inter_size_ / tensor_para_size_);
+        deviceMalloc(&int8_weights_ptr[3], inter_size_ / tensor_para_size_ * hidden_units_);
+
+        if (int8_mode_ == 1) {
+            // Alloc scales for weight only quant for attention and FFN weights
+            deviceMalloc(&weight_only_scale_ptr[0], 3 * hidden_units_ / tensor_para_size_);
+            deviceMalloc(&weight_only_scale_ptr[1], hidden_units_);
+            deviceMalloc(&weight_only_scale_ptr[2], inter_size_ / tensor_para_size_);
+            deviceMalloc(&weight_only_scale_ptr[3], hidden_units_);
+        }
+    }
 }
 
 template struct GptNeoXDecoderLayerWeight<float>;
diff --git a/src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h b/src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h
index 2850da466..bca2b2d3a 100644
--- a/src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h
+++ b/src/fastertransformer/models/gptneox/GptNeoXDecoderLayerWeight.h
@@ -29,11 +29,13 @@ template<typename T>
 struct GptNeoXDecoderLayerWeight {
 public:
     GptNeoXDecoderLayerWeight() = default;
+    GptNeoXDecoderLayerWeight(const int int8_mode);
     GptNeoXDecoderLayerWeight(const int  hidden_units,
                               const int  inter_size,
                               const int  tensor_para_size  = 1,
                               const int  tensor_para_rank  = 0,
-                              const bool use_gptj_residual = true);
+                              const bool use_gptj_residual = true,
+                              const int int8_mode = 0);
     ~GptNeoXDecoderLayerWeight();
     GptNeoXDecoderLayerWeight(const GptNeoXDecoderLayerWeight& other);
     GptNeoXDecoderLayerWeight& operator=(const GptNeoXDecoderLayerWeight& other);
@@ -54,9 +56,14 @@ struct GptNeoXDecoderLayerWeight {
     const int attention_dense_bias_weight_id = 5;
     bool      is_maintain_buffer             = false;
     T*        weights_ptr[12];
+    int       int8_mode_ = 0;
+
+    std::vector<int8_t*> int8_weights_ptr = std::vector<int8_t*>(4, nullptr);
+    std::vector<T*>      weight_only_scale_ptr = std::vector<T*>(4, nullptr);
 
     void setWeightPtr();
     void mallocWeights();
+    void copyFrom(const GptNeoXDecoderLayerWeight& other);
 };
 
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/gptneox/GptNeoXWeight.cc b/src/fastertransformer/models/gptneox/GptNeoXWeight.cc
index 26995f255..56ece0d47 100644
--- a/src/fastertransformer/models/gptneox/GptNeoXWeight.cc
+++ b/src/fastertransformer/models/gptneox/GptNeoXWeight.cc
@@ -29,6 +29,7 @@ GptNeoXWeight<T>::GptNeoXWeight(const int                                  hidde
                                 const int                                  layer_para_size,
                                 const int                                  layer_para_rank,
                                 const bool                                 use_gptj_residual,
+                                const int                                  int8_mode,
                                 PromptLearningType                         prompt_learning_type,
                                 std::map<std::string, std::pair<int, int>> prompt_learning_pair):
     hidden_units_(hidden_units),
@@ -41,6 +42,7 @@ GptNeoXWeight<T>::GptNeoXWeight(const int                                  hidde
     layer_para_size_(layer_para_size),
     layer_para_rank_(layer_para_rank),
     use_gptj_residual_(use_gptj_residual),
+    int8_mode_(int8_mode),
     prompt_learning_type_(prompt_learning_type),
     prompt_learning_pair_(prompt_learning_pair)
 {
@@ -62,7 +64,7 @@ GptNeoXWeight<T>::GptNeoXWeight(const int                                  hidde
     for (int l = 0; l < num_layer_; l++) {
         if (isValidLayerParallelId(l)) {
             decoder_layer_weights.push_back(new GptNeoXDecoderLayerWeight<T>(
-                hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_, use_gptj_residual_));
+                hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_, use_gptj_residual_, int8_mode_));
         }
         else {
             // Layer-parallelism: allocate empty layer because
@@ -103,6 +105,7 @@ GptNeoXWeight<T>::GptNeoXWeight(const GptNeoXWeight& other):
     layer_para_size_(other.layer_para_size_),
     layer_para_rank_(other.layer_para_rank_),
     use_gptj_residual_(other.use_gptj_residual_),
+    int8_mode_(other.int8_mode_),
     prompt_token_weight_size_(other.prompt_token_weight_size_),
     malloc_load_prompt_weights_(other.malloc_load_prompt_weights_),
     prompt_learning_type_(other.prompt_learning_type_),
@@ -149,6 +152,7 @@ GptNeoXWeight<T>& GptNeoXWeight<T>::operator=(const GptNeoXWeight& other)
     layer_para_size_            = other.layer_para_size_;
     layer_para_rank_            = other.layer_para_rank_;
     use_gptj_residual_          = other.use_gptj_residual_;
+    int8_mode_                  = other.int8_mode_;
     prompt_token_weight_size_   = other.prompt_token_weight_size_;
     malloc_load_prompt_weights_ = other.malloc_load_prompt_weights_;
     prompt_learning_type_       = other.prompt_learning_type_;
diff --git a/src/fastertransformer/models/gptneox/GptNeoXWeight.h b/src/fastertransformer/models/gptneox/GptNeoXWeight.h
index 3e868854e..f360d5328 100644
--- a/src/fastertransformer/models/gptneox/GptNeoXWeight.h
+++ b/src/fastertransformer/models/gptneox/GptNeoXWeight.h
@@ -38,6 +38,7 @@ struct GptNeoXWeight {
         const int                                  layer_para_size      = 1,
         const int                                  layer_para_rank      = 0,
         const bool                                 use_gptj_residual_   = true,
+        const int                                  int8_mode            = 0,
         PromptLearningType                         prompt_learning_type = PromptLearningType::no_prompt,
         std::map<std::string, std::pair<int, int>> prompt_learning_pair = std::map<std::string, std::pair<int, int>>{});
 
@@ -88,6 +89,8 @@ struct GptNeoXWeight {
     int layer_para_size_;
     int layer_para_rank_;
 
+    size_t int8_mode_    = 0;
+
     // residual type
     bool use_gptj_residual_;
 
diff --git a/src/fastertransformer/th_op/gptneox/GptNeoXOp.h b/src/fastertransformer/th_op/gptneox/GptNeoXOp.h
index 222fdd409..465d80fa8 100755
--- a/src/fastertransformer/th_op/gptneox/GptNeoXOp.h
+++ b/src/fastertransformer/th_op/gptneox/GptNeoXOp.h
@@ -198,6 +198,7 @@ class FTGptNeoX: public IFGptNeoX {
                                             false,           // is_free_buffer_after_forward
                                             &prop_,          // cuda_device_prop
                                             attention_type,  // attention_type
+                                            0,               // don't support int8 in python for now
                                             nullptr,         // custom_all_reduce_comm
                                             0);              // enable_custom_all_reduce
 
diff --git a/src/fastertransformer/triton_backend/gptj/GptJTritonModel.cc b/src/fastertransformer/triton_backend/gptj/GptJTritonModel.cc
index ea17bc5dc..4fb499c9e 100644
--- a/src/fastertransformer/triton_backend/gptj/GptJTritonModel.cc
+++ b/src/fastertransformer/triton_backend/gptj/GptJTritonModel.cc
@@ -53,7 +53,7 @@ std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createGptJMo
         prompt_learning_table_pair.insert({task_name, {task_name_id, prompt_length}});
     }
 
-    if (data_type == "fp16") {
+    if (data_type == "half" || data_type == "fp16") {
         return std::make_shared<GptJTritonModel<half>>(
             reader.GetInteger("ft_instance_hyperparameter", "max_seq_len"),
             reader.GetInteger(model_name, "head_num"),
@@ -71,7 +71,8 @@ std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createGptJMo
             reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"),
             reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0),
             model_name,
-            model_dir);
+            model_dir,
+            reader.GetInteger("ft_instance_hyperparameter", "int8_mode", 0));
     }
     else if (data_type == "fp32") {
         return std::make_shared<GptJTritonModel<float>>(
@@ -91,7 +92,8 @@ std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createGptJMo
             reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"),
             reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0),
             model_name,
-            model_dir);
+            model_dir,
+            reader.GetInteger("ft_instance_hyperparameter", "int8_mode", 0));
     }
 #ifdef ENABLE_BF16
     else if (data_type == "bf16") {
@@ -112,7 +114,8 @@ std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createGptJMo
             reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"),
             reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0),
             model_name,
-            model_dir);
+            model_dir,
+            reader.GetInteger("ft_instance_hyperparameter", "int8_mode", 0));
     }
 #endif
     else {
@@ -125,11 +128,13 @@ template<typename T>
 GptJTritonModel<T>::GptJTritonModel(size_t      tensor_para_size,
                                     size_t      pipeline_para_size,
                                     int         enable_custom_all_reduce,
-                                    std::string model_dir):
+                                    std::string model_dir,
+                                    int         int8_mode):
     tensor_para_size_(tensor_para_size),
     pipeline_para_size_(pipeline_para_size),
     enable_custom_all_reduce_(enable_custom_all_reduce),
     shared_weights_(std::vector<std::shared_ptr<ft::GptJWeight<T>>>(ft::getDeviceCount())),
+    int8_mode_(int8_mode),
     model_dir_(model_dir)
 {
     INIReader reader = INIReader(model_dir + "/config.ini");
@@ -202,7 +207,8 @@ GptJTritonModel<T>::GptJTritonModel(size_t                                     m
                                     size_t                                     pipeline_para_size,
                                     int                                        enable_custom_all_reduce,
                                     std::string                                model_name,
-                                    std::string                                model_dir):
+                                    std::string                                model_dir,
+                                    int                                        int8_mode):
     max_seq_len_(max_seq_len),
     head_num_(head_num),
     size_per_head_(size_per_head),
@@ -216,6 +222,7 @@ GptJTritonModel<T>::GptJTritonModel(size_t                                     m
     pipeline_para_size_(pipeline_para_size),
     enable_custom_all_reduce_(enable_custom_all_reduce),
     shared_weights_(std::vector<std::shared_ptr<ft::GptJWeight<T>>>(ft::getDeviceCount())),
+    int8_mode_(int8_mode),
     model_name_(model_name),
     model_dir_(model_dir),
     prompt_learning_start_id_(prompt_learning_start_id),
@@ -308,6 +315,7 @@ GptJTritonModel<T>::createModelInstance(int
                                                   false,
                                                   cuda_device_prop_ptr.get(),
                                                   attention_type,
+                                                  int8_mode_,
                                                   custom_all_reduce_comm,
                                                   enable_custom_all_reduce_));
 
@@ -335,6 +343,7 @@ void GptJTritonModel<T>::createSharedWeights(int device_id, int rank)
                                                                      tensor_para_rank,
                                                                      pipeline_para_size_,
                                                                      pipeline_para_rank,
+                                                                     int8_mode_,
                                                                      prompt_learning_type_,
                                                                      prompt_learning_table_pair_);
     shared_weights_[device_id]->loadModel(model_dir_);
@@ -350,6 +359,7 @@ std::string GptJTritonModel<T>::toString()
        << "\nnum_layer: " << num_layer_ << "\nvocab_size: " << vocab_size_ << "\nstart_id: " << start_id_
        << "\nend_id: " << end_id_ << "\ntensor_para_size: " << tensor_para_size_
        << "\npipeline_para_size: " << pipeline_para_size_ << "\nenable_custom_all_reduce: " << enable_custom_all_reduce_
+       << "\nint8_mode: " << int8_mode_
        << "\nmodel_name: " << model_name_ << "\nmodel_dir: " << model_dir_ << std::endl;
     return ss.str();
 }
diff --git a/src/fastertransformer/triton_backend/gptj/GptJTritonModel.h b/src/fastertransformer/triton_backend/gptj/GptJTritonModel.h
index c1d270a3b..582cef666 100644
--- a/src/fastertransformer/triton_backend/gptj/GptJTritonModel.h
+++ b/src/fastertransformer/triton_backend/gptj/GptJTritonModel.h
@@ -43,12 +43,14 @@ struct GptJTritonModel: public AbstractTransformerModel {
                     size_t                                     pipeline_para_size,
                     int                                        enable_custom_all_reduce,
                     std::string                                model_name,
-                    std::string                                model_dir);
+                    std::string                                model_dir,
+                    int                                        int8_mode);
 
     GptJTritonModel(size_t      tensor_para_size,
                     size_t      pipeline_para_size,
                     int         enable_custom_all_reduce,
-                    std::string model_dir);
+                    std::string model_dir,
+                    int         int8_mode);
 
     ~GptJTritonModel() = default;
 
@@ -84,6 +86,8 @@ struct GptJTritonModel: public AbstractTransformerModel {
     bool is_fp16_;
     int  enable_custom_all_reduce_ = 0;
 
+    int  int8_mode_ = 0;
+
     // shared weights for each device
     std::vector<std::shared_ptr<ft::GptJWeight<T>>> shared_weights_;
 
diff --git a/src/fastertransformer/triton_backend/gptneox/GptNeoXTritonModel.cc b/src/fastertransformer/triton_backend/gptneox/GptNeoXTritonModel.cc
index 206f81ad3..e1350fe2b 100644
--- a/src/fastertransformer/triton_backend/gptneox/GptNeoXTritonModel.cc
+++ b/src/fastertransformer/triton_backend/gptneox/GptNeoXTritonModel.cc
@@ -34,19 +34,21 @@ std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createGptNeo
     int               tensor_para_size = reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size");
     std::string       model_dir        = reader.Get("ft_instance_hyperparameter", "model_dir");
 
-    if (data_type == "half") {
+    if (data_type == "half" || data_type == "fp16") {
         return std::make_shared<GptNeoXTritonModel<half>>(
             reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"),
             reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"),
             reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0),
-            model_dir);
+            model_dir,
+            reader.GetInteger("ft_instance_hyperparameter", "int8_mode", 0));
     }
     else {
         return std::make_shared<GptNeoXTritonModel<float>>(
             reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"),
             reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"),
             reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0),
-            model_dir);
+            model_dir,
+            reader.GetInteger("ft_instance_hyperparameter", "int8_mode", 0));
     }
 }
 
@@ -54,11 +56,13 @@ template<typename T>
 GptNeoXTritonModel<T>::GptNeoXTritonModel(size_t      tensor_para_size,
                                           size_t      pipeline_para_size,
                                           int         enable_custom_all_reduce,
-                                          std::string model_dir):
+                                          std::string model_dir,
+                                          int         int8_mode):
     tensor_para_size_(tensor_para_size),
     pipeline_para_size_(pipeline_para_size),
     shared_weights_(std::vector<std::shared_ptr<ft::GptNeoXWeight<T>>>(ft::getDeviceCount())),
-    enable_custom_all_reduce_(enable_custom_all_reduce)
+    enable_custom_all_reduce_(enable_custom_all_reduce),
+    int8_mode_(int8_mode)
 {
     model_dir_ = model_dir;
     const std::string inifile{model_dir + "/config.ini"};
@@ -168,6 +172,7 @@ std::unique_ptr<AbstractTransformerModelInstance> GptNeoXTritonModel<T>::createM
                        false,
                        cuda_device_prop_ptr.get(),
                        attention_type,
+                       int8_mode_,
                        custom_all_reduce_comm,
                        enable_custom_all_reduce_));
 
@@ -197,6 +202,7 @@ void GptNeoXTritonModel<T>::createSharedWeights(int device_id, int rank)
                                                                         pipeline_para_size_,
                                                                         pipeline_para_rank,
                                                                         use_gptj_residual_,
+                                                                        int8_mode_,
                                                                         prompt_learning_type_,
                                                                         prompt_learning_table_pair_);
     shared_weights_[device_id]->loadModel(model_dir_);
@@ -214,6 +220,7 @@ std::string GptNeoXTritonModel<T>::toString()
        << "\nprompt_learning_type_: " << static_cast<int>(prompt_learning_type_)
        << "\nprompt_learning_start_id_: " << prompt_learning_start_id_ << "\ntensor_para_size: " << tensor_para_size_
        << "\npipeline_para_size: " << pipeline_para_size_ << "\nenable_custom_all_reduce: " << enable_custom_all_reduce_
+       << "\nint8_mode: " << int8_mode_
        << "\nmodel_name: " << model_name_ << "\nmodel_dir: " << model_dir_ << std::endl;
     return ss.str();
 }
diff --git a/src/fastertransformer/triton_backend/gptneox/GptNeoXTritonModel.h b/src/fastertransformer/triton_backend/gptneox/GptNeoXTritonModel.h
index 9fe16e311..3ed113e79 100644
--- a/src/fastertransformer/triton_backend/gptneox/GptNeoXTritonModel.h
+++ b/src/fastertransformer/triton_backend/gptneox/GptNeoXTritonModel.h
@@ -30,7 +30,8 @@ struct GptNeoXTritonModel: public AbstractTransformerModel {
     GptNeoXTritonModel(size_t      tensor_para_size,
                        size_t      pipeline_para_size,
                        int         enable_custom_all_reduce,
-                       std::string model_dir);
+                       std::string model_dir,
+                       int         int8_mode);
 
     ~GptNeoXTritonModel() = default;
 
@@ -68,6 +69,8 @@ struct GptNeoXTritonModel: public AbstractTransformerModel {
     // residual type
     bool use_gptj_residual_ = true;
 
+    int  int8_mode_ = 0;
+
     // number of tasks (for prefix-prompt, p/prompt-tuning)
     size_t                                     num_tasks_                  = 0;
     int                                        prompt_learning_start_id_   = 0;