diff --git a/.gitignore b/.gitignore
index 77849f435..5b49d9183 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,4 +15,7 @@ __pycache__/
 **/.ipynb_checkpoints/
 
 /3rdparty/NeMo/
-/3rdparty/apex/
\ No newline at end of file
+/3rdparty/apex/
+20B_checkpoints/
+compile_commands.json
+model/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 870e67f0a..0d879611a 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -418,7 +418,7 @@ add_library(transformer-shared SHARED
 
 if (BUILD_MULTI_GPU)
 target_link_libraries(transformer-shared PUBLIC
-  -lmpi
+  -lmpi -lmpi_cxx
   ${NCCL_LIBRARIES}
 )
 endif()
diff --git a/FasterTransformerReadME.md b/FasterTransformerReadME.md
new file mode 100644
index 000000000..a00e0d631
--- /dev/null
+++ b/FasterTransformerReadME.md
@@ -0,0 +1,417 @@
+# FasterTransformer
+
+This repository provides a script and recipe to run the highly optimized transformer-based encoder and decoder component, and it is tested and maintained by NVIDIA.
+
+## Table Of Contents
+
+- [FasterTransformer](#fastertransformer)
+  - [Table Of Contents](#table-of-contents)
+  - [Model overview](#model-overview)
+    - [Support matrix](#support-matrix)
+  - [Advanced](#advanced)
+    - [Global Environment](#global-environment)
+  - [Performance](#performance)
+    - [BERT base performance](#bert-base-performance)
+      - [BERT base performances of FasterTransformer new features](#bert-base-performances-of-fastertransformer-new-features)
+      - [BERT base performance on TensorFlow](#bert-base-performance-on-tensorflow)
+      - [BERT base performance on PyTorch](#bert-base-performance-on-pytorch)
+    - [Decoding and Decoder performance](#decoding-and-decoder-performance)
+      - [Decoder and Decoding end-to-end translation performance on TensorFlow](#decoder-and-decoding-end-to-end-translation-performance-on-tensorflow)
+      - [Decoder and Decoding end-to-end translation performance on PyTorch](#decoder-and-decoding-end-to-end-translation-performance-on-pytorch)
+    - [GPT performance](#gpt-performance)
+  - [Release notes](#release-notes)
+    - [Changelog](#changelog)
+    - [Known issues](#known-issues)
+
+## Model overview
+
+In NLP, encoder and decoder are two important components, with the transformer layer becoming a popular architecture for both components. FasterTransformer implements a highly optimized transformer layer for both the encoder and decoder for inference. On Volta, Turing and Ampere GPUs, the computing power of Tensor Cores are used automatically when the precision of the data and weights are FP16.
+
+FasterTransformer is built on top of CUDA, cuBLAS, cuBLASLt and C++. We provide at least one API of the following frameworks: TensorFlow, PyTorch and Triton backend. Users can integrate FasterTransformer into these frameworks directly. For supporting frameworks, we also provide example codes to demonstrate how to use, and show the performance on these frameworks.
+
+### Support matrix
+
+| Models           | Framework      | FP16 | INT8 (after Turing) | Sparsity (after Ampere) | Tensor parallel | Pipeline parallel | FP8 (after Hopper) |
+| ---------------- | -------------- | ---- | ------------------- | ----------------------- | --------------- | ----------------- | ------------------ |
+| BERT             | TensorFlow     | Yes  | Yes                 | -                       | -               | -                 | -                  |
+| BERT             | PyTorch        | Yes  | Yes                 | Yes                     | Yes             | Yes               | -                  |
+| BERT             | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |
+| BERT             | C++            | Yes  | Yes                 | -                       | -               | -                 | Yes                |
+| XLNet            | C++            | Yes  | -                   | -                       | -               | -                 | -                  |
+| Encoder          | TensorFlow     | Yes  | Yes                 | -                       | -               | -                 | -                  |
+| Encoder          | PyTorch        | Yes  | Yes                 | Yes                     | -               | -                 | -                  |
+| Decoder          | TensorFlow     | Yes  | -                   | -                       | -               | -                 | -                  |
+| Decoder          | PyTorch        | Yes  | -                   | -                       | -               | -                 | -                  |
+| Decoding         | TensorFlow     | Yes  | -                   | -                       | -               | -                 | -                  |
+| Decoding         | PyTorch        | Yes  | -                   | -                       | -               | -                 | -                  |
+| GPT              | TensorFlow     | Yes  | -                   | -                       | -               | -                 | -                  |
+| GPT/OPT          | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | Yes                |
+| GPT/OPT          | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |
+| GPT-MoE          | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |
+| BLOOM            | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |
+| BLOOM            | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |
+| GPT-J            | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |
+| Longformer       | PyTorch        | Yes  | -                   | -                       | -               | -                 | -                  |
+| T5/UL2           | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |
+| T5               | TensorFlow 2   | Yes  | -                   | -                       | -               | -                 | -                  |
+| T5/UL2           | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |
+| T5               | TensorRT       | Yes  | -                   | -                       | Yes             | Yes               | -                  |
+| T5-MoE           | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |
+| Swin Transformer | PyTorch        | Yes  | Yes                 | -                       | -               | -                 | -                  |
+| Swin Transformer | TensorRT       | Yes  | Yes                 | -                       | -               | -                 | -                  |
+| ViT              | PyTorch        | Yes  | Yes                 | -                       | -               | -                 | -                  |
+| ViT              | TensorRT       | Yes  | Yes                 | -                       | -               | -                 | -                  |
+| GPT-NeoX         | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |
+| GPT-NeoX         | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |
+| BART/mBART       | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |
+| WeNet            | C++            | Yes  | -                   | -                       | -               | -                 | -                  |
+| DeBERTa          | TensorFlow 2   | Yes  | -                   | -                       | On-going        | On-going          | -                  |
+| DeBERTa          | PyTorch        | Yes  | -                   | -                       | On-going        | On-going          | -                  |
+
+* Note that the FasterTransformer supports the models above on C++ because all source codes are built on C++.
+
+More details of specific models are put in `xxx_guide.md` of [`docs/`](docs), where `xxx` means the model name. Some common questions and the respective answers are put in [`docs/QAList.md`](docs/QAList.md). Note that the model of Encoder and BERT are similar and we put the explanation into `bert_guide.md` together.
+
+## Advanced
+
+The following code lists the directory structure of FasterTransformer:
+
+```
+/src/fastertransformer: source code of FasterTransformer
+    |--/cutlass_extensions: Implementation of cutlass gemm/kernels.
+    |--/kernels: CUDA kernels for different models/layers and operations, like addBiasResiual.
+    |--/layers: Implementation of layer modules, like attention layer, ffn layer.
+    |--/models: Implementation of different models, like BERT, GPT.
+    |--/tensorrt_plugin: encapluate FasterTransformer into TensorRT plugin.
+    |--/tf_op: custom Tensorflow OP implementation
+    |--/th_op: custom PyTorch OP implementation
+    |--/triton_backend: custom triton backend implementation
+    |--/utils: Contains common cuda utils, like cublasMMWrapper, memory_utils
+/examples: C++, tensorflow and pytorch interface examples
+    |--/cpp: C++ interface examples
+    |--/pytorch: PyTorch OP examples
+    |--/tensorflow: TensorFlow OP examples
+    |--/tensorrt: TensorRT examples
+/docs: Documents to explain the details of implementation of different models, and show the benchmark
+/benchmark: Contains the scripts to run the benchmarks of different models
+/tests: Unit tests
+/templates: Documents to explain how to add a new model/example into FasterTransformer repo
+```
+
+Note that many folders contains many sub-folders to split different models. Quantization tools are move to `examples`, like `examples/tensorflow/bert/bert-quantization/` and `examples/pytorch/bert/bert-quantization-sparsity/`.
+
+
+### Global Environment
+
+FasterTransformer provides some convenient environment variables for debuging and testing.
+
+1. `FT_LOG_LEVEL`: This environment controls the log level of debug messae. More details are in `src/fastertransformer/utils/logger.h`. Note that the program will print lots of message when the level is lower than `DEBUG` and the program would become very slow.
+2. `FT_NVTX`: If it is set to be `ON` like `FT_NVTX=ON ./bin/gpt_example`, the program will insert tha tag of nvtx to help profiling the program.
+3. `FT_DEBUG_LEVEL`: If it is set to be `DEBUG`, then the program will run `cudaDeviceSynchronize()` after every kernels. Otherwise, the kernel is executued asynchronously by default. It is helpful to locate the error point during debuging. But this flag affects the performance of program significantly. So, it should be used only for debuging.
+
+## Performance
+
+Hardware settings:
+
+* 8xA100-80GBs (with mclk 1593MHz, pclk 1410MHz) with AMD EPYC 7742 64-Core Processor
+* T4 (with mclk 5000MHz, pclk 1590MHz) with Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz
+
+In order to run the following benchmark, we need to install the unix computing tool "bc" by
+
+```bash
+apt-get install bc
+```
+
+### BERT base performance
+
+The FP16 results of TensorFlow were obtained by running the `benchmarks/bert/tf_benchmark.sh`.
+
+The INT8 results of TensorFlow were obtained by running the `benchmarks/bert/tf_int8_benchmark.sh`.
+
+The FP16 results of PyTorch were obtained by running the `benchmarks/bert/pyt_benchmark.sh`.
+
+The INT8 results of PyTorch were obtained by running the `benchmarks/bert/pyt_int8_benchmark.sh`.
+
+More benchmarks are put in [`docs/bert_guide.md`](docs/bert_guide.md#bert-performance).
+
+#### BERT base performances of FasterTransformer new features
+
+The following figure compares the performances of different features of FasterTransformer and FasterTransformer under FP16 on T4.
+
+For large batch size and sequence length, both EFF-FT and FT-INT8-v2 bring about 2x speedup. Using Effective FasterTransformer and int8v2 at the same time can bring about 3.5x speedup compared to FasterTransformer FP16 for large case.
+
+<div align=center><img  width=80% src ="docs/images/FT_Encoder_T4.png"/></div>
+
+#### BERT base performance on TensorFlow
+
+The following figure compares the performances of different features of FasterTransformer and TensorFlow XLA under FP16 on T4.
+
+For small batch size and sequence length, using FasterTransformer can bring about 3x speedup.
+
+For large batch size and sequence length, using Effective FasterTransformer with INT8-v2 quantization can bring about 5x speedup.
+
+<div align=center><img  width=80% src ="docs/images/TF_Encoder_T4.png"/></div>
+
+#### BERT base performance on PyTorch
+
+The following figure compares the performances of different features of FasterTransformer and PyTorch TorchScript under FP16 on T4.
+
+For small batch size and sequence length, using FasterTransformer CustomExt can bring about 4x ~ 6x speedup.
+
+For large batch size and sequence length, using Effective FasterTransformer with INT8-v2 quantization can bring about 5x speedup.
+
+<div align=center><img  width=80% src ="docs/images/Py_Encoder_T4.png"/></div>
+
+### Decoding and Decoder performance
+
+The results of TensorFlow were obtained by running the `benchmarks/decoding/tf_decoding_beamsearch_benchmark.sh` and `benchmarks/decoding/tf_decoding_sampling_benchmark.sh`
+
+The results of PyTorch were obtained by running the `benchmarks/decoding/pyt_decoding_beamsearch_benchmark.sh`.
+
+In the experiments of decoding, we updated the following parameters:
+
+* head_num = 8
+* size_per_head = 64
+* num_layers = 6 for both encoder and decoder
+* vocabulary_size = 32001 for TensorFlow sample codes, 31538 for PyTorch sample codes
+* memory_hidden_dim = 512
+* max sequenc elength = 128
+
+More benchmarks are put in [`docs/decoder_guide.md`](docs/decoder_guide.md#decoding-performance).
+
+#### Decoder and Decoding end-to-end translation performance on TensorFlow
+
+The following figure shows the speedup of of FT-Decoder op and FT-Decoding op compared to TensorFlow under FP16 with T4. Here, we use the throughput of translating a test set to prevent the total tokens of each methods may be different. Compared to TensorFlow, FT-Decoder provides 1.5x ~ 3x speedup; while FT-Decoding provides 4x ~ 18x speedup.
+
+<div align=center><img  width=80% src ="docs/images/TF_Decoder_T4.png"/></div>
+
+#### Decoder and Decoding end-to-end translation performance on PyTorch
+
+The following figure shows the speedup of of FT-Decoder op and FT-Decoding op compared to PyTorch under FP16 with T4. Here, we use the throughput of translating a test set to prevent the total tokens of each methods may be different. Compared to PyTorch, FT-Decoder provides 1.2x ~ 3x speedup; while FT-Decoding provides 3.8x ~ 13x speedup.
+
+<div align=center><img  width=80% src ="docs/images/Py_Decoder_T4.png"/></div>
+
+### GPT performance
+
+The following figure compares the performances of Megatron and FasterTransformer under FP16 on A100.
+
+In the experiments of decoding, we updated the following parameters:
+
+* head_num = 96
+* size_per_head = 128
+* num_layers = 48 for GPT-89B model, 96 for GPT-175B model
+* data_type = FP16
+* vocab_size = 51200
+* top_p = 0.9
+* tensor parallel size = 8
+* input sequence length = 512
+* output sequence length = 32
+
+<div align=center><img  width=80% src ="docs/images/FT_GPT_A100.png"/></div>
+
+## Release notes
+
+### Changelog
+
+May 2023
+- Fix bugs of generation early stopping
+
+January 2023
+- Support GPT MoE
+- Support FP8 for Bert and GPT (**Experimental**)
+- Support DeBERTa on TensorFlow 2 and PyTorch
+
+Dec 2022
+- **Release the FasterTransformer 5.2**
+- Support min length penalty
+
+Nov 2022
+- Support T5 Tensorflow 2 custom op.
+- Support T5 MoE
+- Support WeNet
+- Support BART & mBART
+- Support SwinV2
+- Initial support for w8a8 int8 mode with GPT (preview)
+- Support fused mha in GPT
+
+Oct 2022
+- Support BLOOM
+
+Sep 2022
+- Support factual sampling ([link](https://arxiv.org/pdf/2206.04624.pdf)) in gpt
+- Support for IA3 adapting scheme in T5
+
+Aug 2022
+- Support returning context tokens embeddings in GPT
+- **Release the FasterTransformer 5.1**
+- Support for interactive generation
+- Support for attention time-limited memory
+- Support mt5 and t5-v1.1
+
+July 2022
+- Support UL2 huggingface ckpt. ([link](https://huggingface.co/google/ul2))
+  - Fix bug of T5 under bfloat16.
+- Add ViT INT8 TensorRT Plugin
+- Support batch sampling
+- Support shared context optimization in GPT model
+
+June 2022
+- Support streaming generation for triton backend.
+- Support OPT.
+- Support multi-node multi-GPU BERT under FP32, FP16 and BF16.
+
+May 2022
+- Support bfloat16 on most models.
+- Support [prefix-prompt](https://arxiv.org/pdf/2101.00190.pdf) for GPT-J.
+- Support GPT-NeoX.
+  - epsilon value used in layernorm is now a parameter
+  - rotary embedding GPT-NeoX style (only GPT-J was implemented)
+  - load per-GPU layernorm and bias parameters
+  - weight conversion from EleutherAI checkpoint
+
+April 2022
+- **Release the FasterTransformer 5.0**
+  - Change the default accumulation type of all gemm to FP32.
+  - Support bfloat16 inference in GPT model.
+  - Support Nemo Megatron T5 and Megatron-LM T5 model.
+  - Support ViT.
+
+March 2022
+- Support `stop_ids` and `ban_bad_ids` in GPT-J.
+- Support dynamice `start_id` and `end_id` in GPT-J, GPT, T5 and Decoding.
+
+February 2022
+- Support Swin Transformer.
+- Optimize the k/v cache update of beam search by in-direction buffer.
+- Support runtime input for GPT-J, T5 and GPT.
+- Support soft prompt in GPT and GPT-J.
+- Support custom all reduce kernel.
+  - Limitation: 
+    1. Only support tensor parallel size = 8 on DGX-A100.
+    2. Only support CUDA with cudaMallocAsync.
+
+December 2021
+- Add TensorRT plugin of T5 model.
+- Change some hyper-parameters of GPT model to runtime query.
+- Optimize the memory allocator under C++ code.
+- Fix bug of CUB including when using CUDA 11.5 or newer version.
+
+November 2021
+- **Update the FasterTransformer 5.0 beta**
+- Add GPT-3 INT8 weight only qauntization for batch size <= 2.
+- Support multi-node multi-gpu support on T5.
+- Enhance the multi-node multi-gpu supporting in GPT-3.
+
+August 2021
+- **Release the FasterTransformer 5.0 beta**
+  - Refactor the repo and codes
+  - And special thanks to NAVER Corp. for contributing a lot to this version, as listed below.
+    - Bugs fix
+      - Fix error that occurs when batch_size is less than max_batch_size for gpt pytorch wrapper.
+      - Fix memory leak that occurs every forward because of reused allocator.
+      - Fix race condition that occurs in repetition penalty kernel.
+    - Enhancement
+      - Add random seed setting.
+      - Fix GEMM buffer overflow on FP16 of GPT.
+      - Change to invalidate finished buffer for every completion.
+      - Introduce stop_before for early stop.
+  - Support Longformer.
+  - Rename `layer_para` to `pipeline_para`.
+  - Optimize the sorting of top p sampling.
+  - Support sparsity for Ampere GPUs on BERT.
+  - Support `size_per_head` 96, 160, 192, 224, 256 for GPT model.
+  - Support multi-node inference for GPT Triton backend.
+
+June 2021
+- Support XLNet
+
+April 2021
+- **Release the FasterTransformer 4.0**
+  - Support multi-gpus and multi-nodes inference for GPT model on C++ and PyTorch.
+  - Support single node, multi-gpus inference for GPT model on triton.
+  - Add the int8 fused multi-head attention kernel for bert.
+  - Add the FP16 fused multi-head attention kernel of V100 for bert.
+  - Optimize the kernel of decoder.
+  - Move to independent repo.
+  - Eager mode PyTorch extension is deprecated.
+
+Dec 2020
+- **Release the FasterTransformer 3.1**
+  - Optimize the decoding by adding the finisehd mask to prevent useless computing.
+  - Support opennmt encoder.
+  - Remove the TensorRT plugin supporting.
+  - TorchScript custom op is deprecated.
+
+Nov 2020
+- Optimize the INT8 inference.
+- Support PyTorch INT8 inference.
+- Provide PyTorch INT8 quantiztion tools.
+- Integrate the fused multi-head attention kernel of TensorRT into FasterTransformer.
+- Add unit test of SQuAD.
+- Update the missed NGC checkpoints.
+
+Sep 2020
+- Support GPT2
+- **Release the FasterTransformer 3.0**
+  - Support INT8 quantization of encoder of cpp and TensorFlow op.
+  - Add bert-tf-quantization tool.
+  - Fix the issue that Cmake 15 or Cmake 16 fail to build this project.
+
+Aug 2020
+- Fix the bug of trt plugin.
+
+June 2020
+- **Release the FasterTransformer 2.1**
+  - Add Effective FasterTransformer based on the idea of [Effective Transformer](https://github.com/bytedance/effective_transformer) idea.
+  - Optimize the beam search kernels.
+  - Add PyTorch op supporting
+
+May 2020
+- Fix the bug that seq_len of encoder must be larger than 3.
+- Add the position_encoding of decoding as the input of FasterTransformer decoding. This is convenient to use different types of position encoding. FasterTransformer does not compute the position encoding value, but only lookup the table.
+- Modifying the method of loading model in `translate_sample.py`.
+
+April 2020
+- Rename `decoding_opennmt.h` to `decoding_beamsearch.h`
+- Add DiverseSiblingsSearch for decoding.
+- Add sampling into Decoding
+  - The implementation is in the `decoding_sampling.h`
+  - Add top_k sampling, top_p sampling for decoding.
+- Refactor the tensorflow custom op codes.
+  - Merge `bert_transformer_op.h`, `bert_transformer_op.cu.cc` into `bert_transformer_op.cc`
+  - Merge `decoder.h`, `decoder.cu.cc` into `decoder.cc`
+  - Merge `decoding_beamsearch.h`, `decoding_beamsearch.cu.cc` into `decoding_beamsearch.cc`
+- Fix the bugs of finalize function decoding.py.
+- Fix the bug of tf DiverseSiblingSearch.
+- Add BLEU scorer `bleu_score.py` into `utils`. Note that the BLEU score requires python3.
+- Fuse QKV Gemm of encoder and masked_multi_head_attention of decoder.
+- Add dynamic batch size and dynamic sequence length features into all ops.
+
+March 2020
+- Add feature in FasterTransformer 2.0
+  - Add `translate_sample.py` to demonstrate how to translate a sentence by restoring the pretrained model of OpenNMT-tf.
+- Fix bugs of Fastertransformer 2.0
+  - Fix the bug of maximum sequence length of decoder cannot be larger than 128.
+  - Fix the bug that decoding does not check finish or not after each step.
+  - Fix the bug of decoder about max_seq_len.
+  - Modify the decoding model structure to fit the OpenNMT-tf decoding model.
+    - Add a layer normalization layer after decoder.
+    - Add a normalization for inputs of decoder
+
+February 2020
+- **Release the FasterTransformer 2.0**
+  - Provide a highly optimized OpenNMT-tf based decoder and decoding, including C++ API and TensorFlow op.
+  - Refine the sample codes of encoder.
+  - Add dynamic batch size feature into encoder op.
+
+July 2019
+- **Release the FasterTransformer 1.0**
+  - Provide a highly optimized bert equivalent transformer layer, including C++ API, TensorFlow op and TensorRT plugin.
+
+### Known issues
+
+- Cannot compile on tensorflow 2.10 due to undefined symbol issue.
+- Undefined symbol errors when import the extension
+  - Please `import torch` first. If this has been done, it is due to the incompatible C++ ABI. You may need to check the PyTorch used during compilation and execution are the same, or you need to check how your PyTorch is compiled, or the version of your GCC, etc.
+- Results of TensorFlow and OP would be different in decoding. This problem is caused by the accumulated log probability, and we do not avoid this problem.
+- If encounter some problem in the custom environment, try to use the gcc/g++ 4.8 to build the project of TensorFlow op, especially for TensorFlow 1.14.
diff --git a/README.md b/README.md
index a00e0d631..72735e507 100644
--- a/README.md
+++ b/README.md
@@ -1,417 +1,15 @@
-# FasterTransformer
+# FasterTransformer for SaumsungCEChallenge
 
-This repository provides a script and recipe to run the highly optimized transformer-based encoder and decoder component, and it is tested and maintained by NVIDIA.
+Check out FasterTransformer [README.md](FasterTransformerReadME.md)
 
-## Table Of Contents
+## Installation
 
-- [FasterTransformer](#fastertransformer)
-  - [Table Of Contents](#table-of-contents)
-  - [Model overview](#model-overview)
-    - [Support matrix](#support-matrix)
-  - [Advanced](#advanced)
-    - [Global Environment](#global-environment)
-  - [Performance](#performance)
-    - [BERT base performance](#bert-base-performance)
-      - [BERT base performances of FasterTransformer new features](#bert-base-performances-of-fastertransformer-new-features)
-      - [BERT base performance on TensorFlow](#bert-base-performance-on-tensorflow)
-      - [BERT base performance on PyTorch](#bert-base-performance-on-pytorch)
-    - [Decoding and Decoder performance](#decoding-and-decoder-performance)
-      - [Decoder and Decoding end-to-end translation performance on TensorFlow](#decoder-and-decoding-end-to-end-translation-performance-on-tensorflow)
-      - [Decoder and Decoding end-to-end translation performance on PyTorch](#decoder-and-decoding-end-to-end-translation-performance-on-pytorch)
-    - [GPT performance](#gpt-performance)
-  - [Release notes](#release-notes)
-    - [Changelog](#changelog)
-    - [Known issues](#known-issues)
-
-## Model overview
-
-In NLP, encoder and decoder are two important components, with the transformer layer becoming a popular architecture for both components. FasterTransformer implements a highly optimized transformer layer for both the encoder and decoder for inference. On Volta, Turing and Ampere GPUs, the computing power of Tensor Cores are used automatically when the precision of the data and weights are FP16.
-
-FasterTransformer is built on top of CUDA, cuBLAS, cuBLASLt and C++. We provide at least one API of the following frameworks: TensorFlow, PyTorch and Triton backend. Users can integrate FasterTransformer into these frameworks directly. For supporting frameworks, we also provide example codes to demonstrate how to use, and show the performance on these frameworks.
-
-### Support matrix
-
-| Models           | Framework      | FP16 | INT8 (after Turing) | Sparsity (after Ampere) | Tensor parallel | Pipeline parallel | FP8 (after Hopper) |
-| ---------------- | -------------- | ---- | ------------------- | ----------------------- | --------------- | ----------------- | ------------------ |
-| BERT             | TensorFlow     | Yes  | Yes                 | -                       | -               | -                 | -                  |
-| BERT             | PyTorch        | Yes  | Yes                 | Yes                     | Yes             | Yes               | -                  |
-| BERT             | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |
-| BERT             | C++            | Yes  | Yes                 | -                       | -               | -                 | Yes                |
-| XLNet            | C++            | Yes  | -                   | -                       | -               | -                 | -                  |
-| Encoder          | TensorFlow     | Yes  | Yes                 | -                       | -               | -                 | -                  |
-| Encoder          | PyTorch        | Yes  | Yes                 | Yes                     | -               | -                 | -                  |
-| Decoder          | TensorFlow     | Yes  | -                   | -                       | -               | -                 | -                  |
-| Decoder          | PyTorch        | Yes  | -                   | -                       | -               | -                 | -                  |
-| Decoding         | TensorFlow     | Yes  | -                   | -                       | -               | -                 | -                  |
-| Decoding         | PyTorch        | Yes  | -                   | -                       | -               | -                 | -                  |
-| GPT              | TensorFlow     | Yes  | -                   | -                       | -               | -                 | -                  |
-| GPT/OPT          | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | Yes                |
-| GPT/OPT          | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |
-| GPT-MoE          | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |
-| BLOOM            | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |
-| BLOOM            | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |
-| GPT-J            | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |
-| Longformer       | PyTorch        | Yes  | -                   | -                       | -               | -                 | -                  |
-| T5/UL2           | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |
-| T5               | TensorFlow 2   | Yes  | -                   | -                       | -               | -                 | -                  |
-| T5/UL2           | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |
-| T5               | TensorRT       | Yes  | -                   | -                       | Yes             | Yes               | -                  |
-| T5-MoE           | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |
-| Swin Transformer | PyTorch        | Yes  | Yes                 | -                       | -               | -                 | -                  |
-| Swin Transformer | TensorRT       | Yes  | Yes                 | -                       | -               | -                 | -                  |
-| ViT              | PyTorch        | Yes  | Yes                 | -                       | -               | -                 | -                  |
-| ViT              | TensorRT       | Yes  | Yes                 | -                       | -               | -                 | -                  |
-| GPT-NeoX         | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |
-| GPT-NeoX         | Triton backend | Yes  | -                   | -                       | Yes             | Yes               | -                  |
-| BART/mBART       | PyTorch        | Yes  | -                   | -                       | Yes             | Yes               | -                  |
-| WeNet            | C++            | Yes  | -                   | -                       | -               | -                 | -                  |
-| DeBERTa          | TensorFlow 2   | Yes  | -                   | -                       | On-going        | On-going          | -                  |
-| DeBERTa          | PyTorch        | Yes  | -                   | -                       | On-going        | On-going          | -                  |
-
-* Note that the FasterTransformer supports the models above on C++ because all source codes are built on C++.
-
-More details of specific models are put in `xxx_guide.md` of [`docs/`](docs), where `xxx` means the model name. Some common questions and the respective answers are put in [`docs/QAList.md`](docs/QAList.md). Note that the model of Encoder and BERT are similar and we put the explanation into `bert_guide.md` together.
-
-## Advanced
-
-The following code lists the directory structure of FasterTransformer:
 
 ```
-/src/fastertransformer: source code of FasterTransformer
-    |--/cutlass_extensions: Implementation of cutlass gemm/kernels.
-    |--/kernels: CUDA kernels for different models/layers and operations, like addBiasResiual.
-    |--/layers: Implementation of layer modules, like attention layer, ffn layer.
-    |--/models: Implementation of different models, like BERT, GPT.
-    |--/tensorrt_plugin: encapluate FasterTransformer into TensorRT plugin.
-    |--/tf_op: custom Tensorflow OP implementation
-    |--/th_op: custom PyTorch OP implementation
-    |--/triton_backend: custom triton backend implementation
-    |--/utils: Contains common cuda utils, like cublasMMWrapper, memory_utils
-/examples: C++, tensorflow and pytorch interface examples
-    |--/cpp: C++ interface examples
-    |--/pytorch: PyTorch OP examples
-    |--/tensorflow: TensorFlow OP examples
-    |--/tensorrt: TensorRT examples
-/docs: Documents to explain the details of implementation of different models, and show the benchmark
-/benchmark: Contains the scripts to run the benchmarks of different models
-/tests: Unit tests
-/templates: Documents to explain how to add a new model/example into FasterTransformer repo
-```
-
-Note that many folders contains many sub-folders to split different models. Quantization tools are move to `examples`, like `examples/tensorflow/bert/bert-quantization/` and `examples/pytorch/bert/bert-quantization-sparsity/`.
-
-
-### Global Environment
-
-FasterTransformer provides some convenient environment variables for debuging and testing.
-
-1. `FT_LOG_LEVEL`: This environment controls the log level of debug messae. More details are in `src/fastertransformer/utils/logger.h`. Note that the program will print lots of message when the level is lower than `DEBUG` and the program would become very slow.
-2. `FT_NVTX`: If it is set to be `ON` like `FT_NVTX=ON ./bin/gpt_example`, the program will insert tha tag of nvtx to help profiling the program.
-3. `FT_DEBUG_LEVEL`: If it is set to be `DEBUG`, then the program will run `cudaDeviceSynchronize()` after every kernels. Otherwise, the kernel is executued asynchronously by default. It is helpful to locate the error point during debuging. But this flag affects the performance of program significantly. So, it should be used only for debuging.
-
-## Performance
-
-Hardware settings:
-
-* 8xA100-80GBs (with mclk 1593MHz, pclk 1410MHz) with AMD EPYC 7742 64-Core Processor
-* T4 (with mclk 5000MHz, pclk 1590MHz) with Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz
-
-In order to run the following benchmark, we need to install the unix computing tool "bc" by
-
-```bash
-apt-get install bc
+mkdir -p FasterTransformer/build
+cd FasterTransformer/build
+git submodule init && git submodule update
+cmake -DSM=70 -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON -DBUILD_MULTI_GPU=ON ..
+make -j32
 ```
 
-### BERT base performance
-
-The FP16 results of TensorFlow were obtained by running the `benchmarks/bert/tf_benchmark.sh`.
-
-The INT8 results of TensorFlow were obtained by running the `benchmarks/bert/tf_int8_benchmark.sh`.
-
-The FP16 results of PyTorch were obtained by running the `benchmarks/bert/pyt_benchmark.sh`.
-
-The INT8 results of PyTorch were obtained by running the `benchmarks/bert/pyt_int8_benchmark.sh`.
-
-More benchmarks are put in [`docs/bert_guide.md`](docs/bert_guide.md#bert-performance).
-
-#### BERT base performances of FasterTransformer new features
-
-The following figure compares the performances of different features of FasterTransformer and FasterTransformer under FP16 on T4.
-
-For large batch size and sequence length, both EFF-FT and FT-INT8-v2 bring about 2x speedup. Using Effective FasterTransformer and int8v2 at the same time can bring about 3.5x speedup compared to FasterTransformer FP16 for large case.
-
-<div align=center><img  width=80% src ="docs/images/FT_Encoder_T4.png"/></div>
-
-#### BERT base performance on TensorFlow
-
-The following figure compares the performances of different features of FasterTransformer and TensorFlow XLA under FP16 on T4.
-
-For small batch size and sequence length, using FasterTransformer can bring about 3x speedup.
-
-For large batch size and sequence length, using Effective FasterTransformer with INT8-v2 quantization can bring about 5x speedup.
-
-<div align=center><img  width=80% src ="docs/images/TF_Encoder_T4.png"/></div>
-
-#### BERT base performance on PyTorch
-
-The following figure compares the performances of different features of FasterTransformer and PyTorch TorchScript under FP16 on T4.
-
-For small batch size and sequence length, using FasterTransformer CustomExt can bring about 4x ~ 6x speedup.
-
-For large batch size and sequence length, using Effective FasterTransformer with INT8-v2 quantization can bring about 5x speedup.
-
-<div align=center><img  width=80% src ="docs/images/Py_Encoder_T4.png"/></div>
-
-### Decoding and Decoder performance
-
-The results of TensorFlow were obtained by running the `benchmarks/decoding/tf_decoding_beamsearch_benchmark.sh` and `benchmarks/decoding/tf_decoding_sampling_benchmark.sh`
-
-The results of PyTorch were obtained by running the `benchmarks/decoding/pyt_decoding_beamsearch_benchmark.sh`.
-
-In the experiments of decoding, we updated the following parameters:
-
-* head_num = 8
-* size_per_head = 64
-* num_layers = 6 for both encoder and decoder
-* vocabulary_size = 32001 for TensorFlow sample codes, 31538 for PyTorch sample codes
-* memory_hidden_dim = 512
-* max sequenc elength = 128
-
-More benchmarks are put in [`docs/decoder_guide.md`](docs/decoder_guide.md#decoding-performance).
-
-#### Decoder and Decoding end-to-end translation performance on TensorFlow
-
-The following figure shows the speedup of of FT-Decoder op and FT-Decoding op compared to TensorFlow under FP16 with T4. Here, we use the throughput of translating a test set to prevent the total tokens of each methods may be different. Compared to TensorFlow, FT-Decoder provides 1.5x ~ 3x speedup; while FT-Decoding provides 4x ~ 18x speedup.
-
-<div align=center><img  width=80% src ="docs/images/TF_Decoder_T4.png"/></div>
-
-#### Decoder and Decoding end-to-end translation performance on PyTorch
-
-The following figure shows the speedup of of FT-Decoder op and FT-Decoding op compared to PyTorch under FP16 with T4. Here, we use the throughput of translating a test set to prevent the total tokens of each methods may be different. Compared to PyTorch, FT-Decoder provides 1.2x ~ 3x speedup; while FT-Decoding provides 3.8x ~ 13x speedup.
-
-<div align=center><img  width=80% src ="docs/images/Py_Decoder_T4.png"/></div>
-
-### GPT performance
-
-The following figure compares the performances of Megatron and FasterTransformer under FP16 on A100.
-
-In the experiments of decoding, we updated the following parameters:
-
-* head_num = 96
-* size_per_head = 128
-* num_layers = 48 for GPT-89B model, 96 for GPT-175B model
-* data_type = FP16
-* vocab_size = 51200
-* top_p = 0.9
-* tensor parallel size = 8
-* input sequence length = 512
-* output sequence length = 32
-
-<div align=center><img  width=80% src ="docs/images/FT_GPT_A100.png"/></div>
-
-## Release notes
-
-### Changelog
-
-May 2023
-- Fix bugs of generation early stopping
-
-January 2023
-- Support GPT MoE
-- Support FP8 for Bert and GPT (**Experimental**)
-- Support DeBERTa on TensorFlow 2 and PyTorch
-
-Dec 2022
-- **Release the FasterTransformer 5.2**
-- Support min length penalty
-
-Nov 2022
-- Support T5 Tensorflow 2 custom op.
-- Support T5 MoE
-- Support WeNet
-- Support BART & mBART
-- Support SwinV2
-- Initial support for w8a8 int8 mode with GPT (preview)
-- Support fused mha in GPT
-
-Oct 2022
-- Support BLOOM
-
-Sep 2022
-- Support factual sampling ([link](https://arxiv.org/pdf/2206.04624.pdf)) in gpt
-- Support for IA3 adapting scheme in T5
-
-Aug 2022
-- Support returning context tokens embeddings in GPT
-- **Release the FasterTransformer 5.1**
-- Support for interactive generation
-- Support for attention time-limited memory
-- Support mt5 and t5-v1.1
-
-July 2022
-- Support UL2 huggingface ckpt. ([link](https://huggingface.co/google/ul2))
-  - Fix bug of T5 under bfloat16.
-- Add ViT INT8 TensorRT Plugin
-- Support batch sampling
-- Support shared context optimization in GPT model
-
-June 2022
-- Support streaming generation for triton backend.
-- Support OPT.
-- Support multi-node multi-GPU BERT under FP32, FP16 and BF16.
-
-May 2022
-- Support bfloat16 on most models.
-- Support [prefix-prompt](https://arxiv.org/pdf/2101.00190.pdf) for GPT-J.
-- Support GPT-NeoX.
-  - epsilon value used in layernorm is now a parameter
-  - rotary embedding GPT-NeoX style (only GPT-J was implemented)
-  - load per-GPU layernorm and bias parameters
-  - weight conversion from EleutherAI checkpoint
-
-April 2022
-- **Release the FasterTransformer 5.0**
-  - Change the default accumulation type of all gemm to FP32.
-  - Support bfloat16 inference in GPT model.
-  - Support Nemo Megatron T5 and Megatron-LM T5 model.
-  - Support ViT.
-
-March 2022
-- Support `stop_ids` and `ban_bad_ids` in GPT-J.
-- Support dynamice `start_id` and `end_id` in GPT-J, GPT, T5 and Decoding.
-
-February 2022
-- Support Swin Transformer.
-- Optimize the k/v cache update of beam search by in-direction buffer.
-- Support runtime input for GPT-J, T5 and GPT.
-- Support soft prompt in GPT and GPT-J.
-- Support custom all reduce kernel.
-  - Limitation: 
-    1. Only support tensor parallel size = 8 on DGX-A100.
-    2. Only support CUDA with cudaMallocAsync.
-
-December 2021
-- Add TensorRT plugin of T5 model.
-- Change some hyper-parameters of GPT model to runtime query.
-- Optimize the memory allocator under C++ code.
-- Fix bug of CUB including when using CUDA 11.5 or newer version.
-
-November 2021
-- **Update the FasterTransformer 5.0 beta**
-- Add GPT-3 INT8 weight only qauntization for batch size <= 2.
-- Support multi-node multi-gpu support on T5.
-- Enhance the multi-node multi-gpu supporting in GPT-3.
-
-August 2021
-- **Release the FasterTransformer 5.0 beta**
-  - Refactor the repo and codes
-  - And special thanks to NAVER Corp. for contributing a lot to this version, as listed below.
-    - Bugs fix
-      - Fix error that occurs when batch_size is less than max_batch_size for gpt pytorch wrapper.
-      - Fix memory leak that occurs every forward because of reused allocator.
-      - Fix race condition that occurs in repetition penalty kernel.
-    - Enhancement
-      - Add random seed setting.
-      - Fix GEMM buffer overflow on FP16 of GPT.
-      - Change to invalidate finished buffer for every completion.
-      - Introduce stop_before for early stop.
-  - Support Longformer.
-  - Rename `layer_para` to `pipeline_para`.
-  - Optimize the sorting of top p sampling.
-  - Support sparsity for Ampere GPUs on BERT.
-  - Support `size_per_head` 96, 160, 192, 224, 256 for GPT model.
-  - Support multi-node inference for GPT Triton backend.
-
-June 2021
-- Support XLNet
-
-April 2021
-- **Release the FasterTransformer 4.0**
-  - Support multi-gpus and multi-nodes inference for GPT model on C++ and PyTorch.
-  - Support single node, multi-gpus inference for GPT model on triton.
-  - Add the int8 fused multi-head attention kernel for bert.
-  - Add the FP16 fused multi-head attention kernel of V100 for bert.
-  - Optimize the kernel of decoder.
-  - Move to independent repo.
-  - Eager mode PyTorch extension is deprecated.
-
-Dec 2020
-- **Release the FasterTransformer 3.1**
-  - Optimize the decoding by adding the finisehd mask to prevent useless computing.
-  - Support opennmt encoder.
-  - Remove the TensorRT plugin supporting.
-  - TorchScript custom op is deprecated.
-
-Nov 2020
-- Optimize the INT8 inference.
-- Support PyTorch INT8 inference.
-- Provide PyTorch INT8 quantiztion tools.
-- Integrate the fused multi-head attention kernel of TensorRT into FasterTransformer.
-- Add unit test of SQuAD.
-- Update the missed NGC checkpoints.
-
-Sep 2020
-- Support GPT2
-- **Release the FasterTransformer 3.0**
-  - Support INT8 quantization of encoder of cpp and TensorFlow op.
-  - Add bert-tf-quantization tool.
-  - Fix the issue that Cmake 15 or Cmake 16 fail to build this project.
-
-Aug 2020
-- Fix the bug of trt plugin.
-
-June 2020
-- **Release the FasterTransformer 2.1**
-  - Add Effective FasterTransformer based on the idea of [Effective Transformer](https://github.com/bytedance/effective_transformer) idea.
-  - Optimize the beam search kernels.
-  - Add PyTorch op supporting
-
-May 2020
-- Fix the bug that seq_len of encoder must be larger than 3.
-- Add the position_encoding of decoding as the input of FasterTransformer decoding. This is convenient to use different types of position encoding. FasterTransformer does not compute the position encoding value, but only lookup the table.
-- Modifying the method of loading model in `translate_sample.py`.
-
-April 2020
-- Rename `decoding_opennmt.h` to `decoding_beamsearch.h`
-- Add DiverseSiblingsSearch for decoding.
-- Add sampling into Decoding
-  - The implementation is in the `decoding_sampling.h`
-  - Add top_k sampling, top_p sampling for decoding.
-- Refactor the tensorflow custom op codes.
-  - Merge `bert_transformer_op.h`, `bert_transformer_op.cu.cc` into `bert_transformer_op.cc`
-  - Merge `decoder.h`, `decoder.cu.cc` into `decoder.cc`
-  - Merge `decoding_beamsearch.h`, `decoding_beamsearch.cu.cc` into `decoding_beamsearch.cc`
-- Fix the bugs of finalize function decoding.py.
-- Fix the bug of tf DiverseSiblingSearch.
-- Add BLEU scorer `bleu_score.py` into `utils`. Note that the BLEU score requires python3.
-- Fuse QKV Gemm of encoder and masked_multi_head_attention of decoder.
-- Add dynamic batch size and dynamic sequence length features into all ops.
-
-March 2020
-- Add feature in FasterTransformer 2.0
-  - Add `translate_sample.py` to demonstrate how to translate a sentence by restoring the pretrained model of OpenNMT-tf.
-- Fix bugs of Fastertransformer 2.0
-  - Fix the bug of maximum sequence length of decoder cannot be larger than 128.
-  - Fix the bug that decoding does not check finish or not after each step.
-  - Fix the bug of decoder about max_seq_len.
-  - Modify the decoding model structure to fit the OpenNMT-tf decoding model.
-    - Add a layer normalization layer after decoder.
-    - Add a normalization for inputs of decoder
-
-February 2020
-- **Release the FasterTransformer 2.0**
-  - Provide a highly optimized OpenNMT-tf based decoder and decoding, including C++ API and TensorFlow op.
-  - Refine the sample codes of encoder.
-  - Add dynamic batch size feature into encoder op.
-
-July 2019
-- **Release the FasterTransformer 1.0**
-  - Provide a highly optimized bert equivalent transformer layer, including C++ API, TensorFlow op and TensorRT plugin.
-
-### Known issues
-
-- Cannot compile on tensorflow 2.10 due to undefined symbol issue.
-- Undefined symbol errors when import the extension
-  - Please `import torch` first. If this has been done, it is due to the incompatible C++ ABI. You may need to check the PyTorch used during compilation and execution are the same, or you need to check how your PyTorch is compiled, or the version of your GCC, etc.
-- Results of TensorFlow and OP would be different in decoding. This problem is caused by the accumulated log probability, and we do not avoid this problem.
-- If encounter some problem in the custom environment, try to use the gcc/g++ 4.8 to build the project of TensorFlow op, especially for TensorFlow 1.14.
diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt
index da24d72c6..800dfdd7f 100644
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
@@ -27,6 +27,7 @@ add_subdirectory(wenet)
 add_subdirectory(gptj)
 add_subdirectory(gptneox)
 add_subdirectory(multi_gpu_gpt)
+#add_subdirectory(llama)
 
 if(ENABLE_FP8)
     add_subdirectory(gpt_fp8)
diff --git a/examples/cpp/llama/CMakeLists.txt b/examples/cpp/llama/CMakeLists.txt
new file mode 100644
index 000000000..19fb6e7fc
--- /dev/null
+++ b/examples/cpp/llama/CMakeLists.txt
@@ -0,0 +1,22 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_library(llama_example_utils STATIC llama_example_utils.cc)
+target_link_libraries(llama_example_utils PUBLIC -lcublas -lcublasLt -lcudart
+                      nvtx_utils mpi_utils nccl_utils)
+
+add_executable(llama_example llama_example.cc)
+target_link_libraries(llama_example PUBLIC -lcublas -lcublasLt -lcudart
+    LLaMA mpi_utils nccl_utils nvtx_utils 
+    llama_example_utils word_list)
diff --git a/examples/cpp/llama/bad_words.csv b/examples/cpp/llama/bad_words.csv
new file mode 100644
index 000000000..6a1126ebd
--- /dev/null
+++ b/examples/cpp/llama/bad_words.csv
@@ -0,0 +1,2 @@
+7768,3908
+1,2
diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini
new file mode 100644
index 000000000..3df66269f
--- /dev/null
+++ b/examples/cpp/llama/llama_config.ini
@@ -0,0 +1,21 @@
+[ft_instance_hyperparameter]
+model_name=llama_33B
+model_dir=../models/llama
+data_type=fp16
+pipeline_para_size=4
+
+
+[request]
+request_batch_size=32
+start_pos=2
+
+[llama_33B]
+head_num=52
+size_per_head=128
+vocab_size=32000
+decoder_layers=60
+rotary_embedding=128
+multiple_of=256
+max_seq_len=1024
+padding_id=0
+random_seed=0
diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
new file mode 100644
index 000000000..3065d4873
--- /dev/null
+++ b/examples/cpp/llama/llama_example.cc
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "3rdparty/INIReader.h"
+#include "examples/cpp/llama/llama_example_utils.h"
+#include "src/fastertransformer/models/llama/LLaMA.h"
+#include "src/fastertransformer/utils/mpi_utils.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+#include "src/fastertransformer/utils/nvtx_utils.h"
+#include "src/fastertransformer/utils/word_list.h"
+
+#include <cuda_profiler_api.h>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <sys/time.h>
+#include <vector>
+
+using namespace fastertransformer;
+
+template<typename T>
+void llama_example(const INIReader reader);
+
+int main(int argc, char* argv[])
+{
+    mpi::initialize(&argc, &argv);
+    srand(0);
+
+    std::string ini_name;
+    if (argc == 2) {
+        ini_name = std::string(argv[1]);
+    }
+    else {
+        ini_name = "../examples/cpp/llama/llama_config.ini";
+    }
+
+    INIReader reader = INIReader(ini_name);
+    if (reader.ParseError() < 0) {
+        std::cout << "[ERROR] Can't load '" << ini_name << "'\n";
+        return -1;
+    }
+    const std::string data_type = reader.Get("ft_instance_hyperparameter", "data_type");
+
+    if (data_type == "fp32") {
+        llama_example<float>(reader);
+    }
+    else if (data_type == "fp16") {
+        llama_example<half>(reader);
+    }
+    else {
+        FT_LOG_ERROR("is_fp16 should be 0 (use float) or 1 (use half).");
+        return -1;
+    }
+    mpi::finalize();
+    return 0;
+}
+
+template<typename T>
+void llama_example(const INIReader reader)
+{
+    const std::string model_name         = reader.Get("ft_instance_hyperparameter", "model_name");
+    std::string       model_dir          = std::string(reader.Get("ft_instance_hyperparameter", "model_dir"));
+    int               pipeline_para_size = reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size");
+
+    const size_t head_num             = reader.GetInteger(model_name, "head_num");
+    const size_t size_per_head        = reader.GetInteger(model_name, "size_per_head");
+    const size_t vocab_size           = reader.GetInteger(model_name, "vocab_size");
+    const size_t decoder_layers       = reader.GetInteger(model_name, "decoder_layers");
+    const size_t rotary_embedding_dim = reader.GetInteger(model_name, "rotary_embedding");
+    const int    multiple_of          = reader.GetInteger(model_name, "multiple_of");
+    const size_t max_seq_len          = reader.GetInteger(model_name, "max_seq_len");
+
+    const size_t hidden_units = head_num * size_per_head;
+    const size_t inter_size   = multiple_of * (((8 * hidden_units / 3) + multiple_of - 1) / multiple_of);
+
+    const size_t request_batch_size = reader.GetInteger("request", "request_batch_size");
+    const int    padding_id         = reader.GetInteger(model_name, "padding_id");
+    int          start_pos          = reader.GetInteger("request", "start_pos", 0);
+    unsigned long long random_seed = reader.GetInteger("request", "random_seed", 0);
+
+    FT_CHECK(decoder_layers % pipeline_para_size == 0);
+
+    // Prepare the parallelism parameters
+    int rank       = mpi::getCommWorldRank();
+    int world_size = mpi::getCommWorldSize();
+    if (rank == 0) {
+        printf("Total ranks: %d.\n", world_size);
+    }
+    int device, device_count;
+    check_cuda_error(cudaGetDeviceCount(&device_count));
+    check_cuda_error(cudaSetDevice(rank % device_count));
+    check_cuda_error(cudaGetDevice(&device));
+
+    struct cudaDeviceProp prop;
+    check_cuda_error(cudaGetDeviceProperties(&prop, device));
+    printf("Device %s\n", prop.name);
+
+    printf("P%d is running with GPU #%d.\n", rank, device);
+    if (pipeline_para_size != world_size) {
+        printf("[ERROR] pipeline_para_size should equal to world_size \n");
+        exit(-1);
+    }
+
+    const int layers_per_group = decoder_layers / pipeline_para_size;
+    if (layers_per_group * pipeline_para_size != (int)decoder_layers) {
+        printf("[ERROR] layers_per_group (%d) * pipeline_para_size (%d) should equal to decoder_layers (%ld) \n",
+               layers_per_group,
+               pipeline_para_size,
+               decoder_layers);
+        exit(-1);
+    }
+
+    NcclParam tensor_para;
+    NcclParam pipeline_para;
+    ftNcclInitialize(tensor_para, pipeline_para, 1, pipeline_para_size);
+
+    // Read ids of request from file.
+    size_t           max_input_len = -1;
+    std::vector<int> v_start_lengths;
+    std::vector<int> v_start_ids;
+    read_start_ids(request_batch_size,
+                   &v_start_lengths,
+                   &v_start_ids,
+                   max_input_len,
+                   padding_id,
+                   1,
+                   "../examples/cpp/llama/start_ids.csv");
+
+    int* d_input_ids;
+    int* d_input_lengths;
+    if (max_input_len == 0) {
+        // unconditional case, no input ids, so do nothing.
+        d_input_ids     = nullptr;
+        d_input_lengths = nullptr;
+    }
+    else {
+        // conditional case.
+        deviceMalloc(&d_input_ids, request_batch_size * max_input_len, false);
+        deviceMalloc(&d_input_lengths, request_batch_size, false);
+        cudaH2Dcpy(d_input_ids, v_start_ids.data(), request_batch_size * max_input_len);
+        cudaH2Dcpy(d_input_lengths, v_start_lengths.data(), request_batch_size);
+    }
+
+    const int total_output_len = max_input_len;
+
+    cudaStream_t     stream;
+    cublasHandle_t   cublas_handle;
+    cublasLtHandle_t cublaslt_handle;
+    cudaStreamCreate(&stream);
+    cublasCreate(&cublas_handle);
+    cublasLtCreate(&cublaslt_handle);
+    cublasSetStream(cublas_handle, stream);
+    cublasAlgoMap* cublas_algo_map = new cublasAlgoMap("gemm_config.in");
+
+    Allocator<AllocatorType::CUDA> allocator(getDevice());
+
+    std::mutex*     cublas_wrapper_mutex = new std::mutex();
+    cublasMMWrapper cublas_wrapper =
+        cublasMMWrapper(cublas_handle, cublaslt_handle, stream, cublas_algo_map, cublas_wrapper_mutex, &allocator);
+    if (std::is_same<T, half>::value) {
+        cublas_wrapper.setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F);
+    }
+    else if (std::is_same<T, float>::value) {
+        cublas_wrapper.setFP32GemmConfig();
+    }
+
+    fastertransformer::LLaMAWeight<T> llama_weights(
+        hidden_units, inter_size, vocab_size, decoder_layers, pipeline_para.world_size_, pipeline_para.rank_);
+
+    model_dir = model_dir + "/" + std::to_string(tensor_para.world_size_) + "-gpu";
+    llama_weights.loadModel(model_dir);
+
+    if (world_size > 1) {
+        mpi::bcast(&random_seed, 1, mpi::MPI_TYPE_UNSIGNED_LONG_LONG, 0, mpi::COMM_WORLD);
+    }
+
+    AttentionType attention_type =
+        getAttentionType<T>(size_per_head,
+                            getSMVersion(),
+                            !((std::getenv("SHONG_PADDING") != nullptr)
+                              && (std::string(std::getenv("SHONG_PADDING")) == "ON")),  // true,  // remove_padding
+                            0,      // llama supports any-seq-length fmha
+                            true,   // is_fuse
+                            false,  // with_relative_position_bias
+                            true);  // causal_mask
+
+    switch (attention_type) {
+        case AttentionType::UNFUSED_MHA:
+            std::cout << "UNFUSED_MHA\n";
+            break;
+        case AttentionType::UNFUSED_PADDED_MHA:
+            std::cout << "UNFUSED_PADDED_MHA\n";
+            break;
+        case AttentionType::FUSED_MHA:
+            std::cout << "FUSED_MHA\n";
+            break;
+        case AttentionType::FUSED_PADDED_MHA:
+            std::cout << "FUSED_PADDED_MHA\n";
+            break;
+    }
+
+    LLaMA<T> llama = LLaMA<T>(head_num,
+                              size_per_head,
+                              inter_size,
+                              decoder_layers,
+                              vocab_size,
+                              rotary_embedding_dim,
+                              random_seed,
+                              max_seq_len,
+                              tensor_para,
+                              pipeline_para,
+                              stream,
+                              &cublas_wrapper,
+                              &allocator,
+                              false,  // is_free_buffer_after_forward
+                              &prop,
+                              attention_type);
+
+    float* d_output_logits;
+    deviceMalloc(&d_output_logits, request_batch_size * total_output_len * vocab_size, false);
+    std::unordered_map<std::string, Tensor> input_tensors = std::unordered_map<std::string, Tensor>{
+        {"input_ids",
+         Tensor{MEMORY_GPU, TYPE_INT32, std::vector<size_t>{request_batch_size, (size_t)max_input_len}, d_input_ids}},
+        {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, std::vector<size_t>{request_batch_size}, d_input_lengths}},
+        {"start_pos", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector<size_t>{1}, &start_pos}}};
+
+    std::unordered_map<std::string, Tensor> output_tensors = std::unordered_map<std::string, Tensor>{
+        {"output_logits",
+         Tensor{MEMORY_GPU,
+                TYPE_FP32,
+                std::vector<size_t>{request_batch_size, (size_t)total_output_len, vocab_size},
+                d_output_logits}}};
+
+    print_mem_usage();
+
+    int ite = 1;
+    cudaDeviceSynchronize();
+    mpi::barrier();
+
+    // warm up
+    ite = 1;
+    ft_nvtx::setScope("warmup_time");
+    PUSH_RANGE("warmup time")
+    for (int i = 0; i < ite; ++i) {
+        llama.forward(&output_tensors, &input_tensors, &llama_weights);
+    }
+    cudaDeviceSynchronize();
+    mpi::barrier();
+
+    POP_RANGE;
+    ft_nvtx::resetScope();
+
+    /*
+    if (rank == world_size - 1) {
+        float* out = (float*)malloc(sizeof(float) * request_batch_size * total_output_len * vocab_size);
+        cudaMemcpy(out,
+                   d_output_logits,
+                   sizeof(float) * request_batch_size * total_output_len * vocab_size,
+                   cudaMemcpyDeviceToHost
+                   );
+        for (int b = 0; b < request_batch_size; ++b) {
+            std::cout << "[";
+            for (int s = 0; s < total_output_len; ++s) {
+                std::cout << "[";
+                for (int v = vocab_size - 8; v < vocab_size; ++v) {
+                    std::cout << out[b * total_output_len * vocab_size + s * vocab_size + v] << " ";
+                }
+                std::cout << "]\n";
+            }
+            std::cout << "]\n";
+        }
+        std::cout << "\n";
+        free(out);
+    }
+    */
+
+    // test time
+    cudaProfilerStart();
+    struct timeval start, end;
+    cudaDeviceSynchronize();
+    mpi::barrier();
+
+    gettimeofday(&start, NULL);
+
+    ft_nvtx::setScope("total_time");
+    PUSH_RANGE("total time")
+    // warm up
+    ite = 10;
+    for (int i = 0; i < ite; ++i) {
+        llama.forward(&output_tensors, &input_tensors, &llama_weights);
+    }
+
+    cudaDeviceSynchronize();
+    mpi::barrier();
+
+    POP_RANGE;
+    ft_nvtx::resetScope();
+    gettimeofday(&end, NULL);
+    cudaProfilerStop();
+
+    printf("[INFO] request_batch_size %ld head_num %ld size_per_head %ld total_output_len %d"
+           " decoder_layers %ld vocab_size %ld FT-CPP-decoding-beamsearch-time %.2f ms\n",
+           request_batch_size,
+           head_num,
+           size_per_head,
+           total_output_len,
+           decoder_layers,
+           vocab_size,
+           ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001) / ite);
+
+    ftNcclParamDestroy(tensor_para);
+    ftNcclParamDestroy(pipeline_para);
+
+    delete cublas_algo_map;
+    delete cublas_wrapper_mutex;
+
+    if (d_input_ids != nullptr) {
+        cudaFree(d_input_ids);
+    }
+    if (d_input_lengths != nullptr) {
+        cudaFree(d_input_lengths);
+    }
+    if (d_output_logits != nullptr) {
+        deviceFree(d_output_logits);
+    }
+
+    return;
+}
diff --git a/examples/cpp/llama/llama_example_utils.cc b/examples/cpp/llama/llama_example_utils.cc
new file mode 100644
index 000000000..d6db80856
--- /dev/null
+++ b/examples/cpp/llama/llama_example_utils.cc
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "examples/cpp/llama/llama_example_utils.h"
+
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+#include <fstream>
+#include <sstream>
+
+namespace fastertransformer {
+
+int read_start_ids(size_t            batch_size,
+                   std::vector<int>* v_start_lengths,
+                   std::vector<int>* v_start_ids,
+                   size_t&           max_input_len,
+                   const int         padding_id,
+                   const int         beam_width,
+                   std::string       file_name)
+{
+    std::vector<std::vector<int>> tmp_start_ids;
+    std::vector<int>              tmp_start_lengths;
+
+    std::ifstream start_id_file(file_name, std::ios::in);
+    int           line_num = 0;
+    if (start_id_file.is_open()) {
+        std::string line;
+        while (std::getline(start_id_file, line)) {
+            std::stringstream lineStream(line);
+            std::string       vals;
+            int               i1 = 0;
+            std::vector<int>  tmp_vec;
+            while (std::getline(lineStream, vals, ',')) {
+                tmp_vec.push_back(std::stoi(vals));
+                i1++;
+            }
+            tmp_start_ids.push_back(tmp_vec);
+            tmp_start_lengths.push_back(i1);
+            line_num++;
+        }
+        if (batch_size == 0) {
+            batch_size = line_num;
+        }
+    }
+    else {
+        printf("[WARNING] Cannot open the file '%s'. \n", file_name.c_str());
+        max_input_len = 0;
+        return 0;
+    }
+
+    max_input_len = tmp_start_lengths.data()[0];
+    for (uint i = 1; i < (uint)tmp_start_lengths.size(); i++) {
+        max_input_len = max_input_len > tmp_start_lengths.data()[i] ? max_input_len : tmp_start_lengths.data()[i];
+    }
+
+    while ((int)tmp_start_lengths.size() < batch_size) {
+        std::vector<int> padding_ids;
+        for (int i = 0; i < max_input_len; i++) {
+            padding_ids.push_back(padding_id);
+        }
+        tmp_start_ids.push_back(padding_ids);
+        tmp_start_lengths.push_back(max_input_len);
+    }
+
+    // Add padding
+    for (int i = 0; i < (int)tmp_start_ids.size(); i++) {
+        for (int j = (int)tmp_start_ids[i].size(); j < max_input_len; j++) {
+            tmp_start_ids[i].push_back(padding_id);
+        }
+    }
+
+    for (int i = 0; i < (int)tmp_start_ids.size(); i++) {
+        for (int b = 0; b < beam_width; b++) {
+            for (int j = 0; j < (int)tmp_start_ids[i].size(); j++) {
+                v_start_ids->push_back(tmp_start_ids[i][j]);
+            }
+            v_start_lengths->push_back(tmp_start_lengths[i]);
+        }
+    }
+    return batch_size;
+}
+
+}  // namespace fastertransformer
diff --git a/examples/cpp/llama/llama_example_utils.h b/examples/cpp/llama/llama_example_utils.h
new file mode 100644
index 000000000..1e5d0b9ab
--- /dev/null
+++ b/examples/cpp/llama/llama_example_utils.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda.h>
+#include <map>
+#include <string>
+#include <vector>
+
+namespace fastertransformer {
+
+int read_start_ids(size_t            batch_size,
+                   std::vector<int>* v_start_lengths,
+                   std::vector<int>* v_start_ids,
+                   size_t&           max_input_len,
+                   const int         end_id,
+                   const int         beam_width,
+                   std::string       file_name);
+
+}  // namespace fastertransformer
diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv
new file mode 100644
index 000000000..58bc4b4f6
--- /dev/null
+++ b/examples/cpp/llama/start_ids.csv
@@ -0,0 +1,32 @@
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,  1207,   432, 809,   295,   719, 27372, 29889,  7605,   263,   286,   789,  5941, 292, 10823,   363,   596,  3632,   331,  1943,   432,   809,   295, 719, 27372,  9522,   412, 29889,   360,   728, 29559,   411,  1395, 559, 29899,  7582,  1259,  4426,   508,   367,  1304, 29892,   541, 14383,   270,   728, 29559,   411,  4023,   845,  9418, 29899, 29890, 5761,   616,  4426,   408,   445,   508, 17820,   278,  8341,  1283, 432,   809,   295,   719, 28001 , 29889
+1,  3201,   955, 29901,  1128,   304,   679,   304,   413,  4442, 340, 29889,   315,   905,   263,  1513, 16286,   304,   413,  4442, 2165,  4799,   637,   313, 29926,  6547,  3300,   352, 13607,  6121, 4799,   637, 29897,   515, 29129,  1450,   470,  4655, 14721,   273, 14368, 29892,  1316,   408,   286,   348,   436, 29892,   301,   898, 265, 29892,  1226, 23559, 29892, 10395,  2429, 29892,   282,  1431, 434, 29892,   610,   275,   470,  7655,  1915, 29889,  1704, 26536, 3160,  3287,  1248,   728,  4799,  9012
+1,  8778,   322, 19906, 29901,  1128,   304, 12566,   330,  2390, 267,   515, 17564, 29879, 29889, 26428,   596,  2646,   412,   325, 1475,   411,  2691, 27716,  7787,  1259,   304, 12566,   278,   330, 2390,   267, 29889,   450, 27716,   881,   367,  1546, 29871, 29900, 29889, 29945,   304, 29871, 29900, 29889, 29947,  3533, 17528,   690, 313, 29900, 29889, 29900, 29906, 29900,   304, 29871, 29900, 29889, 29900, 29941, 29896,   297, 29897,   304, 12566,   278,   330,  2390, 267,   515,   285,  3687, 29892, 25550
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,  6548,  5520, 321, 29891,   295,  1161,   267,   322,  2989,   261,   321, 29891, 774,   798, 18180, 29889, 15154,  1207,   786,  4646,   368, 29889, 341,  6151,  2518,   322,   321, 29891,   774,   798,  9127, 29892, 2175,   373,   321, 29891,   295,  1161,   267,   322,   321, 29891, 774,  5727,   975, 11147,   674, 18658,  1438, 15409,  2578, 25414, 322,   674,  5557,   321, 29891,   295,  1161,   267,   322,   321, 29891,   774,  5727,   515, 15678,   636
+1, 26040, 29901,  1128,   304,  1207,  3632,   331,  1943,  6635, 1634,   514,   296, 29889,  3462,   278, 18853, 17182,   304,   263, 805,   764, 18046,   280, 29889,  1152,   278,  1634,   514,   296, 29892,   366, 29915,   645,   817,   263, 29871, 29906, 29899, 21543, 313, 29945, 29929,   286, 29880, 29897, 12917,   805,   764, 18046, 280, 29889,   317,   802, 29872,   911, 29871, 29906,  4441,   567, 310,   454,  3712, 18853, 17182,  1919, 29871, 29906,  4441,   567, 310,  8775, 24841, 18853, 17182, 29892
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,  1207,  7375, 322,   269,   473, 22300, 29889,   323,  2209,   278,   282,   548, 658,   262,   411,   278,  6501,   577, 29891, 12507,   346, 29889, 3462, 29871, 29945, 29871,  1309,   778,   313, 29896, 29946, 29906, 330, 29897,   310, 10814,  6393,   282,   548,   658,   262,   393, 29915, 29879,  1063,  5700,   297, 29871, 30515, 29899, 22466,   313, 29953, 29899,  4317, 29897, 12003, 10076,   567,   322, 29871, 29906, 734,   294,  1129,   787,   313, 29896
+1, 11796,   414,   322, 28251,  1199, 29901,  1128,   304, 19417, 325,   524,   482,  3438,  2017,   432,   809,   295,   719,   373, 18230,   388, 29889, 29301,  1432,   325,   524,   482,  3438,  2017, 432,   809,   295,   719, 10754,  2909,   322,  1432,   325,   524, 482,  3438,  2017,   432,   809,   295,   719,  1856,  3268,   366, 508,  1284, 29889,  6280,  4447,   675,  7535,   411,  3785, 11949, 29892,   278,   664,   310,  1532,  2998,  2874,   414, 29892, 12713, 22848, 29892,  1539,  1338, 29892, 25702
+1, 16224, 10057,   322, 22135, 29901,  1128,   304, 19531,   260, 4227, 29880,  1600,   384,  7901, 10412, 29889, 14542,   852,   263, 19875, 29891,   470,  1045,  3594,   260,  4227, 29880,  1600,   384, 363,   263,   901,  3209,   950,  1106, 29889,   319, 12003, 29892, 1302,  1537,   260,  4227, 29880,  1600,   384,  7901,  1008,   322, 263,  5101,   310,  1320,  1253,   287,  1444,   550,   338,   278, 4922,   982,   304,  7952, 14294,   373,   263, 11220,  4723,   355, 29889,  7357,   523,  2814,   470,   260
+1, 25453,   322, 17465,   292, 29901,  1128,   304, 17545,   901, 330,  2390,   267, 29889,  3462,   330,  2390,   267,   304,   596, 4497,   328, 29889,   319,  5972,   322,  4780,   982,   304,  7910, 278,  5253,   310,   330,  2390,   267,   297,   596, 14218,   652, 300,   338,   304, 28189,   263,  2846,  8870,  1490,   330,  2390, 267,   373,  2246,   310,   263,   301,  3322, 29899,   272,   270, 2559,   814,   603,  4497,   328, 29889,   450, 14225, 21054,   272, 322,  7990, 18459,   310,   278,   330
+1, 15202, 29901,  1128,   304, 11039,   403, 18655,  1849,   964, 263,  9045, 29891, 26044, 29889,  8561,   263, 18655,   519,   885, 2572,   569,   363, 26044, 29889,   319, 18655,   519,   885,  2572, 569,   338,   263,  2560,   270,   728,  1754,   491,   872,   329, 29948,   292, 18655,  1849,   297,   263,  4091,   340,  7243,   322, 769,  4417,   367,  2579, 29808,   975,   963,   304,  4808,   278, 270,   728,  4208, 29889, 14893,   491,  4417,   738, 18655,  1849, 366,   763, 29892,  3704,   373,  1080
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,  1207,   263, 3632,   331,  1943,  9045, 29891,  3700,   471, 29882, 29889,   422, 26062,   599,   310,   278,  2348,  1127, 10070, 29889,   512,   263, 18350, 29899, 29879,  1891, 12580, 29880, 29892,  6837,  4208, 29871, 30226, 18002,   313, 29946, 29945,   330, 29897,   310, 29081,   288, 1446, 29892, 29871, 30515, 18002,   313, 29945, 29929,   286, 29880, 29897, 10849,   454,  3712,  3623,   625, 29892, 29871, 30515, 18002, 313, 29945, 29929,   286, 29880, 29897
+1, 11796,   414,   322, 28251,  1199, 29901,  1128,   304,  1207, 18655, 13956,   286,  1878,  8345,  8310, 29891, 29889,  5701,  1082, 278,   373,   291,   322,   286,  1878, 18901, 29889,   940,   271, 29871, 30226, 18002,   313, 29896, 29906, 29900,   286, 29880, 29897, 310,  4805, 29899,  2405,  5359,   288,  9258, 17182,   297,   263, 2919, 12507,   346,  8357,   975, 18350, 29899,  9812, 12871, 29889, 9038,   278, 17182,   528,  6727,   414, 29892,   788, 29871, 30226, 310,   263,  2319,   373,   291,   393
+1, 11796,   414,   322, 28251,  1199, 29901,  1128,   304,  6958, 10992,  2963, 29889, 15484,   263,  1246,   363,   278, 19075,   982, 304,  6159, 10992,  2963, 29889,   960,   366,   723,   763,   304, 505,   263,  2022, 29899,   517, 29899, 10532, 14983, 29892,   270, 616,   278,  2498,   297,  6578,  2722,  1196,   472, 29871, 29896, 29899, 29947, 29900, 29900, 29899, 29953, 29953, 29947, 29899, 29953, 29955, 29953, 29945, 29889,  2688, 29915,   276,  1722,  7398,   388, 304,  1424, 22394, 29871, 29955,   263
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,  8267,   596, 321, 29891,   774,  5727,   411,   263,  8006,   272, 29889,   349, 27574,   263,  8006,   272,  1754, 10816,   363,   321, 29891,   774, 5727, 29889,   319,  3918,  8006,   272,   674,   451,  2367,   366, 13173,  3347, 29879, 29892,   322,   508,   367, 18215,   304,   671, 2978,   596,  5076, 29889,  8669, 20590,   385,   321, 29891,   774, 798,  8006,   272, 29892,  5069,  2319, 12995,   311,   674,  2367, 366,   278,  3347, 29879,   366, 13521
+1, 15202, 29901,  1128,   304,  5040, 23023,  1848,   321,  1218, 29889,   360,  8349,  7268,   403,  1546,  9128, 18757,   261,   322, 23023,  1848, 18757,   261, 29889,  1763, 18720,   278,  9946,   310, 596, 23023,  1848,   321,  1218, 29892,   372,  1122,   367,  5407, 304,   937,  2274,   746,   366,   526, 11223,  4824,  1711,  9074, 14793,   322,   746,   366,   526, 11223,   953,  8194,   635,  9074, 14793, 29889, 26991, 29892, 23023,  1848, 18757,   261,  5304,   373, 11584,   322, 23880,  5065,  5362, 29889
+1,  4231,   749,   322, 15197, 29901,  1128,   304,   289,  5790, 1044,  4856, 29889,  7519, 29883,   403,  7535,  1048,   278,   289, 5790,  1044, 29889,   450,   289,  5790,  1044, 29892,   884,  2998, 408,   278, 23729,  1458, 19119,  9045,  1044,   313,  1579,   272, 1458,  1002,  1082, 16385, 29871, 29941, 29929, 29946, 29892,   760, 474, 29897,   338,   263,  4307,   393,   471,  4502,   304,  9801, 322,  1072,  5987, 11176, 14703, 19119,  9045, 14502,  5786,   363, 1906, 23164,   515,   263, 19119,  4486
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,  1207,   263, 5613, 15774, 23895,  2017, 29889,  4007,  6967,   278,  2348,  1127, 10070, 29889,  1152,   445,  9522,   412, 29892,   366,   674,   817, 278,  1494,  4452,   584, 29871, 29896,  2894,   293,   274,  2559, 314,   265, 12070,  1919, 29871, 29906, 29945,  2894,   293,  5881, 314,   290,  2532, 29879,  1919, 29871, 29896, 29945,  2894,   293, 17184,  1960,  1919, 29871, 29896,  2894,   293,  1109,  2911, 17796, 1919, 29871, 29896, 10849,  2894,   293
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,   471, 29882, 4105,  4841, 29889, 14542,   852,   278,  1492,   528,   314,  1129, 29877,   322,  4195,   261, 29889,  5806,   738,   528,   314,  1129, 29877,   470,  4195,   261,   674,   664, 29892,   372,   338,  2253, 304,   671,  2730,   391,   332,  5281,   528,   314,  1129,   359, 322,  4195,   414, 29892,  7148,   565,   596,  8716, 29886,   338, 15589,   322,   372, 23766, 29889,  3834,  9316,  1316,   408,  1183, 29874,  2730,   391,   545,   263,  1341
+1, 11796,   414,   322, 28251,  1199, 29901,  1128,   304,  1207, 263, 19408,   413,  3780, 12343,  7539,   282,  5863, 29889, 10306, 278, 11994,   363,  3907,   413,   295,  1188, 29887, 29915, 29879, 19408,   413,  3780, 29886,   583,  2578,  1446,  2441, 29889,  1670, 674,  3117,   367,  1048,  4203,   263,  9853,   310, 19408,   413, 3780, 29886,   583,  7539,  2175, 29889,   313,   697,   310,   278, 2625, 23633,   310,  1641,   278,  7984, 29892,   338,   366,   679, 304, 17545,   738, 29915,   454, 29888
+1,   349,  1691,   322, 24980,  1338, 29901,  1128,   304,   260, 4003,  8343,   263,  2653, 23717, 29889,   402,  1624,   596, 28075, 29889,   887,   674,   817,   263, 29871, 29896, 29906, 21759,   269, 4316, 19144, 29892,   263,  4964, 14051,   495,  8343,   292,   260, 4003, 29892,   322,   263, 29871, 29896, 29953, 29899, 22466,   318, 276,   386,  1705,   274,   493,  1308,   411,   263, 24235,   310, 29871, 29945,   285,  4615,   313,  1454,  2319, 26361, 29897,   322, 29871, 29947,   285,  4615,   313,  1454
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,  2867,   297, 2373,   296,   454,  1624, 17394,   267, 29889,  5373, 29891, 17394, 267,   393,  6216,  1532, 29889,  3080,   326,   675,   278,   817, 363, 16116,   292,   470, 16679,   297,   491,  2805,  2373,   296, 454,  1624, 17394,   267,   393,   526,  2307,   263,  1781,  6216, 363,   366, 29889,  4001,  2373,   296,   454,  1624,   338,   380, 2593,   322, 29395,   990,  4357, 29892,   366, 29915,   276,   451, 2675,   304,   367,  2221,   304,  1735
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,  9563,   470, 2329,   263,   528, 10511,  1283,   321, 29891,   774,   798, 29889, 14542,   852,   385,   321, 29891,   774,   798,   282,  3977,   309, 322,  4764,   672,   393,   338,  2788,   304,   596,  5613,  2927, 292, 29889,   960,   366,  1603,   505,   697,   310,   596,   321, 29891,   774,  5727, 29892,   445,  1795,   367,   263,  2217,  6775, 489,  5143,  1993,   278,   282,  3977,   309,   304,   278,   528, 1943,   310,   596,   321, 29891,   774
+1, 11796,   414,   322, 28251,  1199, 29901,  1128,   304,  5376, 411,   540,  5031, 23448,   263, 29889, 19530, 29876,   278, 25828, 4835, 29889,   940,  5031, 23448,   263,   756,   263,  1353,   310, 25828,  4835,   393, 12234,  2615,  1546,  1023,   322,  4832, 11405, 515,   278,  2635,   310, 14060,   545, 29889,  3834,   310,  1438, 25828,  4835,   526, 10035, 29892,   763,   263,  1238,   369, 29892, 1550,  4045, 29892,   763,   432,   585,   299,   625, 29892,   526, 2649, 29873,   744, 18906,   310,   540
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,  4529,   596, 923,  1416, 29890,  2873, 29889, 23868,   278,  1492, 11955, 29889, 887,   674,   817,   472,  3203,  1023,  1422,   528,  3076,   310, 1207,   786, 29901,   697,   393,   338, 16951,  6501,   261,  1135, 596,  5613, 19309, 16225,   363,   278,   528, 23626,   322,   697, 393,   338,   925,   263,  2217,   301, 14643,  1135,   596, 19309, 363,   278, 12141, 29879, 29889, 28277,   373,   596, 19309, 16225, 322, 24583, 29892,  1438,   508,   367
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,   671,   429, 4542, 29875,  1218,   528,   314,  1129, 29877, 29889,   317,   802, 29872,   911,   263, 12616, 29899, 29879,  1891,  8828,  4757,   310, 528,   314,  1129, 29877,   304,   596,  5112, 29885, 29889,   319, 2217,  2586,   310,   429,  4542, 29875,  1218,   528,   314,  1129, 29877,  5771,   263,  1472,   982, 29889,  2860,  7990,  1259,   596, 11315,   297,   278,  1510,   261,   408,   366, 12891,   723, 29892, 269,   802, 29872,   911,  1048,   263
+1, 25453,   322, 17465,   292, 29901,  1128,   304,  1207,   521, 332,   307, 26163,  5036, 12580,  3137, 29889,  4721,   354,   271, 278,   288,   854,   304, 29871, 29946, 29945, 29900,  6719,   285, 21446,  6884,   470, 29871, 29906, 29906, 29945,  6719,  6432,  1039, 375,   636,  4122,   559,   263,   286,  3096,   262,   260,   764, 491,   285,   492,  3262,   372,   373,   967,  2625, 29889,  8669, 310,   805,   764,   292,  1661, 29899,   303,   860,  1395,  5832, 805,   764,   297,   278,  4251,   310
+1, 11796,   414,   322, 28251,  1199, 29901,  1128,   304,  6483, 285,   719,   541,   725,   329, 10674,  1161, 29889, 14542,   852, 385, 17182, 29889,  1932, 23906,   385, 17182,   304,  6483,   285, 719,   596,   541,   725,   329, 10674,  1161,   297, 29892,   372, 29915, 29879,  4100,   304,  5839,   697,   393,   756,   263,  6133, 25158,  1298,  1135,   278,  7984,   292, 10430, 29889,  1152,  1342, 29892,   565,   366,   505,   263,  9687,   393,  4225,   304,   367, 7984,   287,   472, 29871, 29941, 29945
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,  1207,   385, 288,  9258, 17182,  2730,   391,   332,  3950, 29889,   349, 27574, 953,  7273,  9215,   281,  1165,   515,   263,  3240,   737,   261, 393,  4266,  7093,   297, 28075,   363,  3907,  3632,   331,  1943, 6776,  2527,  1199, 29889,  1670,   526,  1784,  5376,   414,   393, 508,   367,  1476,  7395,  1058, 19417,   953,  7273,  9215,   281, 1165,   636,  5373, 29891,   777, 18853,   288,  2719,   393,   366, 723,   763,   304,   671,   297,   596
+1, 11796,   414,   322, 28251,  1199, 29901,  1128,   304,  5376, 411, 14919, 21549, 29889,  1260,  8332,   403, 14919, 21549, 29899, 513,  1682,   292,  9687,   322, 29914,   272, 13748,   515,   596, 652,   300, 29889,   739, 10083,  2560, 29892,   541,  6480,   825, 366,  2348,   342, 14218,   508,   505,   263, 12176, 10879,   373, 596, 14919, 21549, 11174, 29889,   960,   366,  8369,  7535, 11223, 24937, 29892,  7243, 18219, 29892,   470,   851, 11517,  1432,  2462, 29892,  3814,   304,  2334,   472,  3203
+1, 16224, 10057,   322, 22135, 29901,  1128,   304, 19531,  3708, 552, 17441,   303,   860, 29889, 14542,   852, 12528,   301,   309, 562,   470, 22181,  1581, 17441,   303,   860,   363,  6534, 19309, 260,  2873, 29889,  7419,   363,   301, 14643, 29899,  2780,   287, 3708,   552, 17441,   303,  7358,   411,  7254, 22332,  2873, 29892, 1316,   408,   540,  1624,   470,  3805,   275,   528,  3076, 29892, 304,  1035,   296, 27240,   278,  7254, 22332,  2873,   297,   596, 15509, 19309, 29889,  4525,   674, 19595
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,  1207,   596, 269,   484, 21079,  1106,   716,  1449, 29889,   399,  1161, 10508, 269,   484, 21079,   297,   278,   471,  2790,  4933, 29889,   960, 366,   505,   777, 26616, 10508,   269,   484, 21079, 29892,   366, 508,  5948,   679,   963,  5941,   491, 17452,   963,   297,   278, 471,  2790,  4933, 29892,   925,   408,   366,   723,   738,   916, 26616,  7171,   358, 29889,   887,  1122,   884,   367,  2221,   304, 471, 29882,   777,  1661, 29899, 15257
+1, 16224, 10057,   322, 22135, 29901,  1128,   304,  6755,   263, 18870,   262,   398,  9228, 29889, 14542,   852,   263,  9228,   411, 1880,  3708,   537, 29889,  1094,   411,   599,   758,  8802,  1539, 1338, 29892, 18870,   262,   398,  1818,   367,   394,  2376,   287, 411,   916,  1539,  1338,   297,  1797,   304,  6176,   278,  2898, 2264,  3734,   363,   432,   809,   295,   719, 29889,   739,   338, 4049,   394,  2376,   287,   411,  1661, 29899,  1457,  8802,  1539, 1338,   763,  1302,  2496,   470,   274
diff --git a/examples/cpp/llama/stop_words.csv b/examples/cpp/llama/stop_words.csv
new file mode 100644
index 000000000..9b9b09eba
--- /dev/null
+++ b/examples/cpp/llama/stop_words.csv
@@ -0,0 +1,2 @@
+287, 4346, 12
+3, -1, -1
diff --git a/src/fastertransformer/kernels/CMakeLists.txt b/src/fastertransformer/kernels/CMakeLists.txt
index fd2a1b494..c5cc14c8e 100644
--- a/src/fastertransformer/kernels/CMakeLists.txt
+++ b/src/fastertransformer/kernels/CMakeLists.txt
@@ -233,3 +233,7 @@ add_library(moe_kernels STATIC moe_kernels.cu)
 set_property(TARGET moe_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET moe_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 target_link_libraries(moe_kernels PRIVATE moe_gemm_kernels)
+
+add_library(llama_kernels STATIC llama_kernels.cu)
+set_property(TARGET llama_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET llama_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
diff --git a/src/fastertransformer/kernels/bert_preprocess_kernels.cu b/src/fastertransformer/kernels/bert_preprocess_kernels.cu
index a57161c85..8179c3368 100644
--- a/src/fastertransformer/kernels/bert_preprocess_kernels.cu
+++ b/src/fastertransformer/kernels/bert_preprocess_kernels.cu
@@ -467,4 +467,4 @@ template void invokeQuantizeMatrixRebuildPadding<half, __nv_fp8_e4m3, QUANTIZE_M
 
 #endif
 
-}  // namespace fastertransformer
\ No newline at end of file
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/gpt_kernels.cu b/src/fastertransformer/kernels/gpt_kernels.cu
index 7dc9af620..76852dc4d 100644
--- a/src/fastertransformer/kernels/gpt_kernels.cu
+++ b/src/fastertransformer/kernels/gpt_kernels.cu
@@ -114,7 +114,70 @@ __global__ void start_id_embedding_position_lookups_kernel(T*
                                                                                                      length,           \
                                                                                                      max_length,       \
                                                                                                      batch_size,       \
-                                                                                                     hidden_units);
+                                                                                                     hidden_units)
+template<typename T>
+__global__ void start_id_embedding_lookups_kernel(T*            from_tensor,
+                                                  const T*      embedding_table,
+                                                  const int*    input_ids,
+                                                  const int     length,
+                                                  const int     batch_size,
+                                                  const int64_t hidden_units)
+{
+    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < batch_size * length * hidden_units;
+         index += blockDim.x * gridDim.x) {
+
+        // embedding lookup from word ids [batch, length] (part of [batch, length]) and [vocab, hidden] to generate
+        // embedding [batch, length, hidden]
+        const int word_index      = index / hidden_units;
+        const int word_index_row  = word_index / length;  // batch_id
+        const int word_index_col  = word_index % length;
+        const int real_word_index = word_index_row * length + word_index_col;
+        const int col_index       = index % hidden_units;
+        const int input_id        = input_ids == nullptr ? real_word_index : input_ids[real_word_index];
+
+        from_tensor[index] = embedding_table[input_id * hidden_units + col_index];
+    }
+}
+
+template<typename T>
+void invokeInputIdsEmbeddingLookup(T*           from_tensor,
+                                   const T*     embedding_table,
+                                   const int*   input_ids,
+                                   const int    length,
+                                   const int    batch_size,
+                                   const int    hidden_units,
+                                   cudaStream_t stream)
+{
+    dim3 grid(min(batch_size * length, 65536));
+    dim3 block(min(hidden_units, 512));
+    start_id_embedding_lookups_kernel<T>
+        <<<grid, block, 0, stream>>>(from_tensor, embedding_table, input_ids, length, batch_size, hidden_units);
+}
+
+template void invokeInputIdsEmbeddingLookup(float*       from_tensor,
+                                            const float* embedding_table,
+                                            const int*   input_ids,
+                                            const int    length,
+                                            const int    batch_size,
+                                            const int    hidden_units,
+                                            cudaStream_t stream);
+template void invokeInputIdsEmbeddingLookup(half*        from_tensor,
+                                            const half*  embedding_table,
+                                            const int*   input_ids,
+                                            const int    length,
+                                            const int    batch_size,
+                                            const int    hidden_units,
+                                            cudaStream_t stream);
+
+#ifdef ENABLE_BF16
+template void invokeInputIdsEmbeddingLookup(__nv_bfloat16*       from_tensor,
+                                            const __nv_bfloat16* embedding_table,
+                                            const int*           input_ids,
+                                            const int            length,
+                                            const int            batch_size,
+                                            const int            hidden_units,
+                                            cudaStream_t         stream);
+#endif
 
 template<typename T>
 void invokeInputIdsEmbeddingLookupPosEncoding(T*                    from_tensor,
@@ -203,27 +266,89 @@ template void invokeInputIdsEmbeddingLookupPosEncoding(__nv_bfloat16*
 template<typename T>
 __global__ void inputIdsEmbeddingLookupPosEncodingSoftPrompt(inputIdsEmbeddingLookupPosEncodingSoftPromptParam<T> param)
 {
-    // 1. Copy the input ids to output ids and transpose output ids to [seq_len, batch_size, beam_width].
-    // 2. Embedding lookup by input ids and concat with soft prompt. The axis of concatenation is on axis of seq_len.
-
-    // Assume batch size is 2 and prompts are [[t1, t2], [t3], [t4, t5]], input_ids are [[s1, s2], [s3], [s4]]
-    // then the order of output_ids is
-    // [ [?, ?, s1, s2]
-    //   [?, s3, padding, padding]
-    //   [?, ?, s4, padding] ]
-    // and the order of embedding is
-    // [ [t1, t2, s1, s2]
-    //   [t3, s3, padding, padding]
-    //   [t4, t5, s4, padding] ]
-    // where "?" means undefined values and we should attach it.
+    // 1. Copy the
+    // input ids to
+    // output ids
+    // and
+    // transpose
+    // output ids
+    // to [seq_len,
+    // batch_size,
+    // beam_width].
+    // 2. Embedding
+    // lookup by
+    // input ids
+    // and concat
+    // with soft
+    // prompt. The
+    // axis of
+    // concatenation
+    // is on axis
+    // of seq_len.
+
+    // Assume batch
+    // size is 2
+    // and prompts
+    // are [[t1,
+    // t2], [t3],
+    // [t4, t5]],
+    // input_ids
+    // are [[s1,
+    // s2], [s3],
+    // [s4]] then
+    // the order of
+    // output_ids
+    // is [ [?, ?,
+    // s1, s2]
+    //   [?, s3,
+    //   padding,
+    //   padding]
+    //   [?, ?, s4,
+    //   padding] ]
+    // and the
+    // order of
+    // embedding is
+    // [ [t1, t2,
+    // s1, s2]
+    //   [t3, s3,
+    //   padding,
+    //   padding]
+    //   [t4, t5,
+    //   s4,
+    //   padding] ]
+    // where "?"
+    // means
+    // undefined
+    // values and
+    // we should
+    // attach it.
 
     for (int index = blockIdx.x * blockDim.x + threadIdx.x;
          index < param.batch_size * param.beam_width * (param.max_prefix_soft_prompt_length + param.max_input_length)
                      * param.hidden_units;
          index += blockDim.x * gridDim.x) {
-        // transpose the input_ids [batch, length] (part of [batch, beam, max_input_length]) to
-        // output_ids [length, batch, beam].
-        // ouptut_ids need to add padding in the beginning for soft prompting.
+        // transpose
+        // the
+        // input_ids
+        // [batch,
+        // length]
+        // (part of
+        // [batch,
+        // beam,
+        // max_input_length])
+        // to
+        // output_ids
+        // [length,
+        // batch,
+        // beam].
+        // ouptut_ids
+        // need to
+        // add
+        // padding
+        // in the
+        // beginning
+        // for soft
+        // prompting.
 
         if (index < param.batch_size * param.beam_width * param.max_input_length) {
             int       tmp_index = index;
@@ -239,21 +364,43 @@ __global__ void inputIdsEmbeddingLookupPosEncodingSoftPrompt(inputIdsEmbeddingLo
             }
         }
 
-        // embedding lookup from word ids [batch, beam, length] (part of [batch, beam, max_input_length]), [vocab,
-        // hidden] and [batch, max_prefix_soft_prompt_length, hidden] to generate embedding [batch, beam, length +
-        // max_prefix_soft_prompt_length, hidden]
-        int       tmp_index = index;
-        const int hidden_id = tmp_index % param.hidden_units;
-        tmp_index           = (tmp_index - hidden_id) / param.hidden_units;
-        const int seq_id    = tmp_index % (param.max_prefix_soft_prompt_length + param.max_input_length);
-        tmp_index           = (tmp_index - seq_id) / (param.max_prefix_soft_prompt_length + param.max_input_length);
-        const int beam_id   = tmp_index % param.beam_width;
-        tmp_index           = (tmp_index - beam_id) / param.beam_width;
-        const int batch_id  = tmp_index % param.batch_size;
+        // embedding
+        // lookup
+        // from
+        // word ids
+        // [batch,
+        // beam,
+        // length]
+        // (part of
+        // [batch,
+        // beam,
+        // max_input_length]),
+        // [vocab,
+        // hidden]
+        // and
+        // [batch,
+        // max_prefix_soft_prompt_length,
+        // hidden]
+        // to
+        // generate
+        // embedding
+        // [batch,
+        // beam,
+        // length +
+        // max_prefix_soft_prompt_length,
+        // hidden]
+        int       tmp_index    = index;
+        const int hidden_id    = tmp_index % param.hidden_units;
+        tmp_index              = (tmp_index - hidden_id) / param.hidden_units;
+        const int seq_id       = tmp_index % (param.max_prefix_soft_prompt_length + param.max_input_length);
+        tmp_index              = (tmp_index - seq_id) / (param.max_prefix_soft_prompt_length + param.max_input_length);
+        const int beam_id      = tmp_index % param.beam_width;
+        tmp_index              = (tmp_index - beam_id) / param.beam_width;
+        const int     batch_id = tmp_index % param.batch_size;
         const int64_t hidden_units = param.hidden_units;
-        T         embedding =
+        T             embedding =
             (seq_id < param.prefix_soft_prompt_lengths[batch_id]) ?
-                        (T)param.prefix_soft_prompt_embedding[batch_id * param.max_prefix_soft_prompt_length * hidden_units
+                            (T)param.prefix_soft_prompt_embedding[batch_id * param.max_prefix_soft_prompt_length * hidden_units
                                                       + seq_id * hidden_units + hidden_id] :
                             param.embedding_table[param.input_ids[batch_id * param.beam_width * param.max_input_length
                                                       + beam_id * param.max_input_length
@@ -292,7 +439,8 @@ template void invokeInputIdsEmbeddingLookupPosEncodingSoftPrompt(
     inputIdsEmbeddingLookupPosEncodingSoftPromptParam<__nv_bfloat16> param);
 #endif
 
-// TODO Add half2 implementation
+// TODO Add half2
+// implementation
 template<typename T>
 __global__ void transposeAxis01(T* out, T* in, const int dim0, const int dim1, const int dim2)
 {
@@ -329,9 +477,11 @@ invokeTransposeAxis01(int* out, int* in, const int dim0, const int dim1, const i
 template<typename T>
 __global__ void transposeAxis01(T* out, T* in, const int* in_skipping_dim1, const int dim0, const int dim1)
 {
-    // out: [dim1, dim0]
-    // in: [dim0, dim1]
-    // in_skipping_dim1: [dim1]
+    // out: [dim1,
+    // dim0] in:
+    // [dim0, dim1]
+    // in_skipping_dim1:
+    // [dim1]
 
     int index = threadIdx.x + blockIdx.x * blockDim.x;
     if (index < dim0 * dim1) {
@@ -363,8 +513,15 @@ __global__ void buildDecoderAttentionMaskKernel(T*         attention_mask,
                                                 const int  max_seq_len,
                                                 const int  max_prompt_length)
 {
-    // sequence_lengths: [batch_size]
-    // attention_mask: [batch_size, 1, max_seq_len, max_seq_len + max_prompt_length]
+    // sequence_lengths:
+    // [batch_size]
+    // attention_mask:
+    // [batch_size,
+    // 1,
+    // max_seq_len,
+    // max_seq_len
+    // +
+    // max_prompt_length]
     const int max_prompt_seq_length = max_seq_len + max_prompt_length;
     const int mask_size_per_seq     = max_seq_len * max_prompt_seq_length;
     attention_mask += blockIdx.x * mask_size_per_seq;
@@ -581,29 +738,100 @@ template<int TB_SIZE>
 __global__ void
 find_context_dups(int* shared_contexts, const int* input_ids, const size_t batch_size, const size_t input_seq_len)
 {
-    /* We compare all context pairs (i, j), with i (tgt) < j (src) , to detect duplicate
-     * inputs. If there's a match between i and j, we store i at the
-     * j-th position of shared_context. So that we know that j can be
-     * represented by i. shared_contexts is initialized like shared_contexts[i] = i
-     * and when there's a match, we actually use shared_contexts[j] = min(shared_contexts[j], i)
-     * so that in the end, shared_contexts effectively contains an index
-     * to the match with the lowest index context.
-     * Note that shared_contexts[i] <= i, a property that will be used when uncompacting
+    /* We compare
+     * all context
+     * pairs (i,
+     * j), with i
+     * (tgt) < j
+     * (src) , to
+     * detect
+     * duplicate
+     * inputs. If
+     * there's a
+     * match
+     * between i
+     * and j, we
+     * store i at
+     * the j-th
+     * position of
+     * shared_context.
+     * So that we
+     * know that j
+     * can be
+     * represented
+     * by i.
+     * shared_contexts
+     * is
+     * initialized
+     * like
+     * shared_contexts[i]
+     * = i and when
+     * there's a
+     * match, we
+     * actually use
+     * shared_contexts[j]
+     * =
+     * min(shared_contexts[j],
+     * i) so that
+     * in the end,
+     * shared_contexts
+     * effectively
+     * contains an
+     * index to the
+     * match with
+     * the lowest
+     * index
+     * context.
+     * Note that
+     * shared_contexts[i]
+     * <= i, a
+     * property
+     * that will be
+     * used when
+     * uncompacting
      * inputs.
      */
     typedef cub::BlockReduce<int, TB_SIZE>       BlockReduce;
     __shared__ typename BlockReduce::TempStorage temp_storage;
     __shared__ bool                              match;
 
-    /* Each block is responsible for a (i, j) pair. To map the block space to
-     * the i < j space, we need to convert a linear addressing to a triangle, of
-     * size (batch_size * (batch_size - 1)) / 2
-     * For more information, check https://en.wikipedia.org/wiki/Triangular_number
+    /* Each block
+     * is
+     * responsible
+     * for a (i, j)
+     * pair. To map
+     * the block
+     * space to the
+     * i < j space,
+     * we need to
+     * convert a
+     * linear
+     * addressing
+     * to a
+     * triangle, of
+     * size
+     * (batch_size
+     * * (batch_size - 1)) / 2
+     * For more
+     * information,
+     * check
+     * https://en.wikipedia.org/wiki/Triangular_number
      */
 
-    // blockIdx = [0, 1, 2, ... n(n-1)/2] -> base_index = [0, 1, 1, 2, 2, 2, 3, 3, 3, 3, ..., n - 2]
+    // blockIdx =
+    // [0, 1, 2,
+    // ...
+    // n(n-1)/2] ->
+    // base_index =
+    // [0, 1, 1, 2,
+    // 2, 2, 3, 3,
+    // 3, 3, ..., n
+    // - 2]
     const int base_index = floorf(0.5f * (sqrtf(1 + 8 * blockIdx.x) - 1));
-    const int src_idx    = base_index + 1;  // base_index \in [1, batch_size)
+    const int src_idx    = base_index + 1;  // base_index
+                                            // \in
+                                            // [1,
+                                            // batch_size)
 
     const int rev_base_index = base_index * (base_index + 1) / 2;
     const int tgt_idx        = blockIdx.x - rev_base_index;  // tgt_idx \in [0, src_idx)
@@ -659,9 +887,19 @@ __global__ void generate_dups_indices(int*         batch_to_compact,
 
         if (!masked && is_first_occur) {
             int compact_idx = scan + (first_iter ? 0 : scan_offset);
-            // Context rep. writes initial index
+            // Context
+            // rep.
+            // writes
+            // initial
+            // index
             batch_to_compact[seq_idx * beam_width] = compact_idx;
-            // input ids are tiled in context part
+            // input
+            // ids
+            // are
+            // tiled
+            // in
+            // context
+            // part
             compact_to_batch[compact_idx] = seq_idx * beam_width;
         }
 
@@ -674,13 +912,27 @@ __global__ void generate_dups_indices(int*         batch_to_compact,
         __syncthreads();
 
         if (!masked && !is_first_occur) {
-            // Fill the rest of batch_to_compact based on what rep. wrote
+            // Fill
+            // the
+            // rest
+            // of
+            // batch_to_compact
+            // based
+            // on
+            // what
+            // rep.
+            // wrote
             const int src_idx                      = batch_to_compact[shared_contexts[seq_idx] * beam_width];
             batch_to_compact[seq_idx * beam_width] = src_idx;
         }
 
         if (!masked) {
-            // set same compact idx for beams
+            // set
+            // same
+            // compact
+            // idx
+            // for
+            // beams
             for (int beam_id = 1; beam_id < beam_width; ++beam_id) {
                 batch_to_compact[seq_idx * beam_width + beam_id] = batch_to_compact[seq_idx * beam_width];
             }
@@ -713,11 +965,20 @@ void invokeFindContextDups(int*         shared_contexts,
 {
     dim3 block{512};
     dim3 grid{((int)batch_size + block.x - 1) / block.x};
-    // set shared_context[i] = i
+    // set
+    // shared_context[i] =
+    // i
     init_shared_contexts<<<grid, block, 0, stream>>>(shared_contexts, batch_size);
 
     grid = dim3{(unsigned int)(batch_size * (batch_size - 1)) / 2};
-    // set shared_contexts[i] = j, where j = min{k, such that input_ids[k] == input_ids[i]}
+    // set
+    // shared_contexts[i]
+    // = j, where j
+    // = min{k,
+    // such that
+    // input_ids[k]
+    // ==
+    // input_ids[i]}
     if (input_seq_len <= 128) {
         block = 128;
         find_context_dups<128><<<grid, block, 0, stream>>>(shared_contexts, input_ids, batch_size, input_seq_len);
@@ -727,8 +988,21 @@ void invokeFindContextDups(int*         shared_contexts,
         find_context_dups<256><<<grid, block, 0, stream>>>(shared_contexts, input_ids, batch_size, input_seq_len);
     }
 
-    // set batch_to_compact[i] = j, where j is the position of input_ids[i] in the compact_batch
-    // set compact_to_batch[i] = j, where j is such that compact_to_batch[i] = input_ids[j]
+    // set
+    // batch_to_compact[i]
+    // = j, where j
+    // is the
+    // position of
+    // input_ids[i]
+    // in the
+    // compact_batch
+    // set
+    // compact_to_batch[i]
+    // = j, where j
+    // is such that
+    // compact_to_batch[i]
+    // =
+    // input_ids[j]
     generate_dups_indices<<<1, DUPS_INDICES_BLOCK_SIZE, 0, stream>>>(
         batch_to_compact, compact_to_batch, compact_size, shared_contexts, batch_size, beam_width, input_seq_len);
 }
@@ -782,10 +1056,29 @@ void invokeCompactInputs(T*           compact_input,
                          size_t       hidden_dimension,
                          cudaStream_t stream)
 {
-    /* Compact relevant decoder_layer inputs based on the identical contexts.
-     * For example, decoder_input is [batch_size, seq_len, H]. It's compacted
-     * into compact_input [compact_size, seq_len, H] such that
-     * compact_input[i, ...] = decoder_input[compact_idx[i], ...] */
+    /* Compact
+     * relevant
+     * decoder_layer
+     * inputs based
+     * on the
+     * identical
+     * contexts.
+     * For example,
+     * decoder_input
+     * is
+     * [batch_size,
+     * seq_len, H].
+     * It's
+     * compacted
+     * into
+     * compact_input
+     * [compact_size,
+     * seq_len, H]
+     * such that
+     * compact_input[i,
+     * ...] =
+     * decoder_input[compact_idx[i],
+     * ...] */
     const size_t elems_n = compact_size * seq_len * max(hidden_dimension, seq_len);
     const dim3   blockDim(512);
     const dim3   gridDim((elems_n + 512 - 1) / 512);
@@ -828,8 +1121,19 @@ __global__ void uncompact_outputs(T*         uncompact_buffer,
                                   size_t     batch_size,
                                   size_t     buffer_stride)
 {
-    /* Uncompact a buffer IN of size [Compact, Stride] into OUT of size [Batch, Stride]
-     * so that \forall i, OUT[i, :] = IN[batch_to_compact_idx[i], :]
+    /* Uncompact a
+     * buffer IN of
+     * size
+     * [Compact,
+     * Stride] into
+     * OUT of size
+     * [Batch,
+     * Stride] so
+     * that \forall
+     * i, OUT[i, :]
+     * =
+     * IN[batch_to_compact_idx[i],
+     * :]
      */
     const int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -1124,4 +1428,5 @@ INSTANTIATE_INVOKE_SUM_LENGTH_DIMENSION(__nv_bfloat16);
 #endif
 #undef INSTANTIATE_INVOKE_SUM_LENGTH_DIMENSION
 
-}  // namespace fastertransformer
+}  // namespace
+   // fastertransformer
diff --git a/src/fastertransformer/kernels/gpt_kernels.h b/src/fastertransformer/kernels/gpt_kernels.h
index d78224e0a..bf4963231 100644
--- a/src/fastertransformer/kernels/gpt_kernels.h
+++ b/src/fastertransformer/kernels/gpt_kernels.h
@@ -59,6 +59,15 @@ struct pPromptTuningParam {
     const T* request_prompt_embedding = nullptr;
 };
 
+template<typename T>
+void invokeInputIdsEmbeddingLookup(T*                    from_tensor,
+                                   const T*              embedding_table,
+                                   const int*            input_ids,
+                                   const int             length,
+                                   const int             batch_size,
+                                   const int             hidden_units,
+                                   cudaStream_t          stream);
+
 template<typename T>
 void invokeInputIdsEmbeddingLookupPosEncoding(T*                    from_tensor,
                                               int*                  output_ids,
diff --git a/src/fastertransformer/kernels/layernorm_kernels.cu b/src/fastertransformer/kernels/layernorm_kernels.cu
index 369030b37..80a656cf7 100644
--- a/src/fastertransformer/kernels/layernorm_kernels.cu
+++ b/src/fastertransformer/kernels/layernorm_kernels.cu
@@ -19,6 +19,237 @@
 #include "src/fastertransformer/utils/cuda_type_utils.cuh"
 
 namespace fastertransformer {
+// __global__ void generalLLaMAAddBiasResidualLayerNormOpt(T* normed_output,
+// __global__ void generalLLaMAAddBiasResidualLayerNormOpt2(T* normed_output,
+// __global__ void generalLLaMAAddBiasResidualLayerNorm(const T* __restrict input,
+
+template<typename T, bool IS_OUTPUT, bool IS_BIAS, int RESIDUAL_NUM, bool IS_BETA, int UNROLL_FACTOR>
+__global__ void generalLLaMAAddBiasResidualLayerNormOpt(T* normed_output,
+                                                        T* output,
+                                                        const T* __restrict input,
+                                                        const T* __restrict bias,
+                                                        const T* __restrict residual1,
+                                                        const T* __restrict residual2,
+                                                        const T* __restrict gamma,
+                                                        const T* __restrict beta,
+                                                        const float layernorm_eps,
+                                                        int         m,
+                                                        int         n)
+{
+    extern __shared__ __align__(sizeof(float)) char _shmem[];  // Align on largest type
+    T*                                              shmem = reinterpret_cast<T*>(_shmem);
+
+    __shared__ float s_variance;
+    float            variance = 0.0f;
+
+    using Float_Packed_T = typename packed_as<float, num_elems<T>::value>::type;
+    using Scalar_T       = typename packed_as<T, 1>::type;
+
+    T local_sum = cuda_cast<T>(0.0f);
+
+    const Float_Packed_T scale_from_int = cuda_cast<Float_Packed_T>(0.0f);
+    const Float_Packed_T scale_to_int   = cuda_cast<Float_Packed_T>(0.0f);
+
+#pragma unroll
+    for (int i = threadIdx.x; i < n; i += blockDim.x) {
+        const int index = blockIdx.x * n + i;
+        T         val   = cuda_cast<T>(0.0f);
+
+        if (IS_BIAS) {
+            val = hadd2(val, ldg(&bias[i]));
+        }
+        if (RESIDUAL_NUM == 1) {
+            val = hadd2(val, ldg(&residual1[index]));
+        }
+        else if (RESIDUAL_NUM == 2) {
+            val = hadd2(hadd2(val, ldg(&residual1[index])), ldg(&residual2[index]));
+        }
+
+        if (IS_OUTPUT) {
+            T in_val;
+            in_val = input[index];
+            val    = hadd2(val, in_val);
+        }
+        shmem[i]      = val;
+        output[index] = val;
+        local_sum     = hadd2(local_sum, val);
+    }
+
+    float local_var_sum = 0.0f;
+#pragma unroll UNROLL_FACTOR
+    for (int i = threadIdx.x; i < n; i += blockDim.x) {
+        T     val    = input[blockIdx.x * n + i];
+        float diff_1 = (float)(val.x);
+        float diff_2 = (float)(val.y);
+        local_var_sum += (diff_1 * diff_1 + diff_2 * diff_2);
+    }
+    variance = blockReduceSum(local_var_sum);
+
+    if (threadIdx.x == 0) {
+        s_variance = rsqrtf(variance / n / 2 + layernorm_eps);
+    }
+    __syncthreads();
+
+    T var_2 = cuda_cast<T>(s_variance);
+
+#pragma unroll UNROLL_FACTOR
+    for (int i = threadIdx.x; i < n; i += blockDim.x) {
+        const int index = blockIdx.x * n + i;
+        T         val   = hmul2(shmem[i], var_2, ldg(&gamma[i]));
+        if (IS_BETA) {
+            val = hadd2(val, ldg(&beta[i]));
+        }
+
+        normed_output[index] = val;
+    }
+}
+
+// * Note that typename T is half2 or bfloat2 type
+template<typename T, bool IS_OUTPUT, bool IS_BIAS, int RESIDUAL_NUM, bool IS_BETA, int UNROLL_FACTOR>
+__global__ void generalLLaMAAddBiasResidualLayerNormOpt2(T* normed_output,
+                                                         T* output,
+                                                         const T* __restrict input,
+                                                         const T* __restrict bias,
+                                                         const T* __restrict residual1,
+                                                         const T* __restrict residual2,
+                                                         const T* __restrict gamma,
+                                                         const T* __restrict beta,
+                                                         const float layernorm_eps,
+                                                         int         m,
+                                                         int         n)
+{
+    extern __shared__ __align__(sizeof(float)) char _shmem[];
+    T*                                              shmem = reinterpret_cast<T*>(_shmem);
+
+    __shared__ float s_variance;
+    float            x2_sum   = 0.0f;
+    const int        b_offset = blockIdx.x * n;
+
+    using T1             = typename TypeConverter<T>::Type;
+    using Float_Packed_T = typename packed_as<float, num_elems<T>::value>::type;
+    using Scalar_T       = typename packed_as<T, 1>::type;
+
+    const Float_Packed_T scale_vec_in = cuda_cast<Float_Packed_T>(0.0f);
+    const Float_Packed_T scale_vec    = cuda_cast<Float_Packed_T>(0.0f);
+
+#pragma unroll UNROLL_FACTOR
+    for (int i = threadIdx.x; i < n; i += blockDim.x) {
+        const int index = b_offset + i;
+        float     val_1 = 0.0f;
+        float     val_2 = 0.0f;
+        T         tmp;
+
+        if (IS_BIAS) {
+            tmp = ldg(&bias[i]);
+            val_1 += static_cast<float>(tmp.x);
+            val_2 += static_cast<float>(tmp.y);
+        }
+        if (RESIDUAL_NUM == 1) {
+            tmp = ldg(&residual1[index]);
+            val_1 += static_cast<float>(tmp.x);
+            val_2 += static_cast<float>(tmp.y);
+        }
+        else if (RESIDUAL_NUM == 2) {
+            tmp    = ldg(&residual1[index]);
+            T tmp2 = ldg(&residual2[index]);
+            val_1 += (static_cast<float>(tmp.x) + static_cast<float>(tmp2.x));
+            val_2 += (static_cast<float>(tmp.y) + static_cast<float>(tmp2.y));
+        }
+
+        if (IS_OUTPUT) {
+            tmp = ldg(&input[index]);
+            val_1 += static_cast<float>(tmp.x);
+            val_2 += static_cast<float>(tmp.y);
+        }
+        tmp.x         = cuda_cast<T1>(val_1);
+        tmp.y         = cuda_cast<T1>(val_2);
+        shmem[i]      = tmp;
+        output[index] = tmp;
+        x2_sum += val_1 * val_1 + val_2 * val_2;
+    }
+    float sum_sq = blockReduceSum(x2_sum);
+
+    if (threadIdx.x == 0) {
+        s_variance = rsqrtf(sum_sq / n / 2 + layernorm_eps);
+    }
+    __syncthreads();
+
+    T var_2 = cuda_cast<T>(s_variance);
+
+#pragma unroll UNROLL_FACTOR
+    for (int i = threadIdx.x; i < n; i += blockDim.x) {
+        const int index = blockIdx.x * n + i;
+        T         val   = hmul2(shmem[i], var_2, ldg(&gamma[i]));
+        if (IS_BETA) {
+            val = hadd2(val, ldg(&beta[i]));
+        }
+
+        normed_output[index] = val;
+    }
+}
+
+template<typename T, int RESIDUAL_NUM>
+__global__ void generalLLaMAAddBiasResidualLayerNorm(const T* __restrict input,
+                                                     const T* __restrict residual1,
+                                                     const T* __restrict residual2,
+                                                     const T* __restrict gamma,
+                                                     const T* __restrict beta,
+                                                     const T* __restrict bias,
+                                                     T*          output,
+                                                     T*          norm_output,
+                                                     const float layernorm_eps,
+                                                     int         m,
+                                                     int         n)
+{
+    int tid = threadIdx.x;
+
+    // NOTE: float shmem may exceed the shared memory limit
+    extern __shared__ __align__(sizeof(float)) char _shmem[];
+    T*                                              shmem = reinterpret_cast<T*>(_shmem);
+
+    using Float_Packed_T = typename packed_as<float, num_elems<T>::value>::type;
+    using Scalar_T       = typename packed_as<T, 1>::type;
+
+    __shared__ float s_variance;
+    float            variance  = 0.0f;
+    float            local_sum = 0.0f;
+    for (int i = tid; i < n; i += blockDim.x) {
+        float local_out = 0.0f;
+        if (RESIDUAL_NUM == 1) {
+            local_out = (float)(ldg(&residual1[blockIdx.x * n + i]));
+        }
+        else if (RESIDUAL_NUM == 2) {
+            local_out = (float)(ldg(&residual1[blockIdx.x * n + i])) + float(ldg(&residual2[blockIdx.x * n + i]));
+        }
+        local_out += (float)(input[blockIdx.x * n + i]);
+
+        if (bias != nullptr) {
+            local_out += (float)(ldg(&bias[i]));
+        }
+        shmem[i]                   = (T)local_out;
+        output[blockIdx.x * n + i] = (T)local_out;
+        local_sum += local_out;
+    }
+
+    float local_var_sum = 0.0f;
+    for (int i = tid; i < n; i += blockDim.x) {
+        float diff = (float)(output[blockIdx.x * n + i]);
+        local_var_sum += diff * diff;
+    }
+    variance = blockReduceSum(local_var_sum);
+
+    if (threadIdx.x == 0) {
+        s_variance = rsqrtf(variance / n + layernorm_eps);
+    }
+    __syncthreads();
+
+    for (int i = tid; i < n; i += blockDim.x) {
+        float       beta_val = (beta == nullptr) ? 0.0f : (float)(ldg(&beta[i]));
+        const float val      = (((float)shmem[i] * s_variance) * (float)(ldg(&gamma[i])) + beta_val);
+
+        norm_output[blockIdx.x * n + i] = (T)val;
+    }
+}
 
 // * Note that typename T is half2 or bfloat2 type
 template<typename T, bool IS_OUTPUT, bool IS_BIAS, int RESIDUAL_NUM, bool IS_BETA, int UNROLL_FACTOR>
@@ -841,6 +1072,51 @@ __global__ void generalAddBiasResidualLayerNorm(const T* __restrict input,
     }
 }
 
+template<typename T, bool IS_OUTPUT, bool IS_BIAS, int UNROLL_FACTOR, int RESIDUAL_NUM>
+void dispatch_generalLLaMAAddBiasResidualLayerNormOpt_opt_version(T*           norm_output,
+                                                                  T*           output,
+                                                                  const T*     input,
+                                                                  const T*     bias,
+                                                                  const T*     residual1,
+                                                                  const T*     residual2,
+                                                                  const T*     gamma,
+                                                                  const T*     beta,
+                                                                  float        layernorm_eps,
+                                                                  int          m,
+                                                                  int          half_n,
+                                                                  dim3         grid,
+                                                                  dim3         block,
+                                                                  cudaStream_t stream,
+                                                                  int          opt_version)
+{
+    size_t maxbytes = half_n * sizeof(T);
+    if (opt_version == 1) {
+        if (maxbytes >= (48 << 10)) {
+            check_cuda_error(cudaFuncSetAttribute(
+                generalLLaMAAddBiasResidualLayerNormOpt<T, IS_OUTPUT, IS_BIAS, RESIDUAL_NUM, true, UNROLL_FACTOR>,
+                cudaFuncAttributeMaxDynamicSharedMemorySize,
+                maxbytes));
+        }
+        generalLLaMAAddBiasResidualLayerNormOpt<T, IS_OUTPUT, IS_BIAS, RESIDUAL_NUM, true, UNROLL_FACTOR>
+            <<<grid, block, maxbytes, stream>>>(
+                norm_output, output, input, bias, residual1, residual2, gamma, beta, layernorm_eps, m, half_n);
+    }
+    else if (opt_version == 2) {
+        if (maxbytes >= (48 << 10)) {
+            check_cuda_error(cudaFuncSetAttribute(
+                generalLLaMAAddBiasResidualLayerNormOpt2<T, IS_OUTPUT, IS_BIAS, RESIDUAL_NUM, true, UNROLL_FACTOR>,
+                cudaFuncAttributeMaxDynamicSharedMemorySize,
+                maxbytes));
+        }
+        generalLLaMAAddBiasResidualLayerNormOpt2<T, IS_OUTPUT, IS_BIAS, RESIDUAL_NUM, true, UNROLL_FACTOR>
+            <<<grid, block, maxbytes, stream>>>(
+                norm_output, output, input, bias, residual1, residual2, gamma, beta, layernorm_eps, m, half_n);
+    }
+    else {
+        FT_CHECK_WITH_INFO(false, "opt_num must be 1 or 2");
+    }
+}
+
 template<typename T, bool IS_OUTPUT, bool IS_BIAS, int UNROLL_FACTOR, int RESIDUAL_NUM>
 void dispatch_generalAddBiasResidualLayerNormOpt_opt_version(T*           norm_output,
                                                              T*           output,
@@ -919,6 +1195,62 @@ void dispatch_generalAddBiasResidualLayerNormOpt_opt_version(T*           norm_o
     }
 }
 
+template<typename T, bool IS_BIAS, int UNROLL_FACTOR, int RESIDUAL_NUM>
+void dispatch_generalLLaMAAddBiasResidualLayerNormOpt_is_output(T*           norm_output,
+                                                                T*           output,
+                                                                const T*     input,
+                                                                const T*     bias,
+                                                                const T*     residual1,
+                                                                const T*     residual2,
+                                                                const T*     gamma,
+                                                                const T*     beta,
+                                                                float        layernorm_eps,
+                                                                int          m,
+                                                                int          half_n,
+                                                                dim3         grid,
+                                                                dim3         block,
+                                                                cudaStream_t stream,
+                                                                int          opt_version,
+                                                                bool         is_output)
+{
+    if (is_output) {
+        dispatch_generalLLaMAAddBiasResidualLayerNormOpt_opt_version<T, true, IS_BIAS, UNROLL_FACTOR, RESIDUAL_NUM>(
+            norm_output,
+            output,
+            input,
+            bias,
+            residual1,
+            residual2,
+            gamma,
+            beta,
+            layernorm_eps,
+            m,
+            half_n,
+            grid,
+            block,
+            stream,
+            opt_version);
+    }
+    else {
+        dispatch_generalLLaMAAddBiasResidualLayerNormOpt_opt_version<T, false, IS_BIAS, UNROLL_FACTOR, RESIDUAL_NUM>(
+            norm_output,
+            output,
+            input,
+            bias,
+            residual1,
+            residual2,
+            gamma,
+            beta,
+            layernorm_eps,
+            m,
+            half_n,
+            grid,
+            block,
+            stream,
+            opt_version);
+    }
+}
+
 template<typename T, bool IS_BIAS, int UNROLL_FACTOR, int RESIDUAL_NUM>
 void dispatch_generalAddBiasResidualLayerNormOpt_is_output(T*           norm_output,
                                                            T*           output,
@@ -990,6 +1322,62 @@ void dispatch_generalAddBiasResidualLayerNormOpt_is_output(T*           norm_out
     }
 }
 
+template<typename T, int UNROLL_FACTOR, int RESIDUAL_NUM>
+void dispatch_generalLLaMAAddBiasResidualLayerNormOpt_bias(T*           norm_output,
+                                                           T*           output,
+                                                           const T*     input,
+                                                           const T*     bias,
+                                                           const T*     residual1,
+                                                           const T*     residual2,
+                                                           const T*     gamma,
+                                                           const T*     beta,
+                                                           float        layernorm_eps,
+                                                           int          m,
+                                                           int          half_n,
+                                                           dim3         grid,
+                                                           dim3         block,
+                                                           cudaStream_t stream,
+                                                           int          opt_version,
+                                                           bool         is_output)
+{
+    if (bias != nullptr) {
+        dispatch_generalLLaMAAddBiasResidualLayerNormOpt_is_output<T, true, UNROLL_FACTOR, RESIDUAL_NUM>(norm_output,
+                                                                                                         output,
+                                                                                                         input,
+                                                                                                         bias,
+                                                                                                         residual1,
+                                                                                                         residual2,
+                                                                                                         gamma,
+                                                                                                         beta,
+                                                                                                         layernorm_eps,
+                                                                                                         m,
+                                                                                                         half_n,
+                                                                                                         grid,
+                                                                                                         block,
+                                                                                                         stream,
+                                                                                                         opt_version,
+                                                                                                         is_output);
+    }
+    else {
+        dispatch_generalLLaMAAddBiasResidualLayerNormOpt_is_output<T, false, UNROLL_FACTOR, RESIDUAL_NUM>(norm_output,
+                                                                                                          output,
+                                                                                                          input,
+                                                                                                          bias,
+                                                                                                          residual1,
+                                                                                                          residual2,
+                                                                                                          gamma,
+                                                                                                          beta,
+                                                                                                          layernorm_eps,
+                                                                                                          m,
+                                                                                                          half_n,
+                                                                                                          grid,
+                                                                                                          block,
+                                                                                                          stream,
+                                                                                                          opt_version,
+                                                                                                          is_output);
+    }
+}
+
 template<typename T, int UNROLL_FACTOR, int RESIDUAL_NUM>
 void dispatch_generalAddBiasResidualLayerNormOpt_bias(T*           norm_output,
                                                       T*           output,
@@ -1061,6 +1449,66 @@ void dispatch_generalAddBiasResidualLayerNormOpt_bias(T*           norm_output,
     }
 }
 
+template<typename T, int UNROLL_FACTOR>
+void dispatch_generalLLaMAAddBiasResidualLayerNormOpt_residual_num(T*           norm_output,
+                                                                   T*           output,
+                                                                   const T*     input,
+                                                                   const T*     bias,
+                                                                   const T*     residual1,
+                                                                   const T*     residual2,
+                                                                   const T*     gamma,
+                                                                   const T*     beta,
+                                                                   float        layernorm_eps,
+                                                                   int          m,
+                                                                   int          half_n,
+                                                                   dim3         grid,
+                                                                   dim3         block,
+                                                                   cudaStream_t stream,
+                                                                   int          opt_version,
+                                                                   bool         is_output,
+                                                                   int          residual_num)
+{
+    if (residual_num == 1) {
+        dispatch_generalLLaMAAddBiasResidualLayerNormOpt_bias<T, UNROLL_FACTOR, 1>(norm_output,
+                                                                                   output,
+                                                                                   input,
+                                                                                   bias,
+                                                                                   residual1,
+                                                                                   residual2,
+                                                                                   gamma,
+                                                                                   beta,
+                                                                                   layernorm_eps,
+                                                                                   m,
+                                                                                   half_n,
+                                                                                   grid,
+                                                                                   block,
+                                                                                   stream,
+                                                                                   opt_version,
+                                                                                   is_output);
+    }
+    else if (residual_num == 2) {
+        dispatch_generalLLaMAAddBiasResidualLayerNormOpt_bias<T, UNROLL_FACTOR, 2>(norm_output,
+                                                                                   output,
+                                                                                   input,
+                                                                                   bias,
+                                                                                   residual1,
+                                                                                   residual2,
+                                                                                   gamma,
+                                                                                   beta,
+                                                                                   layernorm_eps,
+                                                                                   m,
+                                                                                   half_n,
+                                                                                   grid,
+                                                                                   block,
+                                                                                   stream,
+                                                                                   opt_version,
+                                                                                   is_output);
+    }
+    else {
+        FT_CHECK_WITH_INFO(false, "residual_num must be 1 or 2");
+    }
+}
+
 template<typename T, int UNROLL_FACTOR>
 void dispatch_generalAddBiasResidualLayerNormOpt_residual_num(T*           norm_output,
                                                               T*           output,
@@ -1136,6 +1584,108 @@ void dispatch_generalAddBiasResidualLayerNormOpt_residual_num(T*           norm_
     }
 }
 
+template<typename T>
+void dispatch_generalLLaMAAddBiasResidualLayerNormOpt_unroll_factor(T*           norm_output,
+                                                                    T*           output,
+                                                                    const T*     input,
+                                                                    const T*     bias,
+                                                                    const T*     residual1,
+                                                                    const T*     residual2,
+                                                                    const T*     gamma,
+                                                                    const T*     beta,
+                                                                    float        layernorm_eps,
+                                                                    int          m,
+                                                                    int          half_n,
+                                                                    dim3         grid,
+                                                                    dim3         block,
+                                                                    cudaStream_t stream,
+                                                                    int          opt_version,
+                                                                    bool         is_output,
+                                                                    int          residual_num,
+                                                                    int          unroll_factor)
+{
+    switch (unroll_factor) {
+        case 1:
+            dispatch_generalLLaMAAddBiasResidualLayerNormOpt_residual_num<T, 1>(norm_output,
+                                                                                output,
+                                                                                input,
+                                                                                bias,
+                                                                                residual1,
+                                                                                residual2,
+                                                                                gamma,
+                                                                                beta,
+                                                                                layernorm_eps,
+                                                                                m,
+                                                                                half_n,
+                                                                                grid,
+                                                                                block,
+                                                                                stream,
+                                                                                opt_version,
+                                                                                is_output,
+                                                                                residual_num);
+            break;
+        case 2:
+            dispatch_generalLLaMAAddBiasResidualLayerNormOpt_residual_num<T, 2>(norm_output,
+                                                                                output,
+                                                                                input,
+                                                                                bias,
+                                                                                residual1,
+                                                                                residual2,
+                                                                                gamma,
+                                                                                beta,
+                                                                                layernorm_eps,
+                                                                                m,
+                                                                                half_n,
+                                                                                grid,
+                                                                                block,
+                                                                                stream,
+                                                                                opt_version,
+                                                                                is_output,
+                                                                                residual_num);
+            break;
+        case 4:
+            dispatch_generalLLaMAAddBiasResidualLayerNormOpt_residual_num<T, 4>(norm_output,
+                                                                                output,
+                                                                                input,
+                                                                                bias,
+                                                                                residual1,
+                                                                                residual2,
+                                                                                gamma,
+                                                                                beta,
+                                                                                layernorm_eps,
+                                                                                m,
+                                                                                half_n,
+                                                                                grid,
+                                                                                block,
+                                                                                stream,
+                                                                                opt_version,
+                                                                                is_output,
+                                                                                residual_num);
+            break;
+        case 8:
+            dispatch_generalLLaMAAddBiasResidualLayerNormOpt_residual_num<T, 8>(norm_output,
+                                                                                output,
+                                                                                input,
+                                                                                bias,
+                                                                                residual1,
+                                                                                residual2,
+                                                                                gamma,
+                                                                                beta,
+                                                                                layernorm_eps,
+                                                                                m,
+                                                                                half_n,
+                                                                                grid,
+                                                                                block,
+                                                                                stream,
+                                                                                opt_version,
+                                                                                is_output,
+                                                                                residual_num);
+            break;
+        default:
+            FT_CHECK_WITH_INFO(false, "unroll_factor must be 1, 2, 4 or 8");
+    }
+}
+
 template<typename T>
 void dispatch_generalAddBiasResidualLayerNormOpt_unroll_factor(T*           norm_output,
                                                                T*           output,
@@ -1263,6 +1813,105 @@ void dispatch_generalAddBiasResidualLayerNormOpt_unroll_factor(T*           norm
     }
 }
 
+template<typename T>
+void invokeGeneralLLaMAAddBiasResidualPreLayerNorm(T*           output,
+                                                   T*           norm_output,
+                                                   const T*     input,
+                                                   const T*     residual1,
+                                                   const T*     gamma,
+                                                   const T*     beta,
+                                                   const T*     bias,
+                                                   const float  layernorm_eps,
+                                                   int          m,
+                                                   int          n,
+                                                   cudaStream_t stream,
+                                                   int          opt_version)
+{
+    const int residual_num = 1;
+    if (opt_version > 0 && sizeof(T) == 2 && n % 2 == 0) {
+        dim3 grid(m);
+        int  half_n    = n / 2;
+        int  half_n_32 = (half_n + 31) / 32 * 32;
+        dim3 block(min(half_n_32, 512));
+        int  rolls_per_thread = half_n / block.x;
+        int  unroll_factor    = 8;
+        while (unroll_factor > rolls_per_thread && unroll_factor > 1) {
+            unroll_factor /= 2;
+        }
+
+        using T2 = typename TypeConverter<T>::Type;
+
+        /* we launch (and instantiate) the kernel by specializing for unroll_factor -> residual_num -> is_bias ->
+         * opt_version */
+        dispatch_generalLLaMAAddBiasResidualLayerNormOpt_unroll_factor((T2*)norm_output,
+                                                                       (T2*)output,
+                                                                       (const T2*)input,
+                                                                       (const T2*)bias,
+                                                                       (const T2*)residual1,
+                                                                       (const T2*)nullptr,
+                                                                       (const T2*)gamma,
+                                                                       (const T2*)beta,
+                                                                       layernorm_eps,
+                                                                       m,
+                                                                       half_n,
+                                                                       grid,
+                                                                       block,
+                                                                       stream,
+                                                                       opt_version,
+                                                                       true,  // is_output
+                                                                       residual_num,
+                                                                       unroll_factor);
+    }
+    else {
+
+        dim3 grid(m);
+        dim3 block(min(n, 1024));
+
+        /* For general cases, n is equal to hidden_units, e.g., 512/1024.
+        Since we have warp shuffle inside the code, block.x % 32 should be 0.
+        */
+        block.x = (block.x + 31) / 32 * 32;
+
+        size_t maxbytes = n * sizeof(T);
+        if (residual_num == 1) {
+            if (maxbytes >= (48 << 10)) {
+                check_cuda_error(cudaFuncSetAttribute(
+                    generalLLaMAAddBiasResidualLayerNorm<T, 1>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes));
+            }
+            generalLLaMAAddBiasResidualLayerNorm<T, 1><<<grid, block, maxbytes, stream>>>(
+                input, residual1, nullptr, gamma, beta, bias, output, norm_output, layernorm_eps, m, n);
+        }
+        else if (residual_num == 2) {
+            if (maxbytes >= (48 << 10)) {
+                check_cuda_error(cudaFuncSetAttribute(
+                    generalLLaMAAddBiasResidualLayerNorm<T, 2>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes));
+            }
+            generalLLaMAAddBiasResidualLayerNorm<T, 2><<<grid, block, maxbytes, stream>>>(
+                input, residual1, nullptr, gamma, beta, bias, output, norm_output, layernorm_eps, m, n);
+        }
+    }
+}
+
+#define INSTANTIATE_INVOKE_GENERAL_LLAMA_ADD_BIAS_RESIDUAL_PRE_LAYER_NORM(T)                                           \
+    template void invokeGeneralLLaMAAddBiasResidualPreLayerNorm(T*           output,                                   \
+                                                                T*           norm_output,                              \
+                                                                const T*     input,                                    \
+                                                                const T*     residual1,                                \
+                                                                const T*     gamma,                                    \
+                                                                const T*     beta,                                     \
+                                                                const T*     bias,                                     \
+                                                                const float  layernorm_eps,                            \
+                                                                int          m,                                        \
+                                                                int          n,                                        \
+                                                                cudaStream_t stream,                                   \
+                                                                int          opt_version)
+INSTANTIATE_INVOKE_GENERAL_LLAMA_ADD_BIAS_RESIDUAL_PRE_LAYER_NORM(float);
+INSTANTIATE_INVOKE_GENERAL_LLAMA_ADD_BIAS_RESIDUAL_PRE_LAYER_NORM(half);
+#ifdef ENABLE_BF16
+INSTANTIATE_INVOKE_GENERAL_LLAMA_ADD_BIAS_RESIDUAL_PRE_LAYER_NORM(__nv_bfloat16);
+#endif
+#undef INSTANTIATE_INVOKE_GENERAL_LLAMA_ADD_BIAS_RESIDUAL_PRE_LAYER_NORM
+
 /* output      <- output + bias + residual_1 + residual_2
  * output_norm <- LN(output) */
 template<typename T>
@@ -1859,6 +2508,77 @@ template void invokeGeneralT5LayerNorm(__nv_bfloat16*       out,
                                        cudaStream_t         stream);
 #endif
 
+/*******************  invokeGeneralLLaMALayerNorm  ***********************/
+
+template<typename T>
+__global__ void generalLLaMALayerNorm(
+    const T* __restrict input, const T* __restrict gamma, T* normed_output, const float layernorm_eps, int m, int n)
+{
+    const int tid = threadIdx.x;
+
+    float local_var_sum = 0.0f;
+    for (int i = tid; i < n; i += blockDim.x) {
+        float val = (float)(ldg(&input[blockIdx.x * n + i]));
+        local_var_sum += val * val;
+    }
+
+    float variance = 0.0f;
+    variance       = blockReduceSum(local_var_sum);
+
+    __shared__ float s_variance;
+    if (threadIdx.x == 0) {
+        s_variance = rsqrtf((variance / (float)n) + layernorm_eps);
+    }
+    __syncthreads();
+
+    for (int i = tid; i < n; i += blockDim.x) {
+        const int index = blockIdx.x * n + i;
+        T         val   = (T) (((float)ldg(&input[index])) * s_variance);
+        normed_output[index] = val * ldg(&gamma[i]);
+    }
+}
+
+template<typename T>
+void invokeGeneralLLaMALayerNorm(
+    T* out, const T* input, const T* gamma, const float layernorm_eps, const int m, const int n, cudaStream_t stream)
+{
+    dim3 grid(m);
+    dim3 block(min(n, 1024));
+
+    /* For general cases, n is equal to hidden_units, e.g., 512/1024.
+       Since we have warp shuffle inside the code, block.x % 32 should be 0.
+     */
+    if (n % 32 != 0) {
+        block.x = 1024;
+    }
+
+    generalLLaMALayerNorm<T><<<grid, block, 0, stream>>>(input, gamma, out, layernorm_eps, m, n);
+}
+
+template void invokeGeneralLLaMALayerNorm(float*       out,
+                                          const float* input,
+                                          const float* gamma,
+                                          const float  layernorm_eps,
+                                          const int    m,
+                                          const int    n,
+                                          cudaStream_t stream);
+template void invokeGeneralLLaMALayerNorm(half*        out,
+                                          const half*  input,
+                                          const half*  gamma,
+                                          const float  layernorm_eps,
+                                          const int    m,
+                                          const int    n,
+                                          cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokeGeneralLLaMALayerNorm(__nv_bfloat16*       out,
+                                          const __nv_bfloat16* input,
+                                          const __nv_bfloat16* gamma,
+                                          const float          layernorm_eps,
+                                          const int            m,
+                                          const int            n,
+                                          cudaStream_t         stream);
+#endif
+
 /*******************  invokeLayernormShiftPartition  ***********************/
 
 // applied to half2 and bfloat162
diff --git a/src/fastertransformer/kernels/layernorm_kernels.h b/src/fastertransformer/kernels/layernorm_kernels.h
index d8ac09234..c7b31e874 100644
--- a/src/fastertransformer/kernels/layernorm_kernels.h
+++ b/src/fastertransformer/kernels/layernorm_kernels.h
@@ -24,7 +24,8 @@
 
 namespace fastertransformer {
 
-enum class LayerNormType {
+enum class LayerNormType
+{
     pre_layernorm,
     post_layernorm,
     InvalidType
@@ -61,6 +62,20 @@ void invokeAddBiasResidualLayerNorm(T*           out,
                                     const int    n,
                                     cudaStream_t stream);
 
+template<typename T>
+void invokeGeneralLLaMAAddBiasResidualPreLayerNorm(T*           output,
+                                                   T*           norm_output,
+                                                   const T*     input,
+                                                   const T*     residual1,
+                                                   const T*     gamma,
+                                                   const T*     beta,
+                                                   const T*     bias,
+                                                   const float  layernorm_eps,
+                                                   int          m,
+                                                   int          n,
+                                                   cudaStream_t stream,
+                                                   int          opt_version = 2);
+
 template<typename T>
 void invokeGeneralAddBiasResidualPreLayerNorm(T*           output,
                                               T*           norm_output,
@@ -161,6 +176,15 @@ void invokeGeneralT5LayerNorm(T*           out,
                               const int    n,
                               cudaStream_t stream);
 
+template<typename T>
+void invokeGeneralLLaMALayerNorm(T*           out,
+                                 const T*     input,
+                                 const T*     gamma,
+                                 const float  layernorm_eps,
+                                 const int    m,
+                                 const int    n,
+                                 cudaStream_t stream);
+
 template<typename T>
 void invokeGeneralAddResidualT5PreLayerNorm(T*           output,
                                             T*           norm_output,
diff --git a/src/fastertransformer/kernels/llama_kernels.cu b/src/fastertransformer/kernels/llama_kernels.cu
new file mode 100644
index 000000000..4b02602d9
--- /dev/null
+++ b/src/fastertransformer/kernels/llama_kernels.cu
@@ -0,0 +1,389 @@
+#include "src/fastertransformer/kernels/llama_kernels.h"
+#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
+#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+
+#include <algorithm>
+
+#include <assert.h>
+#include <cuda_fp16.h>
+#include <stdio.h>
+
+using namespace std;
+namespace fastertransformer {
+
+template<typename T>
+__global__ void LLaMA_get_last_tokens(T* out, T* in, const int* cu_seqlens, int batch_size, int hidden_size)
+{
+    // in [num_tokens, hidden_size]
+    // out [batch_size, hidden_size]
+    int batch_idx = blockIdx.x;
+
+    if (batch_idx >= batch_size)
+        return;
+
+    int pos = cu_seqlens[batch_idx + 1] - 1;
+
+    for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+        out[batch_idx * hidden_size + idx] = in[pos * hidden_size + idx];
+    }
+}
+
+template<typename T>
+void invokeLLaMAGetLastTokens(
+    T* out, T* in, const int* cu_seqlens, int batch_size, int hidden_size, cudaStream_t stream)
+{
+    dim3 grid(batch_size);
+    dim3 block(256);
+    LLaMA_get_last_tokens<<<grid, block, 0, stream>>>(out, in, cu_seqlens, batch_size, hidden_size);
+}
+
+template void invokeLLaMAGetLastTokens(
+    float* out, float* in, const int* cu_seqlens, int batch_size, int hidden_size, cudaStream_t stream);
+template void invokeLLaMAGetLastTokens(
+    half* out, half* in, const int* cu_seqlens, int batch_size, int hidden_size, cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokeLLaMAGetLastTokens(
+    __nv_bfloat16* out, __nv_bfloat16* in, const int* cu_seqlens, int batch_size, int hidden_size, cudaStream_t stream);
+#endif
+
+__global__ void LLaMA_extract_targets(float*     out,
+                                      float*     in,
+                                      const int* target_ids,
+                                      const int* cu_seqlens,
+                                      int        beam_width,
+                                      int        batch_size,
+                                      int        vocab_size,
+                                      int        num_tokens)
+{
+    // in [batch_size, vocab_size]
+    // target_ids [ beam_width, num_tokens ]
+    // out [beam_width, batch_size]
+    int batch_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int beam_idx  = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (batch_idx >= batch_size || beam_idx >= beam_width)
+        return;
+
+    int pos                                = cu_seqlens[batch_idx + 1] - 1;
+    int target_idx                         = target_ids[beam_idx * num_tokens + pos];
+    out[beam_idx * batch_size + batch_idx] = in[batch_idx * vocab_size + target_idx];
+}
+
+void invokeLLaMAExtractTargets(float*       out,
+                               float*       in,
+                               const int*   target_ids,
+                               const int*   cu_seqlens,
+                               int          beam_width,
+                               int          batch_size,
+                               int          vocab_size,
+                               int          num_tokens,
+                               cudaStream_t stream)
+{
+    dim3 block(32, 4);
+    dim3 grid((batch_size + block.x - 1) / block.x, (beam_width + block.y - 1) / block.y);
+    LLaMA_extract_targets<<<grid, block, 0, stream>>>(
+        out, in, target_ids, cu_seqlens, beam_width, batch_size, vocab_size, num_tokens);
+}
+
+__global__ void LLaMA_log_softmax(float* out, const float* logits, const int num_tokens, const int vocab_size)
+{
+    // logits [T, V]
+    // out [T, V]
+    const int64_t    ti = blockIdx.x;
+    __shared__ float s_sum, s_max;
+
+    if (ti >= num_tokens)
+        return;
+
+    float local_max = -1e20f;
+    for (int i = threadIdx.x; i < vocab_size; i += blockDim.x) {
+        float logit_val = logits[ti * vocab_size + i];
+        local_max       = fmax(logit_val, local_max);
+    }
+
+    float max_val = blockDim.x <= 32 ? warpReduceMax(local_max) : blockReduceMax<float>(local_max);
+    if (threadIdx.x == 0) {
+        s_max = max_val;
+    }
+    __syncthreads();
+
+    float local_sum = 0;
+    for (int i = threadIdx.x; i < vocab_size; i += blockDim.x) {
+        float logit_val = logits[ti * vocab_size + i];
+        local_sum += __expf(logit_val - s_max);
+    }
+    float sum_val = blockDim.x <= 32 ? warpReduceSum(local_sum) : blockReduceSum<float>(local_sum);
+    if (threadIdx.x == 0) {
+        // s_sum = sum_val + 1e-6f;
+        s_sum = sum_val;
+    }
+    __syncthreads();
+
+    for (int i = threadIdx.x; i < vocab_size; i += blockDim.x) {
+        float logit_val          = logits[ti * vocab_size + i];
+        out[ti * vocab_size + i] = (logit_val - s_max) - __logf(s_sum);
+    }
+}
+
+void invokeLLaMALogSoftmax(
+    float* out, const float* logits, const int num_tokens, const int vocab_size, cudaStream_t stream)
+{
+    dim3 grid(num_tokens);
+    dim3 block(min(1024, vocab_size));
+    LLaMA_log_softmax<<<grid, block, 0, stream>>>(out, logits, num_tokens, vocab_size);
+}
+
+__global__ void LLaMA_gather_tokens_kernel(float*       out,
+                                           const float* probs,
+                                           const int*   input_lengths,
+                                           const int*   target_ids,
+                                           const int*   cu_seqlens,
+                                           const int    batch_size,
+                                           const int    vocab_size,
+                                           const int    num_tokens)
+{
+    // probs: [T, V]
+    // target_ids: [T]
+    // out: [batch_size]
+    int batch_idx = blockIdx.x;
+
+    if (batch_idx >= batch_size)
+        return;
+
+    float val = 0.f;
+    for (int i = threadIdx.x; i < input_lengths[batch_idx]; i += blockDim.x) {
+        int pos        = cu_seqlens[batch_idx] + i;
+        int target_pos = target_ids[pos];
+        val += (target_pos > 0) ? probs[pos * vocab_size + target_pos] : 0.f;
+    }
+    float sum = blockReduceSum<float>(val);
+
+    if (threadIdx.x == 0)
+        out[batch_idx] = sum;
+}
+
+void invokeLLaMAGatherTokens(float*       out,
+                             const float* probs,
+                             const int*   input_lengths,
+                             const int*   target_ids,
+                             const int*   cu_seqlens,
+                             const int    batch_size,
+                             const int    vocab_size,
+                             const int    num_tokens,
+                             cudaStream_t stream)
+{
+    dim3 grid(batch_size);
+    dim3 block(256);
+    LLaMA_gather_tokens_kernel<<<grid, block, 0, stream>>>(
+        out, probs, input_lengths, target_ids, cu_seqlens, batch_size, vocab_size, num_tokens);
+}
+
+template<typename T>
+__global__ void LLaMAstart_id_embedding_lookups_kernel(
+    T* out, const T* embedding_table, const int* input_ids, const int num_tokens, const int64_t hidden_units)
+{
+    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_tokens * hidden_units;
+         index += blockDim.x * gridDim.x) {
+
+        // embedding lookup from word ids [batch, length] (part of [batch, length]) and [vocab, hidden] to generate
+        // embedding [batch, length, hidden]
+        const int word_index = index / hidden_units;
+        const int col_index  = index % hidden_units;
+        const int input_id   = input_ids[word_index];
+
+        out[index] = embedding_table[input_id * hidden_units + col_index];
+    }
+}
+
+template<typename T>
+void invokeLLaMAInputIdsEmbeddingLookup(T*           out,
+                                        const T*     embedding_table,
+                                        const int*   input_ids,
+                                        const int    num_tokens,
+                                        const int    hidden_units,
+                                        cudaStream_t stream)
+{
+    dim3 grid(min(num_tokens, 65536));
+    dim3 block(min(hidden_units, 512));
+    LLaMAstart_id_embedding_lookups_kernel<T>
+        <<<grid, block, 0, stream>>>(out, embedding_table, input_ids, num_tokens, hidden_units);
+}
+
+template void invokeLLaMAInputIdsEmbeddingLookup(float*       out,
+                                                 const float* embedding_table,
+                                                 const int*   input_ids,
+                                                 const int    num_tokens,
+                                                 const int    hidden_units,
+                                                 cudaStream_t stream);
+template void invokeLLaMAInputIdsEmbeddingLookup(half*        out,
+                                                 const half*  embedding_table,
+                                                 const int*   input_ids,
+                                                 const int    num_tokens,
+                                                 const int    hidden_units,
+                                                 cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokeLLaMAInputIdsEmbeddingLookup(__nv_bfloat16*       out,
+                                                 const __nv_bfloat16* embedding_table,
+                                                 const int*           input_ids,
+                                                 const int            num_tokens,
+                                                 const int            hidden_units,
+                                                 cudaStream_t         stream);
+#endif
+
+__global__ void LLaMAgetPaddingOffsetAndCuSeqLensKernel(
+    int* padding_offset, int* cu_seqlens, const int* sequence_length, const int batch_size, const int seq_len)
+{
+    // do cumulated sum
+    int total_seq_len = 0;
+    int cum_offset    = 0;
+    int index         = 0;
+    for (int i = 0; i < batch_size; i++) {
+        const int num_tokens = sequence_length[i];
+        cu_seqlens[i]        = total_seq_len;
+        for (int j = 0; j < num_tokens; j++) {
+            padding_offset[index] = cum_offset;
+            index++;
+        }
+        cum_offset += seq_len - num_tokens;
+        total_seq_len += num_tokens;
+    }
+    cu_seqlens[batch_size] = total_seq_len;
+}
+
+void invokeLLaMAGetPaddingOffsetAndCuSeqLens(int*         padding_offset,
+                                             int*         cu_seqlens,
+                                             const int*   input_lengths,
+                                             const int    batch_size,
+                                             const int    seq_len,
+                                             cudaStream_t stream)
+{
+    LLaMAgetPaddingOffsetAndCuSeqLensKernel<<<1, 1, 0, stream>>>(
+        padding_offset, cu_seqlens, input_lengths, batch_size, seq_len);
+}
+
+template<typename T>
+__global__ void LLaMAbuildDecoderAttentionMaskKernel(T*         attention_mask,
+                                                     const int* sequence_lengths,
+                                                     const int* context_lengths,
+                                                     const int  batch_size,
+                                                     const int  seq_len,
+                                                     const int  attn_len)
+{
+    // attention_mask:
+    // [batch_size, 1, seq_len, attn_len]
+    const int batch_idx         = blockIdx.x;
+    const int mask_size_per_seq = seq_len * attn_len;
+    attention_mask += batch_idx * mask_size_per_seq;
+    const int context_length = context_lengths[batch_idx];
+    const int length         = sequence_lengths[batch_idx];
+
+    for (int i = threadIdx.x; i < mask_size_per_seq; i += blockDim.x) {
+        int row_id = i / attn_len;
+        int col_id = i % attn_len;
+        if (row_id < length && col_id <= (row_id + context_length)) {
+            attention_mask[i] = (T)(1.0f);
+        }
+        else {
+            attention_mask[i] = (T)(0.0f);
+        }
+    }
+}
+
+template<typename T>
+void invokeLLaMABuildDecoderAttentionMask(T*           attention_mask,
+                                          const int*   sequence_length,
+                                          const int*   context_lengths,
+                                          const int    batch_size,
+                                          const int    seq_len,
+                                          const int    attn_len,
+                                          cudaStream_t stream)
+{
+    LLaMAbuildDecoderAttentionMaskKernel<T><<<batch_size, 256, 0, stream>>>(
+        attention_mask, sequence_length, context_lengths, batch_size, seq_len, attn_len);
+}
+
+template void invokeLLaMABuildDecoderAttentionMask(float*       attention_mask,
+                                                   const int*   sequence_length,
+                                                   const int*   context_lengths,
+                                                   const int    batch_size,
+                                                   const int    seq_len,
+                                                   const int    attn_len,
+                                                   cudaStream_t stream);
+
+template void invokeLLaMABuildDecoderAttentionMask(half*        attention_mask,
+                                                   const int*   sequence_length,
+                                                   const int*   context_lengths,
+                                                   const int    batch_size,
+                                                   const int    seq_len,
+                                                   const int    attn_len,
+                                                   cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokeLLaMABuildDecoderAttentionMask(__nv_bfloat16* attention_mask,
+                                                   const int*     sequence_length,
+                                                   const int*     context_lengths,
+                                                   const int      batch_size,
+                                                   const int      seq_len,
+                                                   const int      attn_len,
+                                                   cudaStream_t   stream);
+#endif
+
+template<typename T>
+__global__ void LLaMACopyKernel(T* dst, T* src, const int count)
+{
+    int           idx     = blockIdx.x * blockDim.x + threadIdx.x;
+    constexpr int X_ELEMS = (sizeof(T) == 4) ? 4 : 8;
+    if (idx * X_ELEMS >= count) {
+        return;
+    }
+
+    auto v_dst = reinterpret_cast<uint4*>(dst);
+    auto v_src = reinterpret_cast<uint4*>(src);
+    v_dst[idx] = v_src[idx];
+}
+
+template<typename T>
+void invokeLLaMACopyKernel(T* dst, T* src, const int count, cudaStream_t stream)
+{
+    constexpr int block_sz = 128;
+    constexpr int x        = (sizeof(T) == 4) ? 4 : 8;
+    assert(count % x == 0);
+    int grid_sz = (count / x + block_sz - 1) / block_sz;
+    LLaMACopyKernel<<<grid_sz, block_sz, 0, stream>>>(dst, src, count);
+}
+
+template void invokeLLaMACopyKernel(float* dst, float* src, const int count, cudaStream_t stream);
+template void invokeLLaMACopyKernel(half* dst, half* src, const int count, cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokeLLaMACopyKernel(__nv_bfloat16* dst, __nv_bfloat16* src, const int count, cudaStream_t stream);
+#endif
+
+template<typename T>
+__global__ void LLaMAMemset0Kernel(T* dst, const int count)
+{
+    int           idx     = blockIdx.x * blockDim.x + threadIdx.x;
+    constexpr int X_ELEMS = (sizeof(T) == 4) ? 4 : 8;
+    if (idx * X_ELEMS >= count) {
+        return;
+    }
+
+    auto v_dst = reinterpret_cast<uint4*>(dst);
+    v_dst[idx] = {0};
+}
+
+template<typename T>
+void invokeLLaMAMemset0(T* dst, const int count, cudaStream_t stream)
+{
+    constexpr int block_sz = 128;
+    constexpr int x        = (sizeof(T) == 4) ? 4 : 8;
+    assert(count % x == 0);
+    int grid_sz = (count / x + block_sz - 1) / block_sz;
+    LLaMAMemset0Kernel<<<grid_sz, block_sz, 0, stream>>>(dst, count);
+}
+
+template void invokeLLaMAMemset0(float* dst, const int count, cudaStream_t stream);
+template void invokeLLaMAMemset0(half* dst, const int count, cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokeLLaMAMemset0(__nv_bfloat16* dst, const int count, cudaStream_t stream);
+#endif
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/llama_kernels.h b/src/fastertransformer/kernels/llama_kernels.h
new file mode 100644
index 000000000..01e3bbf7a
--- /dev/null
+++ b/src/fastertransformer/kernels/llama_kernels.h
@@ -0,0 +1,62 @@
+#pragma once
+
+#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+namespace fastertransformer {
+
+void invokeLLaMAGetPaddingOffsetAndCuSeqLens(int*         padding_offset,
+                                             int*         cu_seqlens,
+                                             const int*   input_lengths,
+                                             const int    batch_size,
+                                             const int    seq_len,
+                                             cudaStream_t stream);
+
+template<typename T>
+void invokeLLaMABuildDecoderAttentionMask(T*           attention_mask,
+                                          const int*   sequence_length,
+                                          const int*   context_lengths,
+                                          const int    batch_size,
+                                          const int    seq_len,
+                                          const int    max_length,
+                                          cudaStream_t stream);
+template<typename T>
+void invokeLLaMAInputIdsEmbeddingLookup(T*           from_tensor,
+                                        const T*     embedding_table,
+                                        const int*   input_ids,
+                                        const int    num_tokens,
+                                        const int    hidden_units,
+                                        cudaStream_t stream);
+
+template<typename T>
+void invokeLLaMACopyKernel(T* dst, T* src, const int count, cudaStream_t stream);
+
+template<typename T>
+void invokeLLaMAMemset0(T* dst, const int count, cudaStream_t stream);
+
+void invokeLLaMAGatherTokens(float*       out,
+                             const float* probs,
+                             const int*   input_lengths,
+                             const int*   target_ids,
+                             const int*   cu_seqlens,
+                             const int    batch_size,
+                             const int    vocab_size,
+                             const int    num_tokens,
+                             cudaStream_t stream);
+
+void invokeLLaMALogSoftmax(
+    float* out, const float* logits, const int num_tokens, const int vocab_size, cudaStream_t stream);
+
+template<typename T>
+void invokeLLaMAGetLastTokens(
+    T* out, T* in, const int* cu_seqlens, int batch_size, int hidden_size, cudaStream_t stream);
+
+void invokeLLaMAExtractTargets(float*       out,
+                               float*       in,
+                               const int*   target_ids,
+                               const int*   cu_seqlens,
+                               int          beam_width,
+                               int          batch_size,
+                               int          vocab_size,
+                               int          num_tokens,
+                               cudaStream_t stream);
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu
index d0fb0a197..a513c1b47 100644
--- a/src/fastertransformer/kernels/unfused_attention_kernels.cu
+++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu
@@ -278,14 +278,14 @@ __global__ void softmax_kernel(T*          attn_score,
     // Loop along with Q dimension.
     for (int64_t qi = blockIdx.x; qi < q_length; qi += gridDim.x) {
 
-        float data[ITEMS_PER_THREAD];
-        int64_t   qk_offset;
-        float local_max = -1e20f;
+        float   data[ITEMS_PER_THREAD];
+        int64_t qk_offset;
+        float   local_max = -1e20f;
 
         // Loop along with K dimension.
         for (int64_t i = 0; blockDim.x * i + threadIdx.x < k_length; i++) {
-            int64_t ki    = blockDim.x * i + threadIdx.x;  // Index of K dimension.
-            qk_offset = ((bi * head_num + hi) * q_length + qi) * k_length + ki;
+            int64_t ki = blockDim.x * i + threadIdx.x;  // Index of K dimension.
+            qk_offset  = ((bi * head_num + hi) * q_length + qi) * k_length + ki;
 
             float qk_val  = static_cast<float>(qk[qk_offset]);
             float qk_bias = 0.0f;
@@ -297,8 +297,8 @@ __global__ void softmax_kernel(T*          attn_score,
                 qk_bias += static_cast<float>(linear_bias_slope * (ki - qi));
             }
 
-            int64_t   mask_offset = (bi * q_length + qi) * k_length + ki;
-            float mask_val    = static_cast<float>(ldg(&attn_mask[mask_offset]));
+            int64_t mask_offset = (bi * q_length + qi) * k_length + ki;
+            float   mask_val    = static_cast<float>(ldg(&attn_mask[mask_offset]));
             qk_bias += (1.0f - mask_val) * -10000.0f;
 
             data[i]   = qk_scale * qk_val + qk_bias;
@@ -1363,9 +1363,9 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T*
     const int head_idx = blockIdx.y;
     const int tidx     = threadIdx.x;
 
-    const int total_seq_len = param.max_prefix_prompt_length + seq_len;
+    const int  total_seq_len = param.max_prefix_prompt_length + seq_len;
+    const bool is_masked     = tidx * vec_size >= size_per_head;
 
-    const bool is_masked = tidx * vec_size >= size_per_head;
     // NOTE: blockIdx.x < batch_size * param.max_prefix_prompt_length really handles prefix prompts
     if (PREFIX_PROMPT && token_idx < 0) {
         const int prompt_batch_idx = blockIdx.x / param.max_prefix_prompt_length;
@@ -1581,6 +1581,135 @@ INSTANTIATEADDFUSEDQKVBIASTRANSPOSE(__nv_bfloat16);
 #endif
 #undef INSTANTIATEADDFUSEDQKVBIASTRANSPOSE
 
+template<typename T>
+__global__ void llama_add_fusedQKV_bias_transpose_kernel(T*         q_buf,
+                                                         T*         k_buf,
+                                                         T*         v_buf,
+                                                         T*         QKV,
+                                                         const int* padding_offset,
+                                                         const int  batch_size,
+                                                         const int  seq_len,
+                                                         const int  head_num,
+                                                         const int  size_per_head,
+                                                         const int  rotary_embedding_dim,
+                                                         const int* context_lengths)
+{
+    constexpr int vec_size         = Vec_t<T>::size;
+    using Vec_t                    = typename Vec_t<T>::Type;
+    const int token_idx            = blockIdx.x;
+    const int token_padding_offset = (padding_offset == nullptr || token_idx < 0) ? 0 : padding_offset[token_idx];
+    const int tgt_token_idx        = token_idx + token_padding_offset;
+
+    const int batch_idx = tgt_token_idx / seq_len;
+    const int seq_idx   = tgt_token_idx % seq_len;
+
+    const int head_idx = blockIdx.y;
+    const int tidx     = threadIdx.x;
+
+    const bool is_masked = tidx * vec_size >= size_per_head;
+
+    const int hidden_idx = head_idx * size_per_head + tidx * vec_size;
+    const int n          = head_num * size_per_head;
+
+    const int src_q_idx = token_idx * 3 * n + hidden_idx;
+    const int src_k_idx = token_idx * 3 * n + hidden_idx + n;
+    const int src_v_idx = token_idx * 3 * n + hidden_idx + 2 * n;
+
+    Vec_t q, k, v;
+    if (!is_masked) {
+        q = *reinterpret_cast<const Vec_t*>(&QKV[src_q_idx]);
+        k = *reinterpret_cast<const Vec_t*>(&QKV[src_k_idx]);
+        v = *reinterpret_cast<const Vec_t*>(&QKV[src_v_idx]);
+    }
+
+    mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, context_lengths[batch_idx] + seq_idx);
+
+    const int dest_q_idx = batch_idx * size_per_head * seq_len * head_num + head_idx * size_per_head * seq_len
+                           + seq_idx * size_per_head + tidx * vec_size;
+
+    const int dest_kv_idx = batch_idx * size_per_head * seq_len * head_num + head_idx * size_per_head * seq_len
+                            + seq_idx * size_per_head + tidx * vec_size;
+
+    if (!is_masked) {
+        *reinterpret_cast<Vec_t*>(&q_buf[dest_q_idx])  = q;
+        *reinterpret_cast<Vec_t*>(&k_buf[dest_kv_idx]) = k;
+        *reinterpret_cast<Vec_t*>(&v_buf[dest_kv_idx]) = v;
+    }
+}
+
+template<typename T>
+void invokeLLaMAAddFusedQKVBiasTranspose(T*           q_buf,
+                                         T*           k_buf,
+                                         T*           v_buf,
+                                         T*           QKV,
+                                         const int*   padding_offset,
+                                         const int    batch_size,
+                                         const int    seq_len,
+                                         const int    token_num,
+                                         const int    head_num,
+                                         const int    size_per_head,
+                                         const int    rotary_embedding_dim,
+                                         const int*   context_lengths,
+                                         cudaStream_t stream)
+{
+    dim3 block((size_per_head / Vec_t<T>::size + 31) / 32 * 32);
+    dim3 grid(token_num, head_num);
+    llama_add_fusedQKV_bias_transpose_kernel<<<grid, block, 0, stream>>>(q_buf,
+                                                                         k_buf,
+                                                                         v_buf,
+                                                                         QKV,
+                                                                         padding_offset,
+                                                                         batch_size,
+                                                                         seq_len,
+                                                                         head_num,
+                                                                         size_per_head,
+                                                                         rotary_embedding_dim,
+                                                                         context_lengths);
+}
+
+template void invokeLLaMAAddFusedQKVBiasTranspose(float*       q_buf,
+                                                  float*       k_buf,
+                                                  float*       v_buf,
+                                                  float*       QKV,
+                                                  const int*   padding_offset,
+                                                  const int    batch_size,
+                                                  const int    seq_len,
+                                                  const int    token_num,
+                                                  const int    head_num,
+                                                  const int    size_per_head,
+                                                  const int    rotary_embedding_dim,
+                                                  const int*   context_lengths,
+                                                  cudaStream_t stream);
+
+template void invokeLLaMAAddFusedQKVBiasTranspose(half*        q_buf,
+                                                  half*        k_buf,
+                                                  half*        v_buf,
+                                                  half*        QKV,
+                                                  const int*   padding_offset,
+                                                  const int    batch_size,
+                                                  const int    seq_len,
+                                                  const int    token_num,
+                                                  const int    head_num,
+                                                  const int    size_per_head,
+                                                  const int    rotary_embedding_dim,
+                                                  const int*   context_lengths,
+                                                  cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokeLLaMAAddFusedQKVBiasTranspose(__nv_bfloat16* q_buf,
+                                                  __nv_bfloat16* k_buf,
+                                                  __nv_bfloat16* v_buf,
+                                                  __nv_bfloat16* QKV,
+                                                  const int*     padding_offset,
+                                                  const int      batch_size,
+                                                  const int      seq_len,
+                                                  const int      token_num,
+                                                  const int      head_num,
+                                                  const int      size_per_head,
+                                                  const int      rotary_embedding_dim,
+                                                  const int*     context_lengths,
+                                                  cudaStream_t   stream);
+#endif
+
 template<typename T>
 __global__ void transpose_4d(T*        dst,
                              T*        src,
@@ -1760,6 +1889,7 @@ void invokeTranspose4dBatchMajor(T*           k_dst,
                                               const int    size_per_head,                                              \
                                               const int    local_head_num,                                             \
                                               cudaStream_t stream)
+
 INSTANTIATETRANSPOSE4DBATCHMAJOR(float);
 INSTANTIATETRANSPOSE4DBATCHMAJOR(half);
 #ifdef ENABLE_BF16
@@ -1767,6 +1897,166 @@ INSTANTIATETRANSPOSE4DBATCHMAJOR(__nv_bfloat16);
 #endif
 #undef INSTANTIATETRANSPOSE4DBATCHMAJOR
 
+template<typename T>
+__global__ void transpose_4d_save_to_cache(T*         k_dst,
+                                           const T*   k_src,
+                                           T*         v_dst,
+                                           const T*   v_src,
+                                           const int* context_lengths,
+                                           const int  head_num,
+                                           const int  size_per_head,
+                                           const int  seq_len,
+                                           const int  max_seq_len)
+{
+    // [batch_size, head_num, seq_len, size_per_head]
+    const int batch_id = blockIdx.y;
+    const int head_id  = blockIdx.z;
+
+    // 16 byte loads will handle "x" dimension
+    auto key_src = reinterpret_cast<const uint4*>(k_src + batch_id * head_num * size_per_head * seq_len
+                                                  + head_id * size_per_head * seq_len);
+    auto key_dst =
+        reinterpret_cast<uint4*>(k_dst + batch_id * head_num * size_per_head * max_seq_len
+                                 + head_id * size_per_head * max_seq_len + context_lengths[batch_id] * size_per_head);
+    auto val_src = reinterpret_cast<const uint4*>(v_src + batch_id * head_num * size_per_head * seq_len
+                                                  + head_id * size_per_head * seq_len);
+    auto val_dst =
+        reinterpret_cast<uint4*>(v_dst + batch_id * head_num * size_per_head * max_seq_len
+                                 + head_id * size_per_head * max_seq_len + context_lengths[batch_id] * size_per_head);
+
+    // idx is over output dimension L * size_per_head / x for values
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    constexpr int X_ELEMS             = (sizeof(T) == 4) ? 4 : 8;
+    const int     size_per_head_div_x = size_per_head / X_ELEMS;
+
+    if (idx >= size_per_head_div_x * seq_len) {
+        return;
+    }
+
+    key_dst[idx] = key_src[idx];
+    val_dst[idx] = val_src[idx];
+}
+
+template<typename T>
+void invokeLLaMASaveToCache(T*           k_dst,
+                            T*           v_dst,
+                            const T*     k_src,
+                            const T*     v_src,
+                            const int*   context_lengths,
+                            const int    batch_size,
+                            const int    head_num,
+                            const int    size_per_head,
+                            const int    seq_len,
+                            const int    max_seq_len,
+                            cudaStream_t stream)
+{
+    constexpr int block_sz = 128;
+    constexpr int x        = (sizeof(T) == 4) ? 4 : 8;
+    dim3          grid((seq_len * size_per_head / x + block_sz - 1) / block_sz, batch_size, head_num);
+
+    transpose_4d_save_to_cache<<<grid, block_sz, 0, stream>>>(
+        k_dst, k_src, v_dst, v_src, context_lengths, head_num, size_per_head, seq_len, max_seq_len);
+}
+
+#define INSTANTIATESAVETOCACHE(T)                                                                                      \
+    template void invokeLLaMASaveToCache(T*           k_dst,                                                           \
+                                         T*           v_dst,                                                           \
+                                         const T*     k_src,                                                           \
+                                         const T*     v_src,                                                           \
+                                         const int*   context_lengths,                                                 \
+                                         const int    batch_size,                                                      \
+                                         const int    head_num,                                                        \
+                                         const int    size_per_head,                                                   \
+                                         const int    seq_len,                                                         \
+                                         const int    max_seq_len,                                                     \
+                                         cudaStream_t stream)
+INSTANTIATESAVETOCACHE(float);
+INSTANTIATESAVETOCACHE(half);
+#ifdef ENABLE_BF16
+INSTANTIATESAVETOCACHE(__nv_bfloat16);
+#endif
+#undef INSTANTIATESAVETOCACHE
+
+template<typename T>
+__global__ void transpose_4d_load_from_cache(T*        k_dst,
+                                             T*        v_dst,
+                                             const T*  k_src,
+                                             const T*  v_src,
+                                             const int head_num,
+                                             const int size_per_head,
+                                             const int seq_len,
+                                             const int attn_len,
+                                             const int max_seq_len)
+{
+    // [batch_size, head_num, attn_len, size_per_head]
+    const int batch_id = blockIdx.y;
+    const int head_id  = blockIdx.z;
+
+    // 16 byte loads will handle "x" dimension
+    auto key_src = reinterpret_cast<const uint4*>(k_src + batch_id * head_num * size_per_head * max_seq_len
+                                                  + head_id * size_per_head * max_seq_len);
+    auto key_dst = reinterpret_cast<uint4*>(k_dst + batch_id * head_num * size_per_head * attn_len
+                                            + head_id * size_per_head * attn_len);
+    auto val_src = reinterpret_cast<const uint4*>(v_src + batch_id * head_num * size_per_head * max_seq_len
+                                                  + head_id * size_per_head * max_seq_len);
+    auto val_dst = reinterpret_cast<uint4*>(v_dst + batch_id * head_num * size_per_head * attn_len
+                                            + head_id * size_per_head * attn_len);
+
+    // idx is over output dimension L * size_per_head / x for values
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    constexpr int X_ELEMS             = (sizeof(T) == 4) ? 4 : 8;
+    const int     size_per_head_div_x = size_per_head / X_ELEMS;
+
+    if (idx >= size_per_head_div_x * attn_len) {
+        return;
+    }
+
+    key_dst[idx] = key_src[idx];
+    val_dst[idx] = val_src[idx];
+}
+
+template<typename T>
+void invokeLLaMALoadFromCache(T*           k_dst,
+                              T*           v_dst,
+                              const T*     k_src,
+                              const T*     v_src,
+                              const int    batch_size,
+                              const int    head_num,
+                              const int    size_per_head,
+                              const int    seq_len,
+                              const int    attn_len,
+                              const int    max_seq_len,
+                              cudaStream_t stream)
+{
+    constexpr int block_sz = 128;
+    constexpr int x        = (sizeof(T) == 4) ? 4 : 8;
+    dim3          grid((attn_len * size_per_head / x + block_sz - 1) / block_sz, batch_size, head_num);
+
+    transpose_4d_load_from_cache<<<grid, block_sz, 0, stream>>>(
+        k_dst, v_dst, k_src, v_src, head_num, size_per_head, seq_len, attn_len, max_seq_len);
+}
+
+#define INSTANTIATELOADFROMCACHE(T)                                                                                    \
+    template void invokeLLaMALoadFromCache(T*           k_dst,                                                         \
+                                           T*           v_dst,                                                         \
+                                           const T*     k_src,                                                         \
+                                           const T*     v_src,                                                         \
+                                           const int    batch_size,                                                    \
+                                           const int    head_num,                                                      \
+                                           const int    size_per_head,                                                 \
+                                           const int    seq_len,                                                       \
+                                           const int    attn_len,                                                      \
+                                           const int    max_seq_len,                                                   \
+                                           cudaStream_t stream)
+INSTANTIATELOADFROMCACHE(float);
+INSTANTIATELOADFROMCACHE(half);
+#ifdef ENABLE_BF16
+INSTANTIATELOADFROMCACHE(__nv_bfloat16);
+#endif
+#undef INSTANTIATELOADFROMCACHE
+
 template<typename T>
 __global__ void addRelativeAttentionBias(
     T* qk_buf, const T* relative_attention_bias, const int batch_size, const int head_num, const int seq_len)
@@ -1827,8 +2117,8 @@ INSTANTIATEADDRELATIVEATTENTIONBIAS(__nv_bfloat16);
 // m = batch*window_num*window_len
 // mm_qkv is [m, head*3*size_per_head] row-major
 // bias_qkv is [head*3*size_per_head]
-// q_buf_, k_buf_, v_buf_ is [batch*window_num, num_head, window_len, size_per_head] row-major
-// grid(window_len, window_num, 3*batch);
+// q_buf_, k_buf_, v_buf_ is [batch*window_num, num_head, window_len,
+// size_per_head] row-major grid(window_len, window_num, 3*batch);
 // block(num_head * size_per_head)
 template<typename T>
 __global__ void add_head3Size_QKV_bias(const T*  mm_qkv,
@@ -1878,8 +2168,8 @@ __global__ void add_head3Size_QKV_bias(const T*  mm_qkv,
 // m = batch*window_num*window_len
 // mm_qkv is [m, head*3*size_per_head] row-major
 // bias_qkv is [head*3*size_per_head]
-// q_buf_, k_buf_, v_buf_ is [batch*window_num, num_head, window_len, size_per_head] row-major
-// grid(window_len, window_num, 3*batch);
+// q_buf_, k_buf_, v_buf_ is [batch*window_num, num_head, window_len,
+// size_per_head] row-major grid(window_len, window_num, 3*batch);
 // block(num_head * size_per_head)
 template<>
 __global__ void add_head3Size_QKV_bias(const float2* mm_qkv,
@@ -1931,8 +2221,8 @@ __global__ void add_head3Size_QKV_bias(const float2* mm_qkv,
 // m = batch*window_num*window_len
 // mm_qkv is [m, head*3*size_per_head] row-major
 // bias_qkv is [head*3*size_per_head]
-// q_buf_, k_buf_, v_buf_ is [batch*window_num, num_head, window_len, size_per_head] row-major
-// grid(window_len, window_num, batch);
+// q_buf_, k_buf_, v_buf_ is [batch*window_num, num_head, window_len,
+// size_per_head] row-major grid(window_len, window_num, batch);
 // block(num_head * size_per_head)
 template<>
 __global__ void add_head3Size_QKV_bias(const half2* mm_qkv,
@@ -2122,7 +2412,8 @@ INSTANTIATEADDHEAD3SIZEQKVBIAS(__nv_bfloat16);
 #endif
 #undef INSTANTIATEADDHEAD3SIZEQKVBIAS
 
-/*******************  invokeMaskedSoftMaxWithRelPosBias  ***********************/
+/*******************  invokeMaskedSoftMaxWithRelPosBias
+ * ***********************/
 
 // grid = (window_len/word_per_thread, window_num*num_head, batch_size)
 // block.x = max(32, (window_len + 31)/32*32)
@@ -2471,7 +2762,8 @@ __global__ void transpose_attentions(
     // attentions_in  shape [B, H, S, S]
     // attentions_out shape [B, L, H, S, S].
     // Note that we write the L dimension as if it was index 0.
-    // In reality, the pointer has already been shifted to point to the correct layer.
+    // In reality, the pointer has already been shifted to point to the
+    // correct layer.
 
     const auto batch_idx = blockIdx.x;
     const auto head_idx  = blockIdx.y;
diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.h b/src/fastertransformer/kernels/unfused_attention_kernels.h
index 7ac7604d4..4f55af19e 100644
--- a/src/fastertransformer/kernels/unfused_attention_kernels.h
+++ b/src/fastertransformer/kernels/unfused_attention_kernels.h
@@ -113,6 +113,21 @@ struct PrefixPromptBatchWeightsParam {
     const size_t prefix_prompt_layer_offset_per_seq = 0;
 };
 
+template<typename T>
+void invokeLLaMAAddFusedQKVBiasTranspose(T*           q_buf,
+                                         T*           k_buf,
+                                         T*           v_buf,
+                                         T*           QKV,
+                                         const int*   padding_offset,
+                                         const int    batch_size,
+                                         const int    seq_len,
+                                         const int    token_num,
+                                         const int    head_num,
+                                         const int    size_per_head,
+                                         const int    rotary_embedding_dim,
+                                         const int*   start_pos,
+                                         cudaStream_t stream);
+
 template<typename T>
 void invokeAddFusedQKVBiasTranspose(T*           q_buf,
                                     T*           k_buf,
@@ -189,6 +204,31 @@ void invokeTranspose4dBatchMajor(T*           k_dst,
                                  const int    local_head_num,
                                  cudaStream_t stream);
 
+template<typename T>
+void invokeLLaMASaveToCache(T*           k_dst,
+                            T*           v_dst,
+                            const T*     k_src,
+                            const T*     v_src,
+                            const int*   context_lengths,
+                            const int    batch_size,
+                            const int    head_num,
+                            const int    size_per_head,
+                            const int    seq_len,
+                            const int    max_seq_len,
+                            cudaStream_t stream);
+template<typename T>
+void invokeLLaMALoadFromCache(T*           k_dst,
+                              T*           v_dst,
+                              const T*     k_src,
+                              const T*     v_src,
+                              const int    batch_size,
+                              const int    head_num,
+                              const int    size_per_head,
+                              const int    seq_len,
+                              const int    attn_len,
+                              const int    max_seq_len,
+                              cudaStream_t stream);
+
 template<typename T>
 void invokeAddRelativeAttentionBias(T*           qk_buf,
                                     const T*     relative_attention_bias,
diff --git a/src/fastertransformer/layers/attention_layers/CMakeLists.txt b/src/fastertransformer/layers/attention_layers/CMakeLists.txt
index 1f0e93b1b..13821892d 100644
--- a/src/fastertransformer/layers/attention_layers/CMakeLists.txt
+++ b/src/fastertransformer/layers/attention_layers/CMakeLists.txt
@@ -44,6 +44,11 @@ set_property(TARGET GptContextAttentionLayer PROPERTY POSITION_INDEPENDENT_CODE
 set_property(TARGET GptContextAttentionLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 target_link_libraries(GptContextAttentionLayer PUBLIC -lcublas -lcudart cublasMMWrapper memory_utils unfused_attention_kernels trt_fused_multi_head_attention fpA_intB_gemm int8_gemm nvtx_utils)
 
+add_library(LLaMAContextAttentionLayer STATIC LLaMAContextAttentionLayer.cc)
+set_property(TARGET LLaMAContextAttentionLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET LLaMAContextAttentionLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(LLaMAContextAttentionLayer PUBLIC -lcublas -lcudart cublasMMWrapper memory_utils unfused_attention_kernels trt_fused_multi_head_attention fpA_intB_gemm int8_gemm nvtx_utils)
+
 add_library(DisentangledAttentionLayer STATIC DisentangledAttentionLayer.cc)
 set_property(TARGET DisentangledAttentionLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET DisentangledAttentionLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
new file mode 100644
index 000000000..28c0f6f55
--- /dev/null
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h"
+#include "src/fastertransformer/kernels/layernorm_kernels.h"
+#include "src/fastertransformer/kernels/unfused_attention_kernels.h"
+#include "src/fastertransformer/kernels/llama_kernels.h"
+#include "src/fastertransformer/utils/llama_utils.h"
+#include "src/fastertransformer/utils/nvtx_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+void LLaMAContextAttentionLayer<T>::forward(TensorMap*                output_tensors,
+                                            TensorMap*                input_tensors,
+                                            const AttentionWeight<T>* attention_weights)
+{
+    // input_tensors:
+    //      input_query [num_tokens, hidden_dimension]
+    //      attention_mask [batch_size, 1, seq_len, attn_len]
+    //      context_lengths, int, [batch_size]
+    //      attention_type [1]
+    //      padding_offset [num_tokens] (optional)
+    //      cu_seqlens [batch_size+1] (optional)
+
+    // output_tensors:
+    //      hidden_features [num_tokens, hidden_dimension]
+    //      key_cache [batch, local_head_num, max_seq_len, size_per_head]
+    //      value_cache [batch, local_head_num, max_seq_len, size_per_head]
+
+    FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+    FT_CHECK(output_tensors->at("key_cache").shape.size() == 4);
+    FT_CHECK(output_tensors->at("value_cache").shape.size() == 4);
+    const int batch_size  = input_tensors->at("attention_mask").shape[0];
+    const int seq_len     = input_tensors->at("attention_mask").shape[2];
+    const int attn_len    = input_tensors->at("attention_mask").shape[3];
+    const int max_seq_len = output_tensors->at("key_cache").shape[2];
+
+    T*                  attention_input = input_tensors->at("input_query").getPtr<T>();
+    T*                  attention_mask  = input_tensors->at("attention_mask").getPtr<T>();
+    const int*          context_lengths = input_tensors->at("context_lengths").getPtr<int>();
+    const int*          padding_offset  = input_tensors->getPtr<int>("padding_offset", nullptr);
+    const int*          cu_seqlens      = input_tensors->getPtr<int>("cu_seqlens", nullptr);
+    const AttentionType attention_type  = input_tensors->getVal<AttentionType>("attention_type");
+    T*                  attention_out   = output_tensors->at("hidden_features").getPtr<T>();
+    T*                  key_cache       = output_tensors->getPtr<T>("key_cache");
+    T*                  value_cache     = output_tensors->getPtr<T>("value_cache");
+
+    FT_CHECK_WITH_INFO(seq_len <= attn_len, "seq_len must be larger than or equal to attn_len");
+    FT_CHECK_WITH_INFO(attention_type != AttentionType::FUSED_PADDED_MHA,
+                       "LLaMA Context FUSED_PADDED_MHA is not supported !");
+
+    PUSH_RANGE("attention buffer alloc");
+    allocateBuffer(batch_size, seq_len, attn_len);
+    POP_RANGE;
+    sync_check_cuda_error();
+
+    const int num_tokens = input_tensors->at("input_query").shape[0];
+
+    PUSH_RANGE("qkv_gemm");
+
+    cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                          CUBLAS_OP_N,
+                          3 * hidden_units_,  // n
+                          num_tokens,
+                          hidden_units_,  // k
+                          attention_weights->query_weight.kernel,
+                          3 * hidden_units_,  // n
+                          attention_input,
+                          hidden_units_,  // k
+                          qkv_buf_,
+                          3 * hidden_units_ /* n */);
+    sync_check_cuda_error();
+
+    if (padding_offset != nullptr) {
+        // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous
+        //cudaMemsetAsync(q_buf_2_, 0, batch_size * (seq_len + 2 * attn_len) * hidden_units_ * sizeof(T), stream_);
+        invokeLLaMAMemset0(q_buf_2_, batch_size * (seq_len + 2 * attn_len) * hidden_units_, stream_);
+        sync_check_cuda_error();
+    }
+
+    invokeLLaMAAddFusedQKVBiasTranspose(q_buf_2_,
+                                        k_buf_2_,
+                                        v_buf_2_,
+                                        qkv_buf_,
+                                        padding_offset,
+                                        batch_size,
+                                        seq_len,
+                                        num_tokens,
+                                        head_num_,
+                                        size_per_head_,
+                                        rotary_embedding_dim_,
+                                        context_lengths,
+                                        stream_);
+    sync_check_cuda_error();
+
+    invokeLLaMASaveToCache(key_cache,
+                           value_cache,
+                           k_buf_2_,
+                           v_buf_2_,
+                           context_lengths,
+                           batch_size,
+                           head_num_,
+                           size_per_head_,
+                           seq_len,
+                           max_seq_len,
+                           stream_);
+    sync_check_cuda_error();
+
+    invokeLLaMALoadFromCache(k_buf_2_,
+                             v_buf_2_,
+                             key_cache,
+                             value_cache,
+                             batch_size,
+                             head_num_,
+                             size_per_head_,
+                             seq_len,
+                             attn_len,
+                             max_seq_len,
+                             stream_);
+    sync_check_cuda_error();
+
+    POP_RANGE;
+
+    const cudaDataType_t gemm_data_type      = getCudaDataType<T>();
+    const int            attention_seq_len_1 = seq_len;   // q length
+    const int            attention_seq_len_2 = attn_len;  // kv length
+    const T              qk_scale            = static_cast<T>(1.0f / sqrtf(size_per_head_ * 1.0f));
+    FT_CHECK(gemm_data_type != CUDA_R_32F);
+
+    //
+    // softmax(Q*K^T)
+    //
+    PUSH_RANGE("Q*K batch gemm");
+
+    cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T,
+                                        CUBLAS_OP_N,
+                                        attention_seq_len_2,  // n
+                                        attention_seq_len_1,  // m
+                                        size_per_head_,       // k
+                                        1.0f,
+                                        k_buf_2_,
+                                        gemm_data_type,
+                                        size_per_head_,                        // k
+                                        attention_seq_len_2 * size_per_head_,  // n * k
+                                        q_buf_2_,
+                                        gemm_data_type,
+                                        size_per_head_,                        // k
+                                        attention_seq_len_1 * size_per_head_,  // m * k
+                                        0.0f,
+                                        qk_buf_float_,
+                                        CUDA_R_32F,
+                                        attention_seq_len_2,  // n
+                                        attention_seq_len_2 * attention_seq_len_1,
+                                        batch_size * head_num_,  // global batch size
+                                        CUDA_R_32F);
+    sync_check_cuda_error();
+    POP_RANGE;
+
+    PUSH_RANGE("softmax");
+    MaskedSoftmaxParam<T, float> param;
+    param.attention_score    = qk_buf_;         // (batch_size, head_num, q_length, k_length)
+    param.qk                 = qk_buf_float_;   // (batch_size, head_num, q_length, k_length)
+    param.attention_mask     = attention_mask;  // (batch_size, q_length, k_length)
+    param.batch_size         = batch_size;
+    param.q_length           = attention_seq_len_1;
+    param.k_length           = attention_seq_len_2;
+    param.num_heads          = head_num_;
+    param.qk_scale           = qk_scale;
+    param.linear_bias_slopes = nullptr;
+    invokeMaskedSoftmax(param, stream_);
+    sync_check_cuda_error();
+    POP_RANGE;
+
+    PUSH_RANGE("QK*V batch gemm");
+    cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N,
+                                        CUBLAS_OP_N,
+                                        size_per_head_,
+                                        attention_seq_len_1,
+                                        attention_seq_len_2,
+
+                                        v_buf_2_,
+                                        size_per_head_,
+                                        attention_seq_len_2 * size_per_head_,
+
+                                        qk_buf_,
+                                        attention_seq_len_2,
+                                        attention_seq_len_1 * attention_seq_len_2,
+
+                                        qkv_buf_2_,
+                                        size_per_head_,
+                                        attention_seq_len_1 * size_per_head_,
+
+                                        batch_size * head_num_);
+    sync_check_cuda_error();
+
+    // transpose (batch_size, num_heads, L, Dh) to (batch_size, L, num_heads * Dh)
+    if (padding_offset == nullptr) {
+        invokeTransposeQKV(qkv_buf_3_,
+                           qkv_buf_2_,
+                           batch_size,
+                           attention_seq_len_1,
+                           head_num_,
+                           size_per_head_,
+                           attention_weights->attention_output_weight.scale,
+                           0,  // int8_mode
+                           stream_);
+        sync_check_cuda_error();
+    }
+    else {
+        invokeTransposeAttentionOutRemovePadding(qkv_buf_2_,
+                                                 qkv_buf_3_,
+                                                 num_tokens,
+                                                 batch_size,
+                                                 attention_seq_len_1,
+                                                 head_num_,
+                                                 size_per_head_,
+                                                 padding_offset,
+                                                 attention_weights->attention_output_weight.scale,
+                                                 0,  // int8_mode
+                                                 stream_);
+        sync_check_cuda_error();
+    }
+    POP_RANGE;
+    sync_check_cuda_error();
+
+    PUSH_RANGE("proj gemm");
+    cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                          CUBLAS_OP_N,
+                          hidden_units_,
+                          num_tokens,
+                          hidden_units_,
+                          attention_weights->attention_output_weight.kernel,
+                          hidden_units_,
+                          qkv_buf_3_,
+                          hidden_units_,
+                          attention_out,
+                          hidden_units_);
+    sync_check_cuda_error();
+    POP_RANGE;
+
+    if (is_free_buffer_after_forward_ == true) {
+        freeBuffer();
+    }
+    sync_check_cuda_error();
+    FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+}
+
+template<typename T>
+LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(size_t           head_num,
+                                                          size_t           size_per_head,
+                                                          size_t           local_head_num,
+                                                          size_t           rotary_embedding_dim,
+                                                          cudaStream_t     stream,
+                                                          cublasMMWrapper* cublas_wrapper,
+                                                          IAllocator*      allocator,
+                                                          bool             is_free_buffer_after_forward):
+    BaseAttentionLayer<T>(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, false),
+    head_num_(head_num),
+    size_per_head_(size_per_head),
+    hidden_units_(head_num * size_per_head),
+    rotary_embedding_dim_(rotary_embedding_dim)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+}
+
+template<typename T>
+LLaMAContextAttentionLayer<T>::LLaMAContextAttentionLayer(LLaMAContextAttentionLayer<T> const& attention_layer):
+    BaseAttentionLayer<T>(attention_layer.stream_,
+                          attention_layer.cublas_wrapper_,
+                          attention_layer.allocator_,
+                          attention_layer.is_free_buffer_after_forward_),
+    head_num_(attention_layer.head_num_),
+    size_per_head_(attention_layer.size_per_head_),
+    hidden_units_(attention_layer.hidden_units_),
+    rotary_embedding_dim_(attention_layer.rotary_embedding_dim_)
+{
+}
+
+template<typename T>
+LLaMAContextAttentionLayer<T>::~LLaMAContextAttentionLayer()
+{
+    cublas_wrapper_ = nullptr;
+    freeBuffer();
+}
+
+template<typename T>
+void LLaMAContextAttentionLayer<T>::allocateBuffer()
+{
+    FT_CHECK(false);
+}
+
+template<typename T>
+void LLaMAContextAttentionLayer<T>::allocateBuffer(size_t batch_size, size_t seq_len, size_t attn_len)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * 3 * batch_size * seq_len * hidden_units_, false);
+    q_buf_2_ =
+        (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * (seq_len + 2 * attn_len) * hidden_units_, false);
+    k_buf_2_ = q_buf_2_ + batch_size * seq_len * hidden_units_;
+    v_buf_2_ = k_buf_2_ + batch_size * attn_len * hidden_units_;
+
+    // save memory usage when using fmha
+    qk_buf_    = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * seq_len * attn_len, false);
+    qkv_buf_2_ = (T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * seq_len * hidden_units_, false);
+    qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * batch_size * seq_len * hidden_units_, false);
+
+    qk_buf_float_ =
+        (float*)allocator_->reMalloc(qk_buf_float_, sizeof(float) * batch_size * head_num_ * seq_len * attn_len, false);
+
+    is_allocate_buffer_ = true;
+}
+
+template<typename T>
+void LLaMAContextAttentionLayer<T>::freeBuffer()
+{
+    if (is_allocate_buffer_) {
+        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        allocator_->free((void**)(&qkv_buf_));
+        allocator_->free((void**)(&q_buf_2_));
+        allocator_->free((void**)(&k_buf_2_));
+        allocator_->free((void**)(&v_buf_2_));
+        allocator_->free((void**)(&qk_buf_));
+        allocator_->free((void**)(&qkv_buf_2_));
+        allocator_->free((void**)(&qkv_buf_3_));
+        allocator_->free((void**)(&qk_buf_float_));
+
+        is_allocate_buffer_ = false;
+    }
+}
+
+template class LLaMAContextAttentionLayer<float>;
+template class LLaMAContextAttentionLayer<half>;
+#ifdef ENABLE_BF16
+template class LLaMAContextAttentionLayer<__nv_bfloat16>;
+#endif
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
new file mode 100644
index 000000000..504cc8aba
--- /dev/null
+++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "3rdparty/trt_fused_multihead_attention/qkvToContext.h"
+#include "src/fastertransformer/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h"
+#include "src/fastertransformer/kernels/cutlass_kernels/int8_gemm/int8_gemm.h"
+#include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class LLaMAContextAttentionLayer: public BaseAttentionLayer<T> {
+private:
+    // metadata
+    const size_t head_num_;
+    const size_t size_per_head_;
+    const size_t hidden_units_;
+    const size_t rotary_embedding_dim_;
+
+    // fmha runner
+    void allocateBuffer() override;
+    void allocateBuffer(size_t batch_size, size_t seq_len, size_t attn_len);
+    void freeBuffer() override;
+
+    using BaseAttentionLayer<T>::is_free_buffer_after_forward_;
+    using BaseAttentionLayer<T>::is_allocate_buffer_;
+    using BaseAttentionLayer<T>::cublas_wrapper_;
+
+protected:
+    using BaseAttentionLayer<T>::allocator_;
+    using BaseAttentionLayer<T>::stream_;
+    T*     qkv_buf_      = nullptr;
+    T*     q_buf_2_      = nullptr;
+    T*     k_buf_2_      = nullptr;
+    T*     v_buf_2_      = nullptr;
+    T*     qk_buf_       = nullptr;
+    float* qk_buf_float_ = nullptr;
+    T*     qkv_buf_2_    = nullptr;
+    T*     qkv_buf_3_    = nullptr;
+
+public:
+    LLaMAContextAttentionLayer(size_t           head_num,
+                               size_t           size_per_head,
+                               size_t           local_head_num,
+                               size_t           rotary_embedding_dim,
+                               cudaStream_t     stream,
+                               cublasMMWrapper* cublas_wrapper,
+                               IAllocator*      allocator,
+                               bool             is_free_buffer_after_forward);
+
+    LLaMAContextAttentionLayer(LLaMAContextAttentionLayer<T> const& attention_layer);
+
+    virtual ~LLaMAContextAttentionLayer();
+
+    void
+    forward(TensorMap* output_tensors, TensorMap* input_tensors, const AttentionWeight<T>* attention_weights) override;
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/CMakeLists.txt b/src/fastertransformer/models/CMakeLists.txt
index 248b4af3d..afc4f8b7b 100644
--- a/src/fastertransformer/models/CMakeLists.txt
+++ b/src/fastertransformer/models/CMakeLists.txt
@@ -27,6 +27,7 @@ add_subdirectory(t5)
 add_subdirectory(bart)
 add_subdirectory(gptj)
 add_subdirectory(gptneox)
+add_subdirectory(llama)
 add_subdirectory(multi_gpu_gpt)
 if(ENABLE_FP8)
 add_subdirectory(gpt_fp8)
diff --git a/src/fastertransformer/models/llama/CMakeLists.txt b/src/fastertransformer/models/llama/CMakeLists.txt
new file mode 100644
index 000000000..287a350da
--- /dev/null
+++ b/src/fastertransformer/models/llama/CMakeLists.txt
@@ -0,0 +1,51 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.8)
+
+add_library(LLaMADecoderLayerWeight STATIC LLaMADecoderLayerWeight.cc)
+set_property(TARGET LLaMADecoderLayerWeight PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET LLaMADecoderLayerWeight PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(LLaMADecoderLayerWeight PUBLIC memory_utils cuda_utils logger)
+
+add_library(LLaMAContextDecoder STATIC LLaMAContextDecoder.cc)
+set_property(TARGET LLaMAContextDecoder PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET LLaMAContextDecoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(LLaMAContextDecoder PUBLIC -lcudart cublasMMWrapper
+                      LLaMAContextAttentionLayer
+                      FfnLayer
+                      layernorm_kernels
+                      add_residual_kernels
+                      llama_kernels
+                      tensor
+                      nccl_utils
+                      cuda_utils
+                      logger)
+
+add_library(LLaMAWeight STATIC LLaMAWeight.cc)
+set_property(TARGET LLaMAWeight PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET LLaMAWeight PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(LLaMAWeight PUBLIC LLaMADecoderLayerWeight cuda_utils logger)
+
+add_library(LLaMA STATIC LLaMA.cc)
+set_property(TARGET LLaMA PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET LLaMA PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(LLaMA PUBLIC -lcudart
+                      LLaMAContextDecoder
+                      decoding_kernels
+                      llama_kernels
+                      tensor
+                      LLaMAWeight
+                      cuda_utils
+                      logger)
diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc
new file mode 100644
index 000000000..1cc8a95c4
--- /dev/null
+++ b/src/fastertransformer/models/llama/LLaMA.cc
@@ -0,0 +1,359 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/models/llama/LLaMA.h"
+#include "src/fastertransformer/kernels/decoding_kernels.h"
+#include "src/fastertransformer/kernels/llama_kernels.h"
+#include "src/fastertransformer/utils/llama_utils.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+#include <algorithm>
+#include <type_traits>
+
+namespace fastertransformer {
+
+template<typename T>
+void LLaMA<T>::initialize()
+{
+    llama_context_decoder_ = new LLaMAContextDecoder<T>(head_num_,
+                                                        size_per_head_,
+                                                        inter_size_,
+                                                        num_layer_,
+                                                        rotary_embedding_dim_,
+                                                        layernorm_eps_,
+                                                        rank_,
+                                                        world_size_,
+                                                        stream_,
+                                                        cublas_wrapper_,
+                                                        allocator_,
+                                                        is_free_buffer_after_forward_,
+                                                        attention_type_);
+}
+
+template<typename T>
+void LLaMA<T>::allocateBuffer()
+{
+    FT_CHECK(false);
+}
+
+template<typename T>
+void LLaMA<T>::allocateBuffer(size_t batch_size, size_t seq_len, size_t attn_len, int is_context)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+    padding_offset_ =
+        reinterpret_cast<int*>(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false));
+    cu_seqlens_ = reinterpret_cast<int*>(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false));
+
+    input_attention_mask_ =
+        (T*)(allocator_->reMalloc(input_attention_mask_, sizeof(T) * batch_size * seq_len * attn_len, false));
+
+    if (is_context) {
+        const size_t self_cache_size =
+            (num_layer_ / world_size_) * batch_size * max_seq_len_ * hidden_units_;
+        key_cache_   = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, false));
+        value_cache_ = key_cache_ + self_cache_size;
+    }
+
+    context_decoder_input_buf_ =
+        (T*)(allocator_->reMalloc(context_decoder_input_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+    context_decoder_output_buf_ = (T*)(allocator_->reMalloc(
+        context_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+
+    context_output_buf_ =
+        (T*)(allocator_->reMalloc(context_output_buf_, sizeof(T) * batch_size * hidden_units_, false));
+    normed_decoder_output_buf_ =
+        (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+    logits_buf_ =
+        (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batch_size * seq_len * vocab_size_, false));
+    log_likelihood_buf_ =
+        (float*)(allocator_->reMalloc(log_likelihood_buf_, sizeof(float) * batch_size * seq_len * vocab_size_, false));
+
+    is_allocate_buffer_ = true;
+}
+
+template<typename T>
+void LLaMA<T>::freeBuffer()
+{
+    if (is_allocate_buffer_) {
+        allocator_->free((void**)(&padding_offset_));
+        allocator_->free((void**)(&cu_seqlens_));
+        allocator_->free((void**)(&input_attention_mask_));
+        allocator_->free((void**)(&key_cache_));
+        allocator_->free((void**)(&context_decoder_input_buf_));
+        allocator_->free((void**)(&context_decoder_output_buf_));
+        allocator_->free((void**)(&context_output_buf_));
+        allocator_->free((void**)(&normed_decoder_output_buf_));
+        allocator_->free((void**)(&logits_buf_));
+        allocator_->free((void**)(&log_likelihood_buf_));
+        is_allocate_buffer_ = false;
+    }
+}
+
+template<typename T>
+LLaMA<T>::LLaMA(size_t           head_num,
+                size_t           size_per_head,
+                size_t           inter_size,
+                size_t           num_layer,
+                size_t           vocab_size,
+                size_t           rotary_embedding_dim,
+                size_t           random_seed,
+                size_t           max_seq_len,
+                size_t           rank,
+                size_t           world_size,
+                cudaStream_t     stream,
+                cublasMMWrapper* cublas_wrapper,
+                IAllocator*      allocator,
+                bool             is_free_buffer_after_forward,
+                cudaDeviceProp*  cuda_device_prop,
+                AttentionType    attention_type):
+    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop),
+    head_num_(head_num),
+    size_per_head_(size_per_head),
+    inter_size_(inter_size),
+    num_layer_(num_layer),
+    vocab_size_(vocab_size),
+    rotary_embedding_dim_(rotary_embedding_dim),
+    random_seed_(random_seed),
+    max_seq_len_(max_seq_len),
+    hidden_units_(head_num * size_per_head),
+    rank_(rank),
+    world_size_(world_size),
+    attention_type_(attention_type)
+{
+    initialize();
+}
+
+template<typename T>
+LLaMA<T>::LLaMA(LLaMA<T> const& llama):
+    BaseLayer(llama),
+    head_num_(llama.head_num_),
+    size_per_head_(llama.size_per_head_),
+    inter_size_(llama.inter_size_),
+    num_layer_(llama.num_layer_),
+    vocab_size_(llama.vocab_size_),
+    rotary_embedding_dim_(llama.rotary_embedding_dim_),
+    random_seed_(llama.random_seed_),
+    max_seq_len_(llama.max_seq_len_),
+    hidden_units_(llama.hidden_units_),
+    rank_(llama.rank_),
+    world_size_(llama.world_size_),
+    attention_type_(llama.attention_type_)
+{
+    initialize();
+}
+
+template<typename T>
+LLaMA<T>::~LLaMA()
+{
+    delete llama_context_decoder_;
+    freeBuffer();
+}
+
+template<typename T>
+void LLaMA<T>::forward(std::vector<Tensor>*       output_tensors,
+                       const std::vector<Tensor>* input_tensors,
+                       const LLaMAWeight<T>*      llama_weights)
+{
+    FT_CHECK(false);
+}
+
+template<typename T>
+void LLaMA<T>::forward(std::unordered_map<std::string, Tensor>*       output_tensors,
+                       const std::unordered_map<std::string, Tensor>* input_tensors,
+                       const LLaMAWeight<T>*                          llama_weights)
+{
+    //
+    // input_tensors:
+    //      input_ids [num_tokens]
+    //      input_lengths [batch_size]
+    //      target_ids [beam_width, num_tokens]
+    //      context_lengths [batch_size]
+    //      seq_len [1] int on cpu
+    //      attn_len [1] int on cpu
+    //      is_context [1] int on cpu
+
+    // output_tensors:
+    //      hidden_vector [num_tokens, hidden_size]
+    //      cum_probs [beam_width, batch_size]
+
+    FT_CHECK_WITH_INFO(input_tensors->size() == 7, "input_tensors->size() == 7");
+    FT_CHECK(input_tensors->at("input_ids").shape.size() == 1);
+    FT_CHECK(input_tensors->at("input_lengths").shape.size() == 1);
+    FT_CHECK(input_tensors->at("target_ids").shape.size() == 2);
+    FT_CHECK(input_tensors->at("context_lengths").shape.size() == 1);
+
+    const DataType data_type       = getTensorType<T>();
+    const bool     is_unpadded_mha = isUnPaddedMHA(attention_type_);
+    const size_t   batch_size      = input_tensors->at("input_lengths").shape[0];
+    const size_t   num_tokens      = input_tensors->at("input_ids").shape[0];
+    const size_t   beam_width      = input_tensors->at("target_ids").shape[0];
+
+    const int* input_ids       = input_tensors->at("input_ids").getPtr<int>();
+    const int* input_lengths   = input_tensors->at("input_lengths").getPtr<int>();
+    const int* target_ids      = input_tensors->at("target_ids").getPtr<int>();
+    const int* context_lengths = input_tensors->at("context_lengths").getPtr<int>();
+    const int  seq_len         = input_tensors->at("seq_len").getVal<int>();
+    const int  attn_len        = input_tensors->at("attn_len").getVal<int>();
+    const int  is_context      = input_tensors->at("is_context").getVal<int>();
+    T*         hidden_vector   = output_tensors->at("hidden_vector").getPtr<T>();
+    float*     cum_probs       = output_tensors->at("cum_probs").getPtr<float>();
+
+    FT_CHECK_WITH_INFO(seq_len <= attn_len, "seq_len must be larger than or equal to attn_len");
+
+    allocateBuffer(batch_size, seq_len, attn_len, is_context);
+    sync_check_cuda_error();
+
+    if (is_unpadded_mha) {
+        invokeLLaMAGetPaddingOffsetAndCuSeqLens(
+            padding_offset_, cu_seqlens_, input_lengths, batch_size, seq_len, stream_);
+        sync_check_cuda_error();
+    }
+
+    invokeLLaMABuildDecoderAttentionMask(
+        input_attention_mask_, input_lengths, context_lengths, batch_size, seq_len, attn_len, stream_);
+    sync_check_cuda_error();
+
+    if (rank_ == 0) {
+        invokeLLaMAInputIdsEmbeddingLookup(context_decoder_input_buf_,
+                                           llama_weights->pre_decoder_embedding_table,
+                                           input_ids,
+                                           num_tokens,
+                                           hidden_units_,
+                                           stream_);
+        sync_check_cuda_error();
+    }
+
+    std::unordered_map<std::string, Tensor> decoder_input_tensors{
+        {"decoder_input",
+         Tensor{MEMORY_GPU,
+                data_type,
+                {num_tokens, hidden_units_},
+                rank_ == 0                 ? context_decoder_input_buf_ : hidden_vector
+         }},
+        {"attention_mask",
+         Tensor{MEMORY_GPU, data_type, {batch_size, 1, (size_t)seq_len, (size_t)(attn_len)}, input_attention_mask_}},
+        {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, input_lengths}},
+        {"context_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, context_lengths}},
+        {"seq_len", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &seq_len}},
+        {"attn_len", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &attn_len}},
+        {"is_context", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &is_context}},
+    };
+
+    if (is_unpadded_mha) {
+        decoder_input_tensors.insert({"padding_offset", Tensor{MEMORY_GPU, TYPE_INT32, {num_tokens}, padding_offset_}});
+        decoder_input_tensors.insert({"cu_seqlens", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size + 1}, cu_seqlens_}});
+    }
+
+    std::unordered_map<std::string, Tensor> decoder_output_tensors{
+        {"decoder_output",
+         Tensor{MEMORY_GPU,
+                data_type,
+                {num_tokens, hidden_units_},
+                (rank_ == world_size_ - 1) ? context_decoder_output_buf_ : hidden_vector
+         }},
+        {"key_cache",
+         Tensor{MEMORY_GPU,
+                data_type,
+                {num_layer_ / world_size_, batch_size, head_num_, max_seq_len_, size_per_head_},
+                key_cache_}},
+        {"value_cache",
+         Tensor{MEMORY_GPU,
+                data_type,
+                {num_layer_ / world_size_, batch_size, head_num_, max_seq_len_, size_per_head_},
+                value_cache_}}};
+
+    llama_context_decoder_->forward(
+        &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights);
+    sync_check_cuda_error();
+
+    if (is_context) {
+        invokeLLaMAGetLastTokens(
+            context_output_buf_, context_decoder_output_buf_, cu_seqlens_, batch_size, hidden_units_, stream_);
+        sync_check_cuda_error();
+
+        invokeGeneralLLaMALayerNorm(normed_decoder_output_buf_,
+                                    context_output_buf_,
+                                    llama_weights->post_decoder_layernorm.gamma,
+                                    layernorm_eps_,
+                                    batch_size,
+                                    hidden_units_,
+                                    stream_);
+        sync_check_cuda_error();
+
+        float alpha = 1.0f;
+        float beta  = 0.0f;
+        cublas_wrapper_->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_32F, CUDA_R_32F);
+        cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                              CUBLAS_OP_N,
+                              vocab_size_,
+                              batch_size,
+                              hidden_units_,
+                              llama_weights->post_decoder_embedding.kernel,
+                              vocab_size_,
+                              normed_decoder_output_buf_,
+                              hidden_units_,  // n
+                              logits_buf_,
+                              vocab_size_);
+        sync_check_cuda_error();
+        cublas_wrapper_->setFP16GemmConfig();
+
+        invokeLLaMALogSoftmax(log_likelihood_buf_, logits_buf_, batch_size, vocab_size_, stream_);
+        sync_check_cuda_error();
+
+        invokeLLaMAExtractTargets(
+            cum_probs, log_likelihood_buf_, target_ids, cu_seqlens_, beam_width, batch_size, vocab_size_, num_tokens, stream_);
+        sync_check_cuda_error();
+    }
+    else {
+        invokeGeneralLLaMALayerNorm(normed_decoder_output_buf_,
+                                    context_decoder_output_buf_,
+                                    llama_weights->post_decoder_layernorm.gamma,
+                                    layernorm_eps_,
+                                    num_tokens,
+                                    hidden_units_,
+                                    stream_);
+        sync_check_cuda_error();
+
+        float alpha = 1.0f;
+        float beta  = 0.0f;
+        cublas_wrapper_->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_32F, CUDA_R_32F);
+        cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                              CUBLAS_OP_N,
+                              vocab_size_,
+                              num_tokens,
+                              hidden_units_,
+                              llama_weights->post_decoder_embedding.kernel,
+                              vocab_size_,
+                              normed_decoder_output_buf_,
+                              hidden_units_,  // n
+                              logits_buf_,
+                              vocab_size_);
+        sync_check_cuda_error();
+        cublas_wrapper_->setFP16GemmConfig();
+
+        invokeLLaMALogSoftmax(log_likelihood_buf_, logits_buf_, num_tokens, vocab_size_, stream_);
+        sync_check_cuda_error();
+
+        invokeLLaMAGatherTokens(
+            cum_probs, log_likelihood_buf_, input_lengths, target_ids, cu_seqlens_, batch_size, vocab_size_, num_tokens, stream_);
+        sync_check_cuda_error();
+    }
+}
+
+template class LLaMA<float>;
+template class LLaMA<half>;
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h
new file mode 100644
index 000000000..117f87341
--- /dev/null
+++ b/src/fastertransformer/models/llama/LLaMA.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+#include "src/fastertransformer/models/llama/LLaMAContextDecoder.h"
+#include "src/fastertransformer/models/llama/LLaMAWeight.h"
+#include "src/fastertransformer/utils/custom_ar_comm.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class LLaMA: public BaseLayer {
+private:
+    // meta data
+    size_t                 head_num_;
+    size_t                 size_per_head_;
+    size_t                 inter_size_;
+    size_t                 num_layer_;
+    size_t                 vocab_size_;
+    size_t                 rotary_embedding_dim_;
+    size_t                 random_seed_;
+    size_t                 max_seq_len_;
+    size_t                 hidden_units_;
+    size_t                 rank_;
+    size_t                 world_size_;
+    static constexpr float layernorm_eps_ = 1e-6f;
+    AttentionType          attention_type_;
+
+    LLaMAContextDecoder<T>* llama_context_decoder_;
+
+    void allocateBuffer() override;
+    void allocateBuffer(size_t batch_size, size_t seq_len, size_t attn_len, int is_context);
+    void freeBuffer() override;
+
+    void initialize();
+
+protected:
+    int*   padding_offset_             = nullptr;
+    int*   cu_seqlens_                 = nullptr;
+    T*     input_attention_mask_       = nullptr;
+    T*     key_cache_                  = nullptr;
+    T*     value_cache_                = nullptr;
+    T*     context_output_buf_         = nullptr;
+    T*     normed_decoder_output_buf_  = nullptr;
+    float* logits_buf_                 = nullptr;
+    float* log_likelihood_buf_         = nullptr;
+    T*     context_decoder_input_buf_  = nullptr;
+    T*     context_decoder_output_buf_ = nullptr;
+
+    void sendTensorsToFirstPipelineNode(std::unordered_map<std::string, Tensor>*       output_tensors,
+                                        const std::unordered_map<std::string, Tensor>* input_tensors);
+
+public:
+    LLaMA(size_t           head_num,
+          size_t           size_per_head,
+          size_t           inter_size,
+          size_t           num_layer,
+          size_t           vocab_size,
+          size_t           rotary_embedding_dim,
+          size_t           random_seed,
+          size_t           max_seq_len,
+          size_t           rank,
+          size_t           world_size,
+          cudaStream_t     stream,
+          cublasMMWrapper* cublas_wrapper,
+          IAllocator*      allocator,
+          bool             is_free_buffer_after_forward,
+          cudaDeviceProp*  cuda_device_prop = nullptr,
+          AttentionType    attention_type   = AttentionType::UNFUSED_MHA);
+
+    LLaMA(LLaMA<T> const& LLaMA);
+
+    ~LLaMA();
+
+    void forward(std::vector<Tensor>*       output_tensors,
+                 const std::vector<Tensor>* input_tensors,
+                 const LLaMAWeight<T>*      llama_weights);
+
+    void forward(std::unordered_map<std::string, Tensor>*       output_tensors,
+                 const std::unordered_map<std::string, Tensor>* input_tensors,
+                 const LLaMAWeight<T>*                          llama_weights);
+
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
new file mode 100644
index 000000000..7406c838f
--- /dev/null
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/models/llama/LLaMAContextDecoder.h"
+#include "src/fastertransformer/kernels/llama_kernels.h"
+#include "src/fastertransformer/layers/FfnLayer.h"
+#include "src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h"
+#include "src/fastertransformer/utils/llama_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+void LLaMAContextDecoder<T>::initialize()
+{
+    self_attention_layer_ = new LLaMAContextAttentionLayer<T>(head_num_,
+                                                              size_per_head_,
+                                                              head_num_,
+                                                              rotary_embedding_dim_,
+                                                              stream_,
+                                                              cublas_wrapper_,
+                                                              allocator_,
+                                                              is_free_buffer_after_forward_);
+
+    ffn_layer_ = new SiluFfnLayer<T>(0,  // max_batch_size
+                                     0,  // max_seq_len
+                                     head_num_,
+                                     size_per_head_,
+                                     0,  // expert_num
+                                     inter_size_,
+                                     stream_,
+                                     cublas_wrapper_,
+                                     allocator_,
+                                     is_free_buffer_after_forward_,
+                                     false,
+                                     true  // use_gated_activation = false
+    );
+}
+
+template<typename T>
+void LLaMAContextDecoder<T>::allocateBuffer()
+{
+    FT_CHECK(false);
+}
+
+template<typename T>
+void LLaMAContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len)
+{
+
+    decoder_normed_input_ = reinterpret_cast<T*>(
+        allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+    self_attn_output_ = reinterpret_cast<T*>(
+        allocator_->reMalloc(self_attn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+    decoder_layer_output_ = reinterpret_cast<T*>(
+        allocator_->reMalloc(decoder_layer_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+    is_allocate_buffer_ = true;
+}
+
+template<typename T>
+void LLaMAContextDecoder<T>::freeBuffer()
+{
+    if (is_allocate_buffer_ == true) {
+        allocator_->free((void**)(&decoder_normed_input_));
+        allocator_->free((void**)(&self_attn_output_));
+        allocator_->free((void**)(&decoder_layer_output_));
+        is_allocate_buffer_ = false;
+    }
+}
+
+template<typename T>
+bool LLaMAContextDecoder<T>::isValidLayerParallelId(uint l)
+{
+    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / world_size_));
+    return l < num_layer_ && (l >= local_num_layer * rank_) && (l < local_num_layer * (rank_ + 1));
+}
+
+template<typename T>
+bool LLaMAContextDecoder<T>::isFirstLayerParallelId(uint l)
+{
+    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / world_size_));
+    return l < num_layer_ && (l == local_num_layer * rank_);
+}
+
+template<typename T>
+bool LLaMAContextDecoder<T>::isLastLayerParallelId(uint l)
+{
+    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / world_size_));
+    return l < num_layer_ && (l == local_num_layer * (rank_ + 1) - 1);
+}
+
+template<typename T>
+int LLaMAContextDecoder<T>::getFirstLayerParallelId()
+{
+    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / world_size_));
+    return local_num_layer * rank_;
+}
+
+template<typename T>
+LLaMAContextDecoder<T>::LLaMAContextDecoder(size_t           head_num,
+                                            size_t           size_per_head,
+                                            size_t           inter_size,
+                                            size_t           num_layer,
+                                            size_t           rotary_embedding_dim,
+                                            float            layernorm_eps,
+                                            size_t           rank,
+                                            size_t           world_size,
+                                            cudaStream_t     stream,
+                                            cublasMMWrapper* cublas_wrapper,
+                                            IAllocator*      allocator,
+                                            bool             is_free_buffer_after_forward,
+                                            AttentionType    attention_type):
+    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
+    head_num_(head_num),
+    size_per_head_(size_per_head),
+    inter_size_(inter_size),
+    num_layer_(num_layer),
+    rotary_embedding_dim_(rotary_embedding_dim),
+    layernorm_eps_(layernorm_eps),
+    hidden_units_(head_num * size_per_head),
+    rank_(rank),
+    world_size_(world_size),
+    attention_type_(attention_type)
+{
+    initialize();
+}
+
+template<typename T>
+LLaMAContextDecoder<T>::LLaMAContextDecoder(LLaMAContextDecoder<T> const& decoder):
+    BaseLayer(decoder.stream_, decoder.cublas_wrapper_, decoder.allocator_, decoder.is_free_buffer_after_forward_),
+    head_num_(decoder.head_num_),
+    size_per_head_(decoder.size_per_head_),
+    inter_size_(decoder.inter_size_),
+    num_layer_(decoder.num_layer_),
+    rotary_embedding_dim_(decoder.rotary_embedding_dim_),
+    layernorm_eps_(decoder.layernorm_eps_),
+    hidden_units_(decoder.hidden_units_),
+    rank_(decoder.rank_),
+    world_size_(decoder.world_size_),
+    attention_type_(decoder.attention_type_)
+{
+    initialize();
+}
+
+template<typename T>
+LLaMAContextDecoder<T>::~LLaMAContextDecoder()
+{
+    delete self_attention_layer_;
+    delete ffn_layer_;
+    freeBuffer();
+}
+
+template<typename T>
+void LLaMAContextDecoder<T>::forward(std::vector<Tensor>*                            output_tensors,
+                                     const std::vector<Tensor>*                      input_tensors,
+                                     const std::vector<LLaMADecoderLayerWeight<T>*>* llama_decoder_layer_weight)
+{
+    std::unordered_map<std::string, Tensor> input_tensors_map{{"decoder_input", input_tensors->at(0)},
+                                                              {"attention_mask", input_tensors->at(1)},
+                                                              {"input_lengths", input_tensors->at(2)},
+                                                              {"context_lengths", input_tensors->at(3)},
+                                                              {"seq_len", input_tensors->at(4)}};
+    std::unordered_map<std::string, Tensor> output_tensors_map{{"decoder_output", output_tensors->at(0)},
+                                                               {"key_cache", output_tensors->at(1)},
+                                                               {"value_cache", output_tensors->at(2)}};
+
+    forward(&output_tensors_map, &input_tensors_map, llama_decoder_layer_weight);
+}
+
+template<typename T>
+void LLaMAContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*        output_tensors,
+                                     const std::unordered_map<std::string, Tensor>*  input_tensors,
+                                     const std::vector<LLaMADecoderLayerWeight<T>*>* llama_decoder_layer_weight)
+{
+    // input tensors:
+    //      decoder_input [num_tokens, hidden_dimension],
+    //      attention_mask [batch_size, 1, seq_len, attn_len]
+    //      input_lengths [batch_size]
+    //      context_lengths [batch_size]
+    //      seq_len [1] int on cpu
+    //      attn_len [1] int on cpu
+    //      is_context [1] int on cpu
+    //      padding_offset [batch_size] int on cpu
+    //      cu_seqlens [batch_size+1] int on cpu
+
+    // output tensors:
+    //      decoder_output [num_tokens, hidden_dimension],
+    //      key_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head]
+    //      value_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head]
+
+    FT_CHECK(input_tensors->size() >= 5);
+    FT_CHECK(output_tensors->size() == 3);
+    const DataType data_type       = getTensorType<T>();
+    const bool     is_unpadded_mha = isUnPaddedMHA(attention_type_);
+    const size_t   batch_size      = input_tensors->at("input_lengths").shape[0];
+    const size_t   num_tokens      = input_tensors->at("decoder_input").shape[0];
+
+    const int* input_lengths   = input_tensors->at("input_lengths").getPtr<int>();
+    const int* context_lengths = input_tensors->at("context_lengths").getPtr<int>();
+    const int  seq_len         = input_tensors->at("attention_mask").shape[2];
+    const int  attn_len        = input_tensors->at("attention_mask").shape[3];
+    const int  is_context      = input_tensors->at("is_context").getVal<int>();
+    const int* padding_offset  = nullptr;
+    const int* cu_seqlens      = nullptr;
+    if (is_unpadded_mha) {
+        padding_offset = input_tensors->at("padding_offset").getPtr<int>();
+        cu_seqlens     = input_tensors->at("cu_seqlens").getPtr<int>();
+    }
+    allocateBuffer(batch_size, seq_len);
+    sync_check_cuda_error();
+
+    T*       decoder_input  = input_tensors->at("decoder_input").getPtr<T>();
+    T*       decoder_output = output_tensors->at("decoder_output").getPtr<T>();
+    const T* attention_mask = input_tensors->at("attention_mask").getPtr<const T>();
+
+    Tensor&             k_cache = output_tensors->at("key_cache");
+    Tensor&             v_cache = output_tensors->at("value_cache");
+    std::vector<size_t> self_k_cache_size;
+    self_k_cache_size.push_back(batch_size);
+    for (auto t = k_cache.shape.begin() + 2; t != k_cache.shape.end(); ++t) {
+        self_k_cache_size.push_back(*t);
+    }
+    std::vector<size_t> self_v_cache_size;
+    self_v_cache_size.push_back(batch_size);
+    for (auto t = v_cache.shape.begin() + 2; t != v_cache.shape.end(); ++t) {
+        self_v_cache_size.push_back(*t);
+    }
+
+    for (int l = 0; l < num_layer_; l++) {
+        if (isValidLayerParallelId(l) == false) {
+            continue;
+        }
+
+        const bool is_final     = false;
+        T*         layer_input  = decoder_layer_output_;
+        T*         layer_output = decoder_layer_output_;
+        if (isFirstLayerParallelId(l)) {
+            layer_input = decoder_input;
+        }
+        if (isLastLayerParallelId(l)) {
+            layer_output = decoder_output;
+        }
+
+        invokeGeneralLLaMALayerNorm(decoder_normed_input_,
+                                    layer_input,
+                                    llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma,
+                                    layernorm_eps_,
+                                    num_tokens,
+                                    hidden_units_,
+                                    stream_);
+        sync_check_cuda_error();
+
+        TensorMap self_attention_input_tensors{
+            {"input_query", Tensor{MEMORY_GPU, data_type, {num_tokens, (size_t)hidden_units_}, decoder_normed_input_}},
+            {"attention_mask",
+             Tensor{MEMORY_GPU,
+                    data_type,
+                    {(size_t)batch_size, (size_t)1, (size_t)seq_len, (size_t)(attn_len)},
+                    attention_mask}},
+            {"context_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {(size_t)batch_size}, context_lengths}},
+            {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type_}},
+            {"is_context", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &is_context}},
+        };
+
+        if (is_unpadded_mha) {
+            self_attention_input_tensors.insert("padding_offset",
+                                                Tensor{MEMORY_GPU, TYPE_INT32, {num_tokens}, padding_offset});
+            self_attention_input_tensors.insert("cu_seqlens",
+                                                Tensor{MEMORY_GPU, TYPE_INT32, {size_t(batch_size + 1)}, cu_seqlens});
+        }
+
+        size_t cache_offset = l - getFirstLayerParallelId();
+        for (auto t = k_cache.shape.begin() + 1; t != k_cache.shape.end(); ++t) {
+            cache_offset *= *t;
+        };
+
+        TensorMap self_attention_output_tensors{
+            {"hidden_features", Tensor{MEMORY_GPU, data_type, {num_tokens, (size_t)hidden_units_}, self_attn_output_}},
+            {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}},
+            {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}};
+
+        self_attention_layer_->forward(&self_attention_output_tensors,
+                                       &self_attention_input_tensors,
+                                       &llama_decoder_layer_weight->at(l)->self_attention_weights);
+
+        invokeGeneralLLaMAAddBiasResidualPreLayerNorm(
+            self_attn_output_,
+            decoder_normed_input_,
+            self_attn_output_,
+            layer_input,
+            llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma,
+            llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta,
+            llama_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias,
+            layernorm_eps_,
+            num_tokens,
+            hidden_units_,
+            stream_);
+        sync_check_cuda_error();
+
+        TensorMap ffn_input_tensors(
+            {{"ffn_input", Tensor{MEMORY_GPU, data_type, {num_tokens, (size_t)hidden_units_}, decoder_normed_input_}}});
+        TensorMap ffn_output_tensors(
+            {{"ffn_output", Tensor{MEMORY_GPU, data_type, {num_tokens, (size_t)hidden_units_}, layer_output}}});
+        ffn_layer_->forward(&ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights);
+
+        invokeAddBiasResidual(layer_output,
+                              self_attn_output_,
+                              llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias,
+                              num_tokens,
+                              hidden_units_,
+                              stream_);
+
+        sync_check_cuda_error();
+    }
+
+    if (is_free_buffer_after_forward_ == true) {
+        freeBuffer();
+    }
+}
+
+template class LLaMAContextDecoder<float>;
+template class LLaMAContextDecoder<half>;
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.h b/src/fastertransformer/models/llama/LLaMAContextDecoder.h
new file mode 100644
index 000000000..3e2aeb0c0
--- /dev/null
+++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <vector>
+
+#include "src/fastertransformer/kernels/add_residual_kernels.h"
+#include "src/fastertransformer/kernels/layernorm_kernels.h"
+#include "src/fastertransformer/layers/BaseLayer.h"
+#include "src/fastertransformer/layers/FfnLayer.h"
+#include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h"
+#include "src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/cublasMMWrapper.h"
+#include "src/fastertransformer/utils/custom_ar_comm.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class LLaMAContextDecoder: public BaseLayer {
+private:
+    // meta data
+    size_t head_num_;
+    size_t size_per_head_;
+    size_t inter_size_;
+    size_t num_layer_;
+    size_t rotary_embedding_dim_;
+    float  layernorm_eps_;
+    size_t hidden_units_;
+    size_t rank_;
+    size_t world_size_;
+    AttentionType attention_type_;
+
+    BaseAttentionLayer<T>* self_attention_layer_;
+    FfnLayer<T>*           ffn_layer_;
+
+    void allocateBuffer() override;
+    void allocateBuffer(size_t batch_size, size_t seq_len);
+    void freeBuffer() override;
+
+    bool isValidLayerParallelId(uint l);
+    bool isFirstLayerParallelId(uint l);
+    bool isLastLayerParallelId(uint l);
+    int  getFirstLayerParallelId();
+
+    void initialize();
+
+protected:
+    T* decoder_normed_input_ = nullptr;
+    T* self_attn_output_     = nullptr;
+    T* decoder_layer_output_ = nullptr;
+
+public:
+    LLaMAContextDecoder(size_t           head_num,
+                        size_t           size_per_head,
+                        size_t           inter_size,
+                        size_t           num_layer,
+                        size_t           rotary_embedding_dim,
+                        float            layernorm_eps,
+                        size_t           rank,
+                        size_t           world_size,
+                        cudaStream_t     stream,
+                        cublasMMWrapper* cublas_wrapper,
+                        IAllocator*      allocator,
+                        bool             is_free_buffer_after_forward,
+                        AttentionType    attention_type = AttentionType::FUSED_MHA);
+
+    LLaMAContextDecoder(LLaMAContextDecoder<T> const& decoder);
+
+    ~LLaMAContextDecoder();
+
+    void forward(std::vector<Tensor>*                            output_tensors,
+                 const std::vector<Tensor>*                      input_tensors,
+                 const std::vector<LLaMADecoderLayerWeight<T>*>* decoder_layer_weights);
+
+    void forward(std::unordered_map<std::string, Tensor>*        output_tensors,
+                 const std::unordered_map<std::string, Tensor>*  input_tensors,
+                 const std::vector<LLaMADecoderLayerWeight<T>*>* llama_decoder_layer_weight);
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc
new file mode 100644
index 000000000..6f3a7721f
--- /dev/null
+++ b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+LLaMADecoderLayerWeight<T>::LLaMADecoderLayerWeight(const int hidden_units, const int inter_size):
+    hidden_units_(hidden_units), inter_size_(inter_size)
+{
+    mallocWeights();
+    setWeightPtr();
+}
+
+template<typename T>
+LLaMADecoderLayerWeight<T>::~LLaMADecoderLayerWeight()
+{
+    if (is_maintain_buffer == true) {
+        for (int i = 0; i < 14; i++) {
+            if (i != attention_dense_bias_weight_id) {
+                cudaFree(weights_ptr[i]);
+            }
+        }
+
+        pre_layernorm_weights.beta                            = nullptr;
+        pre_layernorm_weights.gamma                           = nullptr;
+        self_attention_weights.query_weight.kernel            = nullptr;
+        self_attention_weights.query_weight.bias              = nullptr;
+        self_attention_weights.attention_output_weight.kernel = nullptr;
+        self_attention_weights.attention_output_weight.bias   = nullptr;
+        post_attention_layernorm_weights.beta                 = nullptr;
+        post_attention_layernorm_weights.gamma                = nullptr;
+
+        ffn_weights.intermediate_weight.kernel = nullptr;
+        ffn_weights.intermediate_weight.bias   = nullptr;
+        ffn_weights.output_weight.kernel       = nullptr;
+        ffn_weights.output_weight.bias         = nullptr;
+        is_maintain_buffer                     = false;
+    }
+}
+
+template<typename T>
+LLaMADecoderLayerWeight<T>::LLaMADecoderLayerWeight(const LLaMADecoderLayerWeight& other):
+    hidden_units_(other.hidden_units_), inter_size_(other.inter_size_)
+{
+    mallocWeights();
+    cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_);
+    cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
+    cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_);
+    cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_);
+    cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ * hidden_units_);
+    cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_);
+    cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_);
+    cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_);
+    cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ * hidden_units_);
+    cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_);
+    cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], hidden_units_ * inter_size_);
+    cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], inter_size_);
+    cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_);
+    cudaD2Dcpy(weights_ptr[13], other.weights_ptr[13], hidden_units_);
+    setWeightPtr();
+}
+
+template<typename T>
+LLaMADecoderLayerWeight<T>& LLaMADecoderLayerWeight<T>::operator=(const LLaMADecoderLayerWeight& other)
+{
+    hidden_units_ = other.hidden_units_;
+    inter_size_   = other.inter_size_;
+
+    mallocWeights();
+
+    cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_);
+    cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
+    cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_);
+    cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_);
+    cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ * hidden_units_);
+    cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_);
+    cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_);
+    cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_);
+    cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ * hidden_units_);
+    cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_);
+    cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], hidden_units_ * inter_size_);
+    cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], inter_size_);
+    cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_);
+    cudaD2Dcpy(weights_ptr[13], other.weights_ptr[13], hidden_units_);
+    setWeightPtr();
+    return *this;
+}
+
+template<typename T>
+void LLaMADecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType model_file_type)
+{
+    FT_CHECK(is_maintain_buffer == true);
+
+    loadWeightFromBin<T>(
+        weights_ptr[0], {(size_t)hidden_units_}, dir_path + ".attention_norm.bias.bin", model_file_type);
+    loadWeightFromBin<T>(
+        weights_ptr[1], {(size_t)hidden_units_}, dir_path + ".attention_norm.weight.bin", model_file_type);
+
+    loadWeightFromBin<T>(weights_ptr[2],
+                         {(size_t)hidden_units_, (size_t)(3 * hidden_units_)},
+                         dir_path + ".attention.query_key_value.weight.bin",
+                         model_file_type);
+    loadWeightFromBin<T>(weights_ptr[3],
+                         {(size_t)(3 * hidden_units_)},
+                         dir_path + ".attention.query_key_value.bias.bin",
+                         model_file_type);
+
+    loadWeightFromBin<T>(weights_ptr[4],
+                         {(size_t)(hidden_units_), (size_t)hidden_units_},
+                         dir_path + ".attention.wo.weight.bin",
+                         model_file_type);
+    loadWeightFromBin<T>(weights_ptr[5], {(size_t)hidden_units_}, dir_path + ".attention.wo.bias.bin", model_file_type);
+
+    loadWeightFromBin<T>(weights_ptr[6],
+                         {(size_t)hidden_units_, (size_t)(inter_size_)},
+                         dir_path + ".feed_forward.w1.weight.bin",
+                         model_file_type);
+    loadWeightFromBin<T>(
+        weights_ptr[7], {(size_t)(inter_size_)}, dir_path + ".feed_forward.w1.bias.bin", model_file_type);
+
+    loadWeightFromBin<T>(weights_ptr[8],
+                         {(size_t)(inter_size_), (size_t)hidden_units_},
+                         dir_path + ".feed_forward.w2.weight.bin",
+                         model_file_type);
+    loadWeightFromBin<T>(
+        weights_ptr[9], {(size_t)hidden_units_}, dir_path + ".feed_forward.w2.bias.bin", model_file_type);
+
+    loadWeightFromBin<T>(weights_ptr[10],
+                         {(size_t)hidden_units_, (size_t)(inter_size_)},
+                         dir_path + ".feed_forward.w3.weight.bin",
+                         model_file_type);
+    loadWeightFromBin<T>(
+        weights_ptr[11], {(size_t)(inter_size_)}, dir_path + ".feed_forward.w3.bias.bin", model_file_type);
+
+    loadWeightFromBin<T>(weights_ptr[12], {(size_t)hidden_units_}, dir_path + ".ffn_norm.bias.bin", model_file_type);
+    loadWeightFromBin<T>(weights_ptr[13], {(size_t)hidden_units_}, dir_path + ".ffn_norm.weight.bin", model_file_type);
+}
+
+template<typename T>
+void LLaMADecoderLayerWeight<T>::setWeightPtr()
+{
+    pre_layernorm_weights.beta                            = weights_ptr[0];
+    pre_layernorm_weights.gamma                           = weights_ptr[1];
+    self_attention_weights.query_weight.kernel            = weights_ptr[2];
+    self_attention_weights.query_weight.bias              = weights_ptr[3];
+    self_attention_weights.attention_output_weight.kernel = weights_ptr[4];
+    self_attention_weights.attention_output_weight.bias   = weights_ptr[5];
+
+    ffn_weights.intermediate_weight.kernel  = weights_ptr[6];
+    ffn_weights.intermediate_weight.bias    = weights_ptr[7];
+    ffn_weights.output_weight.kernel        = weights_ptr[8];
+    ffn_weights.output_weight.bias          = weights_ptr[9];
+    ffn_weights.intermediate_weight2.kernel = weights_ptr[10];
+    ffn_weights.intermediate_weight2.bias   = weights_ptr[11];
+
+    post_attention_layernorm_weights.beta  = weights_ptr[12];
+    post_attention_layernorm_weights.gamma = weights_ptr[13];
+    is_maintain_buffer                     = true;
+}
+
+template<typename T>
+void LLaMADecoderLayerWeight<T>::mallocWeights()
+{
+    deviceMalloc(&weights_ptr[0], hidden_units_);
+    deviceMalloc(&weights_ptr[1], hidden_units_);
+    deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_);
+    deviceMalloc(&weights_ptr[3], 3 * hidden_units_);
+    deviceMalloc(&weights_ptr[4], hidden_units_ * hidden_units_);
+    deviceMalloc(&weights_ptr[5], hidden_units_);
+
+    deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_);
+    deviceMalloc(&weights_ptr[7], inter_size_);
+    deviceMalloc(&weights_ptr[8], inter_size_ * hidden_units_);
+    deviceMalloc(&weights_ptr[9], hidden_units_);
+    deviceMalloc(&weights_ptr[10], hidden_units_ * inter_size_);
+    deviceMalloc(&weights_ptr[11], inter_size_);
+    deviceMalloc(&weights_ptr[12], hidden_units_);
+    deviceMalloc(&weights_ptr[13], hidden_units_);
+}
+
+template struct LLaMADecoderLayerWeight<float>;
+template struct LLaMADecoderLayerWeight<half>;
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h
new file mode 100644
index 000000000..35d16300f
--- /dev/null
+++ b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <string>
+
+#include "src/fastertransformer/kernels/layernorm_kernels.h"
+#include "src/fastertransformer/layers/FfnWeight.h"
+#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+struct LLaMADecoderLayerWeight {
+public:
+    LLaMADecoderLayerWeight() = default;
+    LLaMADecoderLayerWeight(const int hidden_units, const int inter_size);
+    ~LLaMADecoderLayerWeight();
+    LLaMADecoderLayerWeight(const LLaMADecoderLayerWeight& other);
+    LLaMADecoderLayerWeight& operator=(const LLaMADecoderLayerWeight& other);
+
+    void loadModel(std::string dir_path, FtCudaDataType model_file_type);
+
+    LayerNormWeight<T> pre_layernorm_weights;
+    AttentionWeight<T> self_attention_weights;
+    LayerNormWeight<T> post_attention_layernorm_weights;
+    FfnWeight<T>       ffn_weights;
+
+private:
+    int       hidden_units_;
+    int       inter_size_;
+    const int attention_dense_bias_weight_id = 5;
+    bool      is_maintain_buffer             = false;
+    T*        weights_ptr[14];
+
+    void setWeightPtr();
+    void mallocWeights();
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMAWeight.cc b/src/fastertransformer/models/llama/LLaMAWeight.cc
new file mode 100644
index 000000000..f1c51e340
--- /dev/null
+++ b/src/fastertransformer/models/llama/LLaMAWeight.cc
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/models/llama/LLaMAWeight.h"
+
+namespace fastertransformer {
+
+template<typename T>
+LLaMAWeight<T>::LLaMAWeight(const int hidden_units,
+                            const int inter_size,
+                            const int vocab_size,
+                            const int num_layer,
+                            const int layer_para_size,
+                            const int layer_para_rank):
+    hidden_units_(hidden_units),
+    inter_size_(inter_size),
+    vocab_size_(vocab_size),
+    num_layer_(num_layer),
+    layer_para_size_(layer_para_size),
+    layer_para_rank_(layer_para_rank)
+{
+    FT_CHECK(num_layer_ % layer_para_size_ == 0);
+
+    decoder_layer_weights.reserve(num_layer_);
+    for (int l = 0; l < num_layer_; l++) {
+        if (isValidLayerParallelId(l)) {
+            decoder_layer_weights.push_back(new LLaMADecoderLayerWeight<T>(hidden_units_, inter_size_));
+        }
+        else {
+            // Layer-parallelism: allocate empty layer because
+            // this rank does not compute it:
+            decoder_layer_weights.push_back(new LLaMADecoderLayerWeight<T>(0, 0));
+        }
+    }
+
+    mallocWeights();
+    setWeightPtr();
+}
+
+template<typename T>
+LLaMAWeight<T>::~LLaMAWeight()
+{
+    if (is_maintain_buffer == true) {
+        for (int i = 0; i < weights_ptr.size(); i++) {
+            deviceFree(weights_ptr[i]);
+        }
+
+        pre_decoder_embedding_table   = nullptr;
+        post_decoder_layernorm.beta   = nullptr;
+        post_decoder_layernorm.gamma  = nullptr;
+        post_decoder_embedding.kernel = nullptr;
+        is_maintain_buffer            = false;
+    }
+}
+
+template<typename T>
+LLaMAWeight<T>::LLaMAWeight(const LLaMAWeight& other):
+    hidden_units_(other.hidden_units_),
+    inter_size_(other.inter_size_),
+    vocab_size_(other.vocab_size_),
+    num_layer_(other.num_layer_),
+    layer_para_size_(other.layer_para_size_),
+    layer_para_rank_(other.layer_para_rank_),
+    prompt_token_weight_size_(other.prompt_token_weight_size_)
+{
+    mallocWeights();
+    cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], vocab_size_ * hidden_units_);
+    cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
+    cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_);
+    cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], hidden_units_ * vocab_size_);
+
+    // prompt learning table: malloc weights and set weight ptr
+    setWeightPtr();
+
+    decoder_layer_weights.clear();
+    decoder_layer_weights.reserve(num_layer_);
+    for (int l = 0; l < num_layer_; l++) {
+        decoder_layer_weights.push_back(other.decoder_layer_weights[l]);
+    }
+}
+
+template<typename T>
+LLaMAWeight<T>& LLaMAWeight<T>::operator=(const LLaMAWeight& other)
+{
+    hidden_units_             = other.hidden_units_;
+    inter_size_               = other.inter_size_;
+    vocab_size_               = other.vocab_size_;
+    num_layer_                = other.num_layer_;
+    layer_para_size_          = other.layer_para_size_;
+    layer_para_rank_          = other.layer_para_rank_;
+    prompt_token_weight_size_ = other.prompt_token_weight_size_;
+
+    mallocWeights();
+    cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], vocab_size_ * hidden_units_);
+    cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
+    cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_);
+    cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], hidden_units_ * vocab_size_);
+
+    setWeightPtr();
+
+    decoder_layer_weights.clear();
+    decoder_layer_weights.reserve(num_layer_);
+    for (int l = 0; l < num_layer_; l++) {
+        decoder_layer_weights.push_back(other.decoder_layer_weights[l]);
+    }
+    return *this;
+}
+
+template<typename T>
+void LLaMAWeight<T>::setWeightPtr()
+{
+    pre_decoder_embedding_table   = weights_ptr[0];
+    post_decoder_layernorm.beta   = weights_ptr[1];
+    post_decoder_layernorm.gamma  = weights_ptr[2];
+    post_decoder_embedding.kernel = weights_ptr[3];
+}
+
+template<typename T>
+void LLaMAWeight<T>::mallocWeights()
+{
+    weights_ptr.resize(num_base_weights);
+
+    deviceMalloc(&weights_ptr[0], vocab_size_ * hidden_units_);
+    deviceMalloc(&weights_ptr[1], hidden_units_);
+    deviceMalloc(&weights_ptr[2], hidden_units_);
+    deviceMalloc(&weights_ptr[3], hidden_units_ * vocab_size_);
+
+    is_maintain_buffer = true;
+}
+
+template<typename T>
+void LLaMAWeight<T>::loadModel(std::string dir_path)
+{
+    FtCudaDataType model_file_type = getModelFileType(dir_path + "/config.ini", "llama");
+    FT_CHECK(is_maintain_buffer == true);
+
+    loadWeightFromBin<T>(weights_ptr[0],
+                         {(size_t)(vocab_size_ * hidden_units_)},
+                         dir_path + "/model.tok_embeddings.weight.bin",
+                         model_file_type);
+    loadWeightFromBin<T>(weights_ptr[1], {(size_t)hidden_units_}, dir_path + "/model.norm.bias.bin", model_file_type);
+    loadWeightFromBin<T>(weights_ptr[2], {(size_t)hidden_units_}, dir_path + "/model.norm.weight.bin", model_file_type);
+    loadWeightFromBin<T>(weights_ptr[3],
+                         {(size_t)(vocab_size_ * hidden_units_)},
+                         dir_path + "/model.output.weight.bin",
+                         model_file_type);
+
+    for (int l = 0; l < num_layer_; l++) {
+        if (isValidLayerParallelId(l)) {
+            decoder_layer_weights[l]->loadModel(dir_path + "/model.layers." + std::to_string(l), model_file_type);
+        }
+    }
+}
+
+template<typename T>
+void LLaMAWeight<T>::resizeLayer(const int num_layer)
+{
+    num_layer_ = num_layer;
+    decoder_layer_weights.reserve(num_layer_);
+    for (int l = 0; l < num_layer_; l++) {
+        decoder_layer_weights.push_back(new LLaMADecoderLayerWeight<T>());
+    }
+}
+
+template<typename T>
+bool LLaMAWeight<T>::isValidLayerParallelId(int l)
+{
+    int local_num_layer = (int)(ceil(num_layer_ * 1.0f / layer_para_size_));
+    return l < num_layer_ && (l >= local_num_layer * layer_para_rank_)
+           && (l < local_num_layer * (layer_para_rank_ + 1));
+}
+
+template struct LLaMAWeight<float>;
+template struct LLaMAWeight<half>;
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LLaMAWeight.h b/src/fastertransformer/models/llama/LLaMAWeight.h
new file mode 100644
index 000000000..e1fed4309
--- /dev/null
+++ b/src/fastertransformer/models/llama/LLaMAWeight.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/kernels/layernorm_kernels.h"
+#include "src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+#include "src/fastertransformer/utils/prompt_learning.h"
+
+namespace fastertransformer {
+
+template<typename T>
+struct LLaMAWeight {
+
+    LLaMAWeight() = default;
+    LLaMAWeight(
+        const int                                  hidden_units,
+        const int                                  inter_size,
+        const int                                  vocab_size,
+        const int                                  num_layer,
+        const int                                  layer_para_size      = 1,
+        const int                                  layer_para_rank      = 0);
+
+    ~LLaMAWeight();
+    LLaMAWeight(const LLaMAWeight& other);
+    LLaMAWeight& operator=(const LLaMAWeight& other);
+
+    void loadModel(std::string dir_path);
+
+    void resizeLayer(const int num_layer);
+
+    std::vector<LLaMADecoderLayerWeight<T>*> decoder_layer_weights;
+    const T*                                   pre_decoder_embedding_table = nullptr;
+    const T* position_encoding_table = nullptr;
+
+    LayerNormWeight<T> post_decoder_layernorm;
+    DenseWeight<T>     post_decoder_embedding;
+
+private:
+    void setWeightPtr();
+    void mallocWeights();
+    bool isValidLayerParallelId(int l);
+
+    int hidden_units_;
+    int inter_size_;
+    int vocab_size_;
+    int num_layer_;
+
+    int layer_para_size_;
+    int layer_para_rank_;
+
+    // prompt learning pair (task_name, (task_name_id, prompt_len))
+    // each prompt token's weight size
+    size_t prompt_token_weight_size_ = 0;
+
+    bool            is_maintain_buffer = false;
+    const size_t    num_base_weights   = 4;
+    std::vector<T*> weights_ptr        = std::vector<T*>(num_base_weights);
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/th_op/CMakeLists.txt b/src/fastertransformer/th_op/CMakeLists.txt
index b9f2b9151..4e8d82d30 100644
--- a/src/fastertransformer/th_op/CMakeLists.txt
+++ b/src/fastertransformer/th_op/CMakeLists.txt
@@ -32,6 +32,7 @@ add_subdirectory(t5)
 add_subdirectory(bart)
 add_subdirectory(bert)
 add_subdirectory(deberta)
+add_subdirectory(llama)
 
 add_library(th_transformer SHARED
             $<TARGET_OBJECTS:th_bart>
@@ -49,6 +50,7 @@ add_library(th_transformer SHARED
             $<TARGET_OBJECTS:th_t5>
             $<TARGET_OBJECTS:th_utils>
             $<TARGET_OBJECTS:th_vit>
+            $<TARGET_OBJECTS:th_llama>
 )
 target_link_libraries(th_transformer PUBLIC "${TORCH_LIBRARIES}"
                       th_bart
@@ -66,6 +68,7 @@ target_link_libraries(th_transformer PUBLIC "${TORCH_LIBRARIES}"
                       th_t5
                       th_utils
                       th_vit
+                      th_llama
 )
 
 if(ENABLE_FP8)
diff --git a/src/fastertransformer/th_op/llama/CMakeLists.txt b/src/fastertransformer/th_op/llama/CMakeLists.txt
new file mode 100755
index 000000000..75d13790e
--- /dev/null
+++ b/src/fastertransformer/th_op/llama/CMakeLists.txt
@@ -0,0 +1,17 @@
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_library(th_llama STATIC LLaMA.cc)
+set_property(TARGET th_llama PROPERTY POSITION_INDEPENDENT_CODE  ON)
+target_link_libraries(th_llama PRIVATE "${TORCH_LIBRARIES}" LLaMA th_utils nccl_utils)
diff --git a/src/fastertransformer/th_op/llama/LLaMA.cc b/src/fastertransformer/th_op/llama/LLaMA.cc
new file mode 100755
index 000000000..760ead92e
--- /dev/null
+++ b/src/fastertransformer/th_op/llama/LLaMA.cc
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/th_op/llama/LLaMA.h"
+
+namespace th = torch;
+namespace torch_ext {
+
+LLaMA::LLaMA(const int64_t            num_heads,
+             const int64_t            size_per_head,
+             const int64_t            inter_size,
+             const int64_t            num_layers,
+             const int64_t            vocab_size,
+             const int64_t            rotary_embedding_dim,
+             const int64_t            random_seed,
+             const int64_t            max_seq_len,
+             const int64_t            rank,
+             const int64_t            world_size,
+             const vector<th::Tensor> weights):
+    vocab_size_(vocab_size), st_(weights[0].scalar_type())
+{
+    for (auto t : weights) {
+        CHECK_INPUT(t, st_);
+    }
+
+    switch (st_) {
+        case at::ScalarType::Float:
+            ftllama = new FTLLaMA<float>((size_t)num_heads,
+                                         (size_t)size_per_head,
+                                         (size_t)inter_size,
+                                         (size_t)num_layers,
+                                         (size_t)vocab_size,
+                                         (size_t)rotary_embedding_dim,
+                                         (size_t)random_seed,
+                                         (size_t)max_seq_len,
+                                         (size_t)rank,
+                                         (size_t)world_size,
+                                         weights);
+            break;
+        case at::ScalarType::Half:
+            ftllama = new FTLLaMA<half>((size_t)num_heads,
+                                        (size_t)size_per_head,
+                                        (size_t)inter_size,
+                                        (size_t)num_layers,
+                                        (size_t)vocab_size,
+                                        (size_t)rotary_embedding_dim,
+                                        (size_t)random_seed,
+                                        (size_t)max_seq_len,
+                                        (size_t)rank,
+                                        (size_t)world_size,
+                                        weights);
+            break;
+        default:
+            throw std::runtime_error("Wrong Tensor type.");
+    }
+}
+
+LLaMA::~LLaMA()
+{
+    delete ftllama;
+}
+
+std::vector<th::Tensor> LLaMA::forward(th::Tensor&   hidden_vector,
+                                       th::Tensor&   cum_probs,
+                                       th::Tensor&   input_ids,
+                                       th::Tensor&   input_lengths,
+                                       th::Tensor&   target_ids,
+                                       th::Tensor&   context_lengths,
+                                       const int64_t seq_len,
+                                       const int64_t attn_len,
+                                       const int64_t is_context)
+{
+    CHECK_TH_CUDA(input_ids);
+    CHECK_CONTIGUOUS(input_ids);
+    TORCH_CHECK(input_ids.dtype() == torch::kInt32, "input_ids dtype should be int32");
+    CHECK_TH_CUDA(input_lengths);
+    CHECK_CONTIGUOUS(input_lengths);
+    TORCH_CHECK(input_lengths.dtype() == torch::kInt32, "input_lengths dtype should be int32");
+
+    ftllama->forward(hidden_vector,
+                     cum_probs,
+                     input_ids,
+                     input_lengths,
+                     target_ids,
+                     context_lengths,
+                     seq_len,
+                     attn_len,
+                     is_context);
+    return std::vector<th::Tensor>{hidden_vector, cum_probs};
+}
+
+}  // namespace torch_ext
+
+static auto fasterTransformerGptTHS =
+#ifdef LEGACY_THS
+    torch::jit::class_<torch_ext::LLaMA>("FasterTransformerLLaMA")
+#else
+    torch::jit::class_<torch_ext::LLaMA>("FasterTransformer", "LLaMA")
+#endif
+        .def(torch::jit::init<int64_t,
+                              int64_t,
+                              int64_t,
+                              int64_t,
+                              int64_t,
+                              int64_t,
+                              int64_t,
+                              int64_t,
+                              int64_t,
+                              int64_t,
+                              std::vector<th::Tensor>>())
+        .def("forward", &torch_ext::LLaMA::forward);
diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h
new file mode 100755
index 000000000..425f260df
--- /dev/null
+++ b/src/fastertransformer/th_op/llama/LLaMA.h
@@ -0,0 +1,300 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/models/llama/LLaMA.h"
+#include "src/fastertransformer/th_op/th_utils.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+
+namespace ft = fastertransformer;
+namespace th = torch;
+namespace torch_ext {
+
+using std::vector;
+
+class IFLLaMA {
+public:
+    virtual ~IFLLaMA() {}
+    virtual void forward(th::Tensor& hidden_vector,
+                         th::Tensor& cum_probs,
+                         th::Tensor& input_ids,
+                         th::Tensor& input_lengths,
+                         th::Tensor& target_ids,
+                         th::Tensor& context_lengths,
+                         const int   seq_len,
+                         const int   attn_len,
+                         const int   is_context) = 0;
+};
+
+template<typename T>
+class FTLLaMA: public IFLLaMA {
+public:
+    FTLLaMA(const size_t             num_heads,
+            const size_t             size_per_head,
+            const size_t             inter_size,
+            const size_t             num_layers,
+            const size_t             vocab_size,
+            const size_t             rotary_embedding_dim,
+            const size_t             random_seed,
+            const size_t             max_seq_len,
+            const int64_t            rank,
+            const int64_t            world_size,
+            const vector<th::Tensor> weights):
+        num_heads_(num_heads),
+        size_per_head_(size_per_head),
+        inter_size_(inter_size),
+        num_layers_(num_layers),
+        vocab_size_(vocab_size),
+        rotary_embedding_dim_(rotary_embedding_dim),
+        random_seed_(random_seed),
+        max_seq_len_(max_seq_len),
+        rank_(rank),
+        world_size_(world_size),
+        weights_(weights)
+    {
+        ft::Logger::getLogger().setLevel(ft::Logger::WARNING);
+
+        ft::check_cuda_error(cublasLtCreate(&cublasltHandle_));
+        cublas_algo_map_      = new ft::cublasAlgoMap(GEMM_CONFIG, "");
+        cublas_wrapper_mutex_ = new std::mutex();
+
+        llama_weights_.resizeLayer(num_layers_);
+        for (int i = 0; i < (int)num_layers_; i++) {
+            llama_weights_.decoder_layer_weights[i]->pre_layernorm_weights.beta =
+                get_ptr<T>(weights_[i + 0 * num_layers_]);
+            llama_weights_.decoder_layer_weights[i]->pre_layernorm_weights.gamma =
+                get_ptr<T>(weights_[i + 1 * num_layers_]);
+            llama_weights_.decoder_layer_weights[i]->self_attention_weights.query_weight.kernel =
+                get_ptr<T>(weights_[i + 2 * num_layers_]);
+            llama_weights_.decoder_layer_weights[i]->self_attention_weights.query_weight.bias =
+                get_ptr<T>(weights_[i + 3 * num_layers_]);
+            llama_weights_.decoder_layer_weights[i]->self_attention_weights.attention_output_weight.kernel =
+                get_ptr<T>(weights_[i + 4 * num_layers_]);
+            llama_weights_.decoder_layer_weights[i]->self_attention_weights.attention_output_weight.bias =
+                get_ptr<T>(weights_[i + 5 * num_layers_]);
+            llama_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight.kernel =
+                get_ptr<T>(weights_[i + 6 * num_layers_]);
+            llama_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight.bias =
+                get_ptr<T>(weights_[i + 7 * num_layers_]);
+            llama_weights_.decoder_layer_weights[i]->ffn_weights.output_weight.kernel =
+                get_ptr<T>(weights_[i + 8 * num_layers_]);
+            llama_weights_.decoder_layer_weights[i]->ffn_weights.output_weight.bias =
+                get_ptr<T>(weights_[i + 9 * num_layers_]);
+            llama_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight2.kernel =
+                get_ptr<T>(weights_[i + 10 * num_layers_]);
+            llama_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight2.bias =
+                get_ptr<T>(weights_[i + 11 * num_layers_]);
+            llama_weights_.decoder_layer_weights[i]->post_attention_layernorm_weights.beta =
+                get_ptr<T>(weights_[i + 12 * num_layers_]);
+            llama_weights_.decoder_layer_weights[i]->post_attention_layernorm_weights.gamma =
+                get_ptr<T>(weights_[i + 13 * num_layers_]);
+        }
+
+        llama_weights_.pre_decoder_embedding_table   = get_ptr<T>(weights_[14 * num_layers_ + 0]);
+        llama_weights_.post_decoder_layernorm.beta   = get_ptr<T>(weights_[14 * num_layers_ + 1]);
+        llama_weights_.post_decoder_layernorm.gamma  = get_ptr<T>(weights_[14 * num_layers_ + 2]);
+        llama_weights_.post_decoder_embedding.kernel = get_ptr<T>(weights_[14 * num_layers_ + 3]);
+
+        ft::check_cuda_error(cudaGetDeviceProperties(&prop_, 0));
+        ft::check_cuda_error(cudaStreamCreate(&stream_));
+
+        for (int i = 0; i < num_events_; ++i) {
+            ft::check_cuda_error(cudaEventCreate(&event_[i]));
+        }
+
+        cublasHandle_t cublasHandle = at::cuda::getCurrentCUDABlasHandle();
+        cublasSetStream(cublasHandle, stream_);
+
+        allocator_      = new ft::Allocator<ft::AllocatorType::TH>();
+        cublas_wrapper_ = new ft::cublasMMWrapper(
+            cublasHandle, cublasltHandle_, stream_, cublas_algo_map_, cublas_wrapper_mutex_, allocator_);
+
+        if (std::is_same<T, half>::value) {
+            cublas_wrapper_->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F);
+        }
+        else if (std::is_same<T, float>::value) {
+            cublas_wrapper_->setFP32GemmConfig();
+        }
+
+        ft::AttentionType attention_type = ft::getAttentionType<T>(size_per_head_,
+                                                                   ft::getSMVersion(),
+                                                                   true,   // remove_padding
+                                                                   0,      // gpt supports any-seq-length fmha
+                                                                   true,   // is_fuse
+                                                                   false,  // with_relative_position_bias
+                                                                   true);  // causal_mask
+                                                                           //
+        llama_ = new ft::LLaMA<T>(num_heads_,
+                                  size_per_head_,
+                                  inter_size_,
+                                  num_layers_,
+                                  vocab_size_,
+                                  rotary_embedding_dim_,
+                                  random_seed_,
+                                  max_seq_len_,
+                                  rank_,
+                                  world_size_,
+                                  stream_,
+                                  cublas_wrapper_,
+                                  allocator_,
+                                  false,          // is_free_buffer_after_forward
+                                  &prop_,         // cuda_device_prop
+                                  attention_type  // attention_type
+        );
+    }
+
+    ~FTLLaMA() override
+    {
+        for (int i = 0; i < num_events_; ++i) {
+            ft::check_cuda_error(cudaEventDestroy(event_[i]));
+        }
+        ft::check_cuda_error(cudaStreamDestroy(stream_));
+
+        delete llama_;
+        delete cublas_wrapper_;
+        delete allocator_;
+
+        cublasLtDestroy(cublasltHandle_);
+        delete cublas_algo_map_;
+        delete cublas_wrapper_mutex_;
+    }
+
+    virtual void forward(th::Tensor& hidden_vector,
+                         th::Tensor& cum_probs,
+                         th::Tensor& input_ids,
+                         th::Tensor& input_lengths,
+                         th::Tensor& target_ids,
+                         th::Tensor& context_lengths,
+                         const int   seq_len,
+                         const int   attn_len,
+                         const int   is_context) override
+    {
+        const size_t batch_size = (size_t)input_lengths.size(0);
+        const size_t num_tokens = (size_t)input_ids.size(0);
+        const size_t beam_width = (size_t)target_ids.size(0);
+
+        std::unordered_map<std::string, ft::Tensor> input_tensors = std::unordered_map<std::string, ft::Tensor>{
+            {"input_ids",
+             ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, std::vector<size_t>{num_tokens}, get_ptr<int>(input_ids)}},
+            {"input_lengths",
+             ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, std::vector<size_t>{batch_size}, get_ptr<int>(input_lengths)}},
+            {"target_ids",
+             ft::Tensor{ft::MEMORY_GPU,
+                        ft::TYPE_INT32,
+                        std::vector<size_t>{beam_width, num_tokens},
+                        get_ptr<int>(target_ids)}},
+            {"context_lengths",
+             ft::Tensor{
+                 ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{batch_size}, get_ptr<int>(context_lengths)}},
+            {"seq_len", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{1}, &seq_len}},
+            {"attn_len", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{1}, &attn_len}},
+            {"is_context", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector<size_t>{1}, &is_context}}};
+
+        std::unordered_map<std::string, ft::Tensor> output_tensors = std::unordered_map<std::string, ft::Tensor>{
+            {"hidden_vector",
+             ft::Tensor{ft::MEMORY_GPU,
+                        (std::is_same<T, half>::value) ? ft::TYPE_FP16 : ft::TYPE_FP32,
+                        std::vector<size_t>{num_tokens, num_heads_ * size_per_head_},
+                        get_ptr<T>(hidden_vector)}},
+            {"cum_probs",
+             ft::Tensor{ft::MEMORY_GPU,
+                        ft::TYPE_FP32,
+                        std::vector<size_t>{beam_width, batch_size},
+                        get_ptr<float>(cum_probs)}}};
+
+        try {
+            ft::check_cuda_error(cudaEventSynchronize(event_[ev_no_]));
+            llama_->forward(&output_tensors, &input_tensors, &llama_weights_);
+            ft::check_cuda_error(cudaEventRecord(event_[ev_no_], stream_));
+
+            auto stream = at::cuda::getCurrentCUDAStream().stream();
+            ft::check_cuda_error(cudaStreamWaitEvent(stream, event_[ev_no_]));
+            ev_no_ = (ev_no_ + 1) % num_events_;
+        }
+        catch (std::runtime_error& error) {
+            std::cout << error.what();
+            exit(-1);
+        }
+        catch (...) {
+            std::cout << "Runtime error";
+            exit(-1);
+        }
+    }
+
+private:
+    const size_t num_heads_;
+    const size_t size_per_head_;
+    const size_t inter_size_;
+    const size_t num_layers_;
+    const size_t vocab_size_;
+    const size_t rotary_embedding_dim_;
+    const size_t random_seed_;
+    const size_t max_seq_len_;
+    const size_t rank_;
+    const size_t world_size_;
+
+    static constexpr int num_events_ = 5;
+    int                  ev_no_      = 0;
+    cudaEvent_t          event_[num_events_];
+    cudaStream_t         stream_;
+
+    std::vector<th::Tensor> weights_;
+    cublasLtHandle_t        cublasltHandle_;
+    std::mutex*             cublas_wrapper_mutex_;
+    ft::cublasAlgoMap*      cublas_algo_map_;
+    struct cudaDeviceProp   prop_;
+    ft::LLaMAWeight<T>      llama_weights_;
+
+    ft::cublasMMWrapper* cublas_wrapper_;
+    ft::IAllocator*      allocator_;
+    ft::LLaMA<T>*        llama_ = nullptr;
+};
+
+class LLaMA: public th::jit::CustomClassHolder {
+public:
+    LLaMA(const int64_t            num_heads,
+          const int64_t            size_per_head,
+          const int64_t            inter_size,
+          const int64_t            num_layers,
+          const int64_t            vocab_size,
+          const int64_t            rotary_embedding_dim,
+          const int64_t            random_seed,
+          const int64_t            max_seq_len,
+          const int64_t            rank,
+          const int64_t            world_size,
+          const vector<th::Tensor> weights);
+
+    ~LLaMA();
+
+    std::vector<th::Tensor> forward(th::Tensor&   hidden_vector,
+                                    th::Tensor&   cum_probs,
+                                    th::Tensor&   input_ids,
+                                    th::Tensor&   input_lengths,
+                                    th::Tensor&   target_ids,
+                                    th::Tensor&   context_lengths,
+                                    const int64_t seq_len,
+                                    const int64_t attn_len,
+                                    const int64_t is_context);
+
+private:
+    const at::ScalarType    st_;
+    size_t                  vocab_size_;
+    IFLLaMA*                ftllama;
+    std::vector<th::Tensor> weights;
+};
+
+}  // namespace torch_ext
diff --git a/src/fastertransformer/utils/CMakeLists.txt b/src/fastertransformer/utils/CMakeLists.txt
index 9796ad076..22f735c27 100644
--- a/src/fastertransformer/utils/CMakeLists.txt
+++ b/src/fastertransformer/utils/CMakeLists.txt
@@ -57,7 +57,7 @@ add_library(mpi_utils STATIC mpi_utils.cc)
 set_property(TARGET mpi_utils PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET mpi_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 if (BUILD_MULTI_GPU)
-    target_link_libraries(mpi_utils PUBLIC -lmpi logger)
+    target_link_libraries(mpi_utils PUBLIC -lmpi -lmpi_cxx logger)
 endif()
 
 add_library(nccl_utils STATIC nccl_utils.cc)
diff --git a/src/fastertransformer/utils/llama_utils.h b/src/fastertransformer/utils/llama_utils.h
new file mode 100644
index 000000000..deed71f2c
--- /dev/null
+++ b/src/fastertransformer/utils/llama_utils.h
@@ -0,0 +1,182 @@
+#include <cuda_runtime.h>
+#include <iomanip>
+#include <iostream>
+
+namespace fastertransformer {
+
+template<typename T>
+static void _print_tensor1(T* out, int dim1, int indent)
+{
+    std::string ind(indent, ' ');
+    int         start0 = 0;
+    int         end0   = (dim1 < 3) ? dim1 : 3;
+    int         start1 = (dim1 < 3) ? 0 : dim1 - 3;
+    int         end1   = (dim1 < 3) ? 0 : dim1;
+
+    std::cout << "[";
+    for (int i = start0; i < end0; ++i) {
+        std::cout << std::fixed << std::setw(7) << std::setprecision(4) << std::setfill(' ') << out[i];
+        if (i != dim1 - 1)
+            std::cout << " ";
+    }
+    if (end0 != start1) {
+        std::cout << "... ";
+    }
+    for (int i = start1; i < end1; ++i) {
+        std::cout << std::fixed << std::setw(7) << std::setprecision(4) << std::setfill(' ') << out[i];
+        if (i != end1 - 1)
+            std::cout << " ";
+    }
+    std::cout << "]";
+}
+
+template<typename T>
+static void _print_tensor2(T* out, int dim1, int dim2, int stride, int indent)
+{
+    std::string ind(indent, ' ');
+    int         start0 = 0;
+    int         end0   = (dim1 < 3) ? dim1 : 3;
+    int         start1 = (dim1 < 3) ? 0 : dim1 - 3;
+    int         end1   = (dim1 < 3) ? 0 : dim1;
+    std::cout << "[";
+    for (int i = start0; i < end0; ++i) {
+        if (i != start0)
+            std::cout << ind;
+        _print_tensor1(&out[i * stride], dim2, indent + 1);
+        if (i != dim1 - 1)
+            std::cout << "\n";
+    }
+    if (end0 != start1) {
+        std::cout << ind;
+        std::cout << "...\n";
+    }
+    for (int i = start1; i < end1; ++i) {
+        std::cout << ind;
+        _print_tensor1(&out[i * stride], dim2, indent + 1);
+        if (i != end1 - 1)
+            std::cout << "\n";
+    }
+    std::cout << "]";
+}
+
+template<typename T>
+static void _print_tensor3(T* out, int dim1, int dim2, int dim3, int stride1, int stride2, int indent)
+{
+    std::string ind(indent, ' ');
+
+    int start0 = 0;
+    int end0   = (dim1 < 3) ? dim1 : 3;
+    int start1 = (dim1 < 3) ? 0 : dim1 - 3;
+    int end1   = (dim1 < 3) ? 0 : dim1;
+    std::cout << "[";
+    for (int i = start0; i < end0; ++i) {
+        if (i != start0)
+            std::cout << ind;
+        _print_tensor2(&out[i * stride1], dim2, dim3, stride2, indent + 1);
+        if (i != dim1 - 1)
+            std::cout << "\n\n";
+    }
+    if (start1 != end1) {
+        std::cout << ind;
+        std::cout << "...\n\n";
+    }
+    for (int i = start1; i < end1; ++i) {
+        std::cout << ind;
+        _print_tensor2(&out[i * stride1], dim2, dim3, stride2, indent + 1);
+        if (i != end1 - 1)
+            std::cout << "\n\n";
+    }
+    std::cout << "]";
+}
+
+template<typename T>
+static void
+_print_tensor4(T* out, int dim1, int dim2, int dim3, int dim4, int stride1, int stride2, int stride3, int indent)
+{
+    std::string ind(indent, ' ');
+
+    int start0 = 0;
+    int end0   = (dim1 < 3) ? dim1 : 3;
+    int start1 = (dim1 < 3) ? 0 : dim1 - 3;
+    int end1   = (dim1 < 3) ? 0 : dim1;
+    std::cout << "[";
+    for (int i = start0; i < end0; ++i) {
+        if (i != start0)
+            std::cout << ind;
+        _print_tensor3(&out[i * stride1], dim2, dim3, dim4, stride2, stride3, indent + 1);
+        if (i != dim1 - 1)
+            std::cout << "\n\n\n";
+    }
+    if (start1 != end1) {
+        std::cout << ind;
+        std::cout << "...\n\n";
+    }
+    for (int i = start1; i < end1; ++i) {
+        std::cout << ind;
+        _print_tensor3(&out[i * stride1], dim2, dim3, dim4, stride2, stride3, indent + 1);
+        if (i != end1 - 1)
+            std::cout << "\n\n\n";
+    }
+    std::cout << "]";
+}
+
+template<typename T>
+static void print_tensor1(T* in, int dim1)
+{
+    T* out = (T*)malloc(sizeof(T) * dim1);
+    cudaMemcpy(out, in, sizeof(T) * dim1, cudaMemcpyDeviceToHost);
+    _print_tensor1(out, dim1, 1);
+    std::cout << "\n";
+    free(out);
+}
+
+template<typename T>
+static void print_tensor2(T* in, int dim1, int dim2, int stride1, int size, int start)
+{
+    T* out = (T*)malloc(sizeof(T) * size);
+    cudaMemcpy(out, in, sizeof(T) * size, cudaMemcpyDeviceToHost);
+    _print_tensor2(&out[start], dim1, dim2, stride1, 1);
+    std::cout << "\n";
+    free(out);
+}
+
+template<typename T>
+static void print_tensor2(T* in, int dim1, int dim2)
+{
+    print_tensor2(in, dim1, dim2, dim2, dim1 * dim2, 0);
+}
+
+template<typename T>
+static void print_tensor3(T* in, int dim1, int dim2, int dim3, int stride1, int stride2, int size, int start)
+{
+    T* out = (T*)malloc(sizeof(T) * size);
+    cudaMemcpy(out, in, sizeof(T) * size, cudaMemcpyDeviceToHost);
+    _print_tensor3(&out[start], dim1, dim2, dim3, stride1, stride2, 1);
+    std::cout << "\n";
+    free(out);
+}
+
+template<typename T>
+static void print_tensor3(T* in, int dim1, int dim2, int dim3)
+{
+    print_tensor3(in, dim1, dim2, dim3, dim2 * dim3, dim3, dim1 * dim2 * dim3, 0);
+}
+
+template<typename T>
+static void
+print_tensor4(T* in, int dim1, int dim2, int dim3, int dim4, int stride1, int stride2, int stride3, int size, int start)
+{
+    T* out = (T*)malloc(sizeof(T) * size);
+    cudaMemcpy(out, in, sizeof(T) * size, cudaMemcpyDeviceToHost);
+    _print_tensor4(&out[start], dim1, dim2, dim3, dim4, stride1, stride2, stride3, 1);
+    std::cout << "\n";
+    free(out);
+}
+
+template<typename T>
+static void print_tensor4(T* in, int dim1, int dim2, int dim3, int dim4)
+{
+    print_tensor4(in, dim1, dim2, dim3, dim4, dim2 * dim3 * dim4, dim3 * dim4, dim4, dim1 * dim2 * dim3 * dim4, 0);
+}
+
+}  // namespace fastertransformer