diff --git a/.gitignore b/.gitignore index 77849f435..5b49d9183 100644 --- a/.gitignore +++ b/.gitignore @@ -15,4 +15,7 @@ __pycache__/ **/.ipynb_checkpoints/ /3rdparty/NeMo/ -/3rdparty/apex/ \ No newline at end of file +/3rdparty/apex/ +20B_checkpoints/ +compile_commands.json +model/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 870e67f0a..0d879611a 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -418,7 +418,7 @@ add_library(transformer-shared SHARED if (BUILD_MULTI_GPU) target_link_libraries(transformer-shared PUBLIC - -lmpi + -lmpi -lmpi_cxx ${NCCL_LIBRARIES} ) endif() diff --git a/FasterTransformerReadME.md b/FasterTransformerReadME.md new file mode 100644 index 000000000..a00e0d631 --- /dev/null +++ b/FasterTransformerReadME.md @@ -0,0 +1,417 @@ +# FasterTransformer + +This repository provides a script and recipe to run the highly optimized transformer-based encoder and decoder component, and it is tested and maintained by NVIDIA. + +## Table Of Contents + +- [FasterTransformer](#fastertransformer) + - [Table Of Contents](#table-of-contents) + - [Model overview](#model-overview) + - [Support matrix](#support-matrix) + - [Advanced](#advanced) + - [Global Environment](#global-environment) + - [Performance](#performance) + - [BERT base performance](#bert-base-performance) + - [BERT base performances of FasterTransformer new features](#bert-base-performances-of-fastertransformer-new-features) + - [BERT base performance on TensorFlow](#bert-base-performance-on-tensorflow) + - [BERT base performance on PyTorch](#bert-base-performance-on-pytorch) + - [Decoding and Decoder performance](#decoding-and-decoder-performance) + - [Decoder and Decoding end-to-end translation performance on TensorFlow](#decoder-and-decoding-end-to-end-translation-performance-on-tensorflow) + - [Decoder and Decoding end-to-end translation performance on PyTorch](#decoder-and-decoding-end-to-end-translation-performance-on-pytorch) + - [GPT performance](#gpt-performance) + - [Release notes](#release-notes) + - [Changelog](#changelog) + - [Known issues](#known-issues) + +## Model overview + +In NLP, encoder and decoder are two important components, with the transformer layer becoming a popular architecture for both components. FasterTransformer implements a highly optimized transformer layer for both the encoder and decoder for inference. On Volta, Turing and Ampere GPUs, the computing power of Tensor Cores are used automatically when the precision of the data and weights are FP16. + +FasterTransformer is built on top of CUDA, cuBLAS, cuBLASLt and C++. We provide at least one API of the following frameworks: TensorFlow, PyTorch and Triton backend. Users can integrate FasterTransformer into these frameworks directly. For supporting frameworks, we also provide example codes to demonstrate how to use, and show the performance on these frameworks. + +### Support matrix + +| Models | Framework | FP16 | INT8 (after Turing) | Sparsity (after Ampere) | Tensor parallel | Pipeline parallel | FP8 (after Hopper) | +| ---------------- | -------------- | ---- | ------------------- | ----------------------- | --------------- | ----------------- | ------------------ | +| BERT | TensorFlow | Yes | Yes | - | - | - | - | +| BERT | PyTorch | Yes | Yes | Yes | Yes | Yes | - | +| BERT | Triton backend | Yes | - | - | Yes | Yes | - | +| BERT | C++ | Yes | Yes | - | - | - | Yes | +| XLNet | C++ | Yes | - | - | - | - | - | +| Encoder | TensorFlow | Yes | Yes | - | - | - | - | +| Encoder | PyTorch | Yes | Yes | Yes | - | - | - | +| Decoder | TensorFlow | Yes | - | - | - | - | - | +| Decoder | PyTorch | Yes | - | - | - | - | - | +| Decoding | TensorFlow | Yes | - | - | - | - | - | +| Decoding | PyTorch | Yes | - | - | - | - | - | +| GPT | TensorFlow | Yes | - | - | - | - | - | +| GPT/OPT | PyTorch | Yes | - | - | Yes | Yes | Yes | +| GPT/OPT | Triton backend | Yes | - | - | Yes | Yes | - | +| GPT-MoE | PyTorch | Yes | - | - | Yes | Yes | - | +| BLOOM | PyTorch | Yes | - | - | Yes | Yes | - | +| BLOOM | Triton backend | Yes | - | - | Yes | Yes | - | +| GPT-J | Triton backend | Yes | - | - | Yes | Yes | - | +| Longformer | PyTorch | Yes | - | - | - | - | - | +| T5/UL2 | PyTorch | Yes | - | - | Yes | Yes | - | +| T5 | TensorFlow 2 | Yes | - | - | - | - | - | +| T5/UL2 | Triton backend | Yes | - | - | Yes | Yes | - | +| T5 | TensorRT | Yes | - | - | Yes | Yes | - | +| T5-MoE | PyTorch | Yes | - | - | Yes | Yes | - | +| Swin Transformer | PyTorch | Yes | Yes | - | - | - | - | +| Swin Transformer | TensorRT | Yes | Yes | - | - | - | - | +| ViT | PyTorch | Yes | Yes | - | - | - | - | +| ViT | TensorRT | Yes | Yes | - | - | - | - | +| GPT-NeoX | PyTorch | Yes | - | - | Yes | Yes | - | +| GPT-NeoX | Triton backend | Yes | - | - | Yes | Yes | - | +| BART/mBART | PyTorch | Yes | - | - | Yes | Yes | - | +| WeNet | C++ | Yes | - | - | - | - | - | +| DeBERTa | TensorFlow 2 | Yes | - | - | On-going | On-going | - | +| DeBERTa | PyTorch | Yes | - | - | On-going | On-going | - | + +* Note that the FasterTransformer supports the models above on C++ because all source codes are built on C++. + +More details of specific models are put in `xxx_guide.md` of [`docs/`](docs), where `xxx` means the model name. Some common questions and the respective answers are put in [`docs/QAList.md`](docs/QAList.md). Note that the model of Encoder and BERT are similar and we put the explanation into `bert_guide.md` together. + +## Advanced + +The following code lists the directory structure of FasterTransformer: + +``` +/src/fastertransformer: source code of FasterTransformer + |--/cutlass_extensions: Implementation of cutlass gemm/kernels. + |--/kernels: CUDA kernels for different models/layers and operations, like addBiasResiual. + |--/layers: Implementation of layer modules, like attention layer, ffn layer. + |--/models: Implementation of different models, like BERT, GPT. + |--/tensorrt_plugin: encapluate FasterTransformer into TensorRT plugin. + |--/tf_op: custom Tensorflow OP implementation + |--/th_op: custom PyTorch OP implementation + |--/triton_backend: custom triton backend implementation + |--/utils: Contains common cuda utils, like cublasMMWrapper, memory_utils +/examples: C++, tensorflow and pytorch interface examples + |--/cpp: C++ interface examples + |--/pytorch: PyTorch OP examples + |--/tensorflow: TensorFlow OP examples + |--/tensorrt: TensorRT examples +/docs: Documents to explain the details of implementation of different models, and show the benchmark +/benchmark: Contains the scripts to run the benchmarks of different models +/tests: Unit tests +/templates: Documents to explain how to add a new model/example into FasterTransformer repo +``` + +Note that many folders contains many sub-folders to split different models. Quantization tools are move to `examples`, like `examples/tensorflow/bert/bert-quantization/` and `examples/pytorch/bert/bert-quantization-sparsity/`. + + +### Global Environment + +FasterTransformer provides some convenient environment variables for debuging and testing. + +1. `FT_LOG_LEVEL`: This environment controls the log level of debug messae. More details are in `src/fastertransformer/utils/logger.h`. Note that the program will print lots of message when the level is lower than `DEBUG` and the program would become very slow. +2. `FT_NVTX`: If it is set to be `ON` like `FT_NVTX=ON ./bin/gpt_example`, the program will insert tha tag of nvtx to help profiling the program. +3. `FT_DEBUG_LEVEL`: If it is set to be `DEBUG`, then the program will run `cudaDeviceSynchronize()` after every kernels. Otherwise, the kernel is executued asynchronously by default. It is helpful to locate the error point during debuging. But this flag affects the performance of program significantly. So, it should be used only for debuging. + +## Performance + +Hardware settings: + +* 8xA100-80GBs (with mclk 1593MHz, pclk 1410MHz) with AMD EPYC 7742 64-Core Processor +* T4 (with mclk 5000MHz, pclk 1590MHz) with Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz + +In order to run the following benchmark, we need to install the unix computing tool "bc" by + +```bash +apt-get install bc +``` + +### BERT base performance + +The FP16 results of TensorFlow were obtained by running the `benchmarks/bert/tf_benchmark.sh`. + +The INT8 results of TensorFlow were obtained by running the `benchmarks/bert/tf_int8_benchmark.sh`. + +The FP16 results of PyTorch were obtained by running the `benchmarks/bert/pyt_benchmark.sh`. + +The INT8 results of PyTorch were obtained by running the `benchmarks/bert/pyt_int8_benchmark.sh`. + +More benchmarks are put in [`docs/bert_guide.md`](docs/bert_guide.md#bert-performance). + +#### BERT base performances of FasterTransformer new features + +The following figure compares the performances of different features of FasterTransformer and FasterTransformer under FP16 on T4. + +For large batch size and sequence length, both EFF-FT and FT-INT8-v2 bring about 2x speedup. Using Effective FasterTransformer and int8v2 at the same time can bring about 3.5x speedup compared to FasterTransformer FP16 for large case. + +
+ +#### BERT base performance on TensorFlow + +The following figure compares the performances of different features of FasterTransformer and TensorFlow XLA under FP16 on T4. + +For small batch size and sequence length, using FasterTransformer can bring about 3x speedup. + +For large batch size and sequence length, using Effective FasterTransformer with INT8-v2 quantization can bring about 5x speedup. + +
+ +#### BERT base performance on PyTorch + +The following figure compares the performances of different features of FasterTransformer and PyTorch TorchScript under FP16 on T4. + +For small batch size and sequence length, using FasterTransformer CustomExt can bring about 4x ~ 6x speedup. + +For large batch size and sequence length, using Effective FasterTransformer with INT8-v2 quantization can bring about 5x speedup. + +
+ +### Decoding and Decoder performance + +The results of TensorFlow were obtained by running the `benchmarks/decoding/tf_decoding_beamsearch_benchmark.sh` and `benchmarks/decoding/tf_decoding_sampling_benchmark.sh` + +The results of PyTorch were obtained by running the `benchmarks/decoding/pyt_decoding_beamsearch_benchmark.sh`. + +In the experiments of decoding, we updated the following parameters: + +* head_num = 8 +* size_per_head = 64 +* num_layers = 6 for both encoder and decoder +* vocabulary_size = 32001 for TensorFlow sample codes, 31538 for PyTorch sample codes +* memory_hidden_dim = 512 +* max sequenc elength = 128 + +More benchmarks are put in [`docs/decoder_guide.md`](docs/decoder_guide.md#decoding-performance). + +#### Decoder and Decoding end-to-end translation performance on TensorFlow + +The following figure shows the speedup of of FT-Decoder op and FT-Decoding op compared to TensorFlow under FP16 with T4. Here, we use the throughput of translating a test set to prevent the total tokens of each methods may be different. Compared to TensorFlow, FT-Decoder provides 1.5x ~ 3x speedup; while FT-Decoding provides 4x ~ 18x speedup. + +
+ +#### Decoder and Decoding end-to-end translation performance on PyTorch + +The following figure shows the speedup of of FT-Decoder op and FT-Decoding op compared to PyTorch under FP16 with T4. Here, we use the throughput of translating a test set to prevent the total tokens of each methods may be different. Compared to PyTorch, FT-Decoder provides 1.2x ~ 3x speedup; while FT-Decoding provides 3.8x ~ 13x speedup. + +
+ +### GPT performance + +The following figure compares the performances of Megatron and FasterTransformer under FP16 on A100. + +In the experiments of decoding, we updated the following parameters: + +* head_num = 96 +* size_per_head = 128 +* num_layers = 48 for GPT-89B model, 96 for GPT-175B model +* data_type = FP16 +* vocab_size = 51200 +* top_p = 0.9 +* tensor parallel size = 8 +* input sequence length = 512 +* output sequence length = 32 + +
+ +## Release notes + +### Changelog + +May 2023 +- Fix bugs of generation early stopping + +January 2023 +- Support GPT MoE +- Support FP8 for Bert and GPT (**Experimental**) +- Support DeBERTa on TensorFlow 2 and PyTorch + +Dec 2022 +- **Release the FasterTransformer 5.2** +- Support min length penalty + +Nov 2022 +- Support T5 Tensorflow 2 custom op. +- Support T5 MoE +- Support WeNet +- Support BART & mBART +- Support SwinV2 +- Initial support for w8a8 int8 mode with GPT (preview) +- Support fused mha in GPT + +Oct 2022 +- Support BLOOM + +Sep 2022 +- Support factual sampling ([link](https://arxiv.org/pdf/2206.04624.pdf)) in gpt +- Support for IA3 adapting scheme in T5 + +Aug 2022 +- Support returning context tokens embeddings in GPT +- **Release the FasterTransformer 5.1** +- Support for interactive generation +- Support for attention time-limited memory +- Support mt5 and t5-v1.1 + +July 2022 +- Support UL2 huggingface ckpt. ([link](https://huggingface.co/google/ul2)) + - Fix bug of T5 under bfloat16. +- Add ViT INT8 TensorRT Plugin +- Support batch sampling +- Support shared context optimization in GPT model + +June 2022 +- Support streaming generation for triton backend. +- Support OPT. +- Support multi-node multi-GPU BERT under FP32, FP16 and BF16. + +May 2022 +- Support bfloat16 on most models. +- Support [prefix-prompt](https://arxiv.org/pdf/2101.00190.pdf) for GPT-J. +- Support GPT-NeoX. + - epsilon value used in layernorm is now a parameter + - rotary embedding GPT-NeoX style (only GPT-J was implemented) + - load per-GPU layernorm and bias parameters + - weight conversion from EleutherAI checkpoint + +April 2022 +- **Release the FasterTransformer 5.0** + - Change the default accumulation type of all gemm to FP32. + - Support bfloat16 inference in GPT model. + - Support Nemo Megatron T5 and Megatron-LM T5 model. + - Support ViT. + +March 2022 +- Support `stop_ids` and `ban_bad_ids` in GPT-J. +- Support dynamice `start_id` and `end_id` in GPT-J, GPT, T5 and Decoding. + +February 2022 +- Support Swin Transformer. +- Optimize the k/v cache update of beam search by in-direction buffer. +- Support runtime input for GPT-J, T5 and GPT. +- Support soft prompt in GPT and GPT-J. +- Support custom all reduce kernel. + - Limitation: + 1. Only support tensor parallel size = 8 on DGX-A100. + 2. Only support CUDA with cudaMallocAsync. + +December 2021 +- Add TensorRT plugin of T5 model. +- Change some hyper-parameters of GPT model to runtime query. +- Optimize the memory allocator under C++ code. +- Fix bug of CUB including when using CUDA 11.5 or newer version. + +November 2021 +- **Update the FasterTransformer 5.0 beta** +- Add GPT-3 INT8 weight only qauntization for batch size <= 2. +- Support multi-node multi-gpu support on T5. +- Enhance the multi-node multi-gpu supporting in GPT-3. + +August 2021 +- **Release the FasterTransformer 5.0 beta** + - Refactor the repo and codes + - And special thanks to NAVER Corp. for contributing a lot to this version, as listed below. + - Bugs fix + - Fix error that occurs when batch_size is less than max_batch_size for gpt pytorch wrapper. + - Fix memory leak that occurs every forward because of reused allocator. + - Fix race condition that occurs in repetition penalty kernel. + - Enhancement + - Add random seed setting. + - Fix GEMM buffer overflow on FP16 of GPT. + - Change to invalidate finished buffer for every completion. + - Introduce stop_before for early stop. + - Support Longformer. + - Rename `layer_para` to `pipeline_para`. + - Optimize the sorting of top p sampling. + - Support sparsity for Ampere GPUs on BERT. + - Support `size_per_head` 96, 160, 192, 224, 256 for GPT model. + - Support multi-node inference for GPT Triton backend. + +June 2021 +- Support XLNet + +April 2021 +- **Release the FasterTransformer 4.0** + - Support multi-gpus and multi-nodes inference for GPT model on C++ and PyTorch. + - Support single node, multi-gpus inference for GPT model on triton. + - Add the int8 fused multi-head attention kernel for bert. + - Add the FP16 fused multi-head attention kernel of V100 for bert. + - Optimize the kernel of decoder. + - Move to independent repo. + - Eager mode PyTorch extension is deprecated. + +Dec 2020 +- **Release the FasterTransformer 3.1** + - Optimize the decoding by adding the finisehd mask to prevent useless computing. + - Support opennmt encoder. + - Remove the TensorRT plugin supporting. + - TorchScript custom op is deprecated. + +Nov 2020 +- Optimize the INT8 inference. +- Support PyTorch INT8 inference. +- Provide PyTorch INT8 quantiztion tools. +- Integrate the fused multi-head attention kernel of TensorRT into FasterTransformer. +- Add unit test of SQuAD. +- Update the missed NGC checkpoints. + +Sep 2020 +- Support GPT2 +- **Release the FasterTransformer 3.0** + - Support INT8 quantization of encoder of cpp and TensorFlow op. + - Add bert-tf-quantization tool. + - Fix the issue that Cmake 15 or Cmake 16 fail to build this project. + +Aug 2020 +- Fix the bug of trt plugin. + +June 2020 +- **Release the FasterTransformer 2.1** + - Add Effective FasterTransformer based on the idea of [Effective Transformer](https://github.com/bytedance/effective_transformer) idea. + - Optimize the beam search kernels. + - Add PyTorch op supporting + +May 2020 +- Fix the bug that seq_len of encoder must be larger than 3. +- Add the position_encoding of decoding as the input of FasterTransformer decoding. This is convenient to use different types of position encoding. FasterTransformer does not compute the position encoding value, but only lookup the table. +- Modifying the method of loading model in `translate_sample.py`. + +April 2020 +- Rename `decoding_opennmt.h` to `decoding_beamsearch.h` +- Add DiverseSiblingsSearch for decoding. +- Add sampling into Decoding + - The implementation is in the `decoding_sampling.h` + - Add top_k sampling, top_p sampling for decoding. +- Refactor the tensorflow custom op codes. + - Merge `bert_transformer_op.h`, `bert_transformer_op.cu.cc` into `bert_transformer_op.cc` + - Merge `decoder.h`, `decoder.cu.cc` into `decoder.cc` + - Merge `decoding_beamsearch.h`, `decoding_beamsearch.cu.cc` into `decoding_beamsearch.cc` +- Fix the bugs of finalize function decoding.py. +- Fix the bug of tf DiverseSiblingSearch. +- Add BLEU scorer `bleu_score.py` into `utils`. Note that the BLEU score requires python3. +- Fuse QKV Gemm of encoder and masked_multi_head_attention of decoder. +- Add dynamic batch size and dynamic sequence length features into all ops. + +March 2020 +- Add feature in FasterTransformer 2.0 + - Add `translate_sample.py` to demonstrate how to translate a sentence by restoring the pretrained model of OpenNMT-tf. +- Fix bugs of Fastertransformer 2.0 + - Fix the bug of maximum sequence length of decoder cannot be larger than 128. + - Fix the bug that decoding does not check finish or not after each step. + - Fix the bug of decoder about max_seq_len. + - Modify the decoding model structure to fit the OpenNMT-tf decoding model. + - Add a layer normalization layer after decoder. + - Add a normalization for inputs of decoder + +February 2020 +- **Release the FasterTransformer 2.0** + - Provide a highly optimized OpenNMT-tf based decoder and decoding, including C++ API and TensorFlow op. + - Refine the sample codes of encoder. + - Add dynamic batch size feature into encoder op. + +July 2019 +- **Release the FasterTransformer 1.0** + - Provide a highly optimized bert equivalent transformer layer, including C++ API, TensorFlow op and TensorRT plugin. + +### Known issues + +- Cannot compile on tensorflow 2.10 due to undefined symbol issue. +- Undefined symbol errors when import the extension + - Please `import torch` first. If this has been done, it is due to the incompatible C++ ABI. You may need to check the PyTorch used during compilation and execution are the same, or you need to check how your PyTorch is compiled, or the version of your GCC, etc. +- Results of TensorFlow and OP would be different in decoding. This problem is caused by the accumulated log probability, and we do not avoid this problem. +- If encounter some problem in the custom environment, try to use the gcc/g++ 4.8 to build the project of TensorFlow op, especially for TensorFlow 1.14. diff --git a/README.md b/README.md index a00e0d631..72735e507 100644 --- a/README.md +++ b/README.md @@ -1,417 +1,15 @@ -# FasterTransformer +# FasterTransformer for SaumsungCEChallenge -This repository provides a script and recipe to run the highly optimized transformer-based encoder and decoder component, and it is tested and maintained by NVIDIA. +Check out FasterTransformer [README.md](FasterTransformerReadME.md) -## Table Of Contents +## Installation -- [FasterTransformer](#fastertransformer) - - [Table Of Contents](#table-of-contents) - - [Model overview](#model-overview) - - [Support matrix](#support-matrix) - - [Advanced](#advanced) - - [Global Environment](#global-environment) - - [Performance](#performance) - - [BERT base performance](#bert-base-performance) - - [BERT base performances of FasterTransformer new features](#bert-base-performances-of-fastertransformer-new-features) - - [BERT base performance on TensorFlow](#bert-base-performance-on-tensorflow) - - [BERT base performance on PyTorch](#bert-base-performance-on-pytorch) - - [Decoding and Decoder performance](#decoding-and-decoder-performance) - - [Decoder and Decoding end-to-end translation performance on TensorFlow](#decoder-and-decoding-end-to-end-translation-performance-on-tensorflow) - - [Decoder and Decoding end-to-end translation performance on PyTorch](#decoder-and-decoding-end-to-end-translation-performance-on-pytorch) - - [GPT performance](#gpt-performance) - - [Release notes](#release-notes) - - [Changelog](#changelog) - - [Known issues](#known-issues) - -## Model overview - -In NLP, encoder and decoder are two important components, with the transformer layer becoming a popular architecture for both components. FasterTransformer implements a highly optimized transformer layer for both the encoder and decoder for inference. On Volta, Turing and Ampere GPUs, the computing power of Tensor Cores are used automatically when the precision of the data and weights are FP16. - -FasterTransformer is built on top of CUDA, cuBLAS, cuBLASLt and C++. We provide at least one API of the following frameworks: TensorFlow, PyTorch and Triton backend. Users can integrate FasterTransformer into these frameworks directly. For supporting frameworks, we also provide example codes to demonstrate how to use, and show the performance on these frameworks. - -### Support matrix - -| Models | Framework | FP16 | INT8 (after Turing) | Sparsity (after Ampere) | Tensor parallel | Pipeline parallel | FP8 (after Hopper) | -| ---------------- | -------------- | ---- | ------------------- | ----------------------- | --------------- | ----------------- | ------------------ | -| BERT | TensorFlow | Yes | Yes | - | - | - | - | -| BERT | PyTorch | Yes | Yes | Yes | Yes | Yes | - | -| BERT | Triton backend | Yes | - | - | Yes | Yes | - | -| BERT | C++ | Yes | Yes | - | - | - | Yes | -| XLNet | C++ | Yes | - | - | - | - | - | -| Encoder | TensorFlow | Yes | Yes | - | - | - | - | -| Encoder | PyTorch | Yes | Yes | Yes | - | - | - | -| Decoder | TensorFlow | Yes | - | - | - | - | - | -| Decoder | PyTorch | Yes | - | - | - | - | - | -| Decoding | TensorFlow | Yes | - | - | - | - | - | -| Decoding | PyTorch | Yes | - | - | - | - | - | -| GPT | TensorFlow | Yes | - | - | - | - | - | -| GPT/OPT | PyTorch | Yes | - | - | Yes | Yes | Yes | -| GPT/OPT | Triton backend | Yes | - | - | Yes | Yes | - | -| GPT-MoE | PyTorch | Yes | - | - | Yes | Yes | - | -| BLOOM | PyTorch | Yes | - | - | Yes | Yes | - | -| BLOOM | Triton backend | Yes | - | - | Yes | Yes | - | -| GPT-J | Triton backend | Yes | - | - | Yes | Yes | - | -| Longformer | PyTorch | Yes | - | - | - | - | - | -| T5/UL2 | PyTorch | Yes | - | - | Yes | Yes | - | -| T5 | TensorFlow 2 | Yes | - | - | - | - | - | -| T5/UL2 | Triton backend | Yes | - | - | Yes | Yes | - | -| T5 | TensorRT | Yes | - | - | Yes | Yes | - | -| T5-MoE | PyTorch | Yes | - | - | Yes | Yes | - | -| Swin Transformer | PyTorch | Yes | Yes | - | - | - | - | -| Swin Transformer | TensorRT | Yes | Yes | - | - | - | - | -| ViT | PyTorch | Yes | Yes | - | - | - | - | -| ViT | TensorRT | Yes | Yes | - | - | - | - | -| GPT-NeoX | PyTorch | Yes | - | - | Yes | Yes | - | -| GPT-NeoX | Triton backend | Yes | - | - | Yes | Yes | - | -| BART/mBART | PyTorch | Yes | - | - | Yes | Yes | - | -| WeNet | C++ | Yes | - | - | - | - | - | -| DeBERTa | TensorFlow 2 | Yes | - | - | On-going | On-going | - | -| DeBERTa | PyTorch | Yes | - | - | On-going | On-going | - | - -* Note that the FasterTransformer supports the models above on C++ because all source codes are built on C++. - -More details of specific models are put in `xxx_guide.md` of [`docs/`](docs), where `xxx` means the model name. Some common questions and the respective answers are put in [`docs/QAList.md`](docs/QAList.md). Note that the model of Encoder and BERT are similar and we put the explanation into `bert_guide.md` together. - -## Advanced - -The following code lists the directory structure of FasterTransformer: ``` -/src/fastertransformer: source code of FasterTransformer - |--/cutlass_extensions: Implementation of cutlass gemm/kernels. - |--/kernels: CUDA kernels for different models/layers and operations, like addBiasResiual. - |--/layers: Implementation of layer modules, like attention layer, ffn layer. - |--/models: Implementation of different models, like BERT, GPT. - |--/tensorrt_plugin: encapluate FasterTransformer into TensorRT plugin. - |--/tf_op: custom Tensorflow OP implementation - |--/th_op: custom PyTorch OP implementation - |--/triton_backend: custom triton backend implementation - |--/utils: Contains common cuda utils, like cublasMMWrapper, memory_utils -/examples: C++, tensorflow and pytorch interface examples - |--/cpp: C++ interface examples - |--/pytorch: PyTorch OP examples - |--/tensorflow: TensorFlow OP examples - |--/tensorrt: TensorRT examples -/docs: Documents to explain the details of implementation of different models, and show the benchmark -/benchmark: Contains the scripts to run the benchmarks of different models -/tests: Unit tests -/templates: Documents to explain how to add a new model/example into FasterTransformer repo -``` - -Note that many folders contains many sub-folders to split different models. Quantization tools are move to `examples`, like `examples/tensorflow/bert/bert-quantization/` and `examples/pytorch/bert/bert-quantization-sparsity/`. - - -### Global Environment - -FasterTransformer provides some convenient environment variables for debuging and testing. - -1. `FT_LOG_LEVEL`: This environment controls the log level of debug messae. More details are in `src/fastertransformer/utils/logger.h`. Note that the program will print lots of message when the level is lower than `DEBUG` and the program would become very slow. -2. `FT_NVTX`: If it is set to be `ON` like `FT_NVTX=ON ./bin/gpt_example`, the program will insert tha tag of nvtx to help profiling the program. -3. `FT_DEBUG_LEVEL`: If it is set to be `DEBUG`, then the program will run `cudaDeviceSynchronize()` after every kernels. Otherwise, the kernel is executued asynchronously by default. It is helpful to locate the error point during debuging. But this flag affects the performance of program significantly. So, it should be used only for debuging. - -## Performance - -Hardware settings: - -* 8xA100-80GBs (with mclk 1593MHz, pclk 1410MHz) with AMD EPYC 7742 64-Core Processor -* T4 (with mclk 5000MHz, pclk 1590MHz) with Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz - -In order to run the following benchmark, we need to install the unix computing tool "bc" by - -```bash -apt-get install bc +mkdir -p FasterTransformer/build +cd FasterTransformer/build +git submodule init && git submodule update +cmake -DSM=70 -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON -DBUILD_MULTI_GPU=ON .. +make -j32 ``` -### BERT base performance - -The FP16 results of TensorFlow were obtained by running the `benchmarks/bert/tf_benchmark.sh`. - -The INT8 results of TensorFlow were obtained by running the `benchmarks/bert/tf_int8_benchmark.sh`. - -The FP16 results of PyTorch were obtained by running the `benchmarks/bert/pyt_benchmark.sh`. - -The INT8 results of PyTorch were obtained by running the `benchmarks/bert/pyt_int8_benchmark.sh`. - -More benchmarks are put in [`docs/bert_guide.md`](docs/bert_guide.md#bert-performance). - -#### BERT base performances of FasterTransformer new features - -The following figure compares the performances of different features of FasterTransformer and FasterTransformer under FP16 on T4. - -For large batch size and sequence length, both EFF-FT and FT-INT8-v2 bring about 2x speedup. Using Effective FasterTransformer and int8v2 at the same time can bring about 3.5x speedup compared to FasterTransformer FP16 for large case. - -
- -#### BERT base performance on TensorFlow - -The following figure compares the performances of different features of FasterTransformer and TensorFlow XLA under FP16 on T4. - -For small batch size and sequence length, using FasterTransformer can bring about 3x speedup. - -For large batch size and sequence length, using Effective FasterTransformer with INT8-v2 quantization can bring about 5x speedup. - -
- -#### BERT base performance on PyTorch - -The following figure compares the performances of different features of FasterTransformer and PyTorch TorchScript under FP16 on T4. - -For small batch size and sequence length, using FasterTransformer CustomExt can bring about 4x ~ 6x speedup. - -For large batch size and sequence length, using Effective FasterTransformer with INT8-v2 quantization can bring about 5x speedup. - -
- -### Decoding and Decoder performance - -The results of TensorFlow were obtained by running the `benchmarks/decoding/tf_decoding_beamsearch_benchmark.sh` and `benchmarks/decoding/tf_decoding_sampling_benchmark.sh` - -The results of PyTorch were obtained by running the `benchmarks/decoding/pyt_decoding_beamsearch_benchmark.sh`. - -In the experiments of decoding, we updated the following parameters: - -* head_num = 8 -* size_per_head = 64 -* num_layers = 6 for both encoder and decoder -* vocabulary_size = 32001 for TensorFlow sample codes, 31538 for PyTorch sample codes -* memory_hidden_dim = 512 -* max sequenc elength = 128 - -More benchmarks are put in [`docs/decoder_guide.md`](docs/decoder_guide.md#decoding-performance). - -#### Decoder and Decoding end-to-end translation performance on TensorFlow - -The following figure shows the speedup of of FT-Decoder op and FT-Decoding op compared to TensorFlow under FP16 with T4. Here, we use the throughput of translating a test set to prevent the total tokens of each methods may be different. Compared to TensorFlow, FT-Decoder provides 1.5x ~ 3x speedup; while FT-Decoding provides 4x ~ 18x speedup. - -
- -#### Decoder and Decoding end-to-end translation performance on PyTorch - -The following figure shows the speedup of of FT-Decoder op and FT-Decoding op compared to PyTorch under FP16 with T4. Here, we use the throughput of translating a test set to prevent the total tokens of each methods may be different. Compared to PyTorch, FT-Decoder provides 1.2x ~ 3x speedup; while FT-Decoding provides 3.8x ~ 13x speedup. - -
- -### GPT performance - -The following figure compares the performances of Megatron and FasterTransformer under FP16 on A100. - -In the experiments of decoding, we updated the following parameters: - -* head_num = 96 -* size_per_head = 128 -* num_layers = 48 for GPT-89B model, 96 for GPT-175B model -* data_type = FP16 -* vocab_size = 51200 -* top_p = 0.9 -* tensor parallel size = 8 -* input sequence length = 512 -* output sequence length = 32 - -
- -## Release notes - -### Changelog - -May 2023 -- Fix bugs of generation early stopping - -January 2023 -- Support GPT MoE -- Support FP8 for Bert and GPT (**Experimental**) -- Support DeBERTa on TensorFlow 2 and PyTorch - -Dec 2022 -- **Release the FasterTransformer 5.2** -- Support min length penalty - -Nov 2022 -- Support T5 Tensorflow 2 custom op. -- Support T5 MoE -- Support WeNet -- Support BART & mBART -- Support SwinV2 -- Initial support for w8a8 int8 mode with GPT (preview) -- Support fused mha in GPT - -Oct 2022 -- Support BLOOM - -Sep 2022 -- Support factual sampling ([link](https://arxiv.org/pdf/2206.04624.pdf)) in gpt -- Support for IA3 adapting scheme in T5 - -Aug 2022 -- Support returning context tokens embeddings in GPT -- **Release the FasterTransformer 5.1** -- Support for interactive generation -- Support for attention time-limited memory -- Support mt5 and t5-v1.1 - -July 2022 -- Support UL2 huggingface ckpt. ([link](https://huggingface.co/google/ul2)) - - Fix bug of T5 under bfloat16. -- Add ViT INT8 TensorRT Plugin -- Support batch sampling -- Support shared context optimization in GPT model - -June 2022 -- Support streaming generation for triton backend. -- Support OPT. -- Support multi-node multi-GPU BERT under FP32, FP16 and BF16. - -May 2022 -- Support bfloat16 on most models. -- Support [prefix-prompt](https://arxiv.org/pdf/2101.00190.pdf) for GPT-J. -- Support GPT-NeoX. - - epsilon value used in layernorm is now a parameter - - rotary embedding GPT-NeoX style (only GPT-J was implemented) - - load per-GPU layernorm and bias parameters - - weight conversion from EleutherAI checkpoint - -April 2022 -- **Release the FasterTransformer 5.0** - - Change the default accumulation type of all gemm to FP32. - - Support bfloat16 inference in GPT model. - - Support Nemo Megatron T5 and Megatron-LM T5 model. - - Support ViT. - -March 2022 -- Support `stop_ids` and `ban_bad_ids` in GPT-J. -- Support dynamice `start_id` and `end_id` in GPT-J, GPT, T5 and Decoding. - -February 2022 -- Support Swin Transformer. -- Optimize the k/v cache update of beam search by in-direction buffer. -- Support runtime input for GPT-J, T5 and GPT. -- Support soft prompt in GPT and GPT-J. -- Support custom all reduce kernel. - - Limitation: - 1. Only support tensor parallel size = 8 on DGX-A100. - 2. Only support CUDA with cudaMallocAsync. - -December 2021 -- Add TensorRT plugin of T5 model. -- Change some hyper-parameters of GPT model to runtime query. -- Optimize the memory allocator under C++ code. -- Fix bug of CUB including when using CUDA 11.5 or newer version. - -November 2021 -- **Update the FasterTransformer 5.0 beta** -- Add GPT-3 INT8 weight only qauntization for batch size <= 2. -- Support multi-node multi-gpu support on T5. -- Enhance the multi-node multi-gpu supporting in GPT-3. - -August 2021 -- **Release the FasterTransformer 5.0 beta** - - Refactor the repo and codes - - And special thanks to NAVER Corp. for contributing a lot to this version, as listed below. - - Bugs fix - - Fix error that occurs when batch_size is less than max_batch_size for gpt pytorch wrapper. - - Fix memory leak that occurs every forward because of reused allocator. - - Fix race condition that occurs in repetition penalty kernel. - - Enhancement - - Add random seed setting. - - Fix GEMM buffer overflow on FP16 of GPT. - - Change to invalidate finished buffer for every completion. - - Introduce stop_before for early stop. - - Support Longformer. - - Rename `layer_para` to `pipeline_para`. - - Optimize the sorting of top p sampling. - - Support sparsity for Ampere GPUs on BERT. - - Support `size_per_head` 96, 160, 192, 224, 256 for GPT model. - - Support multi-node inference for GPT Triton backend. - -June 2021 -- Support XLNet - -April 2021 -- **Release the FasterTransformer 4.0** - - Support multi-gpus and multi-nodes inference for GPT model on C++ and PyTorch. - - Support single node, multi-gpus inference for GPT model on triton. - - Add the int8 fused multi-head attention kernel for bert. - - Add the FP16 fused multi-head attention kernel of V100 for bert. - - Optimize the kernel of decoder. - - Move to independent repo. - - Eager mode PyTorch extension is deprecated. - -Dec 2020 -- **Release the FasterTransformer 3.1** - - Optimize the decoding by adding the finisehd mask to prevent useless computing. - - Support opennmt encoder. - - Remove the TensorRT plugin supporting. - - TorchScript custom op is deprecated. - -Nov 2020 -- Optimize the INT8 inference. -- Support PyTorch INT8 inference. -- Provide PyTorch INT8 quantiztion tools. -- Integrate the fused multi-head attention kernel of TensorRT into FasterTransformer. -- Add unit test of SQuAD. -- Update the missed NGC checkpoints. - -Sep 2020 -- Support GPT2 -- **Release the FasterTransformer 3.0** - - Support INT8 quantization of encoder of cpp and TensorFlow op. - - Add bert-tf-quantization tool. - - Fix the issue that Cmake 15 or Cmake 16 fail to build this project. - -Aug 2020 -- Fix the bug of trt plugin. - -June 2020 -- **Release the FasterTransformer 2.1** - - Add Effective FasterTransformer based on the idea of [Effective Transformer](https://github.com/bytedance/effective_transformer) idea. - - Optimize the beam search kernels. - - Add PyTorch op supporting - -May 2020 -- Fix the bug that seq_len of encoder must be larger than 3. -- Add the position_encoding of decoding as the input of FasterTransformer decoding. This is convenient to use different types of position encoding. FasterTransformer does not compute the position encoding value, but only lookup the table. -- Modifying the method of loading model in `translate_sample.py`. - -April 2020 -- Rename `decoding_opennmt.h` to `decoding_beamsearch.h` -- Add DiverseSiblingsSearch for decoding. -- Add sampling into Decoding - - The implementation is in the `decoding_sampling.h` - - Add top_k sampling, top_p sampling for decoding. -- Refactor the tensorflow custom op codes. - - Merge `bert_transformer_op.h`, `bert_transformer_op.cu.cc` into `bert_transformer_op.cc` - - Merge `decoder.h`, `decoder.cu.cc` into `decoder.cc` - - Merge `decoding_beamsearch.h`, `decoding_beamsearch.cu.cc` into `decoding_beamsearch.cc` -- Fix the bugs of finalize function decoding.py. -- Fix the bug of tf DiverseSiblingSearch. -- Add BLEU scorer `bleu_score.py` into `utils`. Note that the BLEU score requires python3. -- Fuse QKV Gemm of encoder and masked_multi_head_attention of decoder. -- Add dynamic batch size and dynamic sequence length features into all ops. - -March 2020 -- Add feature in FasterTransformer 2.0 - - Add `translate_sample.py` to demonstrate how to translate a sentence by restoring the pretrained model of OpenNMT-tf. -- Fix bugs of Fastertransformer 2.0 - - Fix the bug of maximum sequence length of decoder cannot be larger than 128. - - Fix the bug that decoding does not check finish or not after each step. - - Fix the bug of decoder about max_seq_len. - - Modify the decoding model structure to fit the OpenNMT-tf decoding model. - - Add a layer normalization layer after decoder. - - Add a normalization for inputs of decoder - -February 2020 -- **Release the FasterTransformer 2.0** - - Provide a highly optimized OpenNMT-tf based decoder and decoding, including C++ API and TensorFlow op. - - Refine the sample codes of encoder. - - Add dynamic batch size feature into encoder op. - -July 2019 -- **Release the FasterTransformer 1.0** - - Provide a highly optimized bert equivalent transformer layer, including C++ API, TensorFlow op and TensorRT plugin. - -### Known issues - -- Cannot compile on tensorflow 2.10 due to undefined symbol issue. -- Undefined symbol errors when import the extension - - Please `import torch` first. If this has been done, it is due to the incompatible C++ ABI. You may need to check the PyTorch used during compilation and execution are the same, or you need to check how your PyTorch is compiled, or the version of your GCC, etc. -- Results of TensorFlow and OP would be different in decoding. This problem is caused by the accumulated log probability, and we do not avoid this problem. -- If encounter some problem in the custom environment, try to use the gcc/g++ 4.8 to build the project of TensorFlow op, especially for TensorFlow 1.14. diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt index da24d72c6..800dfdd7f 100644 --- a/examples/cpp/CMakeLists.txt +++ b/examples/cpp/CMakeLists.txt @@ -27,6 +27,7 @@ add_subdirectory(wenet) add_subdirectory(gptj) add_subdirectory(gptneox) add_subdirectory(multi_gpu_gpt) +#add_subdirectory(llama) if(ENABLE_FP8) add_subdirectory(gpt_fp8) diff --git a/examples/cpp/llama/CMakeLists.txt b/examples/cpp/llama/CMakeLists.txt new file mode 100644 index 000000000..19fb6e7fc --- /dev/null +++ b/examples/cpp/llama/CMakeLists.txt @@ -0,0 +1,22 @@ +# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_library(llama_example_utils STATIC llama_example_utils.cc) +target_link_libraries(llama_example_utils PUBLIC -lcublas -lcublasLt -lcudart + nvtx_utils mpi_utils nccl_utils) + +add_executable(llama_example llama_example.cc) +target_link_libraries(llama_example PUBLIC -lcublas -lcublasLt -lcudart + LLaMA mpi_utils nccl_utils nvtx_utils + llama_example_utils word_list) diff --git a/examples/cpp/llama/bad_words.csv b/examples/cpp/llama/bad_words.csv new file mode 100644 index 000000000..6a1126ebd --- /dev/null +++ b/examples/cpp/llama/bad_words.csv @@ -0,0 +1,2 @@ +7768,3908 +1,2 diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini new file mode 100644 index 000000000..3df66269f --- /dev/null +++ b/examples/cpp/llama/llama_config.ini @@ -0,0 +1,21 @@ +[ft_instance_hyperparameter] +model_name=llama_33B +model_dir=../models/llama +data_type=fp16 +pipeline_para_size=4 + + +[request] +request_batch_size=32 +start_pos=2 + +[llama_33B] +head_num=52 +size_per_head=128 +vocab_size=32000 +decoder_layers=60 +rotary_embedding=128 +multiple_of=256 +max_seq_len=1024 +padding_id=0 +random_seed=0 diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc new file mode 100644 index 000000000..3065d4873 --- /dev/null +++ b/examples/cpp/llama/llama_example.cc @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "3rdparty/INIReader.h" +#include "examples/cpp/llama/llama_example_utils.h" +#include "src/fastertransformer/models/llama/LLaMA.h" +#include "src/fastertransformer/utils/mpi_utils.h" +#include "src/fastertransformer/utils/nccl_utils.h" +#include "src/fastertransformer/utils/nvtx_utils.h" +#include "src/fastertransformer/utils/word_list.h" + +#include +#include +#include +#include +#include +#include + +using namespace fastertransformer; + +template +void llama_example(const INIReader reader); + +int main(int argc, char* argv[]) +{ + mpi::initialize(&argc, &argv); + srand(0); + + std::string ini_name; + if (argc == 2) { + ini_name = std::string(argv[1]); + } + else { + ini_name = "../examples/cpp/llama/llama_config.ini"; + } + + INIReader reader = INIReader(ini_name); + if (reader.ParseError() < 0) { + std::cout << "[ERROR] Can't load '" << ini_name << "'\n"; + return -1; + } + const std::string data_type = reader.Get("ft_instance_hyperparameter", "data_type"); + + if (data_type == "fp32") { + llama_example(reader); + } + else if (data_type == "fp16") { + llama_example(reader); + } + else { + FT_LOG_ERROR("is_fp16 should be 0 (use float) or 1 (use half)."); + return -1; + } + mpi::finalize(); + return 0; +} + +template +void llama_example(const INIReader reader) +{ + const std::string model_name = reader.Get("ft_instance_hyperparameter", "model_name"); + std::string model_dir = std::string(reader.Get("ft_instance_hyperparameter", "model_dir")); + int pipeline_para_size = reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"); + + const size_t head_num = reader.GetInteger(model_name, "head_num"); + const size_t size_per_head = reader.GetInteger(model_name, "size_per_head"); + const size_t vocab_size = reader.GetInteger(model_name, "vocab_size"); + const size_t decoder_layers = reader.GetInteger(model_name, "decoder_layers"); + const size_t rotary_embedding_dim = reader.GetInteger(model_name, "rotary_embedding"); + const int multiple_of = reader.GetInteger(model_name, "multiple_of"); + const size_t max_seq_len = reader.GetInteger(model_name, "max_seq_len"); + + const size_t hidden_units = head_num * size_per_head; + const size_t inter_size = multiple_of * (((8 * hidden_units / 3) + multiple_of - 1) / multiple_of); + + const size_t request_batch_size = reader.GetInteger("request", "request_batch_size"); + const int padding_id = reader.GetInteger(model_name, "padding_id"); + int start_pos = reader.GetInteger("request", "start_pos", 0); + unsigned long long random_seed = reader.GetInteger("request", "random_seed", 0); + + FT_CHECK(decoder_layers % pipeline_para_size == 0); + + // Prepare the parallelism parameters + int rank = mpi::getCommWorldRank(); + int world_size = mpi::getCommWorldSize(); + if (rank == 0) { + printf("Total ranks: %d.\n", world_size); + } + int device, device_count; + check_cuda_error(cudaGetDeviceCount(&device_count)); + check_cuda_error(cudaSetDevice(rank % device_count)); + check_cuda_error(cudaGetDevice(&device)); + + struct cudaDeviceProp prop; + check_cuda_error(cudaGetDeviceProperties(&prop, device)); + printf("Device %s\n", prop.name); + + printf("P%d is running with GPU #%d.\n", rank, device); + if (pipeline_para_size != world_size) { + printf("[ERROR] pipeline_para_size should equal to world_size \n"); + exit(-1); + } + + const int layers_per_group = decoder_layers / pipeline_para_size; + if (layers_per_group * pipeline_para_size != (int)decoder_layers) { + printf("[ERROR] layers_per_group (%d) * pipeline_para_size (%d) should equal to decoder_layers (%ld) \n", + layers_per_group, + pipeline_para_size, + decoder_layers); + exit(-1); + } + + NcclParam tensor_para; + NcclParam pipeline_para; + ftNcclInitialize(tensor_para, pipeline_para, 1, pipeline_para_size); + + // Read ids of request from file. + size_t max_input_len = -1; + std::vector v_start_lengths; + std::vector v_start_ids; + read_start_ids(request_batch_size, + &v_start_lengths, + &v_start_ids, + max_input_len, + padding_id, + 1, + "../examples/cpp/llama/start_ids.csv"); + + int* d_input_ids; + int* d_input_lengths; + if (max_input_len == 0) { + // unconditional case, no input ids, so do nothing. + d_input_ids = nullptr; + d_input_lengths = nullptr; + } + else { + // conditional case. + deviceMalloc(&d_input_ids, request_batch_size * max_input_len, false); + deviceMalloc(&d_input_lengths, request_batch_size, false); + cudaH2Dcpy(d_input_ids, v_start_ids.data(), request_batch_size * max_input_len); + cudaH2Dcpy(d_input_lengths, v_start_lengths.data(), request_batch_size); + } + + const int total_output_len = max_input_len; + + cudaStream_t stream; + cublasHandle_t cublas_handle; + cublasLtHandle_t cublaslt_handle; + cudaStreamCreate(&stream); + cublasCreate(&cublas_handle); + cublasLtCreate(&cublaslt_handle); + cublasSetStream(cublas_handle, stream); + cublasAlgoMap* cublas_algo_map = new cublasAlgoMap("gemm_config.in"); + + Allocator allocator(getDevice()); + + std::mutex* cublas_wrapper_mutex = new std::mutex(); + cublasMMWrapper cublas_wrapper = + cublasMMWrapper(cublas_handle, cublaslt_handle, stream, cublas_algo_map, cublas_wrapper_mutex, &allocator); + if (std::is_same::value) { + cublas_wrapper.setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F); + } + else if (std::is_same::value) { + cublas_wrapper.setFP32GemmConfig(); + } + + fastertransformer::LLaMAWeight llama_weights( + hidden_units, inter_size, vocab_size, decoder_layers, pipeline_para.world_size_, pipeline_para.rank_); + + model_dir = model_dir + "/" + std::to_string(tensor_para.world_size_) + "-gpu"; + llama_weights.loadModel(model_dir); + + if (world_size > 1) { + mpi::bcast(&random_seed, 1, mpi::MPI_TYPE_UNSIGNED_LONG_LONG, 0, mpi::COMM_WORLD); + } + + AttentionType attention_type = + getAttentionType(size_per_head, + getSMVersion(), + !((std::getenv("SHONG_PADDING") != nullptr) + && (std::string(std::getenv("SHONG_PADDING")) == "ON")), // true, // remove_padding + 0, // llama supports any-seq-length fmha + true, // is_fuse + false, // with_relative_position_bias + true); // causal_mask + + switch (attention_type) { + case AttentionType::UNFUSED_MHA: + std::cout << "UNFUSED_MHA\n"; + break; + case AttentionType::UNFUSED_PADDED_MHA: + std::cout << "UNFUSED_PADDED_MHA\n"; + break; + case AttentionType::FUSED_MHA: + std::cout << "FUSED_MHA\n"; + break; + case AttentionType::FUSED_PADDED_MHA: + std::cout << "FUSED_PADDED_MHA\n"; + break; + } + + LLaMA llama = LLaMA(head_num, + size_per_head, + inter_size, + decoder_layers, + vocab_size, + rotary_embedding_dim, + random_seed, + max_seq_len, + tensor_para, + pipeline_para, + stream, + &cublas_wrapper, + &allocator, + false, // is_free_buffer_after_forward + &prop, + attention_type); + + float* d_output_logits; + deviceMalloc(&d_output_logits, request_batch_size * total_output_len * vocab_size, false); + std::unordered_map input_tensors = std::unordered_map{ + {"input_ids", + Tensor{MEMORY_GPU, TYPE_INT32, std::vector{request_batch_size, (size_t)max_input_len}, d_input_ids}}, + {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, std::vector{request_batch_size}, d_input_lengths}}, + {"start_pos", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector{1}, &start_pos}}}; + + std::unordered_map output_tensors = std::unordered_map{ + {"output_logits", + Tensor{MEMORY_GPU, + TYPE_FP32, + std::vector{request_batch_size, (size_t)total_output_len, vocab_size}, + d_output_logits}}}; + + print_mem_usage(); + + int ite = 1; + cudaDeviceSynchronize(); + mpi::barrier(); + + // warm up + ite = 1; + ft_nvtx::setScope("warmup_time"); + PUSH_RANGE("warmup time") + for (int i = 0; i < ite; ++i) { + llama.forward(&output_tensors, &input_tensors, &llama_weights); + } + cudaDeviceSynchronize(); + mpi::barrier(); + + POP_RANGE; + ft_nvtx::resetScope(); + + /* + if (rank == world_size - 1) { + float* out = (float*)malloc(sizeof(float) * request_batch_size * total_output_len * vocab_size); + cudaMemcpy(out, + d_output_logits, + sizeof(float) * request_batch_size * total_output_len * vocab_size, + cudaMemcpyDeviceToHost + ); + for (int b = 0; b < request_batch_size; ++b) { + std::cout << "["; + for (int s = 0; s < total_output_len; ++s) { + std::cout << "["; + for (int v = vocab_size - 8; v < vocab_size; ++v) { + std::cout << out[b * total_output_len * vocab_size + s * vocab_size + v] << " "; + } + std::cout << "]\n"; + } + std::cout << "]\n"; + } + std::cout << "\n"; + free(out); + } + */ + + // test time + cudaProfilerStart(); + struct timeval start, end; + cudaDeviceSynchronize(); + mpi::barrier(); + + gettimeofday(&start, NULL); + + ft_nvtx::setScope("total_time"); + PUSH_RANGE("total time") + // warm up + ite = 10; + for (int i = 0; i < ite; ++i) { + llama.forward(&output_tensors, &input_tensors, &llama_weights); + } + + cudaDeviceSynchronize(); + mpi::barrier(); + + POP_RANGE; + ft_nvtx::resetScope(); + gettimeofday(&end, NULL); + cudaProfilerStop(); + + printf("[INFO] request_batch_size %ld head_num %ld size_per_head %ld total_output_len %d" + " decoder_layers %ld vocab_size %ld FT-CPP-decoding-beamsearch-time %.2f ms\n", + request_batch_size, + head_num, + size_per_head, + total_output_len, + decoder_layers, + vocab_size, + ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001) / ite); + + ftNcclParamDestroy(tensor_para); + ftNcclParamDestroy(pipeline_para); + + delete cublas_algo_map; + delete cublas_wrapper_mutex; + + if (d_input_ids != nullptr) { + cudaFree(d_input_ids); + } + if (d_input_lengths != nullptr) { + cudaFree(d_input_lengths); + } + if (d_output_logits != nullptr) { + deviceFree(d_output_logits); + } + + return; +} diff --git a/examples/cpp/llama/llama_example_utils.cc b/examples/cpp/llama/llama_example_utils.cc new file mode 100644 index 000000000..d6db80856 --- /dev/null +++ b/examples/cpp/llama/llama_example_utils.cc @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "examples/cpp/llama/llama_example_utils.h" + +#include +#include +#include +#include + +namespace fastertransformer { + +int read_start_ids(size_t batch_size, + std::vector* v_start_lengths, + std::vector* v_start_ids, + size_t& max_input_len, + const int padding_id, + const int beam_width, + std::string file_name) +{ + std::vector> tmp_start_ids; + std::vector tmp_start_lengths; + + std::ifstream start_id_file(file_name, std::ios::in); + int line_num = 0; + if (start_id_file.is_open()) { + std::string line; + while (std::getline(start_id_file, line)) { + std::stringstream lineStream(line); + std::string vals; + int i1 = 0; + std::vector tmp_vec; + while (std::getline(lineStream, vals, ',')) { + tmp_vec.push_back(std::stoi(vals)); + i1++; + } + tmp_start_ids.push_back(tmp_vec); + tmp_start_lengths.push_back(i1); + line_num++; + } + if (batch_size == 0) { + batch_size = line_num; + } + } + else { + printf("[WARNING] Cannot open the file '%s'. \n", file_name.c_str()); + max_input_len = 0; + return 0; + } + + max_input_len = tmp_start_lengths.data()[0]; + for (uint i = 1; i < (uint)tmp_start_lengths.size(); i++) { + max_input_len = max_input_len > tmp_start_lengths.data()[i] ? max_input_len : tmp_start_lengths.data()[i]; + } + + while ((int)tmp_start_lengths.size() < batch_size) { + std::vector padding_ids; + for (int i = 0; i < max_input_len; i++) { + padding_ids.push_back(padding_id); + } + tmp_start_ids.push_back(padding_ids); + tmp_start_lengths.push_back(max_input_len); + } + + // Add padding + for (int i = 0; i < (int)tmp_start_ids.size(); i++) { + for (int j = (int)tmp_start_ids[i].size(); j < max_input_len; j++) { + tmp_start_ids[i].push_back(padding_id); + } + } + + for (int i = 0; i < (int)tmp_start_ids.size(); i++) { + for (int b = 0; b < beam_width; b++) { + for (int j = 0; j < (int)tmp_start_ids[i].size(); j++) { + v_start_ids->push_back(tmp_start_ids[i][j]); + } + v_start_lengths->push_back(tmp_start_lengths[i]); + } + } + return batch_size; +} + +} // namespace fastertransformer diff --git a/examples/cpp/llama/llama_example_utils.h b/examples/cpp/llama/llama_example_utils.h new file mode 100644 index 000000000..1e5d0b9ab --- /dev/null +++ b/examples/cpp/llama/llama_example_utils.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +namespace fastertransformer { + +int read_start_ids(size_t batch_size, + std::vector* v_start_lengths, + std::vector* v_start_ids, + size_t& max_input_len, + const int end_id, + const int beam_width, + std::string file_name); + +} // namespace fastertransformer diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv new file mode 100644 index 000000000..58bc4b4f6 --- /dev/null +++ b/examples/cpp/llama/start_ids.csv @@ -0,0 +1,32 @@ +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 1207, 432, 809, 295, 719, 27372, 29889, 7605, 263, 286, 789, 5941, 292, 10823, 363, 596, 3632, 331, 1943, 432, 809, 295, 719, 27372, 9522, 412, 29889, 360, 728, 29559, 411, 1395, 559, 29899, 7582, 1259, 4426, 508, 367, 1304, 29892, 541, 14383, 270, 728, 29559, 411, 4023, 845, 9418, 29899, 29890, 5761, 616, 4426, 408, 445, 508, 17820, 278, 8341, 1283, 432, 809, 295, 719, 28001 , 29889 +1, 3201, 955, 29901, 1128, 304, 679, 304, 413, 4442, 340, 29889, 315, 905, 263, 1513, 16286, 304, 413, 4442, 2165, 4799, 637, 313, 29926, 6547, 3300, 352, 13607, 6121, 4799, 637, 29897, 515, 29129, 1450, 470, 4655, 14721, 273, 14368, 29892, 1316, 408, 286, 348, 436, 29892, 301, 898, 265, 29892, 1226, 23559, 29892, 10395, 2429, 29892, 282, 1431, 434, 29892, 610, 275, 470, 7655, 1915, 29889, 1704, 26536, 3160, 3287, 1248, 728, 4799, 9012 +1, 8778, 322, 19906, 29901, 1128, 304, 12566, 330, 2390, 267, 515, 17564, 29879, 29889, 26428, 596, 2646, 412, 325, 1475, 411, 2691, 27716, 7787, 1259, 304, 12566, 278, 330, 2390, 267, 29889, 450, 27716, 881, 367, 1546, 29871, 29900, 29889, 29945, 304, 29871, 29900, 29889, 29947, 3533, 17528, 690, 313, 29900, 29889, 29900, 29906, 29900, 304, 29871, 29900, 29889, 29900, 29941, 29896, 297, 29897, 304, 12566, 278, 330, 2390, 267, 515, 285, 3687, 29892, 25550 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 6548, 5520, 321, 29891, 295, 1161, 267, 322, 2989, 261, 321, 29891, 774, 798, 18180, 29889, 15154, 1207, 786, 4646, 368, 29889, 341, 6151, 2518, 322, 321, 29891, 774, 798, 9127, 29892, 2175, 373, 321, 29891, 295, 1161, 267, 322, 321, 29891, 774, 5727, 975, 11147, 674, 18658, 1438, 15409, 2578, 25414, 322, 674, 5557, 321, 29891, 295, 1161, 267, 322, 321, 29891, 774, 5727, 515, 15678, 636 +1, 26040, 29901, 1128, 304, 1207, 3632, 331, 1943, 6635, 1634, 514, 296, 29889, 3462, 278, 18853, 17182, 304, 263, 805, 764, 18046, 280, 29889, 1152, 278, 1634, 514, 296, 29892, 366, 29915, 645, 817, 263, 29871, 29906, 29899, 21543, 313, 29945, 29929, 286, 29880, 29897, 12917, 805, 764, 18046, 280, 29889, 317, 802, 29872, 911, 29871, 29906, 4441, 567, 310, 454, 3712, 18853, 17182, 1919, 29871, 29906, 4441, 567, 310, 8775, 24841, 18853, 17182, 29892 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 1207, 7375, 322, 269, 473, 22300, 29889, 323, 2209, 278, 282, 548, 658, 262, 411, 278, 6501, 577, 29891, 12507, 346, 29889, 3462, 29871, 29945, 29871, 1309, 778, 313, 29896, 29946, 29906, 330, 29897, 310, 10814, 6393, 282, 548, 658, 262, 393, 29915, 29879, 1063, 5700, 297, 29871, 30515, 29899, 22466, 313, 29953, 29899, 4317, 29897, 12003, 10076, 567, 322, 29871, 29906, 734, 294, 1129, 787, 313, 29896 +1, 11796, 414, 322, 28251, 1199, 29901, 1128, 304, 19417, 325, 524, 482, 3438, 2017, 432, 809, 295, 719, 373, 18230, 388, 29889, 29301, 1432, 325, 524, 482, 3438, 2017, 432, 809, 295, 719, 10754, 2909, 322, 1432, 325, 524, 482, 3438, 2017, 432, 809, 295, 719, 1856, 3268, 366, 508, 1284, 29889, 6280, 4447, 675, 7535, 411, 3785, 11949, 29892, 278, 664, 310, 1532, 2998, 2874, 414, 29892, 12713, 22848, 29892, 1539, 1338, 29892, 25702 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 19531, 260, 4227, 29880, 1600, 384, 7901, 10412, 29889, 14542, 852, 263, 19875, 29891, 470, 1045, 3594, 260, 4227, 29880, 1600, 384, 363, 263, 901, 3209, 950, 1106, 29889, 319, 12003, 29892, 1302, 1537, 260, 4227, 29880, 1600, 384, 7901, 1008, 322, 263, 5101, 310, 1320, 1253, 287, 1444, 550, 338, 278, 4922, 982, 304, 7952, 14294, 373, 263, 11220, 4723, 355, 29889, 7357, 523, 2814, 470, 260 +1, 25453, 322, 17465, 292, 29901, 1128, 304, 17545, 901, 330, 2390, 267, 29889, 3462, 330, 2390, 267, 304, 596, 4497, 328, 29889, 319, 5972, 322, 4780, 982, 304, 7910, 278, 5253, 310, 330, 2390, 267, 297, 596, 14218, 652, 300, 338, 304, 28189, 263, 2846, 8870, 1490, 330, 2390, 267, 373, 2246, 310, 263, 301, 3322, 29899, 272, 270, 2559, 814, 603, 4497, 328, 29889, 450, 14225, 21054, 272, 322, 7990, 18459, 310, 278, 330 +1, 15202, 29901, 1128, 304, 11039, 403, 18655, 1849, 964, 263, 9045, 29891, 26044, 29889, 8561, 263, 18655, 519, 885, 2572, 569, 363, 26044, 29889, 319, 18655, 519, 885, 2572, 569, 338, 263, 2560, 270, 728, 1754, 491, 872, 329, 29948, 292, 18655, 1849, 297, 263, 4091, 340, 7243, 322, 769, 4417, 367, 2579, 29808, 975, 963, 304, 4808, 278, 270, 728, 4208, 29889, 14893, 491, 4417, 738, 18655, 1849, 366, 763, 29892, 3704, 373, 1080 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 1207, 263, 3632, 331, 1943, 9045, 29891, 3700, 471, 29882, 29889, 422, 26062, 599, 310, 278, 2348, 1127, 10070, 29889, 512, 263, 18350, 29899, 29879, 1891, 12580, 29880, 29892, 6837, 4208, 29871, 30226, 18002, 313, 29946, 29945, 330, 29897, 310, 29081, 288, 1446, 29892, 29871, 30515, 18002, 313, 29945, 29929, 286, 29880, 29897, 10849, 454, 3712, 3623, 625, 29892, 29871, 30515, 18002, 313, 29945, 29929, 286, 29880, 29897 +1, 11796, 414, 322, 28251, 1199, 29901, 1128, 304, 1207, 18655, 13956, 286, 1878, 8345, 8310, 29891, 29889, 5701, 1082, 278, 373, 291, 322, 286, 1878, 18901, 29889, 940, 271, 29871, 30226, 18002, 313, 29896, 29906, 29900, 286, 29880, 29897, 310, 4805, 29899, 2405, 5359, 288, 9258, 17182, 297, 263, 2919, 12507, 346, 8357, 975, 18350, 29899, 9812, 12871, 29889, 9038, 278, 17182, 528, 6727, 414, 29892, 788, 29871, 30226, 310, 263, 2319, 373, 291, 393 +1, 11796, 414, 322, 28251, 1199, 29901, 1128, 304, 6958, 10992, 2963, 29889, 15484, 263, 1246, 363, 278, 19075, 982, 304, 6159, 10992, 2963, 29889, 960, 366, 723, 763, 304, 505, 263, 2022, 29899, 517, 29899, 10532, 14983, 29892, 270, 616, 278, 2498, 297, 6578, 2722, 1196, 472, 29871, 29896, 29899, 29947, 29900, 29900, 29899, 29953, 29953, 29947, 29899, 29953, 29955, 29953, 29945, 29889, 2688, 29915, 276, 1722, 7398, 388, 304, 1424, 22394, 29871, 29955, 263 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 8267, 596, 321, 29891, 774, 5727, 411, 263, 8006, 272, 29889, 349, 27574, 263, 8006, 272, 1754, 10816, 363, 321, 29891, 774, 5727, 29889, 319, 3918, 8006, 272, 674, 451, 2367, 366, 13173, 3347, 29879, 29892, 322, 508, 367, 18215, 304, 671, 2978, 596, 5076, 29889, 8669, 20590, 385, 321, 29891, 774, 798, 8006, 272, 29892, 5069, 2319, 12995, 311, 674, 2367, 366, 278, 3347, 29879, 366, 13521 +1, 15202, 29901, 1128, 304, 5040, 23023, 1848, 321, 1218, 29889, 360, 8349, 7268, 403, 1546, 9128, 18757, 261, 322, 23023, 1848, 18757, 261, 29889, 1763, 18720, 278, 9946, 310, 596, 23023, 1848, 321, 1218, 29892, 372, 1122, 367, 5407, 304, 937, 2274, 746, 366, 526, 11223, 4824, 1711, 9074, 14793, 322, 746, 366, 526, 11223, 953, 8194, 635, 9074, 14793, 29889, 26991, 29892, 23023, 1848, 18757, 261, 5304, 373, 11584, 322, 23880, 5065, 5362, 29889 +1, 4231, 749, 322, 15197, 29901, 1128, 304, 289, 5790, 1044, 4856, 29889, 7519, 29883, 403, 7535, 1048, 278, 289, 5790, 1044, 29889, 450, 289, 5790, 1044, 29892, 884, 2998, 408, 278, 23729, 1458, 19119, 9045, 1044, 313, 1579, 272, 1458, 1002, 1082, 16385, 29871, 29941, 29929, 29946, 29892, 760, 474, 29897, 338, 263, 4307, 393, 471, 4502, 304, 9801, 322, 1072, 5987, 11176, 14703, 19119, 9045, 14502, 5786, 363, 1906, 23164, 515, 263, 19119, 4486 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 1207, 263, 5613, 15774, 23895, 2017, 29889, 4007, 6967, 278, 2348, 1127, 10070, 29889, 1152, 445, 9522, 412, 29892, 366, 674, 817, 278, 1494, 4452, 584, 29871, 29896, 2894, 293, 274, 2559, 314, 265, 12070, 1919, 29871, 29906, 29945, 2894, 293, 5881, 314, 290, 2532, 29879, 1919, 29871, 29896, 29945, 2894, 293, 17184, 1960, 1919, 29871, 29896, 2894, 293, 1109, 2911, 17796, 1919, 29871, 29896, 10849, 2894, 293 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 471, 29882, 4105, 4841, 29889, 14542, 852, 278, 1492, 528, 314, 1129, 29877, 322, 4195, 261, 29889, 5806, 738, 528, 314, 1129, 29877, 470, 4195, 261, 674, 664, 29892, 372, 338, 2253, 304, 671, 2730, 391, 332, 5281, 528, 314, 1129, 359, 322, 4195, 414, 29892, 7148, 565, 596, 8716, 29886, 338, 15589, 322, 372, 23766, 29889, 3834, 9316, 1316, 408, 1183, 29874, 2730, 391, 545, 263, 1341 +1, 11796, 414, 322, 28251, 1199, 29901, 1128, 304, 1207, 263, 19408, 413, 3780, 12343, 7539, 282, 5863, 29889, 10306, 278, 11994, 363, 3907, 413, 295, 1188, 29887, 29915, 29879, 19408, 413, 3780, 29886, 583, 2578, 1446, 2441, 29889, 1670, 674, 3117, 367, 1048, 4203, 263, 9853, 310, 19408, 413, 3780, 29886, 583, 7539, 2175, 29889, 313, 697, 310, 278, 2625, 23633, 310, 1641, 278, 7984, 29892, 338, 366, 679, 304, 17545, 738, 29915, 454, 29888 +1, 349, 1691, 322, 24980, 1338, 29901, 1128, 304, 260, 4003, 8343, 263, 2653, 23717, 29889, 402, 1624, 596, 28075, 29889, 887, 674, 817, 263, 29871, 29896, 29906, 21759, 269, 4316, 19144, 29892, 263, 4964, 14051, 495, 8343, 292, 260, 4003, 29892, 322, 263, 29871, 29896, 29953, 29899, 22466, 318, 276, 386, 1705, 274, 493, 1308, 411, 263, 24235, 310, 29871, 29945, 285, 4615, 313, 1454, 2319, 26361, 29897, 322, 29871, 29947, 285, 4615, 313, 1454 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 2867, 297, 2373, 296, 454, 1624, 17394, 267, 29889, 5373, 29891, 17394, 267, 393, 6216, 1532, 29889, 3080, 326, 675, 278, 817, 363, 16116, 292, 470, 16679, 297, 491, 2805, 2373, 296, 454, 1624, 17394, 267, 393, 526, 2307, 263, 1781, 6216, 363, 366, 29889, 4001, 2373, 296, 454, 1624, 338, 380, 2593, 322, 29395, 990, 4357, 29892, 366, 29915, 276, 451, 2675, 304, 367, 2221, 304, 1735 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 9563, 470, 2329, 263, 528, 10511, 1283, 321, 29891, 774, 798, 29889, 14542, 852, 385, 321, 29891, 774, 798, 282, 3977, 309, 322, 4764, 672, 393, 338, 2788, 304, 596, 5613, 2927, 292, 29889, 960, 366, 1603, 505, 697, 310, 596, 321, 29891, 774, 5727, 29892, 445, 1795, 367, 263, 2217, 6775, 489, 5143, 1993, 278, 282, 3977, 309, 304, 278, 528, 1943, 310, 596, 321, 29891, 774 +1, 11796, 414, 322, 28251, 1199, 29901, 1128, 304, 5376, 411, 540, 5031, 23448, 263, 29889, 19530, 29876, 278, 25828, 4835, 29889, 940, 5031, 23448, 263, 756, 263, 1353, 310, 25828, 4835, 393, 12234, 2615, 1546, 1023, 322, 4832, 11405, 515, 278, 2635, 310, 14060, 545, 29889, 3834, 310, 1438, 25828, 4835, 526, 10035, 29892, 763, 263, 1238, 369, 29892, 1550, 4045, 29892, 763, 432, 585, 299, 625, 29892, 526, 2649, 29873, 744, 18906, 310, 540 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 4529, 596, 923, 1416, 29890, 2873, 29889, 23868, 278, 1492, 11955, 29889, 887, 674, 817, 472, 3203, 1023, 1422, 528, 3076, 310, 1207, 786, 29901, 697, 393, 338, 16951, 6501, 261, 1135, 596, 5613, 19309, 16225, 363, 278, 528, 23626, 322, 697, 393, 338, 925, 263, 2217, 301, 14643, 1135, 596, 19309, 363, 278, 12141, 29879, 29889, 28277, 373, 596, 19309, 16225, 322, 24583, 29892, 1438, 508, 367 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 671, 429, 4542, 29875, 1218, 528, 314, 1129, 29877, 29889, 317, 802, 29872, 911, 263, 12616, 29899, 29879, 1891, 8828, 4757, 310, 528, 314, 1129, 29877, 304, 596, 5112, 29885, 29889, 319, 2217, 2586, 310, 429, 4542, 29875, 1218, 528, 314, 1129, 29877, 5771, 263, 1472, 982, 29889, 2860, 7990, 1259, 596, 11315, 297, 278, 1510, 261, 408, 366, 12891, 723, 29892, 269, 802, 29872, 911, 1048, 263 +1, 25453, 322, 17465, 292, 29901, 1128, 304, 1207, 521, 332, 307, 26163, 5036, 12580, 3137, 29889, 4721, 354, 271, 278, 288, 854, 304, 29871, 29946, 29945, 29900, 6719, 285, 21446, 6884, 470, 29871, 29906, 29906, 29945, 6719, 6432, 1039, 375, 636, 4122, 559, 263, 286, 3096, 262, 260, 764, 491, 285, 492, 3262, 372, 373, 967, 2625, 29889, 8669, 310, 805, 764, 292, 1661, 29899, 303, 860, 1395, 5832, 805, 764, 297, 278, 4251, 310 +1, 11796, 414, 322, 28251, 1199, 29901, 1128, 304, 6483, 285, 719, 541, 725, 329, 10674, 1161, 29889, 14542, 852, 385, 17182, 29889, 1932, 23906, 385, 17182, 304, 6483, 285, 719, 596, 541, 725, 329, 10674, 1161, 297, 29892, 372, 29915, 29879, 4100, 304, 5839, 697, 393, 756, 263, 6133, 25158, 1298, 1135, 278, 7984, 292, 10430, 29889, 1152, 1342, 29892, 565, 366, 505, 263, 9687, 393, 4225, 304, 367, 7984, 287, 472, 29871, 29941, 29945 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 1207, 385, 288, 9258, 17182, 2730, 391, 332, 3950, 29889, 349, 27574, 953, 7273, 9215, 281, 1165, 515, 263, 3240, 737, 261, 393, 4266, 7093, 297, 28075, 363, 3907, 3632, 331, 1943, 6776, 2527, 1199, 29889, 1670, 526, 1784, 5376, 414, 393, 508, 367, 1476, 7395, 1058, 19417, 953, 7273, 9215, 281, 1165, 636, 5373, 29891, 777, 18853, 288, 2719, 393, 366, 723, 763, 304, 671, 297, 596 +1, 11796, 414, 322, 28251, 1199, 29901, 1128, 304, 5376, 411, 14919, 21549, 29889, 1260, 8332, 403, 14919, 21549, 29899, 513, 1682, 292, 9687, 322, 29914, 272, 13748, 515, 596, 652, 300, 29889, 739, 10083, 2560, 29892, 541, 6480, 825, 366, 2348, 342, 14218, 508, 505, 263, 12176, 10879, 373, 596, 14919, 21549, 11174, 29889, 960, 366, 8369, 7535, 11223, 24937, 29892, 7243, 18219, 29892, 470, 851, 11517, 1432, 2462, 29892, 3814, 304, 2334, 472, 3203 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 19531, 3708, 552, 17441, 303, 860, 29889, 14542, 852, 12528, 301, 309, 562, 470, 22181, 1581, 17441, 303, 860, 363, 6534, 19309, 260, 2873, 29889, 7419, 363, 301, 14643, 29899, 2780, 287, 3708, 552, 17441, 303, 7358, 411, 7254, 22332, 2873, 29892, 1316, 408, 540, 1624, 470, 3805, 275, 528, 3076, 29892, 304, 1035, 296, 27240, 278, 7254, 22332, 2873, 297, 596, 15509, 19309, 29889, 4525, 674, 19595 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 1207, 596, 269, 484, 21079, 1106, 716, 1449, 29889, 399, 1161, 10508, 269, 484, 21079, 297, 278, 471, 2790, 4933, 29889, 960, 366, 505, 777, 26616, 10508, 269, 484, 21079, 29892, 366, 508, 5948, 679, 963, 5941, 491, 17452, 963, 297, 278, 471, 2790, 4933, 29892, 925, 408, 366, 723, 738, 916, 26616, 7171, 358, 29889, 887, 1122, 884, 367, 2221, 304, 471, 29882, 777, 1661, 29899, 15257 +1, 16224, 10057, 322, 22135, 29901, 1128, 304, 6755, 263, 18870, 262, 398, 9228, 29889, 14542, 852, 263, 9228, 411, 1880, 3708, 537, 29889, 1094, 411, 599, 758, 8802, 1539, 1338, 29892, 18870, 262, 398, 1818, 367, 394, 2376, 287, 411, 916, 1539, 1338, 297, 1797, 304, 6176, 278, 2898, 2264, 3734, 363, 432, 809, 295, 719, 29889, 739, 338, 4049, 394, 2376, 287, 411, 1661, 29899, 1457, 8802, 1539, 1338, 763, 1302, 2496, 470, 274 diff --git a/examples/cpp/llama/stop_words.csv b/examples/cpp/llama/stop_words.csv new file mode 100644 index 000000000..9b9b09eba --- /dev/null +++ b/examples/cpp/llama/stop_words.csv @@ -0,0 +1,2 @@ +287, 4346, 12 +3, -1, -1 diff --git a/src/fastertransformer/kernels/CMakeLists.txt b/src/fastertransformer/kernels/CMakeLists.txt index fd2a1b494..c5cc14c8e 100644 --- a/src/fastertransformer/kernels/CMakeLists.txt +++ b/src/fastertransformer/kernels/CMakeLists.txt @@ -233,3 +233,7 @@ add_library(moe_kernels STATIC moe_kernels.cu) set_property(TARGET moe_kernels PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET moe_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) target_link_libraries(moe_kernels PRIVATE moe_gemm_kernels) + +add_library(llama_kernels STATIC llama_kernels.cu) +set_property(TARGET llama_kernels PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET llama_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) diff --git a/src/fastertransformer/kernels/bert_preprocess_kernels.cu b/src/fastertransformer/kernels/bert_preprocess_kernels.cu index a57161c85..8179c3368 100644 --- a/src/fastertransformer/kernels/bert_preprocess_kernels.cu +++ b/src/fastertransformer/kernels/bert_preprocess_kernels.cu @@ -467,4 +467,4 @@ template void invokeQuantizeMatrixRebuildPadding +__global__ void start_id_embedding_lookups_kernel(T* from_tensor, + const T* embedding_table, + const int* input_ids, + const int length, + const int batch_size, + const int64_t hidden_units) +{ + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < batch_size * length * hidden_units; + index += blockDim.x * gridDim.x) { + + // embedding lookup from word ids [batch, length] (part of [batch, length]) and [vocab, hidden] to generate + // embedding [batch, length, hidden] + const int word_index = index / hidden_units; + const int word_index_row = word_index / length; // batch_id + const int word_index_col = word_index % length; + const int real_word_index = word_index_row * length + word_index_col; + const int col_index = index % hidden_units; + const int input_id = input_ids == nullptr ? real_word_index : input_ids[real_word_index]; + + from_tensor[index] = embedding_table[input_id * hidden_units + col_index]; + } +} + +template +void invokeInputIdsEmbeddingLookup(T* from_tensor, + const T* embedding_table, + const int* input_ids, + const int length, + const int batch_size, + const int hidden_units, + cudaStream_t stream) +{ + dim3 grid(min(batch_size * length, 65536)); + dim3 block(min(hidden_units, 512)); + start_id_embedding_lookups_kernel + <<>>(from_tensor, embedding_table, input_ids, length, batch_size, hidden_units); +} + +template void invokeInputIdsEmbeddingLookup(float* from_tensor, + const float* embedding_table, + const int* input_ids, + const int length, + const int batch_size, + const int hidden_units, + cudaStream_t stream); +template void invokeInputIdsEmbeddingLookup(half* from_tensor, + const half* embedding_table, + const int* input_ids, + const int length, + const int batch_size, + const int hidden_units, + cudaStream_t stream); + +#ifdef ENABLE_BF16 +template void invokeInputIdsEmbeddingLookup(__nv_bfloat16* from_tensor, + const __nv_bfloat16* embedding_table, + const int* input_ids, + const int length, + const int batch_size, + const int hidden_units, + cudaStream_t stream); +#endif template void invokeInputIdsEmbeddingLookupPosEncoding(T* from_tensor, @@ -203,27 +266,89 @@ template void invokeInputIdsEmbeddingLookupPosEncoding(__nv_bfloat16* template __global__ void inputIdsEmbeddingLookupPosEncodingSoftPrompt(inputIdsEmbeddingLookupPosEncodingSoftPromptParam param) { - // 1. Copy the input ids to output ids and transpose output ids to [seq_len, batch_size, beam_width]. - // 2. Embedding lookup by input ids and concat with soft prompt. The axis of concatenation is on axis of seq_len. - - // Assume batch size is 2 and prompts are [[t1, t2], [t3], [t4, t5]], input_ids are [[s1, s2], [s3], [s4]] - // then the order of output_ids is - // [ [?, ?, s1, s2] - // [?, s3, padding, padding] - // [?, ?, s4, padding] ] - // and the order of embedding is - // [ [t1, t2, s1, s2] - // [t3, s3, padding, padding] - // [t4, t5, s4, padding] ] - // where "?" means undefined values and we should attach it. + // 1. Copy the + // input ids to + // output ids + // and + // transpose + // output ids + // to [seq_len, + // batch_size, + // beam_width]. + // 2. Embedding + // lookup by + // input ids + // and concat + // with soft + // prompt. The + // axis of + // concatenation + // is on axis + // of seq_len. + + // Assume batch + // size is 2 + // and prompts + // are [[t1, + // t2], [t3], + // [t4, t5]], + // input_ids + // are [[s1, + // s2], [s3], + // [s4]] then + // the order of + // output_ids + // is [ [?, ?, + // s1, s2] + // [?, s3, + // padding, + // padding] + // [?, ?, s4, + // padding] ] + // and the + // order of + // embedding is + // [ [t1, t2, + // s1, s2] + // [t3, s3, + // padding, + // padding] + // [t4, t5, + // s4, + // padding] ] + // where "?" + // means + // undefined + // values and + // we should + // attach it. for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < param.batch_size * param.beam_width * (param.max_prefix_soft_prompt_length + param.max_input_length) * param.hidden_units; index += blockDim.x * gridDim.x) { - // transpose the input_ids [batch, length] (part of [batch, beam, max_input_length]) to - // output_ids [length, batch, beam]. - // ouptut_ids need to add padding in the beginning for soft prompting. + // transpose + // the + // input_ids + // [batch, + // length] + // (part of + // [batch, + // beam, + // max_input_length]) + // to + // output_ids + // [length, + // batch, + // beam]. + // ouptut_ids + // need to + // add + // padding + // in the + // beginning + // for soft + // prompting. if (index < param.batch_size * param.beam_width * param.max_input_length) { int tmp_index = index; @@ -239,21 +364,43 @@ __global__ void inputIdsEmbeddingLookupPosEncodingSoftPrompt(inputIdsEmbeddingLo } } - // embedding lookup from word ids [batch, beam, length] (part of [batch, beam, max_input_length]), [vocab, - // hidden] and [batch, max_prefix_soft_prompt_length, hidden] to generate embedding [batch, beam, length + - // max_prefix_soft_prompt_length, hidden] - int tmp_index = index; - const int hidden_id = tmp_index % param.hidden_units; - tmp_index = (tmp_index - hidden_id) / param.hidden_units; - const int seq_id = tmp_index % (param.max_prefix_soft_prompt_length + param.max_input_length); - tmp_index = (tmp_index - seq_id) / (param.max_prefix_soft_prompt_length + param.max_input_length); - const int beam_id = tmp_index % param.beam_width; - tmp_index = (tmp_index - beam_id) / param.beam_width; - const int batch_id = tmp_index % param.batch_size; + // embedding + // lookup + // from + // word ids + // [batch, + // beam, + // length] + // (part of + // [batch, + // beam, + // max_input_length]), + // [vocab, + // hidden] + // and + // [batch, + // max_prefix_soft_prompt_length, + // hidden] + // to + // generate + // embedding + // [batch, + // beam, + // length + + // max_prefix_soft_prompt_length, + // hidden] + int tmp_index = index; + const int hidden_id = tmp_index % param.hidden_units; + tmp_index = (tmp_index - hidden_id) / param.hidden_units; + const int seq_id = tmp_index % (param.max_prefix_soft_prompt_length + param.max_input_length); + tmp_index = (tmp_index - seq_id) / (param.max_prefix_soft_prompt_length + param.max_input_length); + const int beam_id = tmp_index % param.beam_width; + tmp_index = (tmp_index - beam_id) / param.beam_width; + const int batch_id = tmp_index % param.batch_size; const int64_t hidden_units = param.hidden_units; - T embedding = + T embedding = (seq_id < param.prefix_soft_prompt_lengths[batch_id]) ? - (T)param.prefix_soft_prompt_embedding[batch_id * param.max_prefix_soft_prompt_length * hidden_units + (T)param.prefix_soft_prompt_embedding[batch_id * param.max_prefix_soft_prompt_length * hidden_units + seq_id * hidden_units + hidden_id] : param.embedding_table[param.input_ids[batch_id * param.beam_width * param.max_input_length + beam_id * param.max_input_length @@ -292,7 +439,8 @@ template void invokeInputIdsEmbeddingLookupPosEncodingSoftPrompt( inputIdsEmbeddingLookupPosEncodingSoftPromptParam<__nv_bfloat16> param); #endif -// TODO Add half2 implementation +// TODO Add half2 +// implementation template __global__ void transposeAxis01(T* out, T* in, const int dim0, const int dim1, const int dim2) { @@ -329,9 +477,11 @@ invokeTransposeAxis01(int* out, int* in, const int dim0, const int dim1, const i template __global__ void transposeAxis01(T* out, T* in, const int* in_skipping_dim1, const int dim0, const int dim1) { - // out: [dim1, dim0] - // in: [dim0, dim1] - // in_skipping_dim1: [dim1] + // out: [dim1, + // dim0] in: + // [dim0, dim1] + // in_skipping_dim1: + // [dim1] int index = threadIdx.x + blockIdx.x * blockDim.x; if (index < dim0 * dim1) { @@ -363,8 +513,15 @@ __global__ void buildDecoderAttentionMaskKernel(T* attention_mask, const int max_seq_len, const int max_prompt_length) { - // sequence_lengths: [batch_size] - // attention_mask: [batch_size, 1, max_seq_len, max_seq_len + max_prompt_length] + // sequence_lengths: + // [batch_size] + // attention_mask: + // [batch_size, + // 1, + // max_seq_len, + // max_seq_len + // + + // max_prompt_length] const int max_prompt_seq_length = max_seq_len + max_prompt_length; const int mask_size_per_seq = max_seq_len * max_prompt_seq_length; attention_mask += blockIdx.x * mask_size_per_seq; @@ -581,29 +738,100 @@ template __global__ void find_context_dups(int* shared_contexts, const int* input_ids, const size_t batch_size, const size_t input_seq_len) { - /* We compare all context pairs (i, j), with i (tgt) < j (src) , to detect duplicate - * inputs. If there's a match between i and j, we store i at the - * j-th position of shared_context. So that we know that j can be - * represented by i. shared_contexts is initialized like shared_contexts[i] = i - * and when there's a match, we actually use shared_contexts[j] = min(shared_contexts[j], i) - * so that in the end, shared_contexts effectively contains an index - * to the match with the lowest index context. - * Note that shared_contexts[i] <= i, a property that will be used when uncompacting + /* We compare + * all context + * pairs (i, + * j), with i + * (tgt) < j + * (src) , to + * detect + * duplicate + * inputs. If + * there's a + * match + * between i + * and j, we + * store i at + * the j-th + * position of + * shared_context. + * So that we + * know that j + * can be + * represented + * by i. + * shared_contexts + * is + * initialized + * like + * shared_contexts[i] + * = i and when + * there's a + * match, we + * actually use + * shared_contexts[j] + * = + * min(shared_contexts[j], + * i) so that + * in the end, + * shared_contexts + * effectively + * contains an + * index to the + * match with + * the lowest + * index + * context. + * Note that + * shared_contexts[i] + * <= i, a + * property + * that will be + * used when + * uncompacting * inputs. */ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; __shared__ bool match; - /* Each block is responsible for a (i, j) pair. To map the block space to - * the i < j space, we need to convert a linear addressing to a triangle, of - * size (batch_size * (batch_size - 1)) / 2 - * For more information, check https://en.wikipedia.org/wiki/Triangular_number + /* Each block + * is + * responsible + * for a (i, j) + * pair. To map + * the block + * space to the + * i < j space, + * we need to + * convert a + * linear + * addressing + * to a + * triangle, of + * size + * (batch_size + * * (batch_size - 1)) / 2 + * For more + * information, + * check + * https://en.wikipedia.org/wiki/Triangular_number */ - // blockIdx = [0, 1, 2, ... n(n-1)/2] -> base_index = [0, 1, 1, 2, 2, 2, 3, 3, 3, 3, ..., n - 2] + // blockIdx = + // [0, 1, 2, + // ... + // n(n-1)/2] -> + // base_index = + // [0, 1, 1, 2, + // 2, 2, 3, 3, + // 3, 3, ..., n + // - 2] const int base_index = floorf(0.5f * (sqrtf(1 + 8 * blockIdx.x) - 1)); - const int src_idx = base_index + 1; // base_index \in [1, batch_size) + const int src_idx = base_index + 1; // base_index + // \in + // [1, + // batch_size) const int rev_base_index = base_index * (base_index + 1) / 2; const int tgt_idx = blockIdx.x - rev_base_index; // tgt_idx \in [0, src_idx) @@ -659,9 +887,19 @@ __global__ void generate_dups_indices(int* batch_to_compact, if (!masked && is_first_occur) { int compact_idx = scan + (first_iter ? 0 : scan_offset); - // Context rep. writes initial index + // Context + // rep. + // writes + // initial + // index batch_to_compact[seq_idx * beam_width] = compact_idx; - // input ids are tiled in context part + // input + // ids + // are + // tiled + // in + // context + // part compact_to_batch[compact_idx] = seq_idx * beam_width; } @@ -674,13 +912,27 @@ __global__ void generate_dups_indices(int* batch_to_compact, __syncthreads(); if (!masked && !is_first_occur) { - // Fill the rest of batch_to_compact based on what rep. wrote + // Fill + // the + // rest + // of + // batch_to_compact + // based + // on + // what + // rep. + // wrote const int src_idx = batch_to_compact[shared_contexts[seq_idx] * beam_width]; batch_to_compact[seq_idx * beam_width] = src_idx; } if (!masked) { - // set same compact idx for beams + // set + // same + // compact + // idx + // for + // beams for (int beam_id = 1; beam_id < beam_width; ++beam_id) { batch_to_compact[seq_idx * beam_width + beam_id] = batch_to_compact[seq_idx * beam_width]; } @@ -713,11 +965,20 @@ void invokeFindContextDups(int* shared_contexts, { dim3 block{512}; dim3 grid{((int)batch_size + block.x - 1) / block.x}; - // set shared_context[i] = i + // set + // shared_context[i] = + // i init_shared_contexts<<>>(shared_contexts, batch_size); grid = dim3{(unsigned int)(batch_size * (batch_size - 1)) / 2}; - // set shared_contexts[i] = j, where j = min{k, such that input_ids[k] == input_ids[i]} + // set + // shared_contexts[i] + // = j, where j + // = min{k, + // such that + // input_ids[k] + // == + // input_ids[i]} if (input_seq_len <= 128) { block = 128; find_context_dups<128><<>>(shared_contexts, input_ids, batch_size, input_seq_len); @@ -727,8 +988,21 @@ void invokeFindContextDups(int* shared_contexts, find_context_dups<256><<>>(shared_contexts, input_ids, batch_size, input_seq_len); } - // set batch_to_compact[i] = j, where j is the position of input_ids[i] in the compact_batch - // set compact_to_batch[i] = j, where j is such that compact_to_batch[i] = input_ids[j] + // set + // batch_to_compact[i] + // = j, where j + // is the + // position of + // input_ids[i] + // in the + // compact_batch + // set + // compact_to_batch[i] + // = j, where j + // is such that + // compact_to_batch[i] + // = + // input_ids[j] generate_dups_indices<<<1, DUPS_INDICES_BLOCK_SIZE, 0, stream>>>( batch_to_compact, compact_to_batch, compact_size, shared_contexts, batch_size, beam_width, input_seq_len); } @@ -782,10 +1056,29 @@ void invokeCompactInputs(T* compact_input, size_t hidden_dimension, cudaStream_t stream) { - /* Compact relevant decoder_layer inputs based on the identical contexts. - * For example, decoder_input is [batch_size, seq_len, H]. It's compacted - * into compact_input [compact_size, seq_len, H] such that - * compact_input[i, ...] = decoder_input[compact_idx[i], ...] */ + /* Compact + * relevant + * decoder_layer + * inputs based + * on the + * identical + * contexts. + * For example, + * decoder_input + * is + * [batch_size, + * seq_len, H]. + * It's + * compacted + * into + * compact_input + * [compact_size, + * seq_len, H] + * such that + * compact_input[i, + * ...] = + * decoder_input[compact_idx[i], + * ...] */ const size_t elems_n = compact_size * seq_len * max(hidden_dimension, seq_len); const dim3 blockDim(512); const dim3 gridDim((elems_n + 512 - 1) / 512); @@ -828,8 +1121,19 @@ __global__ void uncompact_outputs(T* uncompact_buffer, size_t batch_size, size_t buffer_stride) { - /* Uncompact a buffer IN of size [Compact, Stride] into OUT of size [Batch, Stride] - * so that \forall i, OUT[i, :] = IN[batch_to_compact_idx[i], :] + /* Uncompact a + * buffer IN of + * size + * [Compact, + * Stride] into + * OUT of size + * [Batch, + * Stride] so + * that \forall + * i, OUT[i, :] + * = + * IN[batch_to_compact_idx[i], + * :] */ const int global_idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -1124,4 +1428,5 @@ INSTANTIATE_INVOKE_SUM_LENGTH_DIMENSION(__nv_bfloat16); #endif #undef INSTANTIATE_INVOKE_SUM_LENGTH_DIMENSION -} // namespace fastertransformer +} // namespace + // fastertransformer diff --git a/src/fastertransformer/kernels/gpt_kernels.h b/src/fastertransformer/kernels/gpt_kernels.h index d78224e0a..bf4963231 100644 --- a/src/fastertransformer/kernels/gpt_kernels.h +++ b/src/fastertransformer/kernels/gpt_kernels.h @@ -59,6 +59,15 @@ struct pPromptTuningParam { const T* request_prompt_embedding = nullptr; }; +template +void invokeInputIdsEmbeddingLookup(T* from_tensor, + const T* embedding_table, + const int* input_ids, + const int length, + const int batch_size, + const int hidden_units, + cudaStream_t stream); + template void invokeInputIdsEmbeddingLookupPosEncoding(T* from_tensor, int* output_ids, diff --git a/src/fastertransformer/kernels/layernorm_kernels.cu b/src/fastertransformer/kernels/layernorm_kernels.cu index 369030b37..80a656cf7 100644 --- a/src/fastertransformer/kernels/layernorm_kernels.cu +++ b/src/fastertransformer/kernels/layernorm_kernels.cu @@ -19,6 +19,237 @@ #include "src/fastertransformer/utils/cuda_type_utils.cuh" namespace fastertransformer { +// __global__ void generalLLaMAAddBiasResidualLayerNormOpt(T* normed_output, +// __global__ void generalLLaMAAddBiasResidualLayerNormOpt2(T* normed_output, +// __global__ void generalLLaMAAddBiasResidualLayerNorm(const T* __restrict input, + +template +__global__ void generalLLaMAAddBiasResidualLayerNormOpt(T* normed_output, + T* output, + const T* __restrict input, + const T* __restrict bias, + const T* __restrict residual1, + const T* __restrict residual2, + const T* __restrict gamma, + const T* __restrict beta, + const float layernorm_eps, + int m, + int n) +{ + extern __shared__ __align__(sizeof(float)) char _shmem[]; // Align on largest type + T* shmem = reinterpret_cast(_shmem); + + __shared__ float s_variance; + float variance = 0.0f; + + using Float_Packed_T = typename packed_as::value>::type; + using Scalar_T = typename packed_as::type; + + T local_sum = cuda_cast(0.0f); + + const Float_Packed_T scale_from_int = cuda_cast(0.0f); + const Float_Packed_T scale_to_int = cuda_cast(0.0f); + +#pragma unroll + for (int i = threadIdx.x; i < n; i += blockDim.x) { + const int index = blockIdx.x * n + i; + T val = cuda_cast(0.0f); + + if (IS_BIAS) { + val = hadd2(val, ldg(&bias[i])); + } + if (RESIDUAL_NUM == 1) { + val = hadd2(val, ldg(&residual1[index])); + } + else if (RESIDUAL_NUM == 2) { + val = hadd2(hadd2(val, ldg(&residual1[index])), ldg(&residual2[index])); + } + + if (IS_OUTPUT) { + T in_val; + in_val = input[index]; + val = hadd2(val, in_val); + } + shmem[i] = val; + output[index] = val; + local_sum = hadd2(local_sum, val); + } + + float local_var_sum = 0.0f; +#pragma unroll UNROLL_FACTOR + for (int i = threadIdx.x; i < n; i += blockDim.x) { + T val = input[blockIdx.x * n + i]; + float diff_1 = (float)(val.x); + float diff_2 = (float)(val.y); + local_var_sum += (diff_1 * diff_1 + diff_2 * diff_2); + } + variance = blockReduceSum(local_var_sum); + + if (threadIdx.x == 0) { + s_variance = rsqrtf(variance / n / 2 + layernorm_eps); + } + __syncthreads(); + + T var_2 = cuda_cast(s_variance); + +#pragma unroll UNROLL_FACTOR + for (int i = threadIdx.x; i < n; i += blockDim.x) { + const int index = blockIdx.x * n + i; + T val = hmul2(shmem[i], var_2, ldg(&gamma[i])); + if (IS_BETA) { + val = hadd2(val, ldg(&beta[i])); + } + + normed_output[index] = val; + } +} + +// * Note that typename T is half2 or bfloat2 type +template +__global__ void generalLLaMAAddBiasResidualLayerNormOpt2(T* normed_output, + T* output, + const T* __restrict input, + const T* __restrict bias, + const T* __restrict residual1, + const T* __restrict residual2, + const T* __restrict gamma, + const T* __restrict beta, + const float layernorm_eps, + int m, + int n) +{ + extern __shared__ __align__(sizeof(float)) char _shmem[]; + T* shmem = reinterpret_cast(_shmem); + + __shared__ float s_variance; + float x2_sum = 0.0f; + const int b_offset = blockIdx.x * n; + + using T1 = typename TypeConverter::Type; + using Float_Packed_T = typename packed_as::value>::type; + using Scalar_T = typename packed_as::type; + + const Float_Packed_T scale_vec_in = cuda_cast(0.0f); + const Float_Packed_T scale_vec = cuda_cast(0.0f); + +#pragma unroll UNROLL_FACTOR + for (int i = threadIdx.x; i < n; i += blockDim.x) { + const int index = b_offset + i; + float val_1 = 0.0f; + float val_2 = 0.0f; + T tmp; + + if (IS_BIAS) { + tmp = ldg(&bias[i]); + val_1 += static_cast(tmp.x); + val_2 += static_cast(tmp.y); + } + if (RESIDUAL_NUM == 1) { + tmp = ldg(&residual1[index]); + val_1 += static_cast(tmp.x); + val_2 += static_cast(tmp.y); + } + else if (RESIDUAL_NUM == 2) { + tmp = ldg(&residual1[index]); + T tmp2 = ldg(&residual2[index]); + val_1 += (static_cast(tmp.x) + static_cast(tmp2.x)); + val_2 += (static_cast(tmp.y) + static_cast(tmp2.y)); + } + + if (IS_OUTPUT) { + tmp = ldg(&input[index]); + val_1 += static_cast(tmp.x); + val_2 += static_cast(tmp.y); + } + tmp.x = cuda_cast(val_1); + tmp.y = cuda_cast(val_2); + shmem[i] = tmp; + output[index] = tmp; + x2_sum += val_1 * val_1 + val_2 * val_2; + } + float sum_sq = blockReduceSum(x2_sum); + + if (threadIdx.x == 0) { + s_variance = rsqrtf(sum_sq / n / 2 + layernorm_eps); + } + __syncthreads(); + + T var_2 = cuda_cast(s_variance); + +#pragma unroll UNROLL_FACTOR + for (int i = threadIdx.x; i < n; i += blockDim.x) { + const int index = blockIdx.x * n + i; + T val = hmul2(shmem[i], var_2, ldg(&gamma[i])); + if (IS_BETA) { + val = hadd2(val, ldg(&beta[i])); + } + + normed_output[index] = val; + } +} + +template +__global__ void generalLLaMAAddBiasResidualLayerNorm(const T* __restrict input, + const T* __restrict residual1, + const T* __restrict residual2, + const T* __restrict gamma, + const T* __restrict beta, + const T* __restrict bias, + T* output, + T* norm_output, + const float layernorm_eps, + int m, + int n) +{ + int tid = threadIdx.x; + + // NOTE: float shmem may exceed the shared memory limit + extern __shared__ __align__(sizeof(float)) char _shmem[]; + T* shmem = reinterpret_cast(_shmem); + + using Float_Packed_T = typename packed_as::value>::type; + using Scalar_T = typename packed_as::type; + + __shared__ float s_variance; + float variance = 0.0f; + float local_sum = 0.0f; + for (int i = tid; i < n; i += blockDim.x) { + float local_out = 0.0f; + if (RESIDUAL_NUM == 1) { + local_out = (float)(ldg(&residual1[blockIdx.x * n + i])); + } + else if (RESIDUAL_NUM == 2) { + local_out = (float)(ldg(&residual1[blockIdx.x * n + i])) + float(ldg(&residual2[blockIdx.x * n + i])); + } + local_out += (float)(input[blockIdx.x * n + i]); + + if (bias != nullptr) { + local_out += (float)(ldg(&bias[i])); + } + shmem[i] = (T)local_out; + output[blockIdx.x * n + i] = (T)local_out; + local_sum += local_out; + } + + float local_var_sum = 0.0f; + for (int i = tid; i < n; i += blockDim.x) { + float diff = (float)(output[blockIdx.x * n + i]); + local_var_sum += diff * diff; + } + variance = blockReduceSum(local_var_sum); + + if (threadIdx.x == 0) { + s_variance = rsqrtf(variance / n + layernorm_eps); + } + __syncthreads(); + + for (int i = tid; i < n; i += blockDim.x) { + float beta_val = (beta == nullptr) ? 0.0f : (float)(ldg(&beta[i])); + const float val = (((float)shmem[i] * s_variance) * (float)(ldg(&gamma[i])) + beta_val); + + norm_output[blockIdx.x * n + i] = (T)val; + } +} // * Note that typename T is half2 or bfloat2 type template @@ -841,6 +1072,51 @@ __global__ void generalAddBiasResidualLayerNorm(const T* __restrict input, } } +template +void dispatch_generalLLaMAAddBiasResidualLayerNormOpt_opt_version(T* norm_output, + T* output, + const T* input, + const T* bias, + const T* residual1, + const T* residual2, + const T* gamma, + const T* beta, + float layernorm_eps, + int m, + int half_n, + dim3 grid, + dim3 block, + cudaStream_t stream, + int opt_version) +{ + size_t maxbytes = half_n * sizeof(T); + if (opt_version == 1) { + if (maxbytes >= (48 << 10)) { + check_cuda_error(cudaFuncSetAttribute( + generalLLaMAAddBiasResidualLayerNormOpt, + cudaFuncAttributeMaxDynamicSharedMemorySize, + maxbytes)); + } + generalLLaMAAddBiasResidualLayerNormOpt + <<>>( + norm_output, output, input, bias, residual1, residual2, gamma, beta, layernorm_eps, m, half_n); + } + else if (opt_version == 2) { + if (maxbytes >= (48 << 10)) { + check_cuda_error(cudaFuncSetAttribute( + generalLLaMAAddBiasResidualLayerNormOpt2, + cudaFuncAttributeMaxDynamicSharedMemorySize, + maxbytes)); + } + generalLLaMAAddBiasResidualLayerNormOpt2 + <<>>( + norm_output, output, input, bias, residual1, residual2, gamma, beta, layernorm_eps, m, half_n); + } + else { + FT_CHECK_WITH_INFO(false, "opt_num must be 1 or 2"); + } +} + template void dispatch_generalAddBiasResidualLayerNormOpt_opt_version(T* norm_output, T* output, @@ -919,6 +1195,62 @@ void dispatch_generalAddBiasResidualLayerNormOpt_opt_version(T* norm_o } } +template +void dispatch_generalLLaMAAddBiasResidualLayerNormOpt_is_output(T* norm_output, + T* output, + const T* input, + const T* bias, + const T* residual1, + const T* residual2, + const T* gamma, + const T* beta, + float layernorm_eps, + int m, + int half_n, + dim3 grid, + dim3 block, + cudaStream_t stream, + int opt_version, + bool is_output) +{ + if (is_output) { + dispatch_generalLLaMAAddBiasResidualLayerNormOpt_opt_version( + norm_output, + output, + input, + bias, + residual1, + residual2, + gamma, + beta, + layernorm_eps, + m, + half_n, + grid, + block, + stream, + opt_version); + } + else { + dispatch_generalLLaMAAddBiasResidualLayerNormOpt_opt_version( + norm_output, + output, + input, + bias, + residual1, + residual2, + gamma, + beta, + layernorm_eps, + m, + half_n, + grid, + block, + stream, + opt_version); + } +} + template void dispatch_generalAddBiasResidualLayerNormOpt_is_output(T* norm_output, T* output, @@ -990,6 +1322,62 @@ void dispatch_generalAddBiasResidualLayerNormOpt_is_output(T* norm_out } } +template +void dispatch_generalLLaMAAddBiasResidualLayerNormOpt_bias(T* norm_output, + T* output, + const T* input, + const T* bias, + const T* residual1, + const T* residual2, + const T* gamma, + const T* beta, + float layernorm_eps, + int m, + int half_n, + dim3 grid, + dim3 block, + cudaStream_t stream, + int opt_version, + bool is_output) +{ + if (bias != nullptr) { + dispatch_generalLLaMAAddBiasResidualLayerNormOpt_is_output(norm_output, + output, + input, + bias, + residual1, + residual2, + gamma, + beta, + layernorm_eps, + m, + half_n, + grid, + block, + stream, + opt_version, + is_output); + } + else { + dispatch_generalLLaMAAddBiasResidualLayerNormOpt_is_output(norm_output, + output, + input, + bias, + residual1, + residual2, + gamma, + beta, + layernorm_eps, + m, + half_n, + grid, + block, + stream, + opt_version, + is_output); + } +} + template void dispatch_generalAddBiasResidualLayerNormOpt_bias(T* norm_output, T* output, @@ -1061,6 +1449,66 @@ void dispatch_generalAddBiasResidualLayerNormOpt_bias(T* norm_output, } } +template +void dispatch_generalLLaMAAddBiasResidualLayerNormOpt_residual_num(T* norm_output, + T* output, + const T* input, + const T* bias, + const T* residual1, + const T* residual2, + const T* gamma, + const T* beta, + float layernorm_eps, + int m, + int half_n, + dim3 grid, + dim3 block, + cudaStream_t stream, + int opt_version, + bool is_output, + int residual_num) +{ + if (residual_num == 1) { + dispatch_generalLLaMAAddBiasResidualLayerNormOpt_bias(norm_output, + output, + input, + bias, + residual1, + residual2, + gamma, + beta, + layernorm_eps, + m, + half_n, + grid, + block, + stream, + opt_version, + is_output); + } + else if (residual_num == 2) { + dispatch_generalLLaMAAddBiasResidualLayerNormOpt_bias(norm_output, + output, + input, + bias, + residual1, + residual2, + gamma, + beta, + layernorm_eps, + m, + half_n, + grid, + block, + stream, + opt_version, + is_output); + } + else { + FT_CHECK_WITH_INFO(false, "residual_num must be 1 or 2"); + } +} + template void dispatch_generalAddBiasResidualLayerNormOpt_residual_num(T* norm_output, T* output, @@ -1136,6 +1584,108 @@ void dispatch_generalAddBiasResidualLayerNormOpt_residual_num(T* norm_ } } +template +void dispatch_generalLLaMAAddBiasResidualLayerNormOpt_unroll_factor(T* norm_output, + T* output, + const T* input, + const T* bias, + const T* residual1, + const T* residual2, + const T* gamma, + const T* beta, + float layernorm_eps, + int m, + int half_n, + dim3 grid, + dim3 block, + cudaStream_t stream, + int opt_version, + bool is_output, + int residual_num, + int unroll_factor) +{ + switch (unroll_factor) { + case 1: + dispatch_generalLLaMAAddBiasResidualLayerNormOpt_residual_num(norm_output, + output, + input, + bias, + residual1, + residual2, + gamma, + beta, + layernorm_eps, + m, + half_n, + grid, + block, + stream, + opt_version, + is_output, + residual_num); + break; + case 2: + dispatch_generalLLaMAAddBiasResidualLayerNormOpt_residual_num(norm_output, + output, + input, + bias, + residual1, + residual2, + gamma, + beta, + layernorm_eps, + m, + half_n, + grid, + block, + stream, + opt_version, + is_output, + residual_num); + break; + case 4: + dispatch_generalLLaMAAddBiasResidualLayerNormOpt_residual_num(norm_output, + output, + input, + bias, + residual1, + residual2, + gamma, + beta, + layernorm_eps, + m, + half_n, + grid, + block, + stream, + opt_version, + is_output, + residual_num); + break; + case 8: + dispatch_generalLLaMAAddBiasResidualLayerNormOpt_residual_num(norm_output, + output, + input, + bias, + residual1, + residual2, + gamma, + beta, + layernorm_eps, + m, + half_n, + grid, + block, + stream, + opt_version, + is_output, + residual_num); + break; + default: + FT_CHECK_WITH_INFO(false, "unroll_factor must be 1, 2, 4 or 8"); + } +} + template void dispatch_generalAddBiasResidualLayerNormOpt_unroll_factor(T* norm_output, T* output, @@ -1263,6 +1813,105 @@ void dispatch_generalAddBiasResidualLayerNormOpt_unroll_factor(T* norm } } +template +void invokeGeneralLLaMAAddBiasResidualPreLayerNorm(T* output, + T* norm_output, + const T* input, + const T* residual1, + const T* gamma, + const T* beta, + const T* bias, + const float layernorm_eps, + int m, + int n, + cudaStream_t stream, + int opt_version) +{ + const int residual_num = 1; + if (opt_version > 0 && sizeof(T) == 2 && n % 2 == 0) { + dim3 grid(m); + int half_n = n / 2; + int half_n_32 = (half_n + 31) / 32 * 32; + dim3 block(min(half_n_32, 512)); + int rolls_per_thread = half_n / block.x; + int unroll_factor = 8; + while (unroll_factor > rolls_per_thread && unroll_factor > 1) { + unroll_factor /= 2; + } + + using T2 = typename TypeConverter::Type; + + /* we launch (and instantiate) the kernel by specializing for unroll_factor -> residual_num -> is_bias -> + * opt_version */ + dispatch_generalLLaMAAddBiasResidualLayerNormOpt_unroll_factor((T2*)norm_output, + (T2*)output, + (const T2*)input, + (const T2*)bias, + (const T2*)residual1, + (const T2*)nullptr, + (const T2*)gamma, + (const T2*)beta, + layernorm_eps, + m, + half_n, + grid, + block, + stream, + opt_version, + true, // is_output + residual_num, + unroll_factor); + } + else { + + dim3 grid(m); + dim3 block(min(n, 1024)); + + /* For general cases, n is equal to hidden_units, e.g., 512/1024. + Since we have warp shuffle inside the code, block.x % 32 should be 0. + */ + block.x = (block.x + 31) / 32 * 32; + + size_t maxbytes = n * sizeof(T); + if (residual_num == 1) { + if (maxbytes >= (48 << 10)) { + check_cuda_error(cudaFuncSetAttribute( + generalLLaMAAddBiasResidualLayerNorm, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes)); + } + generalLLaMAAddBiasResidualLayerNorm<<>>( + input, residual1, nullptr, gamma, beta, bias, output, norm_output, layernorm_eps, m, n); + } + else if (residual_num == 2) { + if (maxbytes >= (48 << 10)) { + check_cuda_error(cudaFuncSetAttribute( + generalLLaMAAddBiasResidualLayerNorm, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes)); + } + generalLLaMAAddBiasResidualLayerNorm<<>>( + input, residual1, nullptr, gamma, beta, bias, output, norm_output, layernorm_eps, m, n); + } + } +} + +#define INSTANTIATE_INVOKE_GENERAL_LLAMA_ADD_BIAS_RESIDUAL_PRE_LAYER_NORM(T) \ + template void invokeGeneralLLaMAAddBiasResidualPreLayerNorm(T* output, \ + T* norm_output, \ + const T* input, \ + const T* residual1, \ + const T* gamma, \ + const T* beta, \ + const T* bias, \ + const float layernorm_eps, \ + int m, \ + int n, \ + cudaStream_t stream, \ + int opt_version) +INSTANTIATE_INVOKE_GENERAL_LLAMA_ADD_BIAS_RESIDUAL_PRE_LAYER_NORM(float); +INSTANTIATE_INVOKE_GENERAL_LLAMA_ADD_BIAS_RESIDUAL_PRE_LAYER_NORM(half); +#ifdef ENABLE_BF16 +INSTANTIATE_INVOKE_GENERAL_LLAMA_ADD_BIAS_RESIDUAL_PRE_LAYER_NORM(__nv_bfloat16); +#endif +#undef INSTANTIATE_INVOKE_GENERAL_LLAMA_ADD_BIAS_RESIDUAL_PRE_LAYER_NORM + /* output <- output + bias + residual_1 + residual_2 * output_norm <- LN(output) */ template @@ -1859,6 +2508,77 @@ template void invokeGeneralT5LayerNorm(__nv_bfloat16* out, cudaStream_t stream); #endif +/******************* invokeGeneralLLaMALayerNorm ***********************/ + +template +__global__ void generalLLaMALayerNorm( + const T* __restrict input, const T* __restrict gamma, T* normed_output, const float layernorm_eps, int m, int n) +{ + const int tid = threadIdx.x; + + float local_var_sum = 0.0f; + for (int i = tid; i < n; i += blockDim.x) { + float val = (float)(ldg(&input[blockIdx.x * n + i])); + local_var_sum += val * val; + } + + float variance = 0.0f; + variance = blockReduceSum(local_var_sum); + + __shared__ float s_variance; + if (threadIdx.x == 0) { + s_variance = rsqrtf((variance / (float)n) + layernorm_eps); + } + __syncthreads(); + + for (int i = tid; i < n; i += blockDim.x) { + const int index = blockIdx.x * n + i; + T val = (T) (((float)ldg(&input[index])) * s_variance); + normed_output[index] = val * ldg(&gamma[i]); + } +} + +template +void invokeGeneralLLaMALayerNorm( + T* out, const T* input, const T* gamma, const float layernorm_eps, const int m, const int n, cudaStream_t stream) +{ + dim3 grid(m); + dim3 block(min(n, 1024)); + + /* For general cases, n is equal to hidden_units, e.g., 512/1024. + Since we have warp shuffle inside the code, block.x % 32 should be 0. + */ + if (n % 32 != 0) { + block.x = 1024; + } + + generalLLaMALayerNorm<<>>(input, gamma, out, layernorm_eps, m, n); +} + +template void invokeGeneralLLaMALayerNorm(float* out, + const float* input, + const float* gamma, + const float layernorm_eps, + const int m, + const int n, + cudaStream_t stream); +template void invokeGeneralLLaMALayerNorm(half* out, + const half* input, + const half* gamma, + const float layernorm_eps, + const int m, + const int n, + cudaStream_t stream); +#ifdef ENABLE_BF16 +template void invokeGeneralLLaMALayerNorm(__nv_bfloat16* out, + const __nv_bfloat16* input, + const __nv_bfloat16* gamma, + const float layernorm_eps, + const int m, + const int n, + cudaStream_t stream); +#endif + /******************* invokeLayernormShiftPartition ***********************/ // applied to half2 and bfloat162 diff --git a/src/fastertransformer/kernels/layernorm_kernels.h b/src/fastertransformer/kernels/layernorm_kernels.h index d8ac09234..c7b31e874 100644 --- a/src/fastertransformer/kernels/layernorm_kernels.h +++ b/src/fastertransformer/kernels/layernorm_kernels.h @@ -24,7 +24,8 @@ namespace fastertransformer { -enum class LayerNormType { +enum class LayerNormType +{ pre_layernorm, post_layernorm, InvalidType @@ -61,6 +62,20 @@ void invokeAddBiasResidualLayerNorm(T* out, const int n, cudaStream_t stream); +template +void invokeGeneralLLaMAAddBiasResidualPreLayerNorm(T* output, + T* norm_output, + const T* input, + const T* residual1, + const T* gamma, + const T* beta, + const T* bias, + const float layernorm_eps, + int m, + int n, + cudaStream_t stream, + int opt_version = 2); + template void invokeGeneralAddBiasResidualPreLayerNorm(T* output, T* norm_output, @@ -161,6 +176,15 @@ void invokeGeneralT5LayerNorm(T* out, const int n, cudaStream_t stream); +template +void invokeGeneralLLaMALayerNorm(T* out, + const T* input, + const T* gamma, + const float layernorm_eps, + const int m, + const int n, + cudaStream_t stream); + template void invokeGeneralAddResidualT5PreLayerNorm(T* output, T* norm_output, diff --git a/src/fastertransformer/kernels/llama_kernels.cu b/src/fastertransformer/kernels/llama_kernels.cu new file mode 100644 index 000000000..4b02602d9 --- /dev/null +++ b/src/fastertransformer/kernels/llama_kernels.cu @@ -0,0 +1,389 @@ +#include "src/fastertransformer/kernels/llama_kernels.h" +#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh" +#include "src/fastertransformer/utils/cuda_fp8_utils.h" + +#include + +#include +#include +#include + +using namespace std; +namespace fastertransformer { + +template +__global__ void LLaMA_get_last_tokens(T* out, T* in, const int* cu_seqlens, int batch_size, int hidden_size) +{ + // in [num_tokens, hidden_size] + // out [batch_size, hidden_size] + int batch_idx = blockIdx.x; + + if (batch_idx >= batch_size) + return; + + int pos = cu_seqlens[batch_idx + 1] - 1; + + for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { + out[batch_idx * hidden_size + idx] = in[pos * hidden_size + idx]; + } +} + +template +void invokeLLaMAGetLastTokens( + T* out, T* in, const int* cu_seqlens, int batch_size, int hidden_size, cudaStream_t stream) +{ + dim3 grid(batch_size); + dim3 block(256); + LLaMA_get_last_tokens<<>>(out, in, cu_seqlens, batch_size, hidden_size); +} + +template void invokeLLaMAGetLastTokens( + float* out, float* in, const int* cu_seqlens, int batch_size, int hidden_size, cudaStream_t stream); +template void invokeLLaMAGetLastTokens( + half* out, half* in, const int* cu_seqlens, int batch_size, int hidden_size, cudaStream_t stream); +#ifdef ENABLE_BF16 +template void invokeLLaMAGetLastTokens( + __nv_bfloat16* out, __nv_bfloat16* in, const int* cu_seqlens, int batch_size, int hidden_size, cudaStream_t stream); +#endif + +__global__ void LLaMA_extract_targets(float* out, + float* in, + const int* target_ids, + const int* cu_seqlens, + int beam_width, + int batch_size, + int vocab_size, + int num_tokens) +{ + // in [batch_size, vocab_size] + // target_ids [ beam_width, num_tokens ] + // out [beam_width, batch_size] + int batch_idx = blockIdx.x * blockDim.x + threadIdx.x; + int beam_idx = blockIdx.y * blockDim.y + threadIdx.y; + + if (batch_idx >= batch_size || beam_idx >= beam_width) + return; + + int pos = cu_seqlens[batch_idx + 1] - 1; + int target_idx = target_ids[beam_idx * num_tokens + pos]; + out[beam_idx * batch_size + batch_idx] = in[batch_idx * vocab_size + target_idx]; +} + +void invokeLLaMAExtractTargets(float* out, + float* in, + const int* target_ids, + const int* cu_seqlens, + int beam_width, + int batch_size, + int vocab_size, + int num_tokens, + cudaStream_t stream) +{ + dim3 block(32, 4); + dim3 grid((batch_size + block.x - 1) / block.x, (beam_width + block.y - 1) / block.y); + LLaMA_extract_targets<<>>( + out, in, target_ids, cu_seqlens, beam_width, batch_size, vocab_size, num_tokens); +} + +__global__ void LLaMA_log_softmax(float* out, const float* logits, const int num_tokens, const int vocab_size) +{ + // logits [T, V] + // out [T, V] + const int64_t ti = blockIdx.x; + __shared__ float s_sum, s_max; + + if (ti >= num_tokens) + return; + + float local_max = -1e20f; + for (int i = threadIdx.x; i < vocab_size; i += blockDim.x) { + float logit_val = logits[ti * vocab_size + i]; + local_max = fmax(logit_val, local_max); + } + + float max_val = blockDim.x <= 32 ? warpReduceMax(local_max) : blockReduceMax(local_max); + if (threadIdx.x == 0) { + s_max = max_val; + } + __syncthreads(); + + float local_sum = 0; + for (int i = threadIdx.x; i < vocab_size; i += blockDim.x) { + float logit_val = logits[ti * vocab_size + i]; + local_sum += __expf(logit_val - s_max); + } + float sum_val = blockDim.x <= 32 ? warpReduceSum(local_sum) : blockReduceSum(local_sum); + if (threadIdx.x == 0) { + // s_sum = sum_val + 1e-6f; + s_sum = sum_val; + } + __syncthreads(); + + for (int i = threadIdx.x; i < vocab_size; i += blockDim.x) { + float logit_val = logits[ti * vocab_size + i]; + out[ti * vocab_size + i] = (logit_val - s_max) - __logf(s_sum); + } +} + +void invokeLLaMALogSoftmax( + float* out, const float* logits, const int num_tokens, const int vocab_size, cudaStream_t stream) +{ + dim3 grid(num_tokens); + dim3 block(min(1024, vocab_size)); + LLaMA_log_softmax<<>>(out, logits, num_tokens, vocab_size); +} + +__global__ void LLaMA_gather_tokens_kernel(float* out, + const float* probs, + const int* input_lengths, + const int* target_ids, + const int* cu_seqlens, + const int batch_size, + const int vocab_size, + const int num_tokens) +{ + // probs: [T, V] + // target_ids: [T] + // out: [batch_size] + int batch_idx = blockIdx.x; + + if (batch_idx >= batch_size) + return; + + float val = 0.f; + for (int i = threadIdx.x; i < input_lengths[batch_idx]; i += blockDim.x) { + int pos = cu_seqlens[batch_idx] + i; + int target_pos = target_ids[pos]; + val += (target_pos > 0) ? probs[pos * vocab_size + target_pos] : 0.f; + } + float sum = blockReduceSum(val); + + if (threadIdx.x == 0) + out[batch_idx] = sum; +} + +void invokeLLaMAGatherTokens(float* out, + const float* probs, + const int* input_lengths, + const int* target_ids, + const int* cu_seqlens, + const int batch_size, + const int vocab_size, + const int num_tokens, + cudaStream_t stream) +{ + dim3 grid(batch_size); + dim3 block(256); + LLaMA_gather_tokens_kernel<<>>( + out, probs, input_lengths, target_ids, cu_seqlens, batch_size, vocab_size, num_tokens); +} + +template +__global__ void LLaMAstart_id_embedding_lookups_kernel( + T* out, const T* embedding_table, const int* input_ids, const int num_tokens, const int64_t hidden_units) +{ + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_tokens * hidden_units; + index += blockDim.x * gridDim.x) { + + // embedding lookup from word ids [batch, length] (part of [batch, length]) and [vocab, hidden] to generate + // embedding [batch, length, hidden] + const int word_index = index / hidden_units; + const int col_index = index % hidden_units; + const int input_id = input_ids[word_index]; + + out[index] = embedding_table[input_id * hidden_units + col_index]; + } +} + +template +void invokeLLaMAInputIdsEmbeddingLookup(T* out, + const T* embedding_table, + const int* input_ids, + const int num_tokens, + const int hidden_units, + cudaStream_t stream) +{ + dim3 grid(min(num_tokens, 65536)); + dim3 block(min(hidden_units, 512)); + LLaMAstart_id_embedding_lookups_kernel + <<>>(out, embedding_table, input_ids, num_tokens, hidden_units); +} + +template void invokeLLaMAInputIdsEmbeddingLookup(float* out, + const float* embedding_table, + const int* input_ids, + const int num_tokens, + const int hidden_units, + cudaStream_t stream); +template void invokeLLaMAInputIdsEmbeddingLookup(half* out, + const half* embedding_table, + const int* input_ids, + const int num_tokens, + const int hidden_units, + cudaStream_t stream); +#ifdef ENABLE_BF16 +template void invokeLLaMAInputIdsEmbeddingLookup(__nv_bfloat16* out, + const __nv_bfloat16* embedding_table, + const int* input_ids, + const int num_tokens, + const int hidden_units, + cudaStream_t stream); +#endif + +__global__ void LLaMAgetPaddingOffsetAndCuSeqLensKernel( + int* padding_offset, int* cu_seqlens, const int* sequence_length, const int batch_size, const int seq_len) +{ + // do cumulated sum + int total_seq_len = 0; + int cum_offset = 0; + int index = 0; + for (int i = 0; i < batch_size; i++) { + const int num_tokens = sequence_length[i]; + cu_seqlens[i] = total_seq_len; + for (int j = 0; j < num_tokens; j++) { + padding_offset[index] = cum_offset; + index++; + } + cum_offset += seq_len - num_tokens; + total_seq_len += num_tokens; + } + cu_seqlens[batch_size] = total_seq_len; +} + +void invokeLLaMAGetPaddingOffsetAndCuSeqLens(int* padding_offset, + int* cu_seqlens, + const int* input_lengths, + const int batch_size, + const int seq_len, + cudaStream_t stream) +{ + LLaMAgetPaddingOffsetAndCuSeqLensKernel<<<1, 1, 0, stream>>>( + padding_offset, cu_seqlens, input_lengths, batch_size, seq_len); +} + +template +__global__ void LLaMAbuildDecoderAttentionMaskKernel(T* attention_mask, + const int* sequence_lengths, + const int* context_lengths, + const int batch_size, + const int seq_len, + const int attn_len) +{ + // attention_mask: + // [batch_size, 1, seq_len, attn_len] + const int batch_idx = blockIdx.x; + const int mask_size_per_seq = seq_len * attn_len; + attention_mask += batch_idx * mask_size_per_seq; + const int context_length = context_lengths[batch_idx]; + const int length = sequence_lengths[batch_idx]; + + for (int i = threadIdx.x; i < mask_size_per_seq; i += blockDim.x) { + int row_id = i / attn_len; + int col_id = i % attn_len; + if (row_id < length && col_id <= (row_id + context_length)) { + attention_mask[i] = (T)(1.0f); + } + else { + attention_mask[i] = (T)(0.0f); + } + } +} + +template +void invokeLLaMABuildDecoderAttentionMask(T* attention_mask, + const int* sequence_length, + const int* context_lengths, + const int batch_size, + const int seq_len, + const int attn_len, + cudaStream_t stream) +{ + LLaMAbuildDecoderAttentionMaskKernel<<>>( + attention_mask, sequence_length, context_lengths, batch_size, seq_len, attn_len); +} + +template void invokeLLaMABuildDecoderAttentionMask(float* attention_mask, + const int* sequence_length, + const int* context_lengths, + const int batch_size, + const int seq_len, + const int attn_len, + cudaStream_t stream); + +template void invokeLLaMABuildDecoderAttentionMask(half* attention_mask, + const int* sequence_length, + const int* context_lengths, + const int batch_size, + const int seq_len, + const int attn_len, + cudaStream_t stream); +#ifdef ENABLE_BF16 +template void invokeLLaMABuildDecoderAttentionMask(__nv_bfloat16* attention_mask, + const int* sequence_length, + const int* context_lengths, + const int batch_size, + const int seq_len, + const int attn_len, + cudaStream_t stream); +#endif + +template +__global__ void LLaMACopyKernel(T* dst, T* src, const int count) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + constexpr int X_ELEMS = (sizeof(T) == 4) ? 4 : 8; + if (idx * X_ELEMS >= count) { + return; + } + + auto v_dst = reinterpret_cast(dst); + auto v_src = reinterpret_cast(src); + v_dst[idx] = v_src[idx]; +} + +template +void invokeLLaMACopyKernel(T* dst, T* src, const int count, cudaStream_t stream) +{ + constexpr int block_sz = 128; + constexpr int x = (sizeof(T) == 4) ? 4 : 8; + assert(count % x == 0); + int grid_sz = (count / x + block_sz - 1) / block_sz; + LLaMACopyKernel<<>>(dst, src, count); +} + +template void invokeLLaMACopyKernel(float* dst, float* src, const int count, cudaStream_t stream); +template void invokeLLaMACopyKernel(half* dst, half* src, const int count, cudaStream_t stream); +#ifdef ENABLE_BF16 +template void invokeLLaMACopyKernel(__nv_bfloat16* dst, __nv_bfloat16* src, const int count, cudaStream_t stream); +#endif + +template +__global__ void LLaMAMemset0Kernel(T* dst, const int count) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + constexpr int X_ELEMS = (sizeof(T) == 4) ? 4 : 8; + if (idx * X_ELEMS >= count) { + return; + } + + auto v_dst = reinterpret_cast(dst); + v_dst[idx] = {0}; +} + +template +void invokeLLaMAMemset0(T* dst, const int count, cudaStream_t stream) +{ + constexpr int block_sz = 128; + constexpr int x = (sizeof(T) == 4) ? 4 : 8; + assert(count % x == 0); + int grid_sz = (count / x + block_sz - 1) / block_sz; + LLaMAMemset0Kernel<<>>(dst, count); +} + +template void invokeLLaMAMemset0(float* dst, const int count, cudaStream_t stream); +template void invokeLLaMAMemset0(half* dst, const int count, cudaStream_t stream); +#ifdef ENABLE_BF16 +template void invokeLLaMAMemset0(__nv_bfloat16* dst, const int count, cudaStream_t stream); +#endif + +} // namespace fastertransformer diff --git a/src/fastertransformer/kernels/llama_kernels.h b/src/fastertransformer/kernels/llama_kernels.h new file mode 100644 index 000000000..01e3bbf7a --- /dev/null +++ b/src/fastertransformer/kernels/llama_kernels.h @@ -0,0 +1,62 @@ +#pragma once + +#include "src/fastertransformer/utils/cuda_fp8_utils.h" +#include "src/fastertransformer/utils/memory_utils.h" +namespace fastertransformer { + +void invokeLLaMAGetPaddingOffsetAndCuSeqLens(int* padding_offset, + int* cu_seqlens, + const int* input_lengths, + const int batch_size, + const int seq_len, + cudaStream_t stream); + +template +void invokeLLaMABuildDecoderAttentionMask(T* attention_mask, + const int* sequence_length, + const int* context_lengths, + const int batch_size, + const int seq_len, + const int max_length, + cudaStream_t stream); +template +void invokeLLaMAInputIdsEmbeddingLookup(T* from_tensor, + const T* embedding_table, + const int* input_ids, + const int num_tokens, + const int hidden_units, + cudaStream_t stream); + +template +void invokeLLaMACopyKernel(T* dst, T* src, const int count, cudaStream_t stream); + +template +void invokeLLaMAMemset0(T* dst, const int count, cudaStream_t stream); + +void invokeLLaMAGatherTokens(float* out, + const float* probs, + const int* input_lengths, + const int* target_ids, + const int* cu_seqlens, + const int batch_size, + const int vocab_size, + const int num_tokens, + cudaStream_t stream); + +void invokeLLaMALogSoftmax( + float* out, const float* logits, const int num_tokens, const int vocab_size, cudaStream_t stream); + +template +void invokeLLaMAGetLastTokens( + T* out, T* in, const int* cu_seqlens, int batch_size, int hidden_size, cudaStream_t stream); + +void invokeLLaMAExtractTargets(float* out, + float* in, + const int* target_ids, + const int* cu_seqlens, + int beam_width, + int batch_size, + int vocab_size, + int num_tokens, + cudaStream_t stream); +} // namespace fastertransformer diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu index d0fb0a197..a513c1b47 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.cu +++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu @@ -278,14 +278,14 @@ __global__ void softmax_kernel(T* attn_score, // Loop along with Q dimension. for (int64_t qi = blockIdx.x; qi < q_length; qi += gridDim.x) { - float data[ITEMS_PER_THREAD]; - int64_t qk_offset; - float local_max = -1e20f; + float data[ITEMS_PER_THREAD]; + int64_t qk_offset; + float local_max = -1e20f; // Loop along with K dimension. for (int64_t i = 0; blockDim.x * i + threadIdx.x < k_length; i++) { - int64_t ki = blockDim.x * i + threadIdx.x; // Index of K dimension. - qk_offset = ((bi * head_num + hi) * q_length + qi) * k_length + ki; + int64_t ki = blockDim.x * i + threadIdx.x; // Index of K dimension. + qk_offset = ((bi * head_num + hi) * q_length + qi) * k_length + ki; float qk_val = static_cast(qk[qk_offset]); float qk_bias = 0.0f; @@ -297,8 +297,8 @@ __global__ void softmax_kernel(T* attn_score, qk_bias += static_cast(linear_bias_slope * (ki - qi)); } - int64_t mask_offset = (bi * q_length + qi) * k_length + ki; - float mask_val = static_cast(ldg(&attn_mask[mask_offset])); + int64_t mask_offset = (bi * q_length + qi) * k_length + ki; + float mask_val = static_cast(ldg(&attn_mask[mask_offset])); qk_bias += (1.0f - mask_val) * -10000.0f; data[i] = qk_scale * qk_val + qk_bias; @@ -1363,9 +1363,9 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* const int head_idx = blockIdx.y; const int tidx = threadIdx.x; - const int total_seq_len = param.max_prefix_prompt_length + seq_len; + const int total_seq_len = param.max_prefix_prompt_length + seq_len; + const bool is_masked = tidx * vec_size >= size_per_head; - const bool is_masked = tidx * vec_size >= size_per_head; // NOTE: blockIdx.x < batch_size * param.max_prefix_prompt_length really handles prefix prompts if (PREFIX_PROMPT && token_idx < 0) { const int prompt_batch_idx = blockIdx.x / param.max_prefix_prompt_length; @@ -1581,6 +1581,135 @@ INSTANTIATEADDFUSEDQKVBIASTRANSPOSE(__nv_bfloat16); #endif #undef INSTANTIATEADDFUSEDQKVBIASTRANSPOSE +template +__global__ void llama_add_fusedQKV_bias_transpose_kernel(T* q_buf, + T* k_buf, + T* v_buf, + T* QKV, + const int* padding_offset, + const int batch_size, + const int seq_len, + const int head_num, + const int size_per_head, + const int rotary_embedding_dim, + const int* context_lengths) +{ + constexpr int vec_size = Vec_t::size; + using Vec_t = typename Vec_t::Type; + const int token_idx = blockIdx.x; + const int token_padding_offset = (padding_offset == nullptr || token_idx < 0) ? 0 : padding_offset[token_idx]; + const int tgt_token_idx = token_idx + token_padding_offset; + + const int batch_idx = tgt_token_idx / seq_len; + const int seq_idx = tgt_token_idx % seq_len; + + const int head_idx = blockIdx.y; + const int tidx = threadIdx.x; + + const bool is_masked = tidx * vec_size >= size_per_head; + + const int hidden_idx = head_idx * size_per_head + tidx * vec_size; + const int n = head_num * size_per_head; + + const int src_q_idx = token_idx * 3 * n + hidden_idx; + const int src_k_idx = token_idx * 3 * n + hidden_idx + n; + const int src_v_idx = token_idx * 3 * n + hidden_idx + 2 * n; + + Vec_t q, k, v; + if (!is_masked) { + q = *reinterpret_cast(&QKV[src_q_idx]); + k = *reinterpret_cast(&QKV[src_k_idx]); + v = *reinterpret_cast(&QKV[src_v_idx]); + } + + mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, context_lengths[batch_idx] + seq_idx); + + const int dest_q_idx = batch_idx * size_per_head * seq_len * head_num + head_idx * size_per_head * seq_len + + seq_idx * size_per_head + tidx * vec_size; + + const int dest_kv_idx = batch_idx * size_per_head * seq_len * head_num + head_idx * size_per_head * seq_len + + seq_idx * size_per_head + tidx * vec_size; + + if (!is_masked) { + *reinterpret_cast(&q_buf[dest_q_idx]) = q; + *reinterpret_cast(&k_buf[dest_kv_idx]) = k; + *reinterpret_cast(&v_buf[dest_kv_idx]) = v; + } +} + +template +void invokeLLaMAAddFusedQKVBiasTranspose(T* q_buf, + T* k_buf, + T* v_buf, + T* QKV, + const int* padding_offset, + const int batch_size, + const int seq_len, + const int token_num, + const int head_num, + const int size_per_head, + const int rotary_embedding_dim, + const int* context_lengths, + cudaStream_t stream) +{ + dim3 block((size_per_head / Vec_t::size + 31) / 32 * 32); + dim3 grid(token_num, head_num); + llama_add_fusedQKV_bias_transpose_kernel<<>>(q_buf, + k_buf, + v_buf, + QKV, + padding_offset, + batch_size, + seq_len, + head_num, + size_per_head, + rotary_embedding_dim, + context_lengths); +} + +template void invokeLLaMAAddFusedQKVBiasTranspose(float* q_buf, + float* k_buf, + float* v_buf, + float* QKV, + const int* padding_offset, + const int batch_size, + const int seq_len, + const int token_num, + const int head_num, + const int size_per_head, + const int rotary_embedding_dim, + const int* context_lengths, + cudaStream_t stream); + +template void invokeLLaMAAddFusedQKVBiasTranspose(half* q_buf, + half* k_buf, + half* v_buf, + half* QKV, + const int* padding_offset, + const int batch_size, + const int seq_len, + const int token_num, + const int head_num, + const int size_per_head, + const int rotary_embedding_dim, + const int* context_lengths, + cudaStream_t stream); +#ifdef ENABLE_BF16 +template void invokeLLaMAAddFusedQKVBiasTranspose(__nv_bfloat16* q_buf, + __nv_bfloat16* k_buf, + __nv_bfloat16* v_buf, + __nv_bfloat16* QKV, + const int* padding_offset, + const int batch_size, + const int seq_len, + const int token_num, + const int head_num, + const int size_per_head, + const int rotary_embedding_dim, + const int* context_lengths, + cudaStream_t stream); +#endif + template __global__ void transpose_4d(T* dst, T* src, @@ -1760,6 +1889,7 @@ void invokeTranspose4dBatchMajor(T* k_dst, const int size_per_head, \ const int local_head_num, \ cudaStream_t stream) + INSTANTIATETRANSPOSE4DBATCHMAJOR(float); INSTANTIATETRANSPOSE4DBATCHMAJOR(half); #ifdef ENABLE_BF16 @@ -1767,6 +1897,166 @@ INSTANTIATETRANSPOSE4DBATCHMAJOR(__nv_bfloat16); #endif #undef INSTANTIATETRANSPOSE4DBATCHMAJOR +template +__global__ void transpose_4d_save_to_cache(T* k_dst, + const T* k_src, + T* v_dst, + const T* v_src, + const int* context_lengths, + const int head_num, + const int size_per_head, + const int seq_len, + const int max_seq_len) +{ + // [batch_size, head_num, seq_len, size_per_head] + const int batch_id = blockIdx.y; + const int head_id = blockIdx.z; + + // 16 byte loads will handle "x" dimension + auto key_src = reinterpret_cast(k_src + batch_id * head_num * size_per_head * seq_len + + head_id * size_per_head * seq_len); + auto key_dst = + reinterpret_cast(k_dst + batch_id * head_num * size_per_head * max_seq_len + + head_id * size_per_head * max_seq_len + context_lengths[batch_id] * size_per_head); + auto val_src = reinterpret_cast(v_src + batch_id * head_num * size_per_head * seq_len + + head_id * size_per_head * seq_len); + auto val_dst = + reinterpret_cast(v_dst + batch_id * head_num * size_per_head * max_seq_len + + head_id * size_per_head * max_seq_len + context_lengths[batch_id] * size_per_head); + + // idx is over output dimension L * size_per_head / x for values + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + + constexpr int X_ELEMS = (sizeof(T) == 4) ? 4 : 8; + const int size_per_head_div_x = size_per_head / X_ELEMS; + + if (idx >= size_per_head_div_x * seq_len) { + return; + } + + key_dst[idx] = key_src[idx]; + val_dst[idx] = val_src[idx]; +} + +template +void invokeLLaMASaveToCache(T* k_dst, + T* v_dst, + const T* k_src, + const T* v_src, + const int* context_lengths, + const int batch_size, + const int head_num, + const int size_per_head, + const int seq_len, + const int max_seq_len, + cudaStream_t stream) +{ + constexpr int block_sz = 128; + constexpr int x = (sizeof(T) == 4) ? 4 : 8; + dim3 grid((seq_len * size_per_head / x + block_sz - 1) / block_sz, batch_size, head_num); + + transpose_4d_save_to_cache<<>>( + k_dst, k_src, v_dst, v_src, context_lengths, head_num, size_per_head, seq_len, max_seq_len); +} + +#define INSTANTIATESAVETOCACHE(T) \ + template void invokeLLaMASaveToCache(T* k_dst, \ + T* v_dst, \ + const T* k_src, \ + const T* v_src, \ + const int* context_lengths, \ + const int batch_size, \ + const int head_num, \ + const int size_per_head, \ + const int seq_len, \ + const int max_seq_len, \ + cudaStream_t stream) +INSTANTIATESAVETOCACHE(float); +INSTANTIATESAVETOCACHE(half); +#ifdef ENABLE_BF16 +INSTANTIATESAVETOCACHE(__nv_bfloat16); +#endif +#undef INSTANTIATESAVETOCACHE + +template +__global__ void transpose_4d_load_from_cache(T* k_dst, + T* v_dst, + const T* k_src, + const T* v_src, + const int head_num, + const int size_per_head, + const int seq_len, + const int attn_len, + const int max_seq_len) +{ + // [batch_size, head_num, attn_len, size_per_head] + const int batch_id = blockIdx.y; + const int head_id = blockIdx.z; + + // 16 byte loads will handle "x" dimension + auto key_src = reinterpret_cast(k_src + batch_id * head_num * size_per_head * max_seq_len + + head_id * size_per_head * max_seq_len); + auto key_dst = reinterpret_cast(k_dst + batch_id * head_num * size_per_head * attn_len + + head_id * size_per_head * attn_len); + auto val_src = reinterpret_cast(v_src + batch_id * head_num * size_per_head * max_seq_len + + head_id * size_per_head * max_seq_len); + auto val_dst = reinterpret_cast(v_dst + batch_id * head_num * size_per_head * attn_len + + head_id * size_per_head * attn_len); + + // idx is over output dimension L * size_per_head / x for values + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + + constexpr int X_ELEMS = (sizeof(T) == 4) ? 4 : 8; + const int size_per_head_div_x = size_per_head / X_ELEMS; + + if (idx >= size_per_head_div_x * attn_len) { + return; + } + + key_dst[idx] = key_src[idx]; + val_dst[idx] = val_src[idx]; +} + +template +void invokeLLaMALoadFromCache(T* k_dst, + T* v_dst, + const T* k_src, + const T* v_src, + const int batch_size, + const int head_num, + const int size_per_head, + const int seq_len, + const int attn_len, + const int max_seq_len, + cudaStream_t stream) +{ + constexpr int block_sz = 128; + constexpr int x = (sizeof(T) == 4) ? 4 : 8; + dim3 grid((attn_len * size_per_head / x + block_sz - 1) / block_sz, batch_size, head_num); + + transpose_4d_load_from_cache<<>>( + k_dst, v_dst, k_src, v_src, head_num, size_per_head, seq_len, attn_len, max_seq_len); +} + +#define INSTANTIATELOADFROMCACHE(T) \ + template void invokeLLaMALoadFromCache(T* k_dst, \ + T* v_dst, \ + const T* k_src, \ + const T* v_src, \ + const int batch_size, \ + const int head_num, \ + const int size_per_head, \ + const int seq_len, \ + const int attn_len, \ + const int max_seq_len, \ + cudaStream_t stream) +INSTANTIATELOADFROMCACHE(float); +INSTANTIATELOADFROMCACHE(half); +#ifdef ENABLE_BF16 +INSTANTIATELOADFROMCACHE(__nv_bfloat16); +#endif +#undef INSTANTIATELOADFROMCACHE + template __global__ void addRelativeAttentionBias( T* qk_buf, const T* relative_attention_bias, const int batch_size, const int head_num, const int seq_len) @@ -1827,8 +2117,8 @@ INSTANTIATEADDRELATIVEATTENTIONBIAS(__nv_bfloat16); // m = batch*window_num*window_len // mm_qkv is [m, head*3*size_per_head] row-major // bias_qkv is [head*3*size_per_head] -// q_buf_, k_buf_, v_buf_ is [batch*window_num, num_head, window_len, size_per_head] row-major -// grid(window_len, window_num, 3*batch); +// q_buf_, k_buf_, v_buf_ is [batch*window_num, num_head, window_len, +// size_per_head] row-major grid(window_len, window_num, 3*batch); // block(num_head * size_per_head) template __global__ void add_head3Size_QKV_bias(const T* mm_qkv, @@ -1878,8 +2168,8 @@ __global__ void add_head3Size_QKV_bias(const T* mm_qkv, // m = batch*window_num*window_len // mm_qkv is [m, head*3*size_per_head] row-major // bias_qkv is [head*3*size_per_head] -// q_buf_, k_buf_, v_buf_ is [batch*window_num, num_head, window_len, size_per_head] row-major -// grid(window_len, window_num, 3*batch); +// q_buf_, k_buf_, v_buf_ is [batch*window_num, num_head, window_len, +// size_per_head] row-major grid(window_len, window_num, 3*batch); // block(num_head * size_per_head) template<> __global__ void add_head3Size_QKV_bias(const float2* mm_qkv, @@ -1931,8 +2221,8 @@ __global__ void add_head3Size_QKV_bias(const float2* mm_qkv, // m = batch*window_num*window_len // mm_qkv is [m, head*3*size_per_head] row-major // bias_qkv is [head*3*size_per_head] -// q_buf_, k_buf_, v_buf_ is [batch*window_num, num_head, window_len, size_per_head] row-major -// grid(window_len, window_num, batch); +// q_buf_, k_buf_, v_buf_ is [batch*window_num, num_head, window_len, +// size_per_head] row-major grid(window_len, window_num, batch); // block(num_head * size_per_head) template<> __global__ void add_head3Size_QKV_bias(const half2* mm_qkv, @@ -2122,7 +2412,8 @@ INSTANTIATEADDHEAD3SIZEQKVBIAS(__nv_bfloat16); #endif #undef INSTANTIATEADDHEAD3SIZEQKVBIAS -/******************* invokeMaskedSoftMaxWithRelPosBias ***********************/ +/******************* invokeMaskedSoftMaxWithRelPosBias + * ***********************/ // grid = (window_len/word_per_thread, window_num*num_head, batch_size) // block.x = max(32, (window_len + 31)/32*32) @@ -2471,7 +2762,8 @@ __global__ void transpose_attentions( // attentions_in shape [B, H, S, S] // attentions_out shape [B, L, H, S, S]. // Note that we write the L dimension as if it was index 0. - // In reality, the pointer has already been shifted to point to the correct layer. + // In reality, the pointer has already been shifted to point to the + // correct layer. const auto batch_idx = blockIdx.x; const auto head_idx = blockIdx.y; diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.h b/src/fastertransformer/kernels/unfused_attention_kernels.h index 7ac7604d4..4f55af19e 100644 --- a/src/fastertransformer/kernels/unfused_attention_kernels.h +++ b/src/fastertransformer/kernels/unfused_attention_kernels.h @@ -113,6 +113,21 @@ struct PrefixPromptBatchWeightsParam { const size_t prefix_prompt_layer_offset_per_seq = 0; }; +template +void invokeLLaMAAddFusedQKVBiasTranspose(T* q_buf, + T* k_buf, + T* v_buf, + T* QKV, + const int* padding_offset, + const int batch_size, + const int seq_len, + const int token_num, + const int head_num, + const int size_per_head, + const int rotary_embedding_dim, + const int* start_pos, + cudaStream_t stream); + template void invokeAddFusedQKVBiasTranspose(T* q_buf, T* k_buf, @@ -189,6 +204,31 @@ void invokeTranspose4dBatchMajor(T* k_dst, const int local_head_num, cudaStream_t stream); +template +void invokeLLaMASaveToCache(T* k_dst, + T* v_dst, + const T* k_src, + const T* v_src, + const int* context_lengths, + const int batch_size, + const int head_num, + const int size_per_head, + const int seq_len, + const int max_seq_len, + cudaStream_t stream); +template +void invokeLLaMALoadFromCache(T* k_dst, + T* v_dst, + const T* k_src, + const T* v_src, + const int batch_size, + const int head_num, + const int size_per_head, + const int seq_len, + const int attn_len, + const int max_seq_len, + cudaStream_t stream); + template void invokeAddRelativeAttentionBias(T* qk_buf, const T* relative_attention_bias, diff --git a/src/fastertransformer/layers/attention_layers/CMakeLists.txt b/src/fastertransformer/layers/attention_layers/CMakeLists.txt index 1f0e93b1b..13821892d 100644 --- a/src/fastertransformer/layers/attention_layers/CMakeLists.txt +++ b/src/fastertransformer/layers/attention_layers/CMakeLists.txt @@ -44,6 +44,11 @@ set_property(TARGET GptContextAttentionLayer PROPERTY POSITION_INDEPENDENT_CODE set_property(TARGET GptContextAttentionLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) target_link_libraries(GptContextAttentionLayer PUBLIC -lcublas -lcudart cublasMMWrapper memory_utils unfused_attention_kernels trt_fused_multi_head_attention fpA_intB_gemm int8_gemm nvtx_utils) +add_library(LLaMAContextAttentionLayer STATIC LLaMAContextAttentionLayer.cc) +set_property(TARGET LLaMAContextAttentionLayer PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET LLaMAContextAttentionLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) +target_link_libraries(LLaMAContextAttentionLayer PUBLIC -lcublas -lcudart cublasMMWrapper memory_utils unfused_attention_kernels trt_fused_multi_head_attention fpA_intB_gemm int8_gemm nvtx_utils) + add_library(DisentangledAttentionLayer STATIC DisentangledAttentionLayer.cc) set_property(TARGET DisentangledAttentionLayer PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET DisentangledAttentionLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc new file mode 100644 index 000000000..28c0f6f55 --- /dev/null +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.cc @@ -0,0 +1,352 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h" +#include "src/fastertransformer/kernels/layernorm_kernels.h" +#include "src/fastertransformer/kernels/unfused_attention_kernels.h" +#include "src/fastertransformer/kernels/llama_kernels.h" +#include "src/fastertransformer/utils/llama_utils.h" +#include "src/fastertransformer/utils/nvtx_utils.h" + +namespace fastertransformer { + +template +void LLaMAContextAttentionLayer::forward(TensorMap* output_tensors, + TensorMap* input_tensors, + const AttentionWeight* attention_weights) +{ + // input_tensors: + // input_query [num_tokens, hidden_dimension] + // attention_mask [batch_size, 1, seq_len, attn_len] + // context_lengths, int, [batch_size] + // attention_type [1] + // padding_offset [num_tokens] (optional) + // cu_seqlens [batch_size+1] (optional) + + // output_tensors: + // hidden_features [num_tokens, hidden_dimension] + // key_cache [batch, local_head_num, max_seq_len, size_per_head] + // value_cache [batch, local_head_num, max_seq_len, size_per_head] + + FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); + FT_CHECK(output_tensors->at("key_cache").shape.size() == 4); + FT_CHECK(output_tensors->at("value_cache").shape.size() == 4); + const int batch_size = input_tensors->at("attention_mask").shape[0]; + const int seq_len = input_tensors->at("attention_mask").shape[2]; + const int attn_len = input_tensors->at("attention_mask").shape[3]; + const int max_seq_len = output_tensors->at("key_cache").shape[2]; + + T* attention_input = input_tensors->at("input_query").getPtr(); + T* attention_mask = input_tensors->at("attention_mask").getPtr(); + const int* context_lengths = input_tensors->at("context_lengths").getPtr(); + const int* padding_offset = input_tensors->getPtr("padding_offset", nullptr); + const int* cu_seqlens = input_tensors->getPtr("cu_seqlens", nullptr); + const AttentionType attention_type = input_tensors->getVal("attention_type"); + T* attention_out = output_tensors->at("hidden_features").getPtr(); + T* key_cache = output_tensors->getPtr("key_cache"); + T* value_cache = output_tensors->getPtr("value_cache"); + + FT_CHECK_WITH_INFO(seq_len <= attn_len, "seq_len must be larger than or equal to attn_len"); + FT_CHECK_WITH_INFO(attention_type != AttentionType::FUSED_PADDED_MHA, + "LLaMA Context FUSED_PADDED_MHA is not supported !"); + + PUSH_RANGE("attention buffer alloc"); + allocateBuffer(batch_size, seq_len, attn_len); + POP_RANGE; + sync_check_cuda_error(); + + const int num_tokens = input_tensors->at("input_query").shape[0]; + + PUSH_RANGE("qkv_gemm"); + + cublas_wrapper_->Gemm(CUBLAS_OP_N, + CUBLAS_OP_N, + 3 * hidden_units_, // n + num_tokens, + hidden_units_, // k + attention_weights->query_weight.kernel, + 3 * hidden_units_, // n + attention_input, + hidden_units_, // k + qkv_buf_, + 3 * hidden_units_ /* n */); + sync_check_cuda_error(); + + if (padding_offset != nullptr) { + // q_buf_2_, k_buf_2_ and v_buf_2_ are continuous + //cudaMemsetAsync(q_buf_2_, 0, batch_size * (seq_len + 2 * attn_len) * hidden_units_ * sizeof(T), stream_); + invokeLLaMAMemset0(q_buf_2_, batch_size * (seq_len + 2 * attn_len) * hidden_units_, stream_); + sync_check_cuda_error(); + } + + invokeLLaMAAddFusedQKVBiasTranspose(q_buf_2_, + k_buf_2_, + v_buf_2_, + qkv_buf_, + padding_offset, + batch_size, + seq_len, + num_tokens, + head_num_, + size_per_head_, + rotary_embedding_dim_, + context_lengths, + stream_); + sync_check_cuda_error(); + + invokeLLaMASaveToCache(key_cache, + value_cache, + k_buf_2_, + v_buf_2_, + context_lengths, + batch_size, + head_num_, + size_per_head_, + seq_len, + max_seq_len, + stream_); + sync_check_cuda_error(); + + invokeLLaMALoadFromCache(k_buf_2_, + v_buf_2_, + key_cache, + value_cache, + batch_size, + head_num_, + size_per_head_, + seq_len, + attn_len, + max_seq_len, + stream_); + sync_check_cuda_error(); + + POP_RANGE; + + const cudaDataType_t gemm_data_type = getCudaDataType(); + const int attention_seq_len_1 = seq_len; // q length + const int attention_seq_len_2 = attn_len; // kv length + const T qk_scale = static_cast(1.0f / sqrtf(size_per_head_ * 1.0f)); + FT_CHECK(gemm_data_type != CUDA_R_32F); + + // + // softmax(Q*K^T) + // + PUSH_RANGE("Q*K batch gemm"); + + cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T, + CUBLAS_OP_N, + attention_seq_len_2, // n + attention_seq_len_1, // m + size_per_head_, // k + 1.0f, + k_buf_2_, + gemm_data_type, + size_per_head_, // k + attention_seq_len_2 * size_per_head_, // n * k + q_buf_2_, + gemm_data_type, + size_per_head_, // k + attention_seq_len_1 * size_per_head_, // m * k + 0.0f, + qk_buf_float_, + CUDA_R_32F, + attention_seq_len_2, // n + attention_seq_len_2 * attention_seq_len_1, + batch_size * head_num_, // global batch size + CUDA_R_32F); + sync_check_cuda_error(); + POP_RANGE; + + PUSH_RANGE("softmax"); + MaskedSoftmaxParam param; + param.attention_score = qk_buf_; // (batch_size, head_num, q_length, k_length) + param.qk = qk_buf_float_; // (batch_size, head_num, q_length, k_length) + param.attention_mask = attention_mask; // (batch_size, q_length, k_length) + param.batch_size = batch_size; + param.q_length = attention_seq_len_1; + param.k_length = attention_seq_len_2; + param.num_heads = head_num_; + param.qk_scale = qk_scale; + param.linear_bias_slopes = nullptr; + invokeMaskedSoftmax(param, stream_); + sync_check_cuda_error(); + POP_RANGE; + + PUSH_RANGE("QK*V batch gemm"); + cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N, + CUBLAS_OP_N, + size_per_head_, + attention_seq_len_1, + attention_seq_len_2, + + v_buf_2_, + size_per_head_, + attention_seq_len_2 * size_per_head_, + + qk_buf_, + attention_seq_len_2, + attention_seq_len_1 * attention_seq_len_2, + + qkv_buf_2_, + size_per_head_, + attention_seq_len_1 * size_per_head_, + + batch_size * head_num_); + sync_check_cuda_error(); + + // transpose (batch_size, num_heads, L, Dh) to (batch_size, L, num_heads * Dh) + if (padding_offset == nullptr) { + invokeTransposeQKV(qkv_buf_3_, + qkv_buf_2_, + batch_size, + attention_seq_len_1, + head_num_, + size_per_head_, + attention_weights->attention_output_weight.scale, + 0, // int8_mode + stream_); + sync_check_cuda_error(); + } + else { + invokeTransposeAttentionOutRemovePadding(qkv_buf_2_, + qkv_buf_3_, + num_tokens, + batch_size, + attention_seq_len_1, + head_num_, + size_per_head_, + padding_offset, + attention_weights->attention_output_weight.scale, + 0, // int8_mode + stream_); + sync_check_cuda_error(); + } + POP_RANGE; + sync_check_cuda_error(); + + PUSH_RANGE("proj gemm"); + cublas_wrapper_->Gemm(CUBLAS_OP_N, + CUBLAS_OP_N, + hidden_units_, + num_tokens, + hidden_units_, + attention_weights->attention_output_weight.kernel, + hidden_units_, + qkv_buf_3_, + hidden_units_, + attention_out, + hidden_units_); + sync_check_cuda_error(); + POP_RANGE; + + if (is_free_buffer_after_forward_ == true) { + freeBuffer(); + } + sync_check_cuda_error(); + FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); +} + +template +LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(size_t head_num, + size_t size_per_head, + size_t local_head_num, + size_t rotary_embedding_dim, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward): + BaseAttentionLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, false), + head_num_(head_num), + size_per_head_(size_per_head), + hidden_units_(head_num * size_per_head), + rotary_embedding_dim_(rotary_embedding_dim) +{ + FT_LOG_DEBUG(__PRETTY_FUNCTION__); +} + +template +LLaMAContextAttentionLayer::LLaMAContextAttentionLayer(LLaMAContextAttentionLayer const& attention_layer): + BaseAttentionLayer(attention_layer.stream_, + attention_layer.cublas_wrapper_, + attention_layer.allocator_, + attention_layer.is_free_buffer_after_forward_), + head_num_(attention_layer.head_num_), + size_per_head_(attention_layer.size_per_head_), + hidden_units_(attention_layer.hidden_units_), + rotary_embedding_dim_(attention_layer.rotary_embedding_dim_) +{ +} + +template +LLaMAContextAttentionLayer::~LLaMAContextAttentionLayer() +{ + cublas_wrapper_ = nullptr; + freeBuffer(); +} + +template +void LLaMAContextAttentionLayer::allocateBuffer() +{ + FT_CHECK(false); +} + +template +void LLaMAContextAttentionLayer::allocateBuffer(size_t batch_size, size_t seq_len, size_t attn_len) +{ + FT_LOG_DEBUG(__PRETTY_FUNCTION__); + qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * 3 * batch_size * seq_len * hidden_units_, false); + q_buf_2_ = + (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * (seq_len + 2 * attn_len) * hidden_units_, false); + k_buf_2_ = q_buf_2_ + batch_size * seq_len * hidden_units_; + v_buf_2_ = k_buf_2_ + batch_size * attn_len * hidden_units_; + + // save memory usage when using fmha + qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * head_num_ * seq_len * attn_len, false); + qkv_buf_2_ = (T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * seq_len * hidden_units_, false); + qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * batch_size * seq_len * hidden_units_, false); + + qk_buf_float_ = + (float*)allocator_->reMalloc(qk_buf_float_, sizeof(float) * batch_size * head_num_ * seq_len * attn_len, false); + + is_allocate_buffer_ = true; +} + +template +void LLaMAContextAttentionLayer::freeBuffer() +{ + if (is_allocate_buffer_) { + FT_LOG_DEBUG(__PRETTY_FUNCTION__); + allocator_->free((void**)(&qkv_buf_)); + allocator_->free((void**)(&q_buf_2_)); + allocator_->free((void**)(&k_buf_2_)); + allocator_->free((void**)(&v_buf_2_)); + allocator_->free((void**)(&qk_buf_)); + allocator_->free((void**)(&qkv_buf_2_)); + allocator_->free((void**)(&qkv_buf_3_)); + allocator_->free((void**)(&qk_buf_float_)); + + is_allocate_buffer_ = false; + } +} + +template class LLaMAContextAttentionLayer; +template class LLaMAContextAttentionLayer; +#ifdef ENABLE_BF16 +template class LLaMAContextAttentionLayer<__nv_bfloat16>; +#endif + +} // namespace fastertransformer diff --git a/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h new file mode 100644 index 000000000..504cc8aba --- /dev/null +++ b/src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "3rdparty/trt_fused_multihead_attention/qkvToContext.h" +#include "src/fastertransformer/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h" +#include "src/fastertransformer/kernels/cutlass_kernels/int8_gemm/int8_gemm.h" +#include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h" + +namespace fastertransformer { + +template +class LLaMAContextAttentionLayer: public BaseAttentionLayer { +private: + // metadata + const size_t head_num_; + const size_t size_per_head_; + const size_t hidden_units_; + const size_t rotary_embedding_dim_; + + // fmha runner + void allocateBuffer() override; + void allocateBuffer(size_t batch_size, size_t seq_len, size_t attn_len); + void freeBuffer() override; + + using BaseAttentionLayer::is_free_buffer_after_forward_; + using BaseAttentionLayer::is_allocate_buffer_; + using BaseAttentionLayer::cublas_wrapper_; + +protected: + using BaseAttentionLayer::allocator_; + using BaseAttentionLayer::stream_; + T* qkv_buf_ = nullptr; + T* q_buf_2_ = nullptr; + T* k_buf_2_ = nullptr; + T* v_buf_2_ = nullptr; + T* qk_buf_ = nullptr; + float* qk_buf_float_ = nullptr; + T* qkv_buf_2_ = nullptr; + T* qkv_buf_3_ = nullptr; + +public: + LLaMAContextAttentionLayer(size_t head_num, + size_t size_per_head, + size_t local_head_num, + size_t rotary_embedding_dim, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward); + + LLaMAContextAttentionLayer(LLaMAContextAttentionLayer const& attention_layer); + + virtual ~LLaMAContextAttentionLayer(); + + void + forward(TensorMap* output_tensors, TensorMap* input_tensors, const AttentionWeight* attention_weights) override; +}; + +} // namespace fastertransformer diff --git a/src/fastertransformer/models/CMakeLists.txt b/src/fastertransformer/models/CMakeLists.txt index 248b4af3d..afc4f8b7b 100644 --- a/src/fastertransformer/models/CMakeLists.txt +++ b/src/fastertransformer/models/CMakeLists.txt @@ -27,6 +27,7 @@ add_subdirectory(t5) add_subdirectory(bart) add_subdirectory(gptj) add_subdirectory(gptneox) +add_subdirectory(llama) add_subdirectory(multi_gpu_gpt) if(ENABLE_FP8) add_subdirectory(gpt_fp8) diff --git a/src/fastertransformer/models/llama/CMakeLists.txt b/src/fastertransformer/models/llama/CMakeLists.txt new file mode 100644 index 000000000..287a350da --- /dev/null +++ b/src/fastertransformer/models/llama/CMakeLists.txt @@ -0,0 +1,51 @@ +# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +cmake_minimum_required(VERSION 3.8) + +add_library(LLaMADecoderLayerWeight STATIC LLaMADecoderLayerWeight.cc) +set_property(TARGET LLaMADecoderLayerWeight PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET LLaMADecoderLayerWeight PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) +target_link_libraries(LLaMADecoderLayerWeight PUBLIC memory_utils cuda_utils logger) + +add_library(LLaMAContextDecoder STATIC LLaMAContextDecoder.cc) +set_property(TARGET LLaMAContextDecoder PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET LLaMAContextDecoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) +target_link_libraries(LLaMAContextDecoder PUBLIC -lcudart cublasMMWrapper + LLaMAContextAttentionLayer + FfnLayer + layernorm_kernels + add_residual_kernels + llama_kernels + tensor + nccl_utils + cuda_utils + logger) + +add_library(LLaMAWeight STATIC LLaMAWeight.cc) +set_property(TARGET LLaMAWeight PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET LLaMAWeight PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) +target_link_libraries(LLaMAWeight PUBLIC LLaMADecoderLayerWeight cuda_utils logger) + +add_library(LLaMA STATIC LLaMA.cc) +set_property(TARGET LLaMA PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET LLaMA PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) +target_link_libraries(LLaMA PUBLIC -lcudart + LLaMAContextDecoder + decoding_kernels + llama_kernels + tensor + LLaMAWeight + cuda_utils + logger) diff --git a/src/fastertransformer/models/llama/LLaMA.cc b/src/fastertransformer/models/llama/LLaMA.cc new file mode 100644 index 000000000..1cc8a95c4 --- /dev/null +++ b/src/fastertransformer/models/llama/LLaMA.cc @@ -0,0 +1,359 @@ +/* + * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/fastertransformer/models/llama/LLaMA.h" +#include "src/fastertransformer/kernels/decoding_kernels.h" +#include "src/fastertransformer/kernels/llama_kernels.h" +#include "src/fastertransformer/utils/llama_utils.h" +#include "src/fastertransformer/utils/memory_utils.h" +#include +#include + +namespace fastertransformer { + +template +void LLaMA::initialize() +{ + llama_context_decoder_ = new LLaMAContextDecoder(head_num_, + size_per_head_, + inter_size_, + num_layer_, + rotary_embedding_dim_, + layernorm_eps_, + rank_, + world_size_, + stream_, + cublas_wrapper_, + allocator_, + is_free_buffer_after_forward_, + attention_type_); +} + +template +void LLaMA::allocateBuffer() +{ + FT_CHECK(false); +} + +template +void LLaMA::allocateBuffer(size_t batch_size, size_t seq_len, size_t attn_len, int is_context) +{ + FT_LOG_DEBUG(__PRETTY_FUNCTION__); + + padding_offset_ = + reinterpret_cast(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false)); + cu_seqlens_ = reinterpret_cast(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false)); + + input_attention_mask_ = + (T*)(allocator_->reMalloc(input_attention_mask_, sizeof(T) * batch_size * seq_len * attn_len, false)); + + if (is_context) { + const size_t self_cache_size = + (num_layer_ / world_size_) * batch_size * max_seq_len_ * hidden_units_; + key_cache_ = (T*)(allocator_->reMalloc(key_cache_, sizeof(T) * self_cache_size * 2, false)); + value_cache_ = key_cache_ + self_cache_size; + } + + context_decoder_input_buf_ = + (T*)(allocator_->reMalloc(context_decoder_input_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + context_decoder_output_buf_ = (T*)(allocator_->reMalloc( + context_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + + context_output_buf_ = + (T*)(allocator_->reMalloc(context_output_buf_, sizeof(T) * batch_size * hidden_units_, false)); + normed_decoder_output_buf_ = + (T*)(allocator_->reMalloc(normed_decoder_output_buf_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + logits_buf_ = + (float*)(allocator_->reMalloc(logits_buf_, sizeof(float) * batch_size * seq_len * vocab_size_, false)); + log_likelihood_buf_ = + (float*)(allocator_->reMalloc(log_likelihood_buf_, sizeof(float) * batch_size * seq_len * vocab_size_, false)); + + is_allocate_buffer_ = true; +} + +template +void LLaMA::freeBuffer() +{ + if (is_allocate_buffer_) { + allocator_->free((void**)(&padding_offset_)); + allocator_->free((void**)(&cu_seqlens_)); + allocator_->free((void**)(&input_attention_mask_)); + allocator_->free((void**)(&key_cache_)); + allocator_->free((void**)(&context_decoder_input_buf_)); + allocator_->free((void**)(&context_decoder_output_buf_)); + allocator_->free((void**)(&context_output_buf_)); + allocator_->free((void**)(&normed_decoder_output_buf_)); + allocator_->free((void**)(&logits_buf_)); + allocator_->free((void**)(&log_likelihood_buf_)); + is_allocate_buffer_ = false; + } +} + +template +LLaMA::LLaMA(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t vocab_size, + size_t rotary_embedding_dim, + size_t random_seed, + size_t max_seq_len, + size_t rank, + size_t world_size, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + cudaDeviceProp* cuda_device_prop, + AttentionType attention_type): + BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop), + head_num_(head_num), + size_per_head_(size_per_head), + inter_size_(inter_size), + num_layer_(num_layer), + vocab_size_(vocab_size), + rotary_embedding_dim_(rotary_embedding_dim), + random_seed_(random_seed), + max_seq_len_(max_seq_len), + hidden_units_(head_num * size_per_head), + rank_(rank), + world_size_(world_size), + attention_type_(attention_type) +{ + initialize(); +} + +template +LLaMA::LLaMA(LLaMA const& llama): + BaseLayer(llama), + head_num_(llama.head_num_), + size_per_head_(llama.size_per_head_), + inter_size_(llama.inter_size_), + num_layer_(llama.num_layer_), + vocab_size_(llama.vocab_size_), + rotary_embedding_dim_(llama.rotary_embedding_dim_), + random_seed_(llama.random_seed_), + max_seq_len_(llama.max_seq_len_), + hidden_units_(llama.hidden_units_), + rank_(llama.rank_), + world_size_(llama.world_size_), + attention_type_(llama.attention_type_) +{ + initialize(); +} + +template +LLaMA::~LLaMA() +{ + delete llama_context_decoder_; + freeBuffer(); +} + +template +void LLaMA::forward(std::vector* output_tensors, + const std::vector* input_tensors, + const LLaMAWeight* llama_weights) +{ + FT_CHECK(false); +} + +template +void LLaMA::forward(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors, + const LLaMAWeight* llama_weights) +{ + // + // input_tensors: + // input_ids [num_tokens] + // input_lengths [batch_size] + // target_ids [beam_width, num_tokens] + // context_lengths [batch_size] + // seq_len [1] int on cpu + // attn_len [1] int on cpu + // is_context [1] int on cpu + + // output_tensors: + // hidden_vector [num_tokens, hidden_size] + // cum_probs [beam_width, batch_size] + + FT_CHECK_WITH_INFO(input_tensors->size() == 7, "input_tensors->size() == 7"); + FT_CHECK(input_tensors->at("input_ids").shape.size() == 1); + FT_CHECK(input_tensors->at("input_lengths").shape.size() == 1); + FT_CHECK(input_tensors->at("target_ids").shape.size() == 2); + FT_CHECK(input_tensors->at("context_lengths").shape.size() == 1); + + const DataType data_type = getTensorType(); + const bool is_unpadded_mha = isUnPaddedMHA(attention_type_); + const size_t batch_size = input_tensors->at("input_lengths").shape[0]; + const size_t num_tokens = input_tensors->at("input_ids").shape[0]; + const size_t beam_width = input_tensors->at("target_ids").shape[0]; + + const int* input_ids = input_tensors->at("input_ids").getPtr(); + const int* input_lengths = input_tensors->at("input_lengths").getPtr(); + const int* target_ids = input_tensors->at("target_ids").getPtr(); + const int* context_lengths = input_tensors->at("context_lengths").getPtr(); + const int seq_len = input_tensors->at("seq_len").getVal(); + const int attn_len = input_tensors->at("attn_len").getVal(); + const int is_context = input_tensors->at("is_context").getVal(); + T* hidden_vector = output_tensors->at("hidden_vector").getPtr(); + float* cum_probs = output_tensors->at("cum_probs").getPtr(); + + FT_CHECK_WITH_INFO(seq_len <= attn_len, "seq_len must be larger than or equal to attn_len"); + + allocateBuffer(batch_size, seq_len, attn_len, is_context); + sync_check_cuda_error(); + + if (is_unpadded_mha) { + invokeLLaMAGetPaddingOffsetAndCuSeqLens( + padding_offset_, cu_seqlens_, input_lengths, batch_size, seq_len, stream_); + sync_check_cuda_error(); + } + + invokeLLaMABuildDecoderAttentionMask( + input_attention_mask_, input_lengths, context_lengths, batch_size, seq_len, attn_len, stream_); + sync_check_cuda_error(); + + if (rank_ == 0) { + invokeLLaMAInputIdsEmbeddingLookup(context_decoder_input_buf_, + llama_weights->pre_decoder_embedding_table, + input_ids, + num_tokens, + hidden_units_, + stream_); + sync_check_cuda_error(); + } + + std::unordered_map decoder_input_tensors{ + {"decoder_input", + Tensor{MEMORY_GPU, + data_type, + {num_tokens, hidden_units_}, + rank_ == 0 ? context_decoder_input_buf_ : hidden_vector + }}, + {"attention_mask", + Tensor{MEMORY_GPU, data_type, {batch_size, 1, (size_t)seq_len, (size_t)(attn_len)}, input_attention_mask_}}, + {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, input_lengths}}, + {"context_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, context_lengths}}, + {"seq_len", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &seq_len}}, + {"attn_len", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &attn_len}}, + {"is_context", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &is_context}}, + }; + + if (is_unpadded_mha) { + decoder_input_tensors.insert({"padding_offset", Tensor{MEMORY_GPU, TYPE_INT32, {num_tokens}, padding_offset_}}); + decoder_input_tensors.insert({"cu_seqlens", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size + 1}, cu_seqlens_}}); + } + + std::unordered_map decoder_output_tensors{ + {"decoder_output", + Tensor{MEMORY_GPU, + data_type, + {num_tokens, hidden_units_}, + (rank_ == world_size_ - 1) ? context_decoder_output_buf_ : hidden_vector + }}, + {"key_cache", + Tensor{MEMORY_GPU, + data_type, + {num_layer_ / world_size_, batch_size, head_num_, max_seq_len_, size_per_head_}, + key_cache_}}, + {"value_cache", + Tensor{MEMORY_GPU, + data_type, + {num_layer_ / world_size_, batch_size, head_num_, max_seq_len_, size_per_head_}, + value_cache_}}}; + + llama_context_decoder_->forward( + &decoder_output_tensors, &decoder_input_tensors, &llama_weights->decoder_layer_weights); + sync_check_cuda_error(); + + if (is_context) { + invokeLLaMAGetLastTokens( + context_output_buf_, context_decoder_output_buf_, cu_seqlens_, batch_size, hidden_units_, stream_); + sync_check_cuda_error(); + + invokeGeneralLLaMALayerNorm(normed_decoder_output_buf_, + context_output_buf_, + llama_weights->post_decoder_layernorm.gamma, + layernorm_eps_, + batch_size, + hidden_units_, + stream_); + sync_check_cuda_error(); + + float alpha = 1.0f; + float beta = 0.0f; + cublas_wrapper_->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_32F, CUDA_R_32F); + cublas_wrapper_->Gemm(CUBLAS_OP_N, + CUBLAS_OP_N, + vocab_size_, + batch_size, + hidden_units_, + llama_weights->post_decoder_embedding.kernel, + vocab_size_, + normed_decoder_output_buf_, + hidden_units_, // n + logits_buf_, + vocab_size_); + sync_check_cuda_error(); + cublas_wrapper_->setFP16GemmConfig(); + + invokeLLaMALogSoftmax(log_likelihood_buf_, logits_buf_, batch_size, vocab_size_, stream_); + sync_check_cuda_error(); + + invokeLLaMAExtractTargets( + cum_probs, log_likelihood_buf_, target_ids, cu_seqlens_, beam_width, batch_size, vocab_size_, num_tokens, stream_); + sync_check_cuda_error(); + } + else { + invokeGeneralLLaMALayerNorm(normed_decoder_output_buf_, + context_decoder_output_buf_, + llama_weights->post_decoder_layernorm.gamma, + layernorm_eps_, + num_tokens, + hidden_units_, + stream_); + sync_check_cuda_error(); + + float alpha = 1.0f; + float beta = 0.0f; + cublas_wrapper_->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_32F, CUDA_R_32F); + cublas_wrapper_->Gemm(CUBLAS_OP_N, + CUBLAS_OP_N, + vocab_size_, + num_tokens, + hidden_units_, + llama_weights->post_decoder_embedding.kernel, + vocab_size_, + normed_decoder_output_buf_, + hidden_units_, // n + logits_buf_, + vocab_size_); + sync_check_cuda_error(); + cublas_wrapper_->setFP16GemmConfig(); + + invokeLLaMALogSoftmax(log_likelihood_buf_, logits_buf_, num_tokens, vocab_size_, stream_); + sync_check_cuda_error(); + + invokeLLaMAGatherTokens( + cum_probs, log_likelihood_buf_, input_lengths, target_ids, cu_seqlens_, batch_size, vocab_size_, num_tokens, stream_); + sync_check_cuda_error(); + } +} + +template class LLaMA; +template class LLaMA; + +} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMA.h b/src/fastertransformer/models/llama/LLaMA.h new file mode 100644 index 000000000..117f87341 --- /dev/null +++ b/src/fastertransformer/models/llama/LLaMA.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "src/fastertransformer/models/llama/LLaMAContextDecoder.h" +#include "src/fastertransformer/models/llama/LLaMAWeight.h" +#include "src/fastertransformer/utils/custom_ar_comm.h" + +namespace fastertransformer { + +template +class LLaMA: public BaseLayer { +private: + // meta data + size_t head_num_; + size_t size_per_head_; + size_t inter_size_; + size_t num_layer_; + size_t vocab_size_; + size_t rotary_embedding_dim_; + size_t random_seed_; + size_t max_seq_len_; + size_t hidden_units_; + size_t rank_; + size_t world_size_; + static constexpr float layernorm_eps_ = 1e-6f; + AttentionType attention_type_; + + LLaMAContextDecoder* llama_context_decoder_; + + void allocateBuffer() override; + void allocateBuffer(size_t batch_size, size_t seq_len, size_t attn_len, int is_context); + void freeBuffer() override; + + void initialize(); + +protected: + int* padding_offset_ = nullptr; + int* cu_seqlens_ = nullptr; + T* input_attention_mask_ = nullptr; + T* key_cache_ = nullptr; + T* value_cache_ = nullptr; + T* context_output_buf_ = nullptr; + T* normed_decoder_output_buf_ = nullptr; + float* logits_buf_ = nullptr; + float* log_likelihood_buf_ = nullptr; + T* context_decoder_input_buf_ = nullptr; + T* context_decoder_output_buf_ = nullptr; + + void sendTensorsToFirstPipelineNode(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors); + +public: + LLaMA(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t vocab_size, + size_t rotary_embedding_dim, + size_t random_seed, + size_t max_seq_len, + size_t rank, + size_t world_size, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + cudaDeviceProp* cuda_device_prop = nullptr, + AttentionType attention_type = AttentionType::UNFUSED_MHA); + + LLaMA(LLaMA const& LLaMA); + + ~LLaMA(); + + void forward(std::vector* output_tensors, + const std::vector* input_tensors, + const LLaMAWeight* llama_weights); + + void forward(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors, + const LLaMAWeight* llama_weights); + +}; + +} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.cc b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc new file mode 100644 index 000000000..7406c838f --- /dev/null +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.cc @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/fastertransformer/models/llama/LLaMAContextDecoder.h" +#include "src/fastertransformer/kernels/llama_kernels.h" +#include "src/fastertransformer/layers/FfnLayer.h" +#include "src/fastertransformer/layers/attention_layers/LLaMAContextAttentionLayer.h" +#include "src/fastertransformer/utils/llama_utils.h" + +namespace fastertransformer { + +template +void LLaMAContextDecoder::initialize() +{ + self_attention_layer_ = new LLaMAContextAttentionLayer(head_num_, + size_per_head_, + head_num_, + rotary_embedding_dim_, + stream_, + cublas_wrapper_, + allocator_, + is_free_buffer_after_forward_); + + ffn_layer_ = new SiluFfnLayer(0, // max_batch_size + 0, // max_seq_len + head_num_, + size_per_head_, + 0, // expert_num + inter_size_, + stream_, + cublas_wrapper_, + allocator_, + is_free_buffer_after_forward_, + false, + true // use_gated_activation = false + ); +} + +template +void LLaMAContextDecoder::allocateBuffer() +{ + FT_CHECK(false); +} + +template +void LLaMAContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len) +{ + + decoder_normed_input_ = reinterpret_cast( + allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + self_attn_output_ = reinterpret_cast( + allocator_->reMalloc(self_attn_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + decoder_layer_output_ = reinterpret_cast( + allocator_->reMalloc(decoder_layer_output_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + is_allocate_buffer_ = true; +} + +template +void LLaMAContextDecoder::freeBuffer() +{ + if (is_allocate_buffer_ == true) { + allocator_->free((void**)(&decoder_normed_input_)); + allocator_->free((void**)(&self_attn_output_)); + allocator_->free((void**)(&decoder_layer_output_)); + is_allocate_buffer_ = false; + } +} + +template +bool LLaMAContextDecoder::isValidLayerParallelId(uint l) +{ + int local_num_layer = (int)(ceil(num_layer_ * 1.0f / world_size_)); + return l < num_layer_ && (l >= local_num_layer * rank_) && (l < local_num_layer * (rank_ + 1)); +} + +template +bool LLaMAContextDecoder::isFirstLayerParallelId(uint l) +{ + int local_num_layer = (int)(ceil(num_layer_ * 1.0f / world_size_)); + return l < num_layer_ && (l == local_num_layer * rank_); +} + +template +bool LLaMAContextDecoder::isLastLayerParallelId(uint l) +{ + int local_num_layer = (int)(ceil(num_layer_ * 1.0f / world_size_)); + return l < num_layer_ && (l == local_num_layer * (rank_ + 1) - 1); +} + +template +int LLaMAContextDecoder::getFirstLayerParallelId() +{ + int local_num_layer = (int)(ceil(num_layer_ * 1.0f / world_size_)); + return local_num_layer * rank_; +} + +template +LLaMAContextDecoder::LLaMAContextDecoder(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t rotary_embedding_dim, + float layernorm_eps, + size_t rank, + size_t world_size, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + AttentionType attention_type): + BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward), + head_num_(head_num), + size_per_head_(size_per_head), + inter_size_(inter_size), + num_layer_(num_layer), + rotary_embedding_dim_(rotary_embedding_dim), + layernorm_eps_(layernorm_eps), + hidden_units_(head_num * size_per_head), + rank_(rank), + world_size_(world_size), + attention_type_(attention_type) +{ + initialize(); +} + +template +LLaMAContextDecoder::LLaMAContextDecoder(LLaMAContextDecoder const& decoder): + BaseLayer(decoder.stream_, decoder.cublas_wrapper_, decoder.allocator_, decoder.is_free_buffer_after_forward_), + head_num_(decoder.head_num_), + size_per_head_(decoder.size_per_head_), + inter_size_(decoder.inter_size_), + num_layer_(decoder.num_layer_), + rotary_embedding_dim_(decoder.rotary_embedding_dim_), + layernorm_eps_(decoder.layernorm_eps_), + hidden_units_(decoder.hidden_units_), + rank_(decoder.rank_), + world_size_(decoder.world_size_), + attention_type_(decoder.attention_type_) +{ + initialize(); +} + +template +LLaMAContextDecoder::~LLaMAContextDecoder() +{ + delete self_attention_layer_; + delete ffn_layer_; + freeBuffer(); +} + +template +void LLaMAContextDecoder::forward(std::vector* output_tensors, + const std::vector* input_tensors, + const std::vector*>* llama_decoder_layer_weight) +{ + std::unordered_map input_tensors_map{{"decoder_input", input_tensors->at(0)}, + {"attention_mask", input_tensors->at(1)}, + {"input_lengths", input_tensors->at(2)}, + {"context_lengths", input_tensors->at(3)}, + {"seq_len", input_tensors->at(4)}}; + std::unordered_map output_tensors_map{{"decoder_output", output_tensors->at(0)}, + {"key_cache", output_tensors->at(1)}, + {"value_cache", output_tensors->at(2)}}; + + forward(&output_tensors_map, &input_tensors_map, llama_decoder_layer_weight); +} + +template +void LLaMAContextDecoder::forward(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors, + const std::vector*>* llama_decoder_layer_weight) +{ + // input tensors: + // decoder_input [num_tokens, hidden_dimension], + // attention_mask [batch_size, 1, seq_len, attn_len] + // input_lengths [batch_size] + // context_lengths [batch_size] + // seq_len [1] int on cpu + // attn_len [1] int on cpu + // is_context [1] int on cpu + // padding_offset [batch_size] int on cpu + // cu_seqlens [batch_size+1] int on cpu + + // output tensors: + // decoder_output [num_tokens, hidden_dimension], + // key_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head] + // value_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head] + + FT_CHECK(input_tensors->size() >= 5); + FT_CHECK(output_tensors->size() == 3); + const DataType data_type = getTensorType(); + const bool is_unpadded_mha = isUnPaddedMHA(attention_type_); + const size_t batch_size = input_tensors->at("input_lengths").shape[0]; + const size_t num_tokens = input_tensors->at("decoder_input").shape[0]; + + const int* input_lengths = input_tensors->at("input_lengths").getPtr(); + const int* context_lengths = input_tensors->at("context_lengths").getPtr(); + const int seq_len = input_tensors->at("attention_mask").shape[2]; + const int attn_len = input_tensors->at("attention_mask").shape[3]; + const int is_context = input_tensors->at("is_context").getVal(); + const int* padding_offset = nullptr; + const int* cu_seqlens = nullptr; + if (is_unpadded_mha) { + padding_offset = input_tensors->at("padding_offset").getPtr(); + cu_seqlens = input_tensors->at("cu_seqlens").getPtr(); + } + allocateBuffer(batch_size, seq_len); + sync_check_cuda_error(); + + T* decoder_input = input_tensors->at("decoder_input").getPtr(); + T* decoder_output = output_tensors->at("decoder_output").getPtr(); + const T* attention_mask = input_tensors->at("attention_mask").getPtr(); + + Tensor& k_cache = output_tensors->at("key_cache"); + Tensor& v_cache = output_tensors->at("value_cache"); + std::vector self_k_cache_size; + self_k_cache_size.push_back(batch_size); + for (auto t = k_cache.shape.begin() + 2; t != k_cache.shape.end(); ++t) { + self_k_cache_size.push_back(*t); + } + std::vector self_v_cache_size; + self_v_cache_size.push_back(batch_size); + for (auto t = v_cache.shape.begin() + 2; t != v_cache.shape.end(); ++t) { + self_v_cache_size.push_back(*t); + } + + for (int l = 0; l < num_layer_; l++) { + if (isValidLayerParallelId(l) == false) { + continue; + } + + const bool is_final = false; + T* layer_input = decoder_layer_output_; + T* layer_output = decoder_layer_output_; + if (isFirstLayerParallelId(l)) { + layer_input = decoder_input; + } + if (isLastLayerParallelId(l)) { + layer_output = decoder_output; + } + + invokeGeneralLLaMALayerNorm(decoder_normed_input_, + layer_input, + llama_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma, + layernorm_eps_, + num_tokens, + hidden_units_, + stream_); + sync_check_cuda_error(); + + TensorMap self_attention_input_tensors{ + {"input_query", Tensor{MEMORY_GPU, data_type, {num_tokens, (size_t)hidden_units_}, decoder_normed_input_}}, + {"attention_mask", + Tensor{MEMORY_GPU, + data_type, + {(size_t)batch_size, (size_t)1, (size_t)seq_len, (size_t)(attn_len)}, + attention_mask}}, + {"context_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {(size_t)batch_size}, context_lengths}}, + {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type_}}, + {"is_context", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &is_context}}, + }; + + if (is_unpadded_mha) { + self_attention_input_tensors.insert("padding_offset", + Tensor{MEMORY_GPU, TYPE_INT32, {num_tokens}, padding_offset}); + self_attention_input_tensors.insert("cu_seqlens", + Tensor{MEMORY_GPU, TYPE_INT32, {size_t(batch_size + 1)}, cu_seqlens}); + } + + size_t cache_offset = l - getFirstLayerParallelId(); + for (auto t = k_cache.shape.begin() + 1; t != k_cache.shape.end(); ++t) { + cache_offset *= *t; + }; + + TensorMap self_attention_output_tensors{ + {"hidden_features", Tensor{MEMORY_GPU, data_type, {num_tokens, (size_t)hidden_units_}, self_attn_output_}}, + {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}}, + {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}}; + + self_attention_layer_->forward(&self_attention_output_tensors, + &self_attention_input_tensors, + &llama_decoder_layer_weight->at(l)->self_attention_weights); + + invokeGeneralLLaMAAddBiasResidualPreLayerNorm( + self_attn_output_, + decoder_normed_input_, + self_attn_output_, + layer_input, + llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.gamma, + llama_decoder_layer_weight->at(l)->post_attention_layernorm_weights.beta, + llama_decoder_layer_weight->at(l)->self_attention_weights.attention_output_weight.bias, + layernorm_eps_, + num_tokens, + hidden_units_, + stream_); + sync_check_cuda_error(); + + TensorMap ffn_input_tensors( + {{"ffn_input", Tensor{MEMORY_GPU, data_type, {num_tokens, (size_t)hidden_units_}, decoder_normed_input_}}}); + TensorMap ffn_output_tensors( + {{"ffn_output", Tensor{MEMORY_GPU, data_type, {num_tokens, (size_t)hidden_units_}, layer_output}}}); + ffn_layer_->forward(&ffn_output_tensors, &ffn_input_tensors, &llama_decoder_layer_weight->at(l)->ffn_weights); + + invokeAddBiasResidual(layer_output, + self_attn_output_, + llama_decoder_layer_weight->at(l)->ffn_weights.output_weight.bias, + num_tokens, + hidden_units_, + stream_); + + sync_check_cuda_error(); + } + + if (is_free_buffer_after_forward_ == true) { + freeBuffer(); + } +} + +template class LLaMAContextDecoder; +template class LLaMAContextDecoder; + +} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMAContextDecoder.h b/src/fastertransformer/models/llama/LLaMAContextDecoder.h new file mode 100644 index 000000000..3e2aeb0c0 --- /dev/null +++ b/src/fastertransformer/models/llama/LLaMAContextDecoder.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "src/fastertransformer/kernels/add_residual_kernels.h" +#include "src/fastertransformer/kernels/layernorm_kernels.h" +#include "src/fastertransformer/layers/BaseLayer.h" +#include "src/fastertransformer/layers/FfnLayer.h" +#include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h" +#include "src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h" +#include "src/fastertransformer/utils/Tensor.h" +#include "src/fastertransformer/utils/allocator.h" +#include "src/fastertransformer/utils/cublasMMWrapper.h" +#include "src/fastertransformer/utils/custom_ar_comm.h" +#include "src/fastertransformer/utils/nccl_utils.h" + +namespace fastertransformer { + +template +class LLaMAContextDecoder: public BaseLayer { +private: + // meta data + size_t head_num_; + size_t size_per_head_; + size_t inter_size_; + size_t num_layer_; + size_t rotary_embedding_dim_; + float layernorm_eps_; + size_t hidden_units_; + size_t rank_; + size_t world_size_; + AttentionType attention_type_; + + BaseAttentionLayer* self_attention_layer_; + FfnLayer* ffn_layer_; + + void allocateBuffer() override; + void allocateBuffer(size_t batch_size, size_t seq_len); + void freeBuffer() override; + + bool isValidLayerParallelId(uint l); + bool isFirstLayerParallelId(uint l); + bool isLastLayerParallelId(uint l); + int getFirstLayerParallelId(); + + void initialize(); + +protected: + T* decoder_normed_input_ = nullptr; + T* self_attn_output_ = nullptr; + T* decoder_layer_output_ = nullptr; + +public: + LLaMAContextDecoder(size_t head_num, + size_t size_per_head, + size_t inter_size, + size_t num_layer, + size_t rotary_embedding_dim, + float layernorm_eps, + size_t rank, + size_t world_size, + cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, + bool is_free_buffer_after_forward, + AttentionType attention_type = AttentionType::FUSED_MHA); + + LLaMAContextDecoder(LLaMAContextDecoder const& decoder); + + ~LLaMAContextDecoder(); + + void forward(std::vector* output_tensors, + const std::vector* input_tensors, + const std::vector*>* decoder_layer_weights); + + void forward(std::unordered_map* output_tensors, + const std::unordered_map* input_tensors, + const std::vector*>* llama_decoder_layer_weight); +}; + +} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc new file mode 100644 index 000000000..6f3a7721f --- /dev/null +++ b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.cc @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h" +#include "src/fastertransformer/utils/memory_utils.h" + +namespace fastertransformer { + +template +LLaMADecoderLayerWeight::LLaMADecoderLayerWeight(const int hidden_units, const int inter_size): + hidden_units_(hidden_units), inter_size_(inter_size) +{ + mallocWeights(); + setWeightPtr(); +} + +template +LLaMADecoderLayerWeight::~LLaMADecoderLayerWeight() +{ + if (is_maintain_buffer == true) { + for (int i = 0; i < 14; i++) { + if (i != attention_dense_bias_weight_id) { + cudaFree(weights_ptr[i]); + } + } + + pre_layernorm_weights.beta = nullptr; + pre_layernorm_weights.gamma = nullptr; + self_attention_weights.query_weight.kernel = nullptr; + self_attention_weights.query_weight.bias = nullptr; + self_attention_weights.attention_output_weight.kernel = nullptr; + self_attention_weights.attention_output_weight.bias = nullptr; + post_attention_layernorm_weights.beta = nullptr; + post_attention_layernorm_weights.gamma = nullptr; + + ffn_weights.intermediate_weight.kernel = nullptr; + ffn_weights.intermediate_weight.bias = nullptr; + ffn_weights.output_weight.kernel = nullptr; + ffn_weights.output_weight.bias = nullptr; + is_maintain_buffer = false; + } +} + +template +LLaMADecoderLayerWeight::LLaMADecoderLayerWeight(const LLaMADecoderLayerWeight& other): + hidden_units_(other.hidden_units_), inter_size_(other.inter_size_) +{ + mallocWeights(); + cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_); + cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); + cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_); + cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_); + cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ * hidden_units_); + cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_); + cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_); + cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_); + cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ * hidden_units_); + cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_); + cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], hidden_units_ * inter_size_); + cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], inter_size_); + cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_); + cudaD2Dcpy(weights_ptr[13], other.weights_ptr[13], hidden_units_); + setWeightPtr(); +} + +template +LLaMADecoderLayerWeight& LLaMADecoderLayerWeight::operator=(const LLaMADecoderLayerWeight& other) +{ + hidden_units_ = other.hidden_units_; + inter_size_ = other.inter_size_; + + mallocWeights(); + + cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_); + cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); + cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_); + cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_); + cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ * hidden_units_); + cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_); + cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_); + cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_); + cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], inter_size_ * hidden_units_); + cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], hidden_units_); + cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], hidden_units_ * inter_size_); + cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], inter_size_); + cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_); + cudaD2Dcpy(weights_ptr[13], other.weights_ptr[13], hidden_units_); + setWeightPtr(); + return *this; +} + +template +void LLaMADecoderLayerWeight::loadModel(std::string dir_path, FtCudaDataType model_file_type) +{ + FT_CHECK(is_maintain_buffer == true); + + loadWeightFromBin( + weights_ptr[0], {(size_t)hidden_units_}, dir_path + ".attention_norm.bias.bin", model_file_type); + loadWeightFromBin( + weights_ptr[1], {(size_t)hidden_units_}, dir_path + ".attention_norm.weight.bin", model_file_type); + + loadWeightFromBin(weights_ptr[2], + {(size_t)hidden_units_, (size_t)(3 * hidden_units_)}, + dir_path + ".attention.query_key_value.weight.bin", + model_file_type); + loadWeightFromBin(weights_ptr[3], + {(size_t)(3 * hidden_units_)}, + dir_path + ".attention.query_key_value.bias.bin", + model_file_type); + + loadWeightFromBin(weights_ptr[4], + {(size_t)(hidden_units_), (size_t)hidden_units_}, + dir_path + ".attention.wo.weight.bin", + model_file_type); + loadWeightFromBin(weights_ptr[5], {(size_t)hidden_units_}, dir_path + ".attention.wo.bias.bin", model_file_type); + + loadWeightFromBin(weights_ptr[6], + {(size_t)hidden_units_, (size_t)(inter_size_)}, + dir_path + ".feed_forward.w1.weight.bin", + model_file_type); + loadWeightFromBin( + weights_ptr[7], {(size_t)(inter_size_)}, dir_path + ".feed_forward.w1.bias.bin", model_file_type); + + loadWeightFromBin(weights_ptr[8], + {(size_t)(inter_size_), (size_t)hidden_units_}, + dir_path + ".feed_forward.w2.weight.bin", + model_file_type); + loadWeightFromBin( + weights_ptr[9], {(size_t)hidden_units_}, dir_path + ".feed_forward.w2.bias.bin", model_file_type); + + loadWeightFromBin(weights_ptr[10], + {(size_t)hidden_units_, (size_t)(inter_size_)}, + dir_path + ".feed_forward.w3.weight.bin", + model_file_type); + loadWeightFromBin( + weights_ptr[11], {(size_t)(inter_size_)}, dir_path + ".feed_forward.w3.bias.bin", model_file_type); + + loadWeightFromBin(weights_ptr[12], {(size_t)hidden_units_}, dir_path + ".ffn_norm.bias.bin", model_file_type); + loadWeightFromBin(weights_ptr[13], {(size_t)hidden_units_}, dir_path + ".ffn_norm.weight.bin", model_file_type); +} + +template +void LLaMADecoderLayerWeight::setWeightPtr() +{ + pre_layernorm_weights.beta = weights_ptr[0]; + pre_layernorm_weights.gamma = weights_ptr[1]; + self_attention_weights.query_weight.kernel = weights_ptr[2]; + self_attention_weights.query_weight.bias = weights_ptr[3]; + self_attention_weights.attention_output_weight.kernel = weights_ptr[4]; + self_attention_weights.attention_output_weight.bias = weights_ptr[5]; + + ffn_weights.intermediate_weight.kernel = weights_ptr[6]; + ffn_weights.intermediate_weight.bias = weights_ptr[7]; + ffn_weights.output_weight.kernel = weights_ptr[8]; + ffn_weights.output_weight.bias = weights_ptr[9]; + ffn_weights.intermediate_weight2.kernel = weights_ptr[10]; + ffn_weights.intermediate_weight2.bias = weights_ptr[11]; + + post_attention_layernorm_weights.beta = weights_ptr[12]; + post_attention_layernorm_weights.gamma = weights_ptr[13]; + is_maintain_buffer = true; +} + +template +void LLaMADecoderLayerWeight::mallocWeights() +{ + deviceMalloc(&weights_ptr[0], hidden_units_); + deviceMalloc(&weights_ptr[1], hidden_units_); + deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_); + deviceMalloc(&weights_ptr[3], 3 * hidden_units_); + deviceMalloc(&weights_ptr[4], hidden_units_ * hidden_units_); + deviceMalloc(&weights_ptr[5], hidden_units_); + + deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_); + deviceMalloc(&weights_ptr[7], inter_size_); + deviceMalloc(&weights_ptr[8], inter_size_ * hidden_units_); + deviceMalloc(&weights_ptr[9], hidden_units_); + deviceMalloc(&weights_ptr[10], hidden_units_ * inter_size_); + deviceMalloc(&weights_ptr[11], inter_size_); + deviceMalloc(&weights_ptr[12], hidden_units_); + deviceMalloc(&weights_ptr[13], hidden_units_); +} + +template struct LLaMADecoderLayerWeight; +template struct LLaMADecoderLayerWeight; + +} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h new file mode 100644 index 000000000..35d16300f --- /dev/null +++ b/src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "src/fastertransformer/kernels/layernorm_kernels.h" +#include "src/fastertransformer/layers/FfnWeight.h" +#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h" +#include "src/fastertransformer/utils/cuda_utils.h" + +namespace fastertransformer { + +template +struct LLaMADecoderLayerWeight { +public: + LLaMADecoderLayerWeight() = default; + LLaMADecoderLayerWeight(const int hidden_units, const int inter_size); + ~LLaMADecoderLayerWeight(); + LLaMADecoderLayerWeight(const LLaMADecoderLayerWeight& other); + LLaMADecoderLayerWeight& operator=(const LLaMADecoderLayerWeight& other); + + void loadModel(std::string dir_path, FtCudaDataType model_file_type); + + LayerNormWeight pre_layernorm_weights; + AttentionWeight self_attention_weights; + LayerNormWeight post_attention_layernorm_weights; + FfnWeight ffn_weights; + +private: + int hidden_units_; + int inter_size_; + const int attention_dense_bias_weight_id = 5; + bool is_maintain_buffer = false; + T* weights_ptr[14]; + + void setWeightPtr(); + void mallocWeights(); +}; + +} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMAWeight.cc b/src/fastertransformer/models/llama/LLaMAWeight.cc new file mode 100644 index 000000000..f1c51e340 --- /dev/null +++ b/src/fastertransformer/models/llama/LLaMAWeight.cc @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/fastertransformer/models/llama/LLaMAWeight.h" + +namespace fastertransformer { + +template +LLaMAWeight::LLaMAWeight(const int hidden_units, + const int inter_size, + const int vocab_size, + const int num_layer, + const int layer_para_size, + const int layer_para_rank): + hidden_units_(hidden_units), + inter_size_(inter_size), + vocab_size_(vocab_size), + num_layer_(num_layer), + layer_para_size_(layer_para_size), + layer_para_rank_(layer_para_rank) +{ + FT_CHECK(num_layer_ % layer_para_size_ == 0); + + decoder_layer_weights.reserve(num_layer_); + for (int l = 0; l < num_layer_; l++) { + if (isValidLayerParallelId(l)) { + decoder_layer_weights.push_back(new LLaMADecoderLayerWeight(hidden_units_, inter_size_)); + } + else { + // Layer-parallelism: allocate empty layer because + // this rank does not compute it: + decoder_layer_weights.push_back(new LLaMADecoderLayerWeight(0, 0)); + } + } + + mallocWeights(); + setWeightPtr(); +} + +template +LLaMAWeight::~LLaMAWeight() +{ + if (is_maintain_buffer == true) { + for (int i = 0; i < weights_ptr.size(); i++) { + deviceFree(weights_ptr[i]); + } + + pre_decoder_embedding_table = nullptr; + post_decoder_layernorm.beta = nullptr; + post_decoder_layernorm.gamma = nullptr; + post_decoder_embedding.kernel = nullptr; + is_maintain_buffer = false; + } +} + +template +LLaMAWeight::LLaMAWeight(const LLaMAWeight& other): + hidden_units_(other.hidden_units_), + inter_size_(other.inter_size_), + vocab_size_(other.vocab_size_), + num_layer_(other.num_layer_), + layer_para_size_(other.layer_para_size_), + layer_para_rank_(other.layer_para_rank_), + prompt_token_weight_size_(other.prompt_token_weight_size_) +{ + mallocWeights(); + cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], vocab_size_ * hidden_units_); + cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); + cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_); + cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], hidden_units_ * vocab_size_); + + // prompt learning table: malloc weights and set weight ptr + setWeightPtr(); + + decoder_layer_weights.clear(); + decoder_layer_weights.reserve(num_layer_); + for (int l = 0; l < num_layer_; l++) { + decoder_layer_weights.push_back(other.decoder_layer_weights[l]); + } +} + +template +LLaMAWeight& LLaMAWeight::operator=(const LLaMAWeight& other) +{ + hidden_units_ = other.hidden_units_; + inter_size_ = other.inter_size_; + vocab_size_ = other.vocab_size_; + num_layer_ = other.num_layer_; + layer_para_size_ = other.layer_para_size_; + layer_para_rank_ = other.layer_para_rank_; + prompt_token_weight_size_ = other.prompt_token_weight_size_; + + mallocWeights(); + cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], vocab_size_ * hidden_units_); + cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); + cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_); + cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], hidden_units_ * vocab_size_); + + setWeightPtr(); + + decoder_layer_weights.clear(); + decoder_layer_weights.reserve(num_layer_); + for (int l = 0; l < num_layer_; l++) { + decoder_layer_weights.push_back(other.decoder_layer_weights[l]); + } + return *this; +} + +template +void LLaMAWeight::setWeightPtr() +{ + pre_decoder_embedding_table = weights_ptr[0]; + post_decoder_layernorm.beta = weights_ptr[1]; + post_decoder_layernorm.gamma = weights_ptr[2]; + post_decoder_embedding.kernel = weights_ptr[3]; +} + +template +void LLaMAWeight::mallocWeights() +{ + weights_ptr.resize(num_base_weights); + + deviceMalloc(&weights_ptr[0], vocab_size_ * hidden_units_); + deviceMalloc(&weights_ptr[1], hidden_units_); + deviceMalloc(&weights_ptr[2], hidden_units_); + deviceMalloc(&weights_ptr[3], hidden_units_ * vocab_size_); + + is_maintain_buffer = true; +} + +template +void LLaMAWeight::loadModel(std::string dir_path) +{ + FtCudaDataType model_file_type = getModelFileType(dir_path + "/config.ini", "llama"); + FT_CHECK(is_maintain_buffer == true); + + loadWeightFromBin(weights_ptr[0], + {(size_t)(vocab_size_ * hidden_units_)}, + dir_path + "/model.tok_embeddings.weight.bin", + model_file_type); + loadWeightFromBin(weights_ptr[1], {(size_t)hidden_units_}, dir_path + "/model.norm.bias.bin", model_file_type); + loadWeightFromBin(weights_ptr[2], {(size_t)hidden_units_}, dir_path + "/model.norm.weight.bin", model_file_type); + loadWeightFromBin(weights_ptr[3], + {(size_t)(vocab_size_ * hidden_units_)}, + dir_path + "/model.output.weight.bin", + model_file_type); + + for (int l = 0; l < num_layer_; l++) { + if (isValidLayerParallelId(l)) { + decoder_layer_weights[l]->loadModel(dir_path + "/model.layers." + std::to_string(l), model_file_type); + } + } +} + +template +void LLaMAWeight::resizeLayer(const int num_layer) +{ + num_layer_ = num_layer; + decoder_layer_weights.reserve(num_layer_); + for (int l = 0; l < num_layer_; l++) { + decoder_layer_weights.push_back(new LLaMADecoderLayerWeight()); + } +} + +template +bool LLaMAWeight::isValidLayerParallelId(int l) +{ + int local_num_layer = (int)(ceil(num_layer_ * 1.0f / layer_para_size_)); + return l < num_layer_ && (l >= local_num_layer * layer_para_rank_) + && (l < local_num_layer * (layer_para_rank_ + 1)); +} + +template struct LLaMAWeight; +template struct LLaMAWeight; + +} // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LLaMAWeight.h b/src/fastertransformer/models/llama/LLaMAWeight.h new file mode 100644 index 000000000..e1fed4309 --- /dev/null +++ b/src/fastertransformer/models/llama/LLaMAWeight.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "src/fastertransformer/kernels/layernorm_kernels.h" +#include "src/fastertransformer/models/llama/LLaMADecoderLayerWeight.h" +#include "src/fastertransformer/utils/memory_utils.h" +#include "src/fastertransformer/utils/prompt_learning.h" + +namespace fastertransformer { + +template +struct LLaMAWeight { + + LLaMAWeight() = default; + LLaMAWeight( + const int hidden_units, + const int inter_size, + const int vocab_size, + const int num_layer, + const int layer_para_size = 1, + const int layer_para_rank = 0); + + ~LLaMAWeight(); + LLaMAWeight(const LLaMAWeight& other); + LLaMAWeight& operator=(const LLaMAWeight& other); + + void loadModel(std::string dir_path); + + void resizeLayer(const int num_layer); + + std::vector*> decoder_layer_weights; + const T* pre_decoder_embedding_table = nullptr; + const T* position_encoding_table = nullptr; + + LayerNormWeight post_decoder_layernorm; + DenseWeight post_decoder_embedding; + +private: + void setWeightPtr(); + void mallocWeights(); + bool isValidLayerParallelId(int l); + + int hidden_units_; + int inter_size_; + int vocab_size_; + int num_layer_; + + int layer_para_size_; + int layer_para_rank_; + + // prompt learning pair (task_name, (task_name_id, prompt_len)) + // each prompt token's weight size + size_t prompt_token_weight_size_ = 0; + + bool is_maintain_buffer = false; + const size_t num_base_weights = 4; + std::vector weights_ptr = std::vector(num_base_weights); +}; + +} // namespace fastertransformer diff --git a/src/fastertransformer/th_op/CMakeLists.txt b/src/fastertransformer/th_op/CMakeLists.txt index b9f2b9151..4e8d82d30 100644 --- a/src/fastertransformer/th_op/CMakeLists.txt +++ b/src/fastertransformer/th_op/CMakeLists.txt @@ -32,6 +32,7 @@ add_subdirectory(t5) add_subdirectory(bart) add_subdirectory(bert) add_subdirectory(deberta) +add_subdirectory(llama) add_library(th_transformer SHARED $ @@ -49,6 +50,7 @@ add_library(th_transformer SHARED $ $ $ + $ ) target_link_libraries(th_transformer PUBLIC "${TORCH_LIBRARIES}" th_bart @@ -66,6 +68,7 @@ target_link_libraries(th_transformer PUBLIC "${TORCH_LIBRARIES}" th_t5 th_utils th_vit + th_llama ) if(ENABLE_FP8) diff --git a/src/fastertransformer/th_op/llama/CMakeLists.txt b/src/fastertransformer/th_op/llama/CMakeLists.txt new file mode 100755 index 000000000..75d13790e --- /dev/null +++ b/src/fastertransformer/th_op/llama/CMakeLists.txt @@ -0,0 +1,17 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_library(th_llama STATIC LLaMA.cc) +set_property(TARGET th_llama PROPERTY POSITION_INDEPENDENT_CODE ON) +target_link_libraries(th_llama PRIVATE "${TORCH_LIBRARIES}" LLaMA th_utils nccl_utils) diff --git a/src/fastertransformer/th_op/llama/LLaMA.cc b/src/fastertransformer/th_op/llama/LLaMA.cc new file mode 100755 index 000000000..760ead92e --- /dev/null +++ b/src/fastertransformer/th_op/llama/LLaMA.cc @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/fastertransformer/th_op/llama/LLaMA.h" + +namespace th = torch; +namespace torch_ext { + +LLaMA::LLaMA(const int64_t num_heads, + const int64_t size_per_head, + const int64_t inter_size, + const int64_t num_layers, + const int64_t vocab_size, + const int64_t rotary_embedding_dim, + const int64_t random_seed, + const int64_t max_seq_len, + const int64_t rank, + const int64_t world_size, + const vector weights): + vocab_size_(vocab_size), st_(weights[0].scalar_type()) +{ + for (auto t : weights) { + CHECK_INPUT(t, st_); + } + + switch (st_) { + case at::ScalarType::Float: + ftllama = new FTLLaMA((size_t)num_heads, + (size_t)size_per_head, + (size_t)inter_size, + (size_t)num_layers, + (size_t)vocab_size, + (size_t)rotary_embedding_dim, + (size_t)random_seed, + (size_t)max_seq_len, + (size_t)rank, + (size_t)world_size, + weights); + break; + case at::ScalarType::Half: + ftllama = new FTLLaMA((size_t)num_heads, + (size_t)size_per_head, + (size_t)inter_size, + (size_t)num_layers, + (size_t)vocab_size, + (size_t)rotary_embedding_dim, + (size_t)random_seed, + (size_t)max_seq_len, + (size_t)rank, + (size_t)world_size, + weights); + break; + default: + throw std::runtime_error("Wrong Tensor type."); + } +} + +LLaMA::~LLaMA() +{ + delete ftllama; +} + +std::vector LLaMA::forward(th::Tensor& hidden_vector, + th::Tensor& cum_probs, + th::Tensor& input_ids, + th::Tensor& input_lengths, + th::Tensor& target_ids, + th::Tensor& context_lengths, + const int64_t seq_len, + const int64_t attn_len, + const int64_t is_context) +{ + CHECK_TH_CUDA(input_ids); + CHECK_CONTIGUOUS(input_ids); + TORCH_CHECK(input_ids.dtype() == torch::kInt32, "input_ids dtype should be int32"); + CHECK_TH_CUDA(input_lengths); + CHECK_CONTIGUOUS(input_lengths); + TORCH_CHECK(input_lengths.dtype() == torch::kInt32, "input_lengths dtype should be int32"); + + ftllama->forward(hidden_vector, + cum_probs, + input_ids, + input_lengths, + target_ids, + context_lengths, + seq_len, + attn_len, + is_context); + return std::vector{hidden_vector, cum_probs}; +} + +} // namespace torch_ext + +static auto fasterTransformerGptTHS = +#ifdef LEGACY_THS + torch::jit::class_("FasterTransformerLLaMA") +#else + torch::jit::class_("FasterTransformer", "LLaMA") +#endif + .def(torch::jit::init>()) + .def("forward", &torch_ext::LLaMA::forward); diff --git a/src/fastertransformer/th_op/llama/LLaMA.h b/src/fastertransformer/th_op/llama/LLaMA.h new file mode 100755 index 000000000..425f260df --- /dev/null +++ b/src/fastertransformer/th_op/llama/LLaMA.h @@ -0,0 +1,300 @@ +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/fastertransformer/models/llama/LLaMA.h" +#include "src/fastertransformer/th_op/th_utils.h" +#include "src/fastertransformer/utils/cuda_bf16_wrapper.h" +#include "src/fastertransformer/utils/nccl_utils.h" + +namespace ft = fastertransformer; +namespace th = torch; +namespace torch_ext { + +using std::vector; + +class IFLLaMA { +public: + virtual ~IFLLaMA() {} + virtual void forward(th::Tensor& hidden_vector, + th::Tensor& cum_probs, + th::Tensor& input_ids, + th::Tensor& input_lengths, + th::Tensor& target_ids, + th::Tensor& context_lengths, + const int seq_len, + const int attn_len, + const int is_context) = 0; +}; + +template +class FTLLaMA: public IFLLaMA { +public: + FTLLaMA(const size_t num_heads, + const size_t size_per_head, + const size_t inter_size, + const size_t num_layers, + const size_t vocab_size, + const size_t rotary_embedding_dim, + const size_t random_seed, + const size_t max_seq_len, + const int64_t rank, + const int64_t world_size, + const vector weights): + num_heads_(num_heads), + size_per_head_(size_per_head), + inter_size_(inter_size), + num_layers_(num_layers), + vocab_size_(vocab_size), + rotary_embedding_dim_(rotary_embedding_dim), + random_seed_(random_seed), + max_seq_len_(max_seq_len), + rank_(rank), + world_size_(world_size), + weights_(weights) + { + ft::Logger::getLogger().setLevel(ft::Logger::WARNING); + + ft::check_cuda_error(cublasLtCreate(&cublasltHandle_)); + cublas_algo_map_ = new ft::cublasAlgoMap(GEMM_CONFIG, ""); + cublas_wrapper_mutex_ = new std::mutex(); + + llama_weights_.resizeLayer(num_layers_); + for (int i = 0; i < (int)num_layers_; i++) { + llama_weights_.decoder_layer_weights[i]->pre_layernorm_weights.beta = + get_ptr(weights_[i + 0 * num_layers_]); + llama_weights_.decoder_layer_weights[i]->pre_layernorm_weights.gamma = + get_ptr(weights_[i + 1 * num_layers_]); + llama_weights_.decoder_layer_weights[i]->self_attention_weights.query_weight.kernel = + get_ptr(weights_[i + 2 * num_layers_]); + llama_weights_.decoder_layer_weights[i]->self_attention_weights.query_weight.bias = + get_ptr(weights_[i + 3 * num_layers_]); + llama_weights_.decoder_layer_weights[i]->self_attention_weights.attention_output_weight.kernel = + get_ptr(weights_[i + 4 * num_layers_]); + llama_weights_.decoder_layer_weights[i]->self_attention_weights.attention_output_weight.bias = + get_ptr(weights_[i + 5 * num_layers_]); + llama_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight.kernel = + get_ptr(weights_[i + 6 * num_layers_]); + llama_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight.bias = + get_ptr(weights_[i + 7 * num_layers_]); + llama_weights_.decoder_layer_weights[i]->ffn_weights.output_weight.kernel = + get_ptr(weights_[i + 8 * num_layers_]); + llama_weights_.decoder_layer_weights[i]->ffn_weights.output_weight.bias = + get_ptr(weights_[i + 9 * num_layers_]); + llama_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight2.kernel = + get_ptr(weights_[i + 10 * num_layers_]); + llama_weights_.decoder_layer_weights[i]->ffn_weights.intermediate_weight2.bias = + get_ptr(weights_[i + 11 * num_layers_]); + llama_weights_.decoder_layer_weights[i]->post_attention_layernorm_weights.beta = + get_ptr(weights_[i + 12 * num_layers_]); + llama_weights_.decoder_layer_weights[i]->post_attention_layernorm_weights.gamma = + get_ptr(weights_[i + 13 * num_layers_]); + } + + llama_weights_.pre_decoder_embedding_table = get_ptr(weights_[14 * num_layers_ + 0]); + llama_weights_.post_decoder_layernorm.beta = get_ptr(weights_[14 * num_layers_ + 1]); + llama_weights_.post_decoder_layernorm.gamma = get_ptr(weights_[14 * num_layers_ + 2]); + llama_weights_.post_decoder_embedding.kernel = get_ptr(weights_[14 * num_layers_ + 3]); + + ft::check_cuda_error(cudaGetDeviceProperties(&prop_, 0)); + ft::check_cuda_error(cudaStreamCreate(&stream_)); + + for (int i = 0; i < num_events_; ++i) { + ft::check_cuda_error(cudaEventCreate(&event_[i])); + } + + cublasHandle_t cublasHandle = at::cuda::getCurrentCUDABlasHandle(); + cublasSetStream(cublasHandle, stream_); + + allocator_ = new ft::Allocator(); + cublas_wrapper_ = new ft::cublasMMWrapper( + cublasHandle, cublasltHandle_, stream_, cublas_algo_map_, cublas_wrapper_mutex_, allocator_); + + if (std::is_same::value) { + cublas_wrapper_->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F); + } + else if (std::is_same::value) { + cublas_wrapper_->setFP32GemmConfig(); + } + + ft::AttentionType attention_type = ft::getAttentionType(size_per_head_, + ft::getSMVersion(), + true, // remove_padding + 0, // gpt supports any-seq-length fmha + true, // is_fuse + false, // with_relative_position_bias + true); // causal_mask + // + llama_ = new ft::LLaMA(num_heads_, + size_per_head_, + inter_size_, + num_layers_, + vocab_size_, + rotary_embedding_dim_, + random_seed_, + max_seq_len_, + rank_, + world_size_, + stream_, + cublas_wrapper_, + allocator_, + false, // is_free_buffer_after_forward + &prop_, // cuda_device_prop + attention_type // attention_type + ); + } + + ~FTLLaMA() override + { + for (int i = 0; i < num_events_; ++i) { + ft::check_cuda_error(cudaEventDestroy(event_[i])); + } + ft::check_cuda_error(cudaStreamDestroy(stream_)); + + delete llama_; + delete cublas_wrapper_; + delete allocator_; + + cublasLtDestroy(cublasltHandle_); + delete cublas_algo_map_; + delete cublas_wrapper_mutex_; + } + + virtual void forward(th::Tensor& hidden_vector, + th::Tensor& cum_probs, + th::Tensor& input_ids, + th::Tensor& input_lengths, + th::Tensor& target_ids, + th::Tensor& context_lengths, + const int seq_len, + const int attn_len, + const int is_context) override + { + const size_t batch_size = (size_t)input_lengths.size(0); + const size_t num_tokens = (size_t)input_ids.size(0); + const size_t beam_width = (size_t)target_ids.size(0); + + std::unordered_map input_tensors = std::unordered_map{ + {"input_ids", + ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, std::vector{num_tokens}, get_ptr(input_ids)}}, + {"input_lengths", + ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, std::vector{batch_size}, get_ptr(input_lengths)}}, + {"target_ids", + ft::Tensor{ft::MEMORY_GPU, + ft::TYPE_INT32, + std::vector{beam_width, num_tokens}, + get_ptr(target_ids)}}, + {"context_lengths", + ft::Tensor{ + ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{batch_size}, get_ptr(context_lengths)}}, + {"seq_len", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{1}, &seq_len}}, + {"attn_len", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{1}, &attn_len}}, + {"is_context", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, std::vector{1}, &is_context}}}; + + std::unordered_map output_tensors = std::unordered_map{ + {"hidden_vector", + ft::Tensor{ft::MEMORY_GPU, + (std::is_same::value) ? ft::TYPE_FP16 : ft::TYPE_FP32, + std::vector{num_tokens, num_heads_ * size_per_head_}, + get_ptr(hidden_vector)}}, + {"cum_probs", + ft::Tensor{ft::MEMORY_GPU, + ft::TYPE_FP32, + std::vector{beam_width, batch_size}, + get_ptr(cum_probs)}}}; + + try { + ft::check_cuda_error(cudaEventSynchronize(event_[ev_no_])); + llama_->forward(&output_tensors, &input_tensors, &llama_weights_); + ft::check_cuda_error(cudaEventRecord(event_[ev_no_], stream_)); + + auto stream = at::cuda::getCurrentCUDAStream().stream(); + ft::check_cuda_error(cudaStreamWaitEvent(stream, event_[ev_no_])); + ev_no_ = (ev_no_ + 1) % num_events_; + } + catch (std::runtime_error& error) { + std::cout << error.what(); + exit(-1); + } + catch (...) { + std::cout << "Runtime error"; + exit(-1); + } + } + +private: + const size_t num_heads_; + const size_t size_per_head_; + const size_t inter_size_; + const size_t num_layers_; + const size_t vocab_size_; + const size_t rotary_embedding_dim_; + const size_t random_seed_; + const size_t max_seq_len_; + const size_t rank_; + const size_t world_size_; + + static constexpr int num_events_ = 5; + int ev_no_ = 0; + cudaEvent_t event_[num_events_]; + cudaStream_t stream_; + + std::vector weights_; + cublasLtHandle_t cublasltHandle_; + std::mutex* cublas_wrapper_mutex_; + ft::cublasAlgoMap* cublas_algo_map_; + struct cudaDeviceProp prop_; + ft::LLaMAWeight llama_weights_; + + ft::cublasMMWrapper* cublas_wrapper_; + ft::IAllocator* allocator_; + ft::LLaMA* llama_ = nullptr; +}; + +class LLaMA: public th::jit::CustomClassHolder { +public: + LLaMA(const int64_t num_heads, + const int64_t size_per_head, + const int64_t inter_size, + const int64_t num_layers, + const int64_t vocab_size, + const int64_t rotary_embedding_dim, + const int64_t random_seed, + const int64_t max_seq_len, + const int64_t rank, + const int64_t world_size, + const vector weights); + + ~LLaMA(); + + std::vector forward(th::Tensor& hidden_vector, + th::Tensor& cum_probs, + th::Tensor& input_ids, + th::Tensor& input_lengths, + th::Tensor& target_ids, + th::Tensor& context_lengths, + const int64_t seq_len, + const int64_t attn_len, + const int64_t is_context); + +private: + const at::ScalarType st_; + size_t vocab_size_; + IFLLaMA* ftllama; + std::vector weights; +}; + +} // namespace torch_ext diff --git a/src/fastertransformer/utils/CMakeLists.txt b/src/fastertransformer/utils/CMakeLists.txt index 9796ad076..22f735c27 100644 --- a/src/fastertransformer/utils/CMakeLists.txt +++ b/src/fastertransformer/utils/CMakeLists.txt @@ -57,7 +57,7 @@ add_library(mpi_utils STATIC mpi_utils.cc) set_property(TARGET mpi_utils PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET mpi_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) if (BUILD_MULTI_GPU) - target_link_libraries(mpi_utils PUBLIC -lmpi logger) + target_link_libraries(mpi_utils PUBLIC -lmpi -lmpi_cxx logger) endif() add_library(nccl_utils STATIC nccl_utils.cc) diff --git a/src/fastertransformer/utils/llama_utils.h b/src/fastertransformer/utils/llama_utils.h new file mode 100644 index 000000000..deed71f2c --- /dev/null +++ b/src/fastertransformer/utils/llama_utils.h @@ -0,0 +1,182 @@ +#include +#include +#include + +namespace fastertransformer { + +template +static void _print_tensor1(T* out, int dim1, int indent) +{ + std::string ind(indent, ' '); + int start0 = 0; + int end0 = (dim1 < 3) ? dim1 : 3; + int start1 = (dim1 < 3) ? 0 : dim1 - 3; + int end1 = (dim1 < 3) ? 0 : dim1; + + std::cout << "["; + for (int i = start0; i < end0; ++i) { + std::cout << std::fixed << std::setw(7) << std::setprecision(4) << std::setfill(' ') << out[i]; + if (i != dim1 - 1) + std::cout << " "; + } + if (end0 != start1) { + std::cout << "... "; + } + for (int i = start1; i < end1; ++i) { + std::cout << std::fixed << std::setw(7) << std::setprecision(4) << std::setfill(' ') << out[i]; + if (i != end1 - 1) + std::cout << " "; + } + std::cout << "]"; +} + +template +static void _print_tensor2(T* out, int dim1, int dim2, int stride, int indent) +{ + std::string ind(indent, ' '); + int start0 = 0; + int end0 = (dim1 < 3) ? dim1 : 3; + int start1 = (dim1 < 3) ? 0 : dim1 - 3; + int end1 = (dim1 < 3) ? 0 : dim1; + std::cout << "["; + for (int i = start0; i < end0; ++i) { + if (i != start0) + std::cout << ind; + _print_tensor1(&out[i * stride], dim2, indent + 1); + if (i != dim1 - 1) + std::cout << "\n"; + } + if (end0 != start1) { + std::cout << ind; + std::cout << "...\n"; + } + for (int i = start1; i < end1; ++i) { + std::cout << ind; + _print_tensor1(&out[i * stride], dim2, indent + 1); + if (i != end1 - 1) + std::cout << "\n"; + } + std::cout << "]"; +} + +template +static void _print_tensor3(T* out, int dim1, int dim2, int dim3, int stride1, int stride2, int indent) +{ + std::string ind(indent, ' '); + + int start0 = 0; + int end0 = (dim1 < 3) ? dim1 : 3; + int start1 = (dim1 < 3) ? 0 : dim1 - 3; + int end1 = (dim1 < 3) ? 0 : dim1; + std::cout << "["; + for (int i = start0; i < end0; ++i) { + if (i != start0) + std::cout << ind; + _print_tensor2(&out[i * stride1], dim2, dim3, stride2, indent + 1); + if (i != dim1 - 1) + std::cout << "\n\n"; + } + if (start1 != end1) { + std::cout << ind; + std::cout << "...\n\n"; + } + for (int i = start1; i < end1; ++i) { + std::cout << ind; + _print_tensor2(&out[i * stride1], dim2, dim3, stride2, indent + 1); + if (i != end1 - 1) + std::cout << "\n\n"; + } + std::cout << "]"; +} + +template +static void +_print_tensor4(T* out, int dim1, int dim2, int dim3, int dim4, int stride1, int stride2, int stride3, int indent) +{ + std::string ind(indent, ' '); + + int start0 = 0; + int end0 = (dim1 < 3) ? dim1 : 3; + int start1 = (dim1 < 3) ? 0 : dim1 - 3; + int end1 = (dim1 < 3) ? 0 : dim1; + std::cout << "["; + for (int i = start0; i < end0; ++i) { + if (i != start0) + std::cout << ind; + _print_tensor3(&out[i * stride1], dim2, dim3, dim4, stride2, stride3, indent + 1); + if (i != dim1 - 1) + std::cout << "\n\n\n"; + } + if (start1 != end1) { + std::cout << ind; + std::cout << "...\n\n"; + } + for (int i = start1; i < end1; ++i) { + std::cout << ind; + _print_tensor3(&out[i * stride1], dim2, dim3, dim4, stride2, stride3, indent + 1); + if (i != end1 - 1) + std::cout << "\n\n\n"; + } + std::cout << "]"; +} + +template +static void print_tensor1(T* in, int dim1) +{ + T* out = (T*)malloc(sizeof(T) * dim1); + cudaMemcpy(out, in, sizeof(T) * dim1, cudaMemcpyDeviceToHost); + _print_tensor1(out, dim1, 1); + std::cout << "\n"; + free(out); +} + +template +static void print_tensor2(T* in, int dim1, int dim2, int stride1, int size, int start) +{ + T* out = (T*)malloc(sizeof(T) * size); + cudaMemcpy(out, in, sizeof(T) * size, cudaMemcpyDeviceToHost); + _print_tensor2(&out[start], dim1, dim2, stride1, 1); + std::cout << "\n"; + free(out); +} + +template +static void print_tensor2(T* in, int dim1, int dim2) +{ + print_tensor2(in, dim1, dim2, dim2, dim1 * dim2, 0); +} + +template +static void print_tensor3(T* in, int dim1, int dim2, int dim3, int stride1, int stride2, int size, int start) +{ + T* out = (T*)malloc(sizeof(T) * size); + cudaMemcpy(out, in, sizeof(T) * size, cudaMemcpyDeviceToHost); + _print_tensor3(&out[start], dim1, dim2, dim3, stride1, stride2, 1); + std::cout << "\n"; + free(out); +} + +template +static void print_tensor3(T* in, int dim1, int dim2, int dim3) +{ + print_tensor3(in, dim1, dim2, dim3, dim2 * dim3, dim3, dim1 * dim2 * dim3, 0); +} + +template +static void +print_tensor4(T* in, int dim1, int dim2, int dim3, int dim4, int stride1, int stride2, int stride3, int size, int start) +{ + T* out = (T*)malloc(sizeof(T) * size); + cudaMemcpy(out, in, sizeof(T) * size, cudaMemcpyDeviceToHost); + _print_tensor4(&out[start], dim1, dim2, dim3, dim4, stride1, stride2, stride3, 1); + std::cout << "\n"; + free(out); +} + +template +static void print_tensor4(T* in, int dim1, int dim2, int dim3, int dim4) +{ + print_tensor4(in, dim1, dim2, dim3, dim4, dim2 * dim3 * dim4, dim3 * dim4, dim4, dim1 * dim2 * dim3 * dim4, 0); +} + +} // namespace fastertransformer