diff --git a/.ci/scripts/test_eval_llama_mmlu.sh b/.ci/scripts/test_eval_llama_mmlu.sh index c3c0a3d1a69..b8af5fe609f 100644 --- a/.ci/scripts/test_eval_llama_mmlu.sh +++ b/.ci/scripts/test_eval_llama_mmlu.sh @@ -35,6 +35,7 @@ run_and_verify() { exit 1 fi $PYTHON_EXECUTABLE -m examples.models.llama.eval_llama \ + --model llama2 \ -c stories110M.pt \ -p params.json \ -t tokenizer.model \ diff --git a/.ci/scripts/test_eval_llama_wikitext.sh b/.ci/scripts/test_eval_llama_wikitext.sh index 77af12270ca..ba2b2ec6b30 100644 --- a/.ci/scripts/test_eval_llama_wikitext.sh +++ b/.ci/scripts/test_eval_llama_wikitext.sh @@ -35,6 +35,7 @@ run_and_verify() { exit 1 fi $PYTHON_EXECUTABLE -m examples.models.llama.eval_llama \ + --model llama2 \ -c stories110M.pt \ -p params.json \ -t tokenizer.model \ diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh index ed2a9c2558b..49650364a7e 100644 --- a/.ci/scripts/test_llama.sh +++ b/.ci/scripts/test_llama.sh @@ -206,7 +206,7 @@ if [[ "${QNN}" == "ON" ]]; then EXPORT_ARGS="${EXPORT_ARGS} -kv -v --qnn --disable_dynamic_shape" fi # Add dynamically linked library location -$PYTHON_EXECUTABLE -m examples.models.llama.export_llama ${EXPORT_ARGS} +$PYTHON_EXECUTABLE -m examples.models.llama.export_llama --model llama3 ${EXPORT_ARGS} # Create tokenizer.bin. echo "Creating tokenizer.bin" diff --git a/.ci/scripts/test_llama_runner_eager.sh b/.ci/scripts/test_llama_runner_eager.sh index 537d835ba1c..769eb60142a 100644 --- a/.ci/scripts/test_llama_runner_eager.sh +++ b/.ci/scripts/test_llama_runner_eager.sh @@ -35,6 +35,7 @@ run_and_verify() { exit 1 fi $PYTHON_EXECUTABLE -m examples.models.llama.runner.eager \ + --model llama2 \ -c stories110M.pt \ -p params.json \ -t tokenizer.model \ diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh index 4e37d0ebaa3..a2608a03f0c 100755 --- a/.ci/scripts/test_model.sh +++ b/.ci/scripts/test_model.sh @@ -77,7 +77,7 @@ test_model() { # Install requirements for export_llama bash examples/models/llama/install_requirements.sh # Test export_llama script: python3 -m examples.models.llama.export_llama - "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama -c examples/models/llama/params/demo_rand_params.pth -p examples/models/llama/params/demo_config.json + "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model llama2 -c examples/models/llama/params/demo_rand_params.pth -p examples/models/llama/params/demo_config.json run_portable_executor_runner rm "./${MODEL_NAME}.pte" fi diff --git a/backends/vulkan/docs/android_demo.md b/backends/vulkan/docs/android_demo.md index 2a4faacc0c8..1314a6503aa 100644 --- a/backends/vulkan/docs/android_demo.md +++ b/backends/vulkan/docs/android_demo.md @@ -58,6 +58,7 @@ partially lower the Llama model to Vulkan. ```shell # The files will usually be downloaded to ~/.llama python -m examples.models.llama.export_llama \ + --model llama3_2 --disable_dynamic_shape --vulkan -kv --use_sdpa_with_kv_cache -d fp32 \ -c ~/.llama/checkpoints/Llama3.2-1B/consolidated.00.pth \ -p ~/.llama/checkpoints/Llama3.2-1B/params.json \ diff --git a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md index d928377ff28..90dc7dd0ad8 100644 --- a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md +++ b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md @@ -39,7 +39,7 @@ To export Llama 3 8B instruct with the Qualcomm AI Engine Direct Backend, ensure ```bash # Please note that calibration_data must include the prompt template for special tokens. -python -m examples.models.llama.export_llama -t +python -m examples.models.llama.export_llama --model llama3 -t llama3/Meta-Llama-3-8B-Instruct/tokenizer.model -p -c --use_kv_cache --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ``` diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md index 8308da6d840..7d28288bfed 100644 --- a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md +++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md @@ -101,12 +101,12 @@ We support PTQ by default. The entire export may take ~20 minutes (Llama 3.1 8B) Examples: ``` # 4 bits weight only quantize -python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_16a4w -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte” +python -m examples.models.llama.export_llama --model llama3 --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_16a4w -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte” ``` If the model is really big, it may require model sharding because the Qualcomm DSP is a 32bit system and has a 4GB size limit . For example for Llama 3 8B models, we need to shard the model into 4, but ExecuTorch still packages it into one PTE file. Here is an example: ``` # 8 bits quantization with 4 shards -python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_8a8w -d fp32 --num_sharding 4 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte” +python -m examples.models.llama.export_llama --model llama3 --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_8a8w -d fp32 --num_sharding 4 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte” ``` Note: if you encountered issues below ``` @@ -158,7 +158,7 @@ To export Llama 3 8B instruct with the Qualcomm AI Engine Direct Backend, ensure * 8B models might need 16GB RAM on the device to run. ``` # Please note that calibration_data must include the prompt template for special tokens. -python -m examples.models.llama.export_llama -t -p -c --use_kv_cache --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" +python -m examples.models.llama.export_llama --model llama3 -t -p -c --use_kv_cache --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ``` ## Pushing Model and Tokenizer diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md index 2a6ddbbfe09..4ee52bd1b99 100644 --- a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md +++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md @@ -56,14 +56,14 @@ In this demo app, we support text-only inference with up-to-date Llama models an Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch supports on the XNNPACK backend. * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte" +python -m examples.models.llama.export_llama --model llama3_2 --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte" ``` ### For Llama 3.2 1B and 3B QAT+LoRA models Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend. * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --checkpoint --params -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte" +python -m examples.models.llama.export_llama --model llama3_2 --checkpoint --params -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte" ``` ### For Llama 3.2 1B and 3B BF16 models @@ -72,7 +72,7 @@ We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte" +python -m examples.models.llama.export_llama --model llama3_2 --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte" ``` For more detail using Llama 3.2 lightweight models including prompt template, please go to our official [website](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2#-llama-3.2-lightweight-models-(1b/3b)-). @@ -87,7 +87,7 @@ To safeguard your application, you can use our Llama Guard models for prompt cla * We prepared this model using the following command ``` -python -m examples.models.llama.export_llama --checkpoint --params -d fp32 -kv --use_sdpa_with_kv_cache --quantization_mode 8da4w --group_size 256 --xnnpack --max_seq_length 8193 --embedding-quantize 4,32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_prune_map --output_name="llama_guard_3_1b_pruned_xnnpack.pte" +python -m examples.models.llama.export_llama --model llama3_2 --checkpoint --params -d fp32 -kv --use_sdpa_with_kv_cache --quantization_mode 8da4w --group_size 256 --xnnpack --max_seq_length 8193 --embedding-quantize 4,32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_prune_map --output_name="llama_guard_3_1b_pruned_xnnpack.pte" ``` @@ -97,7 +97,7 @@ python -m examples.models.llama.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama.pte" +python -m examples.models.llama.export_llama --model llama3_2 --checkpoint --params -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama.pte" ``` You may wonder what the ‘--metadata’ flag is doing. This flag helps export the model with proper special tokens added that the runner can detect EOS tokens easily. diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md index eb3c244dee7..8aeed59cab9 100644 --- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md +++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md @@ -45,9 +45,9 @@ Install the required packages to export the model sh examples/models/llama/install_requirements.sh ``` -Export the model +Export the model (Llama 3 in this case) ``` -python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" --params "${MODEL_DIR}/params.json" -kv --use_sdpa_with_kv_cache --mps -d fp32 --disable_dynamic_shape -qmode 8da4w -G 32 +python -m examples.models.llama.export_llama --model llama3 --checkpoint "${MODEL_DIR}/consolidated.00.pth" --params "${MODEL_DIR}/params.json" -kv --use_sdpa_with_kv_cache --mps -d fp32 --disable_dynamic_shape -qmode 8da4w -G 32 ``` ## Pushing Model and Tokenizer diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md index 201a2934470..63dfd334a10 100644 --- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md +++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md @@ -48,14 +48,14 @@ sh examples/models/llama/install_requirements.sh Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch supports on the XNNPACK backend. * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte" +python -m examples.models.llama.export_llama --model llama3_2 --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte" ``` ### For Llama 3.2 1B and 3B QAT+LoRA models Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend. * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --checkpoint --params -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte" +python -m examples.models.llama.export_llama --model llama3_2 --checkpoint --params -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte" ``` ### For Llama 3.2 1B and 3B BF16 models @@ -64,7 +64,7 @@ We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte" +python -m examples.models.llama.export_llama --model llama3_2 --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte" ``` For more detail using Llama 3.2 lightweight models including prompt template, please go to our official [website](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2#-llama-3.2-lightweight-models-(1b/3b)-). @@ -73,7 +73,7 @@ For more detail using Llama 3.2 lightweight models including prompt template, pl Export the model ``` -python -m examples.models.llama.export_llama --checkpoint -p -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" +python -m examples.models.llama.export_llama --model llama3_2 --checkpoint -p -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" ``` ### For LLaVA model diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md index 1ae6796b575..d06c0c031d2 100644 --- a/examples/models/llama/README.md +++ b/examples/models/llama/README.md @@ -166,6 +166,7 @@ LLAMA_CHECKPOINT=path/to/checkpoint.pth LLAMA_PARAMS=path/to/params.json python -m examples.models.llama.export_llama \ + --model llama3_2 --checkpoint "${LLAMA_CHECKPOINT:?}" \ --params "${LLAMA_PARAMS:?}" \ -kv \ @@ -187,6 +188,7 @@ LLAMA_QUANTIZED_CHECKPOINT=path/to/spinquant/checkpoint.pth LLAMA_PARAMS=path/to/spinquant/params.json python -m examples.models.llama.export_llama \ + --model llama3_2 --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \ --params "${LLAMA_PARAMS:?}" \ --use_sdpa_with_kv_cache \ @@ -212,6 +214,7 @@ LLAMA_QUANTIZED_CHECKPOINT=path/to/qlora/checkpoint.pth LLAMA_PARAMS=path/to/qlora/params.json python -m examples.models.llama.export_llama \ + --model llama3_2 --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \ --params "${LLAMA_PARAMS:?}" \ -qat \ @@ -237,9 +240,20 @@ You can export and run the original Llama 3 8B instruct model. 2. Export model and generate `.pte` file ``` - python -m examples.models.llama.export_llama --checkpoint -p -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" + python -m examples.models.llama.export_llama + --model llama3 + --checkpoint + -p + -kv + --use_sdpa_with_kv_cache + -X + -qmode 8da4w + --group_size 128 + -d fp32 + --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + --embedding-quantize 4,32 + --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" ``` - Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `--embedding-quantize 4,32` as shown above to further reduce the model size. ## Step 3: Run on your computer to validate diff --git a/examples/models/llama/UTILS.md b/examples/models/llama/UTILS.md index c2ae26e4835..d26362e3853 100644 --- a/examples/models/llama/UTILS.md +++ b/examples/models/llama/UTILS.md @@ -19,7 +19,7 @@ From `executorch` root: ``` 3. Export model and generate `.pte` file. ``` - python -m examples.models.llama.export_llama -c stories110M.pt -p params.json -X -kv + python -m examples.models.llama.export_llama --model llama3 -c stories110M.pt -p params.json -X -kv ``` ## Smaller model delegated to other backends @@ -27,9 +27,9 @@ From `executorch` root: Currently we supported lowering the stories model to other backends, including, CoreML, MPS and QNN. Please refer to the instruction for each backend ([CoreML](https://pytorch.org/executorch/main/build-run-coreml.html), [MPS](https://pytorch.org/executorch/main/build-run-mps.html), [QNN](https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html)) before trying to lower them. After the backend library is installed, the script to export a lowered model is -- Lower to CoreML: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --coreml -c stories110M.pt -p params.json ` -- MPS: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --mps -c stories110M.pt -p params.json ` -- QNN: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --qnn -c stories110M.pt -p params.json ` +- Lower to CoreML: `python -m examples.models.llama.export_llama --model llama3 -kv --disable_dynamic_shape --coreml -c stories110M.pt -p params.json ` +- MPS: `python -m examples.models.llama.export_llama --model llama3 -kv --disable_dynamic_shape --mps -c stories110M.pt -p params.json ` +- QNN: `python -m examples.models.llama.export_llama --model llama3 -kv --disable_dynamic_shape --qnn -c stories110M.pt -p params.json ` The iOS LLAMA app supports the CoreML and MPS model and the Android LLAMA app supports the QNN model. On Android, it also allow to cross compiler the llama runner binary, push to the device and run. diff --git a/examples/models/llama/eval_llama_lib.py b/examples/models/llama/eval_llama_lib.py index 285d2f874df..d9591d4ed1e 100644 --- a/examples/models/llama/eval_llama_lib.py +++ b/examples/models/llama/eval_llama_lib.py @@ -191,7 +191,7 @@ def gen_eval_wrapper( pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(args) # GPTFastEvalWrapper: Create a wrapper around a pre-exported model - manager: LLMEdgeManager = _prepare_for_llama_export(model_name, args) + manager: LLMEdgeManager = _prepare_for_llama_export(args) if len(quantizers) != 0: manager = manager.export().pt2e_quantize(quantizers) diff --git a/examples/models/llama/export_llama.py b/examples/models/llama/export_llama.py index 3d0d1b7bcfb..5f382bf50cf 100644 --- a/examples/models/llama/export_llama.py +++ b/examples/models/llama/export_llama.py @@ -20,10 +20,9 @@ def main() -> None: seed = 42 torch.manual_seed(seed) - modelname = "llama2" parser = build_args_parser() args = parser.parse_args() - export_llama(modelname, args) + export_llama(args) if __name__ == "__main__": diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index a0b44fb9652..7f71e009cc9 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -24,8 +24,6 @@ from executorch.devtools.etrecord import generate_etrecord -from executorch.examples.models.llama.llama_transformer import ModelArgs - from executorch.extension.llm.export.builder import DType, LLMEdgeManager from executorch.extension.llm.export.partitioner_lib import ( @@ -79,6 +77,10 @@ verbosity_setting = None +EXECUTORCH_DEFINED_MODELS = ["llama2", "llama3", "llama3_1", "llama3_2"] +TORCHTUNE_DEFINED_MODELS = ["llama3_2_vision", "llama3_2_tt"] + + class WeightType(Enum): LLAMA = "LLAMA" FAIRSEQ2 = "FAIRSEQ2" @@ -114,11 +116,11 @@ def build_model( else: output_dir_path = "." - argString = f"--checkpoint par:{modelname}_ckpt.pt --params par:{modelname}_params.json {extra_opts} --output-dir {output_dir_path}" + argString = f"--model {modelname} --checkpoint par:{modelname}_ckpt.pt --params par:{modelname}_params.json {extra_opts} --output-dir {output_dir_path}" parser = build_args_parser() args = parser.parse_args(shlex.split(argString)) # pkg_name = resource_pkg_name - return export_llama(modelname, args) + return export_llama(args) def build_args_parser() -> argparse.ArgumentParser: @@ -128,6 +130,12 @@ def build_args_parser() -> argparse.ArgumentParser: # parser.add_argument( # "-q", "--quantized_ckpt", default=None, help="quantized checkpoint file" # ) + parser.add_argument( + "--model", + default="llama3", + choices=EXECUTORCH_DEFINED_MODELS + TORCHTUNE_DEFINED_MODELS, + help="The Lllama model to export. llama2, llama3, llama3_1, llama3_2 share the same architecture, so they are technically interchangeable given you provide the checkpoint file for the desired version.", + ) parser.add_argument( "-E", "--embedding-quantize", @@ -465,13 +473,13 @@ def canonical_path(path: Union[str, Path], *, dir: bool = False) -> str: return return_val -def export_llama(modelname, args) -> str: +def export_llama(args) -> str: if args.profile_path is not None: try: from executorch.util.python_profiler import CProfilerFlameGraph with CProfilerFlameGraph(args.profile_path): - builder = _export_llama(modelname, args) + builder = _export_llama(args) assert ( filename := builder.get_saved_pte_filename() ) is not None, "Fail to get file name from builder" @@ -482,14 +490,14 @@ def export_llama(modelname, args) -> str: ) return "" else: - builder = _export_llama(modelname, args) + builder = _export_llama(args) assert ( filename := builder.get_saved_pte_filename() ) is not None, "Fail to get file name from builder" return filename -def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager: +def _prepare_for_llama_export(args) -> LLMEdgeManager: """ Helper function for export_llama. Loads the model from checkpoint and params, and sets up a LLMEdgeManager with initial transforms and dtype conversion. @@ -515,7 +523,7 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager: return ( _load_llama_model( - modelname=modelname, + args.model, checkpoint=checkpoint_path, checkpoint_dir=checkpoint_dir, params_path=params_path, @@ -538,7 +546,7 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager: args=args, ) .set_output_dir(output_dir_path) - .source_transform(_get_source_transforms(modelname, dtype_override, args)) + .source_transform(_get_source_transforms(args.model, dtype_override, args)) ) @@ -582,13 +590,13 @@ def _validate_args(args): raise ValueError("Model shard is only supported with qnn backend now.") -def _export_llama(modelname, args) -> LLMEdgeManager: # noqa: C901 +def _export_llama(args) -> LLMEdgeManager: # noqa: C901 _validate_args(args) pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(args) # export_to_edge builder_exported_to_edge = ( - _prepare_for_llama_export(modelname, args) + _prepare_for_llama_export(args) .export() .pt2e_quantize(quantizers) .export_to_edge() @@ -731,16 +739,18 @@ def _load_llama_model_metadata( use_kv_cache: bool, use_sdpa_with_kv_cache: bool, enable_dynamic_shape: bool, - model_args: ModelArgs, + max_seq_len: int, + n_layers: int, + vocab_size: int, metadata_str: Optional[str] = None, ): is_fairseq2 = weight_type == WeightType.FAIRSEQ2 metadata = { "get_bos_id": 3 if is_fairseq2 else 1, "get_eos_ids": [3] if is_fairseq2 else [2], - "get_max_seq_len": model_args.max_seq_len, - "get_n_layers": model_args.n_layers, - "get_vocab_size": model_args.vocab_size, + "get_max_seq_len": max_seq_len, + "get_n_layers": n_layers, + "get_vocab_size": vocab_size, "use_kv_cache": use_kv_cache, "use_sdpa_with_kv_cache": use_sdpa_with_kv_cache, "enable_dynamic_shape": enable_dynamic_shape, @@ -756,8 +766,8 @@ def _load_llama_model_metadata( def _load_llama_model( + modelname: str, *, - modelname: str = "llama2", checkpoint: Optional[str] = None, checkpoint_dir: Optional[str] = None, params_path: str, @@ -785,27 +795,44 @@ def _load_llama_model( Returns: An instance of LLMEdgeManager which contains the eager mode model. """ + assert ( checkpoint or checkpoint_dir ) and params_path, "Both checkpoint/checkpoint_dir and params can't be empty" logging.info( f"Loading model with checkpoint={checkpoint}, params={params_path}, use_kv_cache={use_kv_cache}, weight_type={weight_type}" ) - model, example_inputs, example_kwarg_inputs, _ = EagerModelFactory.create_model( - module_name="llama", - model_class_name="Llama2Model", - checkpoint=checkpoint, - checkpoint_dir=checkpoint_dir, - params=params_path, - use_kv_cache=use_kv_cache, - use_sdpa_with_kv_cache=use_sdpa_with_kv_cache, - generate_full_logits=generate_full_logits, - fairseq2=weight_type == WeightType.FAIRSEQ2, - max_seq_len=max_seq_len, - enable_dynamic_shape=enable_dynamic_shape, - input_prune_map_path=input_prune_map_path, - output_prune_map_path=output_prune_map_path, - args=args, + + if modelname in EXECUTORCH_DEFINED_MODELS: + # Set to llama2 because all models in EXECUTORCH_DEFINED_MODELS share the same archteciture as + # defined in example/models/llama2. + modelname = "llama" + model_class_name = "Llama2Model" + elif modelname in TORCHTUNE_DEFINED_MODELS: + if modelname == "llama3_2_vision": + model_class_name = "Llama3_2Decoder" + if modelname == "llama3_2_tt": + modelname = "llama3_2" + model_class_name = "Llama3_2" + else: + raise ValueError(f"{modelname} is not a valid Llama model.") + + model, example_inputs, example_kwarg_inputs, dynamic_shapes = ( + EagerModelFactory.create_model( + modelname, + model_class_name, + checkpoint=checkpoint, + checkpoint_dir=checkpoint_dir, + params=params_path, + use_kv_cache=use_kv_cache, + use_sdpa_with_kv_cache=use_sdpa_with_kv_cache, + generate_full_logits=generate_full_logits, + fairseq2=weight_type == WeightType.FAIRSEQ2, + max_seq_len=max_seq_len, + enable_dynamic_shape=enable_dynamic_shape, + output_prune_map_path=output_prune_map_path, + args=args, + ) ) if dtype_override: assert isinstance( @@ -837,12 +864,13 @@ def _load_llama_model( return LLMEdgeManager( model=model, modelname=modelname, - max_seq_len=model.params.max_seq_len, + max_seq_len=model.max_seq_len, dtype=dtype, use_kv_cache=use_kv_cache, generate_full_logits=generate_full_logits, example_inputs=example_inputs, example_kwarg_inputs=example_kwarg_inputs, + dynamic_shapes=dynamic_shapes, enable_dynamic_shape=enable_dynamic_shape, calibration_tasks=calibration_tasks, calibration_limit=calibration_limit, @@ -855,7 +883,9 @@ def _load_llama_model( use_kv_cache, use_sdpa_with_kv_cache, enable_dynamic_shape, - model.params, + model.max_seq_len, + model.n_layers, + model.vocab_size, metadata_str, ), args=args, diff --git a/examples/models/llama/runner/eager.py b/examples/models/llama/runner/eager.py index e116e08a099..5fdc2fae665 100644 --- a/examples/models/llama/runner/eager.py +++ b/examples/models/llama/runner/eager.py @@ -10,10 +10,10 @@ import torch -from examples.models.llama.llama_transformer import ModelArgs from executorch.examples.models.llama.export_llama_lib import ( _prepare_for_llama_export, build_args_parser as _build_args_parser, + TORCHTUNE_DEFINED_MODELS, ) from executorch.examples.models.llama.runner.generation import LlamaRunner from executorch.extension.llm.export import LLMEdgeManager @@ -27,18 +27,16 @@ class EagerLlamaRunner(LlamaRunner): def __init__(self, args): with open(args.params, "r") as f: params = json.loads(f.read()) - model_args: ModelArgs = ModelArgs( + super().__init__( + tokenizer_path=args.tokenizer_path, max_seq_len=args.max_seq_length, max_batch_size=1, use_kv_cache=args.use_kv_cache, - **params, - ) - super().__init__( - tokenizer_path=args.tokenizer_path, - model_args=model_args, + vocab_size=params["vocab_size"], + has_full_logits=args.model in TORCHTUNE_DEFINED_MODELS, device="cuda" if torch.cuda.is_available() else "cpu", ) - manager: LLMEdgeManager = _prepare_for_llama_export("llama", args) + manager: LLMEdgeManager = _prepare_for_llama_export(args) self.model = manager.model.eval().to(device=self.device) def forward( diff --git a/examples/models/llama/runner/generation.py b/examples/models/llama/runner/generation.py index e332e0ebe2e..5b52c549e58 100644 --- a/examples/models/llama/runner/generation.py +++ b/examples/models/llama/runner/generation.py @@ -9,7 +9,6 @@ import torch -from executorch.examples.models.llama.llama_transformer import ModelArgs from executorch.extension.llm.tokenizer.utils import get_tokenizer @@ -51,11 +50,21 @@ def next_token(logits: torch.Tensor, temperature: float, top_p: float) -> int: class LlamaRunner(ABC): - def __init__(self, tokenizer_path: str, model_args: ModelArgs, device: str = "cpu"): - self.params = model_args + def __init__( + self, + tokenizer_path: str, + max_seq_len: int, + max_batch_size: int, + use_kv_cache: bool, + vocab_size: int, + device: str = "cpu", + ): + self.max_seq_len = max_seq_len + self.max_batch_size = max_batch_size + self.use_kv_cache = use_kv_cache self.tokenizer = get_tokenizer(tokenizer_path) - assert model_args.vocab_size == self.tokenizer.n_words self.device = device + assert vocab_size == self.tokenizer.n_words @abstractmethod def forward( @@ -72,12 +81,12 @@ def generate( # noqa: C901 top_p: float = 0.9, echo: bool = False, ) -> List[int]: - # prefill + # Prefill logits = self.forward( tokens=torch.tensor([prompt_tokens], dtype=torch.long, device=self.device), input_pos=( torch.tensor([0], dtype=torch.long, device=self.device) - if self.params.use_kv_cache + if self.use_kv_cache else None ), ) @@ -85,8 +94,10 @@ def generate( # noqa: C901 current_token = next_token(logits, temperature, top_p) tokens = prompt_tokens + [current_token] - while len(tokens) < self.params.max_seq_len: - if self.params.use_kv_cache: + i = 0 + while len(tokens) < self.max_seq_len: + print(f"{i} out of {self.max_seq_len} max tokens generated") + if self.use_kv_cache: logits = self.forward( tokens=torch.tensor( [[current_token]], dtype=torch.long, device=self.device @@ -99,13 +110,16 @@ def generate( # noqa: C901 logits = self.forward( tokens=torch.tensor([tokens], dtype=torch.long, device=self.device), ) + current_token = next_token(logits, temperature, top_p) if current_token == self.tokenizer.eos_id or ( hasattr(self.tokenizer, "stop_tokens") and current_token in self.tokenizer.stop_tokens ): break + tokens.append(current_token) + i += 1 return tokens if echo else tokens[len(prompt_tokens) :] @@ -132,12 +146,15 @@ def text_completion( This method generates text completion for the provided prompt, employing nucleus sampling to introduce controlled randomness. """ prompt_tokens = self.tokenizer.encode(prompt, bos=True, eos=False) + print(f"Encoded prompt: {prompt_tokens}") + print("Generating") generation_tokens = self.generate( prompt_tokens=prompt_tokens, temperature=temperature, top_p=top_p, echo=echo, ) + print("Generated") return { "generation": self.tokenizer.decode(generation_tokens), "tokens": generation_tokens, diff --git a/examples/models/llama/runner/native.py b/examples/models/llama/runner/native.py index 90e7fc46dd0..5b3117674d7 100644 --- a/examples/models/llama/runner/native.py +++ b/examples/models/llama/runner/native.py @@ -10,18 +10,22 @@ import torch -from examples.models.llama.llama_transformer import ModelArgs +from executorch.examples.models.llama.export_llama_lib import ( + EXECUTORCH_DEFINED_MODELS, + TORCHTUNE_DEFINED_MODELS, +) + from executorch.extension.pybindings.portable_lib import _load_for_executorch # Load custom ops and quantized ops. from executorch.extension.pybindings import portable_lib # noqa # usort: skip +from executorch.examples.models.llama.runner.generation import LlamaRunner + # Note: import this after portable_lib -from executorch.extension.llm.custom_ops import sdpa_with_kv_cache # noqa # usort: skip +# from executorch.extension.llm.custom_ops import sdpa_with_kv_cache # noqa # usort: skip from executorch.kernels import quantized # noqa -from .generation import LlamaRunner - class NativeLlamaRunner(LlamaRunner): """ @@ -31,13 +35,14 @@ class NativeLlamaRunner(LlamaRunner): def __init__(self, args): with open(args.params, "r") as f: params = json.loads(f.read()) - model_args: ModelArgs = ModelArgs( + super().__init__( + tokenizer_path=args.tokenizer, max_seq_len=args.max_len, max_batch_size=1, use_kv_cache=args.kv_cache, - **params, + vocab_size=params["vocab_size"], + has_full_logits=args.model in TORCHTUNE_DEFINED_MODELS, ) - super().__init__(tokenizer_path=args.tokenizer, model_args=model_args) self.model = _load_for_executorch(args.pte) def forward( @@ -45,16 +50,29 @@ def forward( tokens: Optional[torch.LongTensor] = None, input_pos: Optional[torch.LongTensor] = None, ) -> torch.Tensor: - return ( - self.model.forward((tokens, input_pos)) - if input_pos is not None - else self.model.forward((tokens,)) - )[0] + # TODO: in LlamaRunner there is a generate function that automatically generates + # input_pos tensor and inputs it into the model. Atm TorchTune models use + # kwargs for the input_pos, so we will need to make some changes. At least + # for the time being, we can run the non-kv cache version of the Torchtune + # model with just the tokens like below. + return (self.model.forward((tokens,)))[0] + # return ( + # self.model.forward((tokens, input_pos)) + # if input_pos is not None + # else self.model.forward((tokens,)) + # )[0] def build_args_parser() -> argparse.ArgumentParser: + # TODO: merge these with build_args_parser from export_llama_lib. parser = argparse.ArgumentParser() + parser.add_argument( + "--model", + default="llama", + choices=EXECUTORCH_DEFINED_MODELS + TORCHTUNE_DEFINED_MODELS, + ) + parser.add_argument( "-f", "--pte", @@ -89,7 +107,6 @@ def build_args_parser() -> argparse.ArgumentParser: parser.add_argument( "-kv", "--kv_cache", - default=True, action="store_true", ) diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md index 92ddbf74d94..6e0b3794a74 100644 --- a/examples/models/llama2/README.md +++ b/examples/models/llama2/README.md @@ -37,7 +37,7 @@ You can export and run the original Llama 2 7B model. 3. Export model and generate `.pte` file: ``` - python -m examples.models.llama.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 + python -m examples.models.llama.export_llama --model llama2 --checkpoint --params -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 ``` 4. Create tokenizer.bin. ``` diff --git a/examples/models/llama3_2/__init__.py b/examples/models/llama3_2/__init__.py new file mode 100644 index 00000000000..22d584242cc --- /dev/null +++ b/examples/models/llama3_2/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from .model import Llama3_2 + +__all__ = [Llama3_2] diff --git a/examples/models/llama3_2/model.py b/examples/models/llama3_2/model.py new file mode 100644 index 00000000000..ae6f362e4f4 --- /dev/null +++ b/examples/models/llama3_2/model.py @@ -0,0 +1,150 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +import json +from typing import Any, Dict + +import torch +from executorch.examples.models.checkpoint import ( + get_checkpoint_dtype, + get_default_model_resource_dir, +) + +from executorch.examples.models.model_base import EagerModelBase +from torchtune.models.llama3_2._model_builders import llama3_2_1b +from torchtune.models.convert_weights import meta_to_tune + + +class Llama3_2(EagerModelBase): + """ + Llama3.2 as from TorchTune. + """ + + def __init__(self, **kwargs): + # Set member vars from kwargs. + self.max_seq_len = kwargs.get( + "max_seq_len", 8192 + ) # Trained to be a lot larger, but this value is kept small because of static kv cache at the moment. + self.encoder_max_seq_len = kwargs.get( + "encoder_max_seq_len", int(4 * (448 / 14) ** 2 + 1) + ) # Same as above. + self.output_prune_map_path = kwargs.get("output_prune_map_path", None) + self.use_kv_cache = kwargs.get("use_kv_cache", False) + self.verbose = kwargs.get("verbose", False) + self.args = kwargs.get("args", None) + + ckpt_dir = get_default_model_resource_dir(__file__) + # Single checkpoint file. + checkpoint_path = kwargs.get("checkpoint", ckpt_dir / "demo_rand_params.pth") + # Sharded checkpoint. + checkpoint_dir = kwargs.get("checkpoint_dir", None) + params_path = kwargs.get("params", ckpt_dir / "demo_config.json") + + self.causal_mask = torch.tril( + torch.ones( + size=(self.max_seq_len, self.max_seq_len), + dtype=torch.bool, + ) + ) + self.input_pos = torch.arange(self.max_seq_len) + + # Load checkpoint and params. + device = "cpu" + if checkpoint_dir is not None: + raise NotImplementedError( + "Sharded checkpoint not yet supported for Llama3_2Decoder." + ) + else: + checkpoint = torch.load(checkpoint_path, map_location=device, mmap=True) + checkpoint = meta_to_tune(checkpoint) + with open(params_path, "r") as f: + params = json.loads(f.read()) + + # Find dtype from checkpoint. (skip for now) + self.dtype = get_checkpoint_dtype(checkpoint) + + # Load model. + self.model_ = llama3_2_1b() + + # Save params for future use. + for param_name, param_val in params.items(): + setattr(self.model_, param_name, param_val) + + # Quantize. (skip for now) + + # Load checkpoint. + missing, unexpected = self.model_.load_state_dict( + checkpoint, + strict=False, + assign=True, + ) + if kwargs.get("verbose", False): + print("============= missing keys ================") + print(missing) + print("============= /missing ================") + print("============= unexpected keys ================") + print(unexpected) + print("============= /unexpected ================") + + # Prune the output layer if output_prune_map is provided. + output_prune_map = None + if self.output_prune_map_path is not None: + from executorch.examples.models.llama2.source_transformation.prune_output import ( + prune_output_vocab, + ) + + with open(self.output_prune_map_path, "r") as f: + output_prune_map = json.load(f) + # Change keys from string to int (json only supports string keys) + output_prune_map = {int(k): v for (k, v) in output_prune_map.items()} + + self.model_ = prune_output_vocab(self.model_, output_prune_map) + + if self.use_kv_cache: + print("Setting up KV cache on the model...") + self.model_.setup_caches( + batch_size=1, + dtype=self.dtype, + decoder_max_seq_len=self.max_seq_len, + ) + + def get_eager_model(self) -> torch.nn.Module: + if self.dtype: + return self.model_.to(self.dtype) + else: + return self.model_.to(torch.float16) + + def get_example_inputs(self): + return (torch.ones(1, 32, dtype=torch.long),) + + def get_example_kwarg_inputs(self): + # For export we must use the prefill versions of the + # causal mask and input_pos. + if self.use_kv_cache: + return { + "input_pos": self.input_pos[None, :32], + "mask": self.causal_mask[None, :32], + } + else: + return None + + def get_dynamic_shapes(self): + batch_size = 1 + dim_seq_len = torch.export.Dim("token_dim", min=1, max=self.max_seq_len) + if self.use_kv_cache: + dynamic_shapes = { + "tokens": {0: batch_size, 1: dim_seq_len}, + "input_pos" : {0: batch_size, 1: dim_seq_len}, + "mask": {0: batch_size, 1: dim_seq_len, 2: None}, + } + else: + dynamic_shapes = { + "tokens": {0: batch_size, 1: dim_seq_len}, + } + return dynamic_shapes + diff --git a/examples/models/llama3_2/runner/eager.py b/examples/models/llama3_2/runner/eager.py new file mode 100644 index 00000000000..ea327ad6cc1 --- /dev/null +++ b/examples/models/llama3_2/runner/eager.py @@ -0,0 +1,85 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import json +from typing import Optional + +import torch + +from executorch.examples.models.llama.export_llama_lib import ( + _prepare_for_llama_export, + build_args_parser as _build_args_parser, + TORCHTUNE_DEFINED_MODELS, +) +from executorch.examples.models.llama3_2_vision.runner.generation import TorchTuneLlamaRunner +from executorch.extension.llm.export import LLMEdgeManager + + +class EagerLlamaRunner(TorchTuneLlamaRunner): + """ + Runs llama in eager mode with provided checkpoint file. + """ + + def __init__(self, args): + with open(args.params, "r") as f: + params = json.loads(f.read()) + super().__init__( + tokenizer_path=args.tokenizer_path, + max_seq_len=args.max_seq_length, + max_batch_size=1, + use_kv_cache=args.use_kv_cache, + vocab_size=params["vocab_size"], + device="cuda" if torch.cuda.is_available() else "cpu", + ) + manager: LLMEdgeManager = _prepare_for_llama_export(args) + self.model = manager.model.eval().to(device=self.device) + + def forward( + self, + tokens: Optional[torch.LongTensor] = None, + input_pos: Optional[torch.LongTensor] = None, + mask: Optional[torch.LongTensor] = None, + ) -> torch.Tensor: + return self.model.forward(tokens=tokens, input_pos=input_pos, mask=mask) + + +def build_args_parser() -> argparse.ArgumentParser: + parser = _build_args_parser() + + parser.add_argument( + "--prompt", + type=str, + default="Hello", + ) + + parser.add_argument( + "--temperature", + type=float, + default=0, + ) + + return parser + + +def main() -> None: + parser = build_args_parser() + args = parser.parse_args() + + runner = EagerLlamaRunner(args) + result = runner.text_completion( + prompt=args.prompt, + temperature=args.temperature, + ) + print( + "Response: \n{response}\n Tokens:\n {tokens}".format( + response=result["generation"], tokens=result["tokens"] + ) + ) + + +if __name__ == "__main__": + main() # pragma: no cover diff --git a/examples/models/llama3_2/runner/exported_runner.py b/examples/models/llama3_2/runner/exported_runner.py new file mode 100644 index 00000000000..399b8e8c845 --- /dev/null +++ b/examples/models/llama3_2/runner/exported_runner.py @@ -0,0 +1,95 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import json +from typing import Optional + +import torch + +from executorch.examples.models.llama.export_llama_lib import ( + _prepare_for_llama_export, + build_args_parser as _build_args_parser, + TORCHTUNE_DEFINED_MODELS, +) +from executorch.examples.models.llama3_2_vision.runner.generation import TorchTuneLlamaRunner +from executorch.extension.llm.export import LLMEdgeManager + + +class ExportedLlamaRunner(TorchTuneLlamaRunner): + """ + Runs a torch-exported .pt2 Llama. + """ + + def __init__(self, args): + with open(args.params, "r") as f: + params = json.loads(f.read()) + super().__init__( + tokenizer_path=args.tokenizer_path, + max_seq_len=args.max_seq_length, + max_batch_size=1, + use_kv_cache=args.use_kv_cache, + vocab_size=params["vocab_size"], + device="cuda" if torch.cuda.is_available() else "cpu", + ) + print(f"Loading model from {args.pt2}") + self.model = torch.export.load(args.pt2).module() + print("Model loaded") + + def forward( + self, + tokens: Optional[torch.LongTensor] = None, + input_pos: Optional[torch.LongTensor] = None, + mask: Optional[torch.LongTensor] = None, + ) -> torch.Tensor: + print("Forward") + if self.use_kv_cache: + return self.model(tokens, input_pos=input_pos, mask=mask) + else: + return self.model(tokens) + +def build_args_parser() -> argparse.ArgumentParser: + parser = _build_args_parser() + + parser.add_argument( + "--prompt", + type=str, + default="Hello", + ) + + parser.add_argument( + "--pt2", + type=str, + required=True, + ) + + parser.add_argument( + "--temperature", + type=float, + default=0, + ) + + return parser + + +def main() -> None: + parser = build_args_parser() + args = parser.parse_args() + + runner = ExportedLlamaRunner(args) + result = runner.text_completion( + prompt=args.prompt, + temperature=args.temperature, + ) + print( + "Response: \n{response}\n Tokens:\n {tokens}".format( + response=result["generation"], tokens=result["tokens"] + ) + ) + + +if __name__ == "__main__": + main() # pragma: no cover diff --git a/examples/models/llama3_2/runner/generation.py b/examples/models/llama3_2/runner/generation.py new file mode 100644 index 00000000000..2a97a3b7f3b --- /dev/null +++ b/examples/models/llama3_2/runner/generation.py @@ -0,0 +1,101 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from abc import ABC, abstractmethod +from typing import List, Optional, TypedDict + +import torch + +from executorch.extension.llm.tokenizer.utils import get_tokenizer +from executorch.examples.models.llama.runner.generation import LlamaRunner, next_token, sample_top_p + + +class TorchTuneLlamaRunner(LlamaRunner): + def __init__( + self, + tokenizer_path: str, + max_seq_len: int, + max_batch_size: int, + use_kv_cache: bool, + vocab_size: int, + device: str = "cpu", + ): + super().__init__( + tokenizer_path, + max_seq_len, + max_batch_size, + use_kv_cache, + vocab_size, + device, + ) + + self.causal_mask = torch.tril( + torch.ones( + size=(self.max_seq_len, self.max_seq_len), + dtype=torch.bool, + ) + ) + self.input_pos = torch.arange(self.max_seq_len) + + def generate( # noqa: C901 + self, + prompt_tokens: List[int], + temperature: float = 0.8, + top_p: float = 0.9, + echo: bool = False, + ) -> List[int]: + # Prefill + seq_len = len(prompt_tokens) + input_pos = self.input_pos[None, :seq_len] + mask = self.causal_mask[None, :seq_len] + if self.use_kv_cache: + logits = self.forward( + tokens=torch.tensor([prompt_tokens], dtype=torch.long, device=self.device), + input_pos=input_pos, + mask=mask, + ) + else: + logits = self.forward( + tokens=torch.tensor([prompt_tokens], dtype=torch.long, device=self.device), + ) + + # Only need the last logit. + current_token = next_token(logits[:, -1, :], temperature, top_p) + tokens = prompt_tokens + [current_token] + + i = 0 + while len(tokens) < self.max_seq_len: + print(f"{i} out of {self.max_seq_len} max tokens generated") + mask = self.causal_mask[None, seq_len, None, :] + input_pos = self.input_pos[None, seq_len, None] + if self.use_kv_cache: + logits = self.forward( + tokens=torch.tensor( + [[current_token]], dtype=torch.long, device=self.device + ), + input_pos=input_pos, + mask=mask, + ) + else: + logits = self.forward( + tokens=torch.tensor([tokens], dtype=torch.long, device=self.device), + ) + + # Only need the last logit. + current_token = next_token(logits[:, -1, :], temperature, top_p) + + if current_token == self.tokenizer.eos_id or ( + hasattr(self.tokenizer, "stop_tokens") + and current_token in self.tokenizer.stop_tokens + ): + break + + tokens.append(current_token) + i += 1 + seq_len += 1 + + return tokens if echo else tokens[len(prompt_tokens) :] + diff --git a/examples/models/llama3_2_vision/__init__.py b/examples/models/llama3_2_vision/__init__.py index e69de29bb2d..3c385703d72 100644 --- a/examples/models/llama3_2_vision/__init__.py +++ b/examples/models/llama3_2_vision/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from .model import Llama3_2Decoder + +__all__ = [Llama3_2Decoder] diff --git a/examples/models/llama3_2_vision/model.py b/examples/models/llama3_2_vision/model.py new file mode 100644 index 00000000000..f735b3a3aee --- /dev/null +++ b/examples/models/llama3_2_vision/model.py @@ -0,0 +1,182 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +import json +from typing import Any, Dict + +import torch +from executorch.examples.models.checkpoint import ( + get_checkpoint_dtype, + get_default_model_resource_dir, +) + +from executorch.examples.models.model_base import EagerModelBase +from torchtune.models.llama3_2_vision._component_builders import llama3_2_vision_decoder +from torchtune.models.llama3_2_vision._convert_weights import llama3_vision_meta_to_tune + + +def to_decoder_checkpoint(checkpoint: Dict[str, Any]) -> Dict[str, Any]: + """ + Extracts and formats the decoder-related weights from the checkpoint. The checkpoint contains + weight names prefixed with "encoder"/"decoder", such as "encoder.layer.etc" or "decoder.norm.scale". + To load the text decoder on its own, the "decoder" prefix needs to be removed. + """ + return { + ".".join(weight.split(".")[1:]): value + for weight, value in checkpoint.items() + if weight.startswith("decoder") + } + + +class Llama3_2Decoder(EagerModelBase): + """ + Just the text decoder portions of the Llama3.2 multimodal model. + """ + + def __init__(self, **kwargs): + # Set member vars from kwargs. + self.max_seq_len = kwargs.get( + "max_seq_len", 8192 + ) # Trained to be a lot larger, but this value is kept small because of static kv cache at the moment. + self.encoder_max_seq_len = kwargs.get( + "encoder_max_seq_len", int(4 * (448 / 14) ** 2 + 1) + ) # Same as above. + self.generate_full_logits = kwargs.get("generate_full_logits", False) + self.enable_dynamic_shape = kwargs.get("enable_dynamic_shape", False) + self.output_prune_map_path = kwargs.get("output_prune_map_path", None) + self.use_kv_cache = kwargs.get("use_kv_cache", False) + self.verbose = kwargs.get("verbose", False) + self.args = kwargs.get("args", None) + + ckpt_dir = get_default_model_resource_dir(__file__) + # Single checkpoint file. + checkpoint_path = kwargs.get("checkpoint", ckpt_dir / "demo_rand_params.pth") + # Sharded checkpoint. + checkpoint_dir = kwargs.get("checkpoint_dir", None) + params_path = kwargs.get("params", ckpt_dir / "demo_config.json") + + self.causal_mask = torch.tril( + torch.ones( + size=(self.max_seq_len, self.max_seq_len), + dtype=torch.bool, + ) + ) + self.input_pos = torch.arange(self.max_seq_len) + + # Load checkpoint and params. + device = "cpu" + if checkpoint_dir is not None: + raise NotImplementedError( + "Sharded checkpoint not yet supported for Llama3_2Decoder." + ) + else: + checkpoint = torch.load(checkpoint_path, map_location=device, mmap=True) + checkpoint = llama3_vision_meta_to_tune(checkpoint) + checkpoint = to_decoder_checkpoint(checkpoint) + with open(params_path, "r") as f: + params = json.loads(f.read()) + + # Find dtype from checkpoint. (skip for now) + self.dtype = get_checkpoint_dtype(checkpoint) + + # Load model. + # Cannot use "with torch.device("meta"):" because it causes some exceptions during export, + # i.e. the model isn't fully initialized or something. + self.model_ = llama3_2_vision_decoder( + vocab_size=params["vocab_size"], + num_layers=params["n_layers"], + fusion_interval=params["fusion_interval"], + num_special_tokens=params["n_special_tokens"], + num_heads=params["n_heads"], + num_kv_heads=params["n_kv_heads"], + embed_dim=params["dim"], + max_seq_len=self.max_seq_len, + encoder_max_seq_len=self.encoder_max_seq_len, + rope_base=params["rope_theta"], + intermediate_dim=params["intermediate_dim"], + ) + # Save params for future use. + for param_name, param_val in params.items(): + setattr(self.model_, param_name, param_val) + + # Quantize. (skip for now) + + # Load checkpoint. + missing, unexpected = self.model_.load_state_dict( + checkpoint, + strict=False, + assign=True, + ) + if kwargs.get("verbose", False): + print("============= missing keys ================") + print(missing) + print("============= /missing ================") + print("============= unexpected keys ================") + print(unexpected) + print("============= /unexpected ================") + + # Prune the output layer if output_prune_map is provided. + output_prune_map = None + if self.output_prune_map_path is not None: + from executorch.examples.models.llama2.source_transformation.prune_output import ( + prune_output_vocab, + ) + + with open(self.output_prune_map_path, "r") as f: + output_prune_map = json.load(f) + # Change keys from string to int (json only supports string keys) + output_prune_map = {int(k): v for (k, v) in output_prune_map.items()} + + self.model_ = prune_output_vocab(self.model_, output_prune_map) + + if self.use_kv_cache: + print("Setting up KV cache on the model...") + self.model_.setup_caches( + batch_size=1, + dtype=self.dtype, + decoder_max_seq_len=self.max_seq_len, + ) + + def get_eager_model(self) -> torch.nn.Module: + if self.dtype: + return self.model_.to(self.dtype) + else: + return self.model_.to(torch.float16) + + def get_example_inputs(self): + return (torch.ones(1, 32, dtype=torch.long),) + + def get_example_kwarg_inputs(self): + # For export we must use the prefill versions of the + # causal mask and input_pos. + if self.use_kv_cache: + return { + "input_pos": self.input_pos[None, :32], + "mask": self.causal_mask[None, :32], + # "encoder_input": None, + # "encoder_mask": None, + } + else: + return None + + def get_dynamic_shapes(self): + batch_size = 1 + dim_seq_len = torch.export.Dim("token_dim", min=1, max=self.max_seq_len) + if self.use_kv_cache: + dynamic_shapes = { + "tokens": {0: batch_size, 1: dim_seq_len}, + # "encoder_input": {0: 1, 1: dim_enc, 2: 4096}, + # "encoder_mask": {0: 1, 1: dim, 2: dim_enc}, + "mask": {0: batch_size, 1: dim_seq_len, 2: None}, + "input_pos" : {0: batch_size, 1: dim_seq_len}, + } + else: + dynamic_shapes = { + "tokens": {0: batch_size, 1: dim_seq_len}, + } + return dynamic_shapes diff --git a/examples/models/llama3_2_vision/params/demo_config.json b/examples/models/llama3_2_vision/params/demo_config.json new file mode 100644 index 00000000000..625524ad4c8 --- /dev/null +++ b/examples/models/llama3_2_vision/params/demo_config.json @@ -0,0 +1,18 @@ +{ + "dim": 4096, + "ffn_dim_multiplier": 1.3, + "fusion_interval": 4, + "intermediate_dim": 14336, + "multiple_of": 1024, + "n_heads": 32, + "n_kv_heads": 8, + "n_layers": 32, + "n_special_tokens": 8, + "norm_eps": 1e-05, + "rope_theta": 500000.0, + "use_scaled_rope": true, + "vision_chunk_size": 560, + "vision_max_num_chunks": 4, + "vocab_size": 128256, + "vision_num_cross_attention_layers": 8 +} \ No newline at end of file diff --git a/examples/models/model_factory.py b/examples/models/model_factory.py index 5abe5efe462..5b66aef8de7 100644 --- a/examples/models/model_factory.py +++ b/examples/models/model_factory.py @@ -44,7 +44,7 @@ def create_model( model = model_class(**kwargs) example_kwarg_inputs = None dynamic_shapes = None - if hasattr(model, "get_example_kwarg_inputs()"): + if hasattr(model, "get_example_kwarg_inputs"): example_kwarg_inputs = model.get_example_kwarg_inputs() if hasattr(model, "get_dynamic_shapes"): dynamic_shapes = model.get_dynamic_shapes() diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index bd12c374b51..9244c7dd797 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -194,6 +194,11 @@ def export(self) -> "LLMEdgeManager": strict=True, ).module() else: + print("Exporting with:") + print(f"inputs: {self.example_inputs}") + print(f"kwargs: {self.example_kwarg_inputs}") + print(f"dynamic shapes: {dynamic_shape}") + # pyre-fixme[8]: Attribute has type `Optional[GraphModule]`; used as # `Module`. self.pre_autograd_graph_module = export_for_training(