diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index a83d374ab0b..8bebc7be1bc 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -222,6 +222,7 @@ jobs: --preq_mode 8da4w_output_8da8w \ --preq_group_size 32 \ --max_seq_length 2048 \ + --max_context_length 2048 \ --output_name "${OUT_ET_MODEL_NAME}.pte" \ -kv \ -d fp32 \ @@ -253,6 +254,7 @@ jobs: --xnnpack-extended-ops \ -d fp32 \ --max_seq_length 2048 \ + --max_context_length 2048 \ --output_name "${OUT_ET_MODEL_NAME}.pte" \ --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' ls -lh "${OUT_ET_MODEL_NAME}.pte" diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml index f6424c4fa9a..ea88be441cb 100644 --- a/.github/workflows/apple-perf.yml +++ b/.github/workflows/apple-perf.yml @@ -233,6 +233,7 @@ jobs: --preq_mode 8da4w_output_8da8w \ --preq_group_size 32 \ --max_seq_length 2048 \ + --max_context_length 2048 \ --output_name "${OUT_ET_MODEL_NAME}.pte" \ -kv \ -d fp32 \ @@ -264,6 +265,7 @@ jobs: --xnnpack-extended-ops \ -d fp32 \ --max_seq_length 2048 \ + --max_context_length 2048 \ --output_name "${OUT_ET_MODEL_NAME}.pte" \ --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' ls -lh "${OUT_ET_MODEL_NAME}.pte" diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md index a2ac04ae93a..2b9bad21b7a 100644 --- a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md +++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md @@ -56,14 +56,14 @@ In this demo app, we support text-only inference with up-to-date Llama models an Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch supports on the XNNPACK backend. * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte" +python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --max_context_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte" ``` ### For Llama 3.2 1B and 3B QAT+LoRA models Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend. * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte" +python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --max_context_length 2048--preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte" ``` ### For Llama 3.2 1B and 3B BF16 models @@ -87,7 +87,7 @@ To safeguard your application, you can use our Llama Guard models for prompt cla * We prepared this model using the following command ``` -python -m examples.models.llama.export_llama --checkpoint --params -d fp32 -kv --use_sdpa_with_kv_cache --quantization_mode 8da4w --group_size 256 --xnnpack --max_seq_length 8193 --embedding-quantize 4,32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_prune_map --output_name="llama_guard_3_1b_pruned_xnnpack.pte" +python -m examples.models.llama.export_llama --checkpoint --params -d fp32 -kv --use_sdpa_with_kv_cache --quantization_mode 8da4w --group_size 256 --xnnpack --max_seq_length 8193 --max_context_length 8193 --embedding-quantize 4,32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_prune_map --output_name="llama_guard_3_1b_pruned_xnnpack.pte" ``` diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md index 784ebe50f8a..b127bad10e2 100644 --- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md +++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md @@ -51,14 +51,14 @@ sh examples/models/llama/install_requirements.sh Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch supports on the XNNPACK backend. * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte" +python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --max_context_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte" ``` ### For Llama 3.2 1B and 3B QAT+LoRA models Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend. * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte" +python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint --params -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --max_context_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte" ``` ### For Llama 3.2 1B and 3B BF16 models diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md index 7a8838fb016..57afba8b9cf 100644 --- a/examples/models/llama/README.md +++ b/examples/models/llama/README.md @@ -199,6 +199,7 @@ python -m examples.models.llama.export_llama \ --preq_mode 8da4w_output_8da8w \ --preq_group_size 32 \ --max_seq_length 2048 \ + --max_context_length 2048 \ --output_name "llama3_2.pte" \ -kv \ -d fp32 \ @@ -230,6 +231,7 @@ python -m examples.models.llama.export_llama \ --xnnpack-extended-ops \ -d fp32 \ --max_seq_length 2048 \ + --max_context_length 2048 \ --output_name "llama3_2.pte" \ --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' ``` @@ -397,6 +399,7 @@ python -m examples.models.llama.eval_llama \ -kv \ -d \ --max_seq_len \ + --max_context_len \ --limit ``` @@ -411,6 +414,7 @@ python -m examples.models.llama.eval_llama \ --tasks mmlu \ --num_fewshot 5 \ --max_seq_len + --max_context_len ``` See [Llama utils page](./UTILS.md) page for more advanced use-cases such as fine-tuning and running smaller models for educational purposes, and quick iteration and verification.