Lunwen pr comments

jackzhxng · jackzhxng · commit dc889b913b37 · 2024-11-11T11:37:53.000-08:00
diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
@@ -237,17 +237,17 @@ You can export and run the original Llama 3 8B instruct model.
 
 2. Export model and generate `.pte` file
     ```
-    python -m examples.models.llama.export_llama
-	    --checkpoint <consolidated.00.pth>
-		-p <params.json>
-		-kv
-		--use_sdpa_with_kv_cache
-		-X
-		-qmode 8da4w
-		--group_size 128
-		-d fp32
-		--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
-		--embedding-quantize 4,32
+    python -m examples.models.llama.export_llama \
+	    --checkpoint <consolidated.00.pth> \
+		-p <params.json> \
+		-kv \
+		--use_sdpa_with_kv_cache \
+		-X \
+		-qmode 8da4w \
+		--group_size 128 \
+		-d fp32 \
+		--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+		--embedding-quantize 4,32 \
 		--output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
     ```
     Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `--embedding-quantize 4,32` as shown above to further reduce the model size.
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -79,7 +79,7 @@
 verbosity_setting = None
 
 
-EXECUTORCH_DEFINED_MODELS = ["stories110m", "llama2", "llama3", "llama3.1", "llama3.2"]
+EXECUTORCH_DEFINED_MODELS = ["stories110m", "llama2", "llama3", "llama3_1", "llama3_2"]
 TORCHTUNE_DEFINED_MODELS = []
 
 
@@ -915,7 +915,7 @@ def _get_source_transforms(  # noqa
         ops that is not quantized.
 
         There are cases where this may be a no-op, namely, if all linears are
-        quantizedpp in the checkpoint.
+        quantized in the checkpoint.
         """
         modelname = f"{modelname}_q"
         transforms.append(