99"""
1010Configurations for exporting Llama.
1111
12- Uses dataclases , which integrate with OmegaConf and Hydra.
12+ Uses dataclasses , which integrate with OmegaConf and Hydra.
1313"""
1414
1515import ast
@@ -45,7 +45,7 @@ class PreqMode(str, Enum):
4545 If you are dealing with pre-quantized checkpoints, this used to
4646 be the way to specify them. Now you don't need to specify these
4747 options if you use a TorchAo-prequantized checkpoint, but they
48- are still around to preservce backward compatibility.
48+ are still around to preserve backward compatibility.
4949 """
5050
5151 PREQ_8DA4W = "8da4w"
@@ -65,17 +65,17 @@ class BaseConfig:
6565 If left empty will use defaults specified in model_args.py.
6666 checkpoint: Path to the checkpoint file.
6767 If left empty, the model will be initialized with random weights.
68- checkpoint_dir: Path to directory containt sharded checkpoint files.
68+ checkpoint_dir: Path to directory containing sharded checkpoint files.
6969 tokenizer_path: Path to the tokenizer file.
70- metadata: Json string containining metadata information.
70+ metadata: Json string containing metadata information.
7171 e.g. '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
7272 use_lora: Rank of the LoRA, if set to 0 then this means no LoRA. For use with QAT.
7373 fairseq2: For legacy internal use cases, this is safe to ignore.
7474 preq_mode: Legacy option to specify how prequantized weights are loaded.
7575 Going forward, ExecuTorch supports loading weights prequantized through
7676 TorchAo as-is, without any special handling.
77- preq_group_size: Legacy option to specify the gropu size of prequantized weights.
78- preq_embedding_quantize: Legacy option to specify how prequanitzed embeddings
77+ preq_group_size: Legacy option to specify the group size of prequantized weights.
78+ preq_embedding_quantize: Legacy option to specify how prequantized embeddings
7979 are loaded.
8080 """
8181
@@ -124,10 +124,10 @@ class ModelConfig:
124124 token generation.
125125 use_shared_embeddings: whether the embedding/output weights should be
126126 shared. Only available with torchao kernels, e.g. when
127- qmode set to use a "torchao:8da(\d+)w" pattern.
128- use_sdpa_with_kv_cache: Whether to use flash attention by subtituting
127+ qmode set to use a "torchao:8da(\\ d+)w" pattern.
128+ use_sdpa_with_kv_cache: Whether to use flash attention by substituting
129129 for our custom SDPA op. Note that the naming is poor and this
130- doesn't actually ahve anything to do with the kv_cache at the moment.
130+ doesn't actually have anything to do with the kv_cache at the moment.
131131 expand_rope_table: Temporary workaround to expand sin/cos table in head
132132 dim to take vectorized path in optimized kernels.
133133 use_attention_sink: Whether to use attention sink to support multi-round
@@ -140,7 +140,7 @@ class ModelConfig:
140140 quantize_kv_cache: Whether to perform int8 per token quantization on the KV cache.
141141 local_global_attention: List of integers specifying local and global attention pattern.
142142 e.g., [0, 16, 0, 16] to specify that every other layer is sliding window of 16.
143- [0, 16, 32] pattern specifes 2nd and 3rd layers have sliding windows of 16 and 32.
143+ [0, 16, 32] pattern specifies 2nd and 3rd layers have sliding windows of 16 and 32.
144144 [16] pattern specifies all layers have a sliding window of 16.
145145 """
146146
0 commit comments