Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions examples/models/llama/export_llama_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -1229,15 +1229,7 @@ def _load_llama_model(
checkpoint=checkpoint,
checkpoint_dir=checkpoint_dir,
params=params_path,
use_kv_cache=use_kv_cache,
use_sdpa_with_kv_cache=use_sdpa_with_kv_cache,
generate_full_logits=generate_full_logits,
fairseq2=weight_type == WeightType.FAIRSEQ2,
max_seq_len=max_seq_len,
max_context_len=max_context_len,
enable_dynamic_shape=enable_dynamic_shape,
input_prune_map_path=input_prune_map_path,
output_prune_map_path=output_prune_map_path,
dtype=torch_dtype,
llm_config=llm_config,
)
Expand Down
35 changes: 25 additions & 10 deletions examples/models/llama/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,30 @@ def __init__(self, **kwargs):
# Params file.
params_path = kwargs.get("params", None)

self.use_kv_cache = kwargs.get("use_kv_cache", False)
self.use_sdpa_with_kv_cache_op = kwargs.get("use_sdpa_with_kv_cache", False)
self.generate_full_logits = kwargs.get("generate_full_logits", False)
self.enable_dynamic_shape = kwargs.get("enable_dynamic_shape", False)
self.input_prune_map_path = kwargs.get("input_prune_map_path", None)
self.output_prune_map_path = kwargs.get("output_prune_map_path", None)
self.max_seq_len = kwargs.get("max_seq_len", 128)
self.max_context_len = kwargs.get("max_context_len", 128)
self.llm_config = kwargs.get("llm_config", None)

# Set all parameters from llm_config if available, otherwise use kwargs as fallback
if self.llm_config:
self.use_kv_cache = self.llm_config.model.use_kv_cache
self.use_sdpa_with_kv_cache_op = self.llm_config.model.use_sdpa_with_kv_cache
self.generate_full_logits = self.llm_config.debug.generate_full_logits
self.enable_dynamic_shape = self.llm_config.model.enable_dynamic_shape
self.input_prune_map_path = self.llm_config.model.input_prune_map
self.output_prune_map_path = self.llm_config.model.output_prune_map
self.max_seq_len = self.llm_config.export.max_seq_length
self.max_context_len = self.llm_config.export.max_context_length
self.verbose = self.llm_config.debug.verbose
else:
# Fallback to kwargs for backward compatibility
self.use_kv_cache = kwargs.get("use_kv_cache", False)
self.use_sdpa_with_kv_cache_op = kwargs.get("use_sdpa_with_kv_cache", False)
self.generate_full_logits = kwargs.get("generate_full_logits", False)
self.enable_dynamic_shape = kwargs.get("enable_dynamic_shape", False)
self.input_prune_map_path = kwargs.get("input_prune_map_path", None)
self.output_prune_map_path = kwargs.get("output_prune_map_path", None)
self.max_seq_len = kwargs.get("max_seq_len", 128)
self.max_context_len = kwargs.get("max_context_len", 128)
self.verbose = kwargs.get("verbose", False)

assert (
self.max_context_len >= self.max_seq_len
Expand Down Expand Up @@ -165,7 +180,7 @@ def __init__(self, **kwargs):
if model_name not in ["llama3", "llama3_1"]:
model_args.rope_scale_factor = 32

if kwargs.get("verbose", False):
if self.verbose:
print("============= weights ================")
print("{key} : {weights.numel()} : {weights.size()}")
for key, weights in checkpoint.items():
Expand Down Expand Up @@ -280,7 +295,7 @@ def __init__(self, **kwargs):
f"The provided checkpoint is missing the following weights that are expected by the model: {missing_weights}. Please fix the fqn's in your checkpoint to match."
)
if unexpected:
if kwargs.get("verbose", False):
if self.verbose:
print(f"Unexpected keys: {unexpected}")

# Prune the input layer if input_prune_map is provided
Expand Down
Loading