pytorch · jackzhxng · May 27, 2025
@@ -1229,15 +1229,7 @@ def _load_llama_model(
             checkpoint=checkpoint,
             checkpoint_dir=checkpoint_dir,
             params=params_path,
-            use_kv_cache=use_kv_cache,
-            use_sdpa_with_kv_cache=use_sdpa_with_kv_cache,
-            generate_full_logits=generate_full_logits,
             fairseq2=weight_type == WeightType.FAIRSEQ2,
-            max_seq_len=max_seq_len,
-            max_context_len=max_context_len,
-            enable_dynamic_shape=enable_dynamic_shape,
-            input_prune_map_path=input_prune_map_path,
-            output_prune_map_path=output_prune_map_path,
             dtype=torch_dtype,
             llm_config=llm_config,
         )

@@ -47,15 +47,30 @@ def __init__(self, **kwargs):
         # Params file.
         params_path = kwargs.get("params", None)
 
-        self.use_kv_cache = kwargs.get("use_kv_cache", False)
-        self.use_sdpa_with_kv_cache_op = kwargs.get("use_sdpa_with_kv_cache", False)
-        self.generate_full_logits = kwargs.get("generate_full_logits", False)
-        self.enable_dynamic_shape = kwargs.get("enable_dynamic_shape", False)
-        self.input_prune_map_path = kwargs.get("input_prune_map_path", None)
-        self.output_prune_map_path = kwargs.get("output_prune_map_path", None)
-        self.max_seq_len = kwargs.get("max_seq_len", 128)
-        self.max_context_len = kwargs.get("max_context_len", 128)
         self.llm_config = kwargs.get("llm_config", None)
+
+        # Set all parameters from llm_config if available, otherwise use kwargs as fallback
+        if self.llm_config:
+            self.use_kv_cache = self.llm_config.model.use_kv_cache
+            self.use_sdpa_with_kv_cache_op = self.llm_config.model.use_sdpa_with_kv_cache
+            self.generate_full_logits = self.llm_config.debug.generate_full_logits
+            self.enable_dynamic_shape = self.llm_config.model.enable_dynamic_shape
+            self.input_prune_map_path = self.llm_config.model.input_prune_map
+            self.output_prune_map_path = self.llm_config.model.output_prune_map
+            self.max_seq_len = self.llm_config.export.max_seq_length
+            self.max_context_len = self.llm_config.export.max_context_length
+            self.verbose = self.llm_config.debug.verbose
+        else:
+            # Fallback to kwargs for backward compatibility
+            self.use_kv_cache = kwargs.get("use_kv_cache", False)
+            self.use_sdpa_with_kv_cache_op = kwargs.get("use_sdpa_with_kv_cache", False)
+            self.generate_full_logits = kwargs.get("generate_full_logits", False)
+            self.enable_dynamic_shape = kwargs.get("enable_dynamic_shape", False)
+            self.input_prune_map_path = kwargs.get("input_prune_map_path", None)
+            self.output_prune_map_path = kwargs.get("output_prune_map_path", None)
+            self.max_seq_len = kwargs.get("max_seq_len", 128)
+            self.max_context_len = kwargs.get("max_context_len", 128)
+            self.verbose = kwargs.get("verbose", False)
 
         assert (
             self.max_context_len >= self.max_seq_len
@@ -165,7 +180,7 @@ def __init__(self, **kwargs):
             if model_name not in ["llama3", "llama3_1"]:
                 model_args.rope_scale_factor = 32
 
-        if kwargs.get("verbose", False):
+        if self.verbose:
             print("============= weights ================")
             print("{key} : {weights.numel()} : {weights.size()}")
             for key, weights in checkpoint.items():
@@ -280,7 +295,7 @@ def __init__(self, **kwargs):
                     f"The provided checkpoint is missing the following weights that are expected by the model: {missing_weights}. Please fix the fqn's in your checkpoint to match."
                 )
         if unexpected:
-            if kwargs.get("verbose", False):
+            if self.verbose:
                 print(f"Unexpected keys: {unexpected}")
 
         # Prune the input layer if input_prune_map is provided