LarryXFly
diff --git a/‎tensorrt_llm/_torch/models/modeling_llama.py‎
Lines changed: 23 additions & 2 deletions b/‎tensorrt_llm/_torch/models/modeling_llama.py‎
Lines changed: 23 additions & 2 deletions
diff --git a/‎tensorrt_llm/_torch/models/modeling_utils.py‎
Lines changed: 27 additions & 0 deletions b/‎tensorrt_llm/_torch/models/modeling_utils.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/modules/attention.py‎
Lines changed: 2 additions & 2 deletions b/‎tensorrt_llm/_torch/modules/attention.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tensorrt_llm/_torch/modules/gated_mlp.py‎
Lines changed: 18 additions & 0 deletions b/‎tensorrt_llm/_torch/modules/gated_mlp.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/peft/lora/layer.py‎
Lines changed: 1 addition & 1 deletion b/‎tensorrt_llm/_torch/peft/lora/layer.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorrt_llm/_torch/pyexecutor/_util.py‎
Lines changed: 5 additions & 2 deletions b/‎tensorrt_llm/_torch/pyexecutor/_util.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/llm_request.py‎
Lines changed: 7 additions & 3 deletions b/‎tensorrt_llm/_torch/pyexecutor/llm_request.py‎
Lines changed: 7 additions & 3 deletions
@@ -369,6 +369,7 @@ def __init__(
                 bias=getattr(config, "mlp_bias", False),
                 dtype=config.torch_dtype,
                 config=model_config,
+                layer_idx=layer_idx,
             )
 
             # self.fusion_config.POST_MLP_FUSION = model_config.mapping.has_tp(
@@ -519,6 +520,7 @@ def __init__(
             bias=config.mlp_bias,
             dtype=config.torch_dtype,
             config=model_config,
+            layer_idx=layer_idx,
         )
         self.input_layernorm = RMSNorm(hidden_size=config.hidden_size,
                                        eps=config.rms_norm_eps,
@@ -555,7 +557,7 @@ def forward(
         # Fully Connected
         hidden_states, residual = self.post_attention_layernorm(
             hidden_states, residual)
-        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.mlp(hidden_states, **kwargs)
         if spec_metadata is not None:
             spec_metadata.maybe_capture_hidden_states(self.layer_idx,
                                                       hidden_states, residual)
@@ -689,6 +691,7 @@ def forward(
         inputs_embeds: Optional[torch.FloatTensor] = None,
         pipeline_interface: Optional[PipelineInterface] = None,
         spec_metadata: Optional[SpecMetadata] = None,
+        lora_params=None,
     ) -> torch.Tensor:
         if self.model_config.mapping.is_first_pp_rank():
             if (input_ids is None) ^ (inputs_embeds is not None):
@@ -716,6 +719,7 @@ def forward(
                 attn_metadata=attn_metadata,
                 residual=residual,
                 spec_metadata=spec_metadata,
+                lora_params=lora_params,
             )
 
         if self.model_config.mapping.is_last_pp_rank():
@@ -732,14 +736,29 @@ def __init__(self, model_config: ModelConfig[LlamaConfig]):
         config = self.model_config.pretrained_config
         self.padding_idx = config.pad_token_id
 
+        vocab_size = config.vocab_size
+        # TODO smor- hack
+        if hasattr(model_config,
+                   'lora_config') and model_config.lora_config is not None:
+            from tensorrt_llm.lora_manager import HfLoraLoader
+            lora_loader = HfLoraLoader(model_config.lora_config.lora_dir)
+            weight = lora_loader.embed_tokens
+            # TODO smor - need to split tp matrix here
+            vocab_size = lora_loader.vocab_size
+
         self.embed_tokens = Embedding(
-            config.vocab_size,
+            vocab_size,
             config.hidden_size,
             dtype=config.torch_dtype,
             mapping=model_config.mapping,
             tensor_parallel_mode=TensorParallelMode.COLUMN,
             gather_output=True,
         )
+
+        if hasattr(model_config,
+                   'lora_config') and model_config.lora_config is not None:
+            self.embed_tokens.weight.value = weight.to(self.embed_tokens.dtype)
+
         self.layers = nn.ModuleList([
             LlamaDecoderLayer(
                 model_config,
@@ -758,6 +777,7 @@ def forward(
         inputs_embeds: Optional[torch.FloatTensor] = None,
         pipeline_interface: Optional[PipelineInterface] = None,
         spec_metadata: Optional[SpecMetadata] = None,
+        lora_params=None,
     ) -> torch.Tensor:
         if self.model_config.mapping.is_first_pp_rank():
             if (input_ids is None) ^ (inputs_embeds is not None):
@@ -783,6 +803,7 @@ def forward(
                 attn_metadata=attn_metadata,
                 residual=residual,
                 spec_metadata=spec_metadata,
+                lora_params=lora_params,
             )
 
         if self.model_config.mapping.is_last_pp_rank():
 
@@ -240,6 +240,7 @@ def forward(
         input_ids: torch.LongTensor = None,
         position_ids: Optional[torch.LongTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
+        lora_params: Optional = None,  # TODO smor add type hint
         **kwargs,
     ) -> torch.Tensor:
         if (input_ids is None) ^ (inputs_embeds is not None):
@@ -257,6 +258,7 @@ def forward(
                 position_ids=position_ids,
                 hidden_states=hidden_states,
                 attn_metadata=attn_metadata,
+                lora_params=lora_params,
             )
 
         hidden_states = self.norm(hidden_states)
@@ -355,6 +357,15 @@ def __init__(self, model: TModel, *, config: ModelConfig[TConfig],
             else:
                 # TODO(zhenhuanc): Currently lm_head Linear will not accept QuantConfig
                 # will considering per layer QuantConfig in the future.
+
+                # TODO smor- hack
+                if hasattr(config,
+                           'lora_config') and config.lora_config is not None:
+                    from tensorrt_llm.lora_manager import HfLoraLoader
+                    lora_loader = HfLoraLoader(config.lora_config.lora_dir)
+                    weight = lora_loader.lm_head
+                    vocab_size = lora_loader.vocab_size
+
                 self.lm_head = LMHead(
                     vocab_size,
                     hidden_size,
@@ -364,6 +375,12 @@ def __init__(self, model: TModel, *, config: ModelConfig[TConfig],
                     gather_output=True,
                 )
 
+                if hasattr(config,
+                           'lora_config') and config.lora_config is not None:
+                    # TODO smor- figure out if it sticks
+                    self.lm_head.weight.value = weight.to(
+                        self.lm_head.dtype).cuda()
+
             # use embedding weights in lm_head if tie word embedding is enabled
             if config.pretrained_config.tie_word_embeddings and not isinstance(
                     self.model.embed_tokens, MissingLayer):
@@ -450,6 +467,7 @@ def forward(
         pipeline_interface: Optional[PipelineInterface] = None,
         return_context_logits: bool = False,
         spec_metadata: Optional[SpecMetadata] = None,
+        lora_params: Optional = None,  # TODO smor add type hint
         **kwargs,
     ) -> torch.Tensor:
         if self._supports_pp and self.pp_size > 1:
@@ -466,12 +484,14 @@ def forward(
             if self.pp_rank < self.pp_size - 1:
                 return output
         else:
+
             output = self.model(
                 input_ids=input_ids,
                 attn_metadata=attn_metadata,
                 position_ids=position_ids,
                 inputs_embeds=inputs_embeds,
                 spec_metadata=spec_metadata,
+                lora_params=lora_params,
             )
 
         return self.logits_processor.forward(
@@ -506,6 +526,13 @@ def filter_weights(prefix, weights: Dict):
                         "lm_head"):
                     continue
 
+                # Skip loading weights for embedding and lm_head if LoRA is enabled
+                if hasattr(self.model_config, 'lora_config'
+                           ) and self.model_config.lora_config is not None and (
+                               name == "model.embed_tokens"
+                               or name == "lm_head"):
+                    continue
+
                 # Skip if parameter belongs to a missing layer
                 if missing_layer_parameter(name, self):
                     continue
 
@@ -171,7 +171,7 @@ def forward(
     ) -> torch.Tensor:
         qkv = self.qkv_proj(hidden_states)
 
-        if lora_params is not None:
+        if bool(lora_params):
             qkv_lora = self.splitted_qkv_lora(hidden_states, lora_params,
                                               self.layer_idx)
             if qkv_lora is not None:
@@ -204,7 +204,7 @@ def forward(
 
         attn_output = self.o_proj(attn_output,
                                   all_reduce_params=all_reduce_params)
-        if lora_params is not None:
+        if bool(lora_params):
             attn_lora_output = self.o_lora(attn_output, lora_params,
                                            self.layer_idx)
             if attn_lora_output is not None:
 
@@ -114,9 +114,27 @@ def forward(
 
         if self.activation == F.silu:
             h1 = self.gate_up_proj(x)
+            if bool(lora_params):
+                assert self.layer_idx is not None, "layer_idx is required for lora"
+                h1_lora = self.splitted_gate_up_lora(x, lora_params,
+                                                     self.layer_idx)
+                if h1_lora is not None:
+                    h1 = h1 + h1_lora
+
+                h1_lora = self.fused_gate_up_lora(x, lora_params,
+                                                  self.layer_idx)
+
+                if h1_lora is not None:
+                    h1 = h1 + h1_lora
+
             h2 = swiglu(h1)
             output = self.down_proj(h2,
                                     all_reduce_params=final_all_reduce_params)
+            if bool(lora_params):
+                output_lora = self.down_lora(h2, lora_params, self.layer_idx)
+                if output_lora is not None:
+                    output = output + output_lora
+
             return output
         else:
             raise NotImplementedError(
 
@@ -126,7 +126,7 @@ def forward(self, x, lora_params: Dict,
                     True,  # transB
                     max([r.max() for r in lora_ranks]),
                     0,
-                    lora_params["remove_input_padding"],
+                    True,  # TODO smor- should be lora_params["remove_input_padding"], support in loraOp as well
                 )
                 if isinstance(lora_outputs, torch.Tensor):
                     return lora_outputs
 
@@ -381,8 +381,8 @@ def create_py_executor_instance(dist,
             len(lora_config.lora_target_modules + lora_config.missing_qkv_modules)
 
         # TODO smor- need to figure out how to set these values
-        max_loras = 4
-        max_cpu_loras = 4
+        max_loras = 2
+        max_cpu_loras = 2
         executor_config.peft_cache_config = tllm.executor.PeftCacheConfig(
             num_device_module_layer=max_lora_rank * num_lora_modules *
             max_loras,
@@ -394,6 +394,9 @@ def create_py_executor_instance(dist,
             peft_cache_config=executor_config.peft_cache_config,
             model_config=model_binding_config)
         resources["peft_cache_manager"] = peft_cache_manager
+        model_engine.set_lora_model_config(
+            lora_config.lora_target_modules,
+            lora_config.trtllm_modules_to_hf_modules)
 
     resource_manager = ResourceManager(resources)
 
 
@@ -50,6 +50,7 @@ def __init__(self, *args, client_id=None, **kwargs):
         self.py_draft_tokens = self.draft_tokens
         self.py_last_draft_tokens = None
         self.py_decoding_iter = 0
+        self.py_lora_task_layer_module_configs = None
 
 
 def convert_wordlist(word_list) -> List[List[int]]:
@@ -121,13 +122,16 @@ def executor_request_to_llm_request(req_id: int,
         is None else executor_request.prompt_tuning_config.embedding_table,
         prompt_vocab_size=None if executor_request.prompt_tuning_config is None
         else executor_request.prompt_tuning_config.embedding_table.shape[0],
+        lora_task_id=executor_request.lora_config.task_id
+        if executor_request.lora_config is not None else None,
+        lora_weights=executor_request.lora_config.weights
+        if executor_request.lora_config is not None else None,
+        lora_config=executor_request.lora_config.config
+        if executor_request.lora_config is not None else None,
         mrope_rotary_cos_sin=None if executor_request.mrope_config is None else
         executor_request.mrope_config.mrope_rotary_cos_sin,
         mrope_position_deltas=None if executor_request.mrope_config is None else
         executor_request.mrope_config.mrope_position_deltas,
-        lora_task_id=None,
-        lora_weights=None,
-        lora_config=None,
         lookahead_config=None,
         return_log_probs=False,
         return_context_logits=executor_request.output_config.
Original file line number	Diff line number	Diff line change
`@@ -126,7 +126,7 @@ def forward(self, x, lora_params: Dict,`
`126`	`126`	`True, # transB`
`127`	`127`	`max([r.max() for r in lora_ranks]),`
`128`	`128`	`0,`
`129`		`- lora_params["remove_input_padding"],`
	`129`	`+ True, # TODO smor- should be lora_params["remove_input_padding"], support in loraOp as well`
`130`	`130`	`)`
`131`	`131`	`if isinstance(lora_outputs, torch.Tensor):`
`132`	`132`	`return lora_outputs`