gdengk
diff --git a/‎nemo/collections/llm/api.py‎
Lines changed: 23 additions & 8 deletions b/‎nemo/collections/llm/api.py‎
Lines changed: 23 additions & 8 deletions
diff --git a/‎nemo/collections/llm/gpt/data/chat.py‎
Lines changed: 49 additions & 0 deletions b/‎nemo/collections/llm/gpt/data/chat.py‎
Lines changed: 49 additions & 0 deletions
@@ -65,7 +65,7 @@
 
 @run.cli.entrypoint(namespace="llm")
 def train(
-    model: pl.LightningModule,
+    model: Union[pl.LightningModule, AnyPath],
     data: pl.LightningDataModule,
     trainer: Trainer,
     log: Annotated[Optional[NeMoLogger], run.Config[NeMoLogger]] = None,
@@ -79,7 +79,7 @@ def train(
     Trains a model using the specified data and trainer, with optional tokenizer, source, and export.
 
     Args:
-        model (pl.LightningModule): The model to be trained.
+        model (Union[pl.LightningModule, AnyPath]): The model to be trained or a path to the NeMo 2 checkpoint.
         data (pl.LightningDataModule): The data module containing training data.
         trainer (Trainer): The trainer instance configured with a MegatronStrategy.
         log (NeMoLogger): A nemologger instance.
@@ -106,6 +106,8 @@ def train(
         >>> llm.train(model, data, trainer, tokenizer="data")
         PosixPath('/path/to/log_dir')
     """
+    model = _load_model_from_path(model)
+
     # [ModelOpt]: If modelopt_state exists, overwrite transformer_layer_spec to modelopt spec
     if resume:
         if resume.restore_config and resume.restore_config.path:
@@ -131,7 +133,7 @@ def train(
 
 @run.cli.entrypoint(namespace="llm")
 def pretrain(
-    model: pl.LightningModule,
+    model: Union[pl.LightningModule, AnyPath],
     data: pl.LightningDataModule,
     trainer: Trainer,
     log: Annotated[Optional[NeMoLogger], run.Config[NeMoLogger]] = None,
@@ -145,7 +147,7 @@ def pretrain(
     Note, by default it will use the tokenizer from the model.
 
     Args:
-        model (pl.LightningModule): The model to be pretrained.
+        model (Union[pl.LightningModule, AnyPath]): The model to be pretrained or a path to the NeMo 2 checkpoint.
         data (pl.LightningDataModule): The data module containing pretraining data.
         trainer (Trainer): The trainer instance configured with a MegatronStrategy.
         log (NeMoLogger): A nemologger instance.
@@ -166,6 +168,7 @@ def pretrain(
         >>> llm.pretrain(model, data, trainer)
         PosixPath('/path/to/log_dir')
     """
+    model = _load_model_from_path(model)
     _validate_config(model, data, trainer, log=log, resume=resume, optim=optim)
 
     return train(
@@ -181,28 +184,33 @@ def pretrain(
 
 @run.cli.entrypoint(namespace="llm")
 def finetune(
-    model: pl.LightningModule,
+    model: Union[pl.LightningModule, AnyPath],
     data: pl.LightningDataModule,
     trainer: Trainer,
     log: Annotated[Optional[NeMoLogger], run.Config[NeMoLogger]] = None,
     resume: Annotated[Optional[AutoResume], run.Config[AutoResume]] = None,
     optim: Optional[OptimizerModule] = None,
     peft: Optional[Union[PEFT, ModelTransform, Callable]] = None,
+    tokenizer: Optional[TokenizerType] = "model",
 ) -> Path:
     """
     Finetunes a model using the specified data and trainer, with optional logging, resuming, and PEFT.
 
     Note, by default it will use the tokenizer from the model.
 
     Args:
-        model (pl.LightningModule): The model to be finetuned.
+        model (Union[pl.LightningModule, AnyPath]): The model to be finetuned.
         data (pl.LightningDataModule): The data module containing finetuning data.
         trainer (Trainer): The trainer instance configured with a MegatronStrategy.
         log (NeMoLogger): A nemologger instance.
         resume (Optional[AutoResume]): Resume training from a checkpoint.
         optim (Optional[OptimizerModule]): The optimizer module to be used. If not provided, the default
             optimizer from the model will be used.
         peft (Optional[PEFT]): A PEFT (Parameter-Efficient Fine-Tuning) configuration to be applied.
+        tokenizer (Optional[TokenizerType]): Tokenizer setting to be applied. Can be 'data' or 'model'
+            or an instance of TokenizerSpec. If 'data' uses the data loader's tokenizer instead of the tokenizer
+            from the model checkpoint, which is useful for expanding vocabulary or adding special tokens
+            (such as chat template tokens).
 
     Returns:
         Path: The directory path where finetuning artifacts are saved.
@@ -217,7 +225,7 @@ def finetune(
         >>> llm.finetune(model, data, trainer, peft=llm.peft.LoRA()])
         PosixPath('/path/to/log_dir')
     """
-
+    model = _load_model_from_path(model)
     _validate_config(model, data, trainer, log=log, resume=resume, optim=optim, model_transform=peft)
     return train(
         model=model,
@@ -226,7 +234,7 @@ def finetune(
         log=log,
         resume=resume,
         optim=optim,
-        tokenizer="model",
+        tokenizer=tokenizer,
         model_transform=peft,
     )
 
@@ -630,6 +638,7 @@ def deploy(
             the trtllm backend).
     """
     import os
+
     import uvicorn
 
     from nemo.deploy import DeployPyTriton
@@ -1345,3 +1354,9 @@ def _build_directory_tree(path, tree=None, root_name=None):
                 tree.add(f"[white]{item.name}[/white]")
 
     return tree
+
+
+def _load_model_from_path(model: Union[pl.LightningModule, AnyPath]):
+    if isinstance(model, AnyPath):
+        model = io.load_context(ckpt_to_context_subdir(model), subpath="model")
+    return model
@@ -13,10 +13,16 @@
 # limitations under the License.
 
 from functools import lru_cache
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
 from nemo.collections.llm.gpt.data.core import create_sft_dataset
 from nemo.collections.llm.gpt.data.fine_tuning import FineTuningDataModule
 
+if TYPE_CHECKING:
+    from nemo.collections.common.tokenizers import TokenizerSpec
+    from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
+
 
 class ChatDataModule(FineTuningDataModule):
     """
@@ -26,6 +32,48 @@ class ChatDataModule(FineTuningDataModule):
     See base class `FineTuningDataModule` for more details.
     """
 
+    def __init__(
+        self,
+        dataset_root: Union[str, Path],
+        seq_length: int = 2048,
+        tokenizer: Optional["TokenizerSpec"] = None,
+        micro_batch_size: int = 4,
+        global_batch_size: int = 8,
+        rampup_batch_size: Optional[List[int]] = None,
+        seed: int = 1234,
+        memmap_workers: int = 1,
+        num_workers: int = 8,
+        pin_memory: bool = True,
+        persistent_workers: bool = False,
+        packed_sequence_specs: Optional["PackedSequenceSpecs"] = None,
+        dataset_kwargs: Optional[Dict[str, Any]] = None,
+        use_hf_tokenizer_chat_template: bool = False,
+    ):
+        """Data module for finetuning on chat datasets.
+        See base class `FineTuningDataModule` for more details of the arguments.
+
+        Args:
+            use_hf_tokenizer_chat_template: Whether to use the chat template from the HuggingFace tokenizer. If True,
+                uses the tokenizer's built-in chat template. If False, uses default chat template from
+                GPTSFTChatDataset.  Defaults to False.
+        """
+        super().__init__(
+            dataset_root,
+            seq_length,
+            tokenizer,
+            micro_batch_size,
+            global_batch_size,
+            rampup_batch_size,
+            seed,
+            memmap_workers,
+            num_workers,
+            pin_memory,
+            persistent_workers,
+            packed_sequence_specs,
+            dataset_kwargs,
+        )
+        self.use_hf_tokenizer_chat_template = use_hf_tokenizer_chat_template
+
     @lru_cache
     def _create_dataset(self, path, pack_metadata_path=None, is_test=False, **kwargs):
         # pylint: disable=C0115,C0116
@@ -39,5 +87,6 @@ def _create_dataset(self, path, pack_metadata_path=None, is_test=False, **kwargs
             is_test=is_test,
             pack_metadata_file_path=None,  # packing is not supported
             pad_cu_seqlens=False,
+            use_hf_tokenizer_chat_template=self.use_hf_tokenizer_chat_template,
             **kwargs,
         )