NVIDIA-NeMo · cuichenx · Aug 19, 2025 · Jun 16, 2025 · Jul 2, 2025 · Jul 7, 2025
diff --git a/nemo/collections/common/tokenizers/tiktoken_tokenizer.py b/nemo/collections/common/tokenizers/tiktoken_tokenizer.py
@@ -62,78 +62,134 @@ def reload_mergeable_ranks(
     return ranks
 
 
+# pylint: disable=C0301
 PATTERN_TIKTOKEN = "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
 DEFAULT_TIKTOKEN_MAX_VOCAB = 2**17  # 131072
 SPECIAL_TOKENS = ["<unk>", "<s>", "</s>", "<mask>", "<pad>", "<cls>", "<sep>"]
 SPECIAL_TOKEN_TEMPLATE = "<SPECIAL_{id}>"
 
 
 class TiktokenTokenizer(TokenizerSpec):
+    # pylint: disable=C0115,C0116
     """
     TiktokenTokenizer https://github.com/openai/tiktoken.
 
+    Usage 1 (vocab_file-based):
+        tokenizer = TiktokenTokenizer(
+            vocab_file="path/to/vocab.json",
+            vocab_size=131072,
+            num_special_tokens=1000,
+            special_tokens=["<unk>", "<s>", "</s>", "<mask>", "<pad>", "<cls>", "<sep>"],
+        )
+
+    Usage 2 (encoding_name-based):
+        tokenizer = TiktokenTokenizer(
+            encoding_name="o200_harmony",
+            bos_token="<|startoftext|>",
+            eos_token="<|endoftext|>",
+            pad_token="<|endoftext|>",
+        )
+
     Args:
-        model_path: path to tokenizer vocabulary
+        vocab_file: path to tokenizer vocabulary
+        encoding_name: name of the encoding to use
+        pattern: Regex pattern to split the text
+        vocab_size: size of the vocabulary
         num_special_tokens: number of special tokens to generate
         special_tokens: template for user-defined special tokens
-        pattern: Regex pattern to split the text
+        bos_token: beginning of sentence token
+        eos_token: end of sentence token
+        pad_token: padding token (default is eos_token)
     """
 
     def __init__(
         self,
-        vocab_file: str,
+        vocab_file: Optional[str] = None,
+        encoding_name: Optional[str] = None,
         pattern: str = PATTERN_TIKTOKEN,
         vocab_size: int = DEFAULT_TIKTOKEN_MAX_VOCAB,  # 131072
         num_special_tokens: int = 1000,
         special_tokens: Optional[List[str]] = None,
+        bos_token: str = "<|startoftext|>",
+        eos_token: str = "<|endoftext|>",
+        pad_token: str = "<|endoftext|>",
     ):
-        if not vocab_file or not os.path.exists(vocab_file):
-            raise ValueError(f"vocab_file: {vocab_file} is invalid")
-
-        if special_tokens is None:
-            special_tokens = SPECIAL_TOKENS.copy()
-
-        assert len(special_tokens) == len(set(special_tokens)), f"Special tokens should be unique: {special_tokens}"
-        assert len(special_tokens) <= num_special_tokens < vocab_size
-        assert set(SPECIAL_TOKENS) <= set(special_tokens), f"Custom special tokens should include {SPECIAL_TOKENS}"
-
-        self._unk_id = special_tokens.index("<unk>")
-        self._bos_id = special_tokens.index("<s>")
-        self._eos_id = special_tokens.index("</s>")
-        self._mask_id = special_tokens.index("<mask>")
-        self._pad_id = special_tokens.index("<pad>")
-        self._cls_id = special_tokens.index("<cls>")
-        self._sep_id = special_tokens.index("<sep>")
-
-        self._vocab_size = vocab_size
-        print(f'{self._vocab_size = }')
-        self.num_special_tokens = num_special_tokens
-        special_filler = [SPECIAL_TOKEN_TEMPLATE.format(id=i) for i in range(len(special_tokens), num_special_tokens)]
-        self.special_filler = special_filler
-        if special_filler:
-            print(f"Adding special tokens {special_filler[0]}, ..., {special_filler[-1]}")
-        self.special_tokens = special_tokens + special_filler
-        assert len(set(self.special_tokens)) == len(self.special_tokens) == num_special_tokens, self.special_tokens
-        self.inner_vocab_size = vocab_size - num_special_tokens
-
-        # reload vocab
-        self.token2id = reload_mergeable_ranks(vocab_file, max_vocab=self.inner_vocab_size)
-        self.id2token = {v: k for k, v in self.token2id.items()}
-        assert set(range(self.inner_vocab_size)) == set(self.id2token.keys())
+        if not encoding_name:
+            if not vocab_file or not os.path.exists(vocab_file):
+                raise ValueError(f"vocab_file: {vocab_file} is invalid")
+
+        if vocab_file is not None:
+            if special_tokens is None:
+                special_tokens = SPECIAL_TOKENS.copy()
+
+            assert len(special_tokens) == len(
+                set(special_tokens)
+            ), f"Special tokens should be unique: {special_tokens}"
+            assert len(special_tokens) <= num_special_tokens < vocab_size
+            assert set(SPECIAL_TOKENS) <= set(special_tokens), f"Custom special tokens should include {SPECIAL_TOKENS}"
+
+            self._unk_id = special_tokens.index("<unk>")
+            self._bos_id = special_tokens.index("<s>")
+            self._eos_id = special_tokens.index("</s>")
+            self._mask_id = special_tokens.index("<mask>")
+            self._pad_id = special_tokens.index("<pad>")
+            self._cls_id = special_tokens.index("<cls>")
+            self._sep_id = special_tokens.index("<sep>")
+
+            # reload vocab
+            self._vocab_size = vocab_size
+            self.inner_vocab_size = self._vocab_size - num_special_tokens
+            self.token2id = reload_mergeable_ranks(vocab_file, max_vocab=self.inner_vocab_size)
+            tokenizer_name = Path(vocab_file).parent.name
+
+            print(f'{self._vocab_size = }')
+            self.num_special_tokens = num_special_tokens
+            special_filler = [
+                SPECIAL_TOKEN_TEMPLATE.format(id=i) for i in range(len(special_tokens), num_special_tokens)
+            ]
+            self.special_filler = special_filler
+            if special_filler:
+                print(f"Adding special tokens {special_filler[0]}, ..., {special_filler[-1]}")
+            self.special_tokens = special_tokens + special_filler
+            assert len(set(self.special_tokens)) == len(self.special_tokens) == num_special_tokens, self.special_tokens
+            encoding_special_tokens = {}  # special tokens are handled manually
+            self.allowed_special = set()
+        else:
+            tokenizer_base = tiktoken.get_encoding(encoding_name)
+            self.token2id = tokenizer_base._mergeable_ranks
+            pattern = tokenizer_base._pat_str
+            tokenizer_name = encoding_name
+            self.inner_vocab_size = len(tokenizer_base._mergeable_ranks) + len(tokenizer_base._special_tokens)
+            self.num_special_tokens = 0  # special tokens handled inside tiktoken
+            self._vocab_size = self.inner_vocab_size
+            self.special_filler = []
+            self.special_tokens = []
+            self._bos_id = tokenizer_base.encode(bos_token, allowed_special="all")
+            self._eos_id = tokenizer_base.encode(eos_token, allowed_special="all")
+            self._pad_id = tokenizer_base.encode(pad_token, allowed_special="all")
+            self._unk_id = -1
+            self._mask_id = -1
+            self._cls_id = -1
+            self._sep_id = -1
+            self.allowed_special = "all"
+            encoding_special_tokens = tokenizer_base._special_tokens
+
+        id2token = {v: k for k, v in self.token2id.items()}
+        assert set(range(self.inner_vocab_size)) == set(id2token.keys())
 
         self.shifted_id2token = {i: tok for i, tok in enumerate(self.special_tokens)}
-        for key, value in self.id2token.items():
+        for key, value in id2token.items():
             self.shifted_id2token[key + self.num_special_tokens] = value.decode('utf-8', errors='replace')
 
         self.tokenizer = tiktoken.Encoding(
-            name=Path(vocab_file).parent.name,
+            name=tokenizer_name,
             pat_str=pattern,
             mergeable_ranks=self.token2id,
-            special_tokens={},  # special tokens are handled manually
+            special_tokens=encoding_special_tokens,
         )
 
     def text_to_tokens(self, text: str):
-        token_ids = self.tokenizer.encode(text)
+        token_ids = self.tokenizer.encode(text, allowed_special=self.allowed_special)
         return [self.tokenizer.decode_single_token_bytes(token) for token in token_ids]
 
     def tokens_to_text(self, tokens: List[int]):
@@ -165,7 +221,7 @@ def ids_to_tokens(self, token_ids):
         return tokens
 
     def text_to_ids(self, text: str):
-        tokens = self.tokenizer.encode(text)
+        tokens = self.tokenizer.encode(text, allowed_special=self.allowed_special)
         tokens = [t + self.num_special_tokens for t in tokens]
         return tokens
 

diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
@@ -94,6 +94,10 @@
     GPTConfig126M,
     GPTConfig175B,
     GPTModel,
+    GPTOSSConfig,
+    GPTOSSConfig20B,
+    GPTOSSConfig120B,
+    GPTOSSModel,
     HFAutoModelForCausalLM,
     Hyena1bConfig,
     Hyena7bARCLongContextConfig,
@@ -308,6 +312,10 @@
     "CodeLlamaConfig70B",
     "LlamaModel",
     "LlamaNemotronModel",
+    "GPTOSSConfig",
+    "GPTOSSConfig120B",
+    "GPTOSSConfig20B",
+    "GPTOSSModel",
     "GemmaConfig",
     "GemmaConfig2B",
     "GemmaConfig7B",

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
@@ -603,12 +603,6 @@ def deploy(
     enable_flash_decode: bool = True,
     legacy_ckpt: bool = False,
 ):
-    warnings.warn(
-        "The 'deploy' function is deprecated and will be removed in NeMo FW 25.09 container release. "
-        "For evaluation functionality, please use the new Eval repository: https://github.com/NVIDIA-NeMo/Eval",
-        DeprecationWarning,
-        stacklevel=2,
-    )
     """
     Deploys nemo model on a PyTriton server either "in-framework" or by converting to trtllm depending on the backend.
     This deploy method is intended to be used for evaluation.
@@ -656,6 +650,12 @@ def deploy(
             the trtllm backend).
         legacy_ckpt (bool): Indicates whether the checkpoint is in the legacy format. Default: False
     """
+    warnings.warn(
+        "The 'deploy' function is deprecated and will be removed in NeMo FW 25.09 container release. "
+        "For evaluation functionality, please use the new Eval repository: https://github.com/NVIDIA-NeMo/Eval",
+        DeprecationWarning,
+        stacklevel=2,
+    )
     import os
 
     import uvicorn
@@ -797,12 +797,6 @@ def evaluate(
     eval_cfg: EvaluationConfig = EvaluationConfig(type="gsm8k"),
     adapter_cfg: AdapterConfig | None = None,
 ) -> dict:
-    warnings.warn(
-        "The 'evaluate' function is deprecated and will be removed in NeMo FW 25.09 container release. "
-        "For evaluation functionality, please use the new Eval repository: https://github.com/NVIDIA-NeMo/Eval",
-        DeprecationWarning,
-        stacklevel=2,
-    )
     """
     Evaluates nemo model deployed on PyTriton server using nvidia-lm-eval
 
@@ -813,6 +807,12 @@ def evaluate(
         adapter_cfg (AdapterConfig): configuration for adapters, the object between becnhmark and endpoint.
             Default: None.
     """
+    warnings.warn(
+        "The 'evaluate' function is deprecated and will be removed in NeMo FW 25.09 container release. "
+        "For evaluation functionality, please use the new Eval repository: https://github.com/NVIDIA-NeMo/Eval",
+        DeprecationWarning,
+        stacklevel=2,
+    )
     from nemo.collections.llm.evaluation.base import _legacy_evaluate, find_framework, wait_for_fastapi_server
 
     if target_cfg.api_endpoint.nemo_checkpoint_path is not None:
@@ -1184,7 +1184,7 @@ def generate(
     )
 
     if trainer.strategy.expert_model_parallel_size > 1:
-        gathered_results = results_on_this_dp_rank
+        gathered_results = [r.generated_text if text_only else r for r in results_on_this_dp_rank]
     else:
         gathered_results = [None] * dp_size
 

diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
@@ -59,6 +59,7 @@
     Gemma3Config27B,
     Gemma3Model,
 )
+from nemo.collections.llm.gpt.model.gpt_oss import GPTOSSConfig, GPTOSSConfig20B, GPTOSSConfig120B, GPTOSSModel
 from nemo.collections.llm.gpt.model.hf_auto_model_for_causal_lm import HFAutoModelForCausalLM
 from nemo.collections.llm.gpt.model.hf_llama_embedding import get_llama_bidirectional_hf_model
 from nemo.collections.llm.gpt.model.hyena import (
@@ -236,6 +237,10 @@
     "LlamaEmbeddingModel",
     "Llama32EmbeddingConfig1B",
     "Llama32EmbeddingConfig3B",
+    "GPTOSSConfig",
+    "GPTOSSConfig120B",
+    "GPTOSSConfig20B",
+    "GPTOSSModel",
     "Phi3Config",
     "Phi3ConfigMini",
     "Phi3Model",

diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py
@@ -25,6 +25,8 @@
 from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig
 from megatron.core.models.gpt.gpt_model import GPTModel as MCoreGPTModel
 from megatron.core.optimizer import OptimizerConfig
+from megatron.core.transformer.dot_product_attention import DotProductAttention as MCoreDotProductAttention
+from megatron.core.transformer.enums import AttnBackend
 from megatron.core.transformer.spec_utils import ModuleSpec
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.utils import get_batch_on_this_cp_rank
@@ -384,6 +386,10 @@ def configure_model(self, tokenizer, pre_process=None, post_process=None, vp_sta
             kwargs = {"mtp_block_spec": mtp_block_spec(self, vp_stage=vp_stage)}
         else:
             kwargs = {}
+
+        if self.attention_backend == AttnBackend.local:
+            if hasattr(transformer_layer_spec, 'submodules'):
+                transformer_layer_spec.submodules.self_attention.submodules.core_attention = MCoreDotProductAttention
         with model_init_device_context():
             model = MCoreGPTModel(
                 self,