Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
6ef9392
push current code
cuichenx Jun 16, 2025
abf6aa6
fix tiktoken
cuichenx Jul 2, 2025
9343c5e
modelopt changes
cuichenx Jul 7, 2025
b6a259b
add recipe
cuichenx Jul 9, 2025
cd7b3c4
infernece updates
cuichenx Jul 9, 2025
9cb2ffb
rename model, modify importer for latest checkpoint
cuichenx Jul 10, 2025
67d960a
add 20b recipe
cuichenx Jul 16, 2025
931cff7
minor fixes
cuichenx Jul 16, 2025
9fc2fb0
correct dropout values
cuichenx Jul 17, 2025
e4b9667
Add flops calculator for gpt_oss
guyueh1 Jul 21, 2025
31bff1b
misc
cuichenx Jul 23, 2025
3009f7c
change activation function name and refactor glu +1
cuichenx Jul 31, 2025
f34d3c1
enable (weighted) bias activation fusion
cuichenx Aug 1, 2025
f523a3b
unify te and local naming
cuichenx Aug 2, 2025
5e411ea
update importer exporter
cuichenx Aug 5, 2025
4ccef27
update importer
cuichenx Aug 5, 2025
8f7b826
dequantize in nemo instead of hf for now
cuichenx Aug 5, 2025
6afc481
Rm force using MCORE DPA; add support for gpus_per_node for cw
guyueh1 Aug 6, 2025
7d423d6
update recipe
cuichenx Aug 7, 2025
63d4561
enable lora merge for te grouped linear
cuichenx Aug 7, 2025
847dfc9
Revert back to MCoreDPA
guyueh1 Aug 7, 2025
7ab8aa8
add clamp value as a config
cuichenx Aug 8, 2025
6a66d13
turn off async save for sft
cuichenx Aug 11, 2025
8e78b5a
Apply isort and black reformatting
cuichenx Aug 11, 2025
d0d6dd1
remove gpt-oss specific values in tiktoken
cuichenx Aug 11, 2025
7b7d830
linting
cuichenx Aug 11, 2025
1ed123a
Apply isort and black reformatting
cuichenx Aug 11, 2025
be624e5
linting
cuichenx Aug 11, 2025
d9b3167
Merge remote-tracking branch 'origin/chcui/gpt_oss' into chcui/gpt_oss
cuichenx Aug 11, 2025
a90f960
address comments
cuichenx Aug 12, 2025
9b70c7c
Merge branch 'main' into chcui/gpt_oss
cuichenx Aug 12, 2025
d794762
address comments
cuichenx Aug 13, 2025
2b8b6b5
Merge branch 'main' into chcui/gpt_oss
cuichenx Aug 14, 2025
e204f59
add megatron version guard
cuichenx Aug 18, 2025
03231fd
Merge remote-tracking branch 'origin/chcui/gpt_oss' into chcui/gpt_oss
cuichenx Aug 18, 2025
8cd04bb
Apply isort and black reformatting
cuichenx Aug 18, 2025
9888bd6
add megatron version guard
cuichenx Aug 18, 2025
9a8e767
Merge remote-tracking branch 'origin/chcui/gpt_oss' into chcui/gpt_oss
cuichenx Aug 18, 2025
4a9a68d
guard window_attn_skip_freq
cuichenx Aug 18, 2025
6434914
fix tests
cuichenx Aug 18, 2025
bcb7ffc
Apply isort and black reformatting
cuichenx Aug 18, 2025
b4ddd57
fix tests
cuichenx Aug 19, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 97 additions & 41 deletions nemo/collections/common/tokenizers/tiktoken_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,78 +62,134 @@ def reload_mergeable_ranks(
return ranks


# pylint: disable=C0301
PATTERN_TIKTOKEN = "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
DEFAULT_TIKTOKEN_MAX_VOCAB = 2**17 # 131072
SPECIAL_TOKENS = ["<unk>", "<s>", "</s>", "<mask>", "<pad>", "<cls>", "<sep>"]
SPECIAL_TOKEN_TEMPLATE = "<SPECIAL_{id}>"


class TiktokenTokenizer(TokenizerSpec):
# pylint: disable=C0115,C0116
"""
TiktokenTokenizer https://github.com/openai/tiktoken.

Usage 1 (vocab_file-based):
tokenizer = TiktokenTokenizer(
vocab_file="path/to/vocab.json",
vocab_size=131072,
num_special_tokens=1000,
special_tokens=["<unk>", "<s>", "</s>", "<mask>", "<pad>", "<cls>", "<sep>"],
)

Usage 2 (encoding_name-based):
tokenizer = TiktokenTokenizer(
encoding_name="o200_harmony",
bos_token="<|startoftext|>",
eos_token="<|endoftext|>",
pad_token="<|endoftext|>",
)

Args:
model_path: path to tokenizer vocabulary
vocab_file: path to tokenizer vocabulary
encoding_name: name of the encoding to use
pattern: Regex pattern to split the text
vocab_size: size of the vocabulary
num_special_tokens: number of special tokens to generate
special_tokens: template for user-defined special tokens
pattern: Regex pattern to split the text
bos_token: beginning of sentence token
eos_token: end of sentence token
pad_token: padding token (default is eos_token)
"""

def __init__(
self,
vocab_file: str,
vocab_file: Optional[str] = None,
encoding_name: Optional[str] = None,
pattern: str = PATTERN_TIKTOKEN,
vocab_size: int = DEFAULT_TIKTOKEN_MAX_VOCAB, # 131072
num_special_tokens: int = 1000,
special_tokens: Optional[List[str]] = None,
bos_token: str = "<|startoftext|>",
eos_token: str = "<|endoftext|>",
pad_token: str = "<|endoftext|>",
):
if not vocab_file or not os.path.exists(vocab_file):
raise ValueError(f"vocab_file: {vocab_file} is invalid")

if special_tokens is None:
special_tokens = SPECIAL_TOKENS.copy()

assert len(special_tokens) == len(set(special_tokens)), f"Special tokens should be unique: {special_tokens}"
assert len(special_tokens) <= num_special_tokens < vocab_size
assert set(SPECIAL_TOKENS) <= set(special_tokens), f"Custom special tokens should include {SPECIAL_TOKENS}"

self._unk_id = special_tokens.index("<unk>")
self._bos_id = special_tokens.index("<s>")
self._eos_id = special_tokens.index("</s>")
self._mask_id = special_tokens.index("<mask>")
self._pad_id = special_tokens.index("<pad>")
self._cls_id = special_tokens.index("<cls>")
self._sep_id = special_tokens.index("<sep>")

self._vocab_size = vocab_size
print(f'{self._vocab_size = }')
self.num_special_tokens = num_special_tokens
special_filler = [SPECIAL_TOKEN_TEMPLATE.format(id=i) for i in range(len(special_tokens), num_special_tokens)]
self.special_filler = special_filler
if special_filler:
print(f"Adding special tokens {special_filler[0]}, ..., {special_filler[-1]}")
self.special_tokens = special_tokens + special_filler
assert len(set(self.special_tokens)) == len(self.special_tokens) == num_special_tokens, self.special_tokens
self.inner_vocab_size = vocab_size - num_special_tokens

# reload vocab
self.token2id = reload_mergeable_ranks(vocab_file, max_vocab=self.inner_vocab_size)
self.id2token = {v: k for k, v in self.token2id.items()}
assert set(range(self.inner_vocab_size)) == set(self.id2token.keys())
if not encoding_name:
if not vocab_file or not os.path.exists(vocab_file):
raise ValueError(f"vocab_file: {vocab_file} is invalid")

if vocab_file is not None:
if special_tokens is None:
special_tokens = SPECIAL_TOKENS.copy()

assert len(special_tokens) == len(
set(special_tokens)
), f"Special tokens should be unique: {special_tokens}"
assert len(special_tokens) <= num_special_tokens < vocab_size
assert set(SPECIAL_TOKENS) <= set(special_tokens), f"Custom special tokens should include {SPECIAL_TOKENS}"

self._unk_id = special_tokens.index("<unk>")
self._bos_id = special_tokens.index("<s>")
self._eos_id = special_tokens.index("</s>")
self._mask_id = special_tokens.index("<mask>")
self._pad_id = special_tokens.index("<pad>")
self._cls_id = special_tokens.index("<cls>")
self._sep_id = special_tokens.index("<sep>")

# reload vocab
self._vocab_size = vocab_size
self.inner_vocab_size = self._vocab_size - num_special_tokens
self.token2id = reload_mergeable_ranks(vocab_file, max_vocab=self.inner_vocab_size)
tokenizer_name = Path(vocab_file).parent.name

print(f'{self._vocab_size = }')
self.num_special_tokens = num_special_tokens
special_filler = [
SPECIAL_TOKEN_TEMPLATE.format(id=i) for i in range(len(special_tokens), num_special_tokens)
]
self.special_filler = special_filler
if special_filler:
print(f"Adding special tokens {special_filler[0]}, ..., {special_filler[-1]}")
self.special_tokens = special_tokens + special_filler
assert len(set(self.special_tokens)) == len(self.special_tokens) == num_special_tokens, self.special_tokens
encoding_special_tokens = {} # special tokens are handled manually
self.allowed_special = set()
else:
tokenizer_base = tiktoken.get_encoding(encoding_name)
self.token2id = tokenizer_base._mergeable_ranks
pattern = tokenizer_base._pat_str
tokenizer_name = encoding_name
self.inner_vocab_size = len(tokenizer_base._mergeable_ranks) + len(tokenizer_base._special_tokens)
self.num_special_tokens = 0 # special tokens handled inside tiktoken
self._vocab_size = self.inner_vocab_size
self.special_filler = []
self.special_tokens = []
self._bos_id = tokenizer_base.encode(bos_token, allowed_special="all")
self._eos_id = tokenizer_base.encode(eos_token, allowed_special="all")
self._pad_id = tokenizer_base.encode(pad_token, allowed_special="all")
self._unk_id = -1
self._mask_id = -1
self._cls_id = -1
self._sep_id = -1
self.allowed_special = "all"
encoding_special_tokens = tokenizer_base._special_tokens

id2token = {v: k for k, v in self.token2id.items()}
assert set(range(self.inner_vocab_size)) == set(id2token.keys())

self.shifted_id2token = {i: tok for i, tok in enumerate(self.special_tokens)}
for key, value in self.id2token.items():
for key, value in id2token.items():
self.shifted_id2token[key + self.num_special_tokens] = value.decode('utf-8', errors='replace')

self.tokenizer = tiktoken.Encoding(
name=Path(vocab_file).parent.name,
name=tokenizer_name,
pat_str=pattern,
mergeable_ranks=self.token2id,
special_tokens={}, # special tokens are handled manually
special_tokens=encoding_special_tokens,
)

def text_to_tokens(self, text: str):
token_ids = self.tokenizer.encode(text)
token_ids = self.tokenizer.encode(text, allowed_special=self.allowed_special)
return [self.tokenizer.decode_single_token_bytes(token) for token in token_ids]

def tokens_to_text(self, tokens: List[int]):
Expand Down Expand Up @@ -165,7 +221,7 @@ def ids_to_tokens(self, token_ids):
return tokens

def text_to_ids(self, text: str):
tokens = self.tokenizer.encode(text)
tokens = self.tokenizer.encode(text, allowed_special=self.allowed_special)
tokens = [t + self.num_special_tokens for t in tokens]
return tokens

Expand Down
8 changes: 8 additions & 0 deletions nemo/collections/llm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,10 @@
GPTConfig126M,
GPTConfig175B,
GPTModel,
GPTOSSConfig,
GPTOSSConfig20B,
GPTOSSConfig120B,
GPTOSSModel,
HFAutoModelForCausalLM,
Hyena1bConfig,
Hyena7bARCLongContextConfig,
Expand Down Expand Up @@ -308,6 +312,10 @@
"CodeLlamaConfig70B",
"LlamaModel",
"LlamaNemotronModel",
"GPTOSSConfig",
"GPTOSSConfig120B",
"GPTOSSConfig20B",
"GPTOSSModel",
"GemmaConfig",
"GemmaConfig2B",
"GemmaConfig7B",
Expand Down
26 changes: 13 additions & 13 deletions nemo/collections/llm/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,12 +603,6 @@ def deploy(
enable_flash_decode: bool = True,
legacy_ckpt: bool = False,
):
warnings.warn(
"The 'deploy' function is deprecated and will be removed in NeMo FW 25.09 container release. "
"For evaluation functionality, please use the new Eval repository: https://github.com/NVIDIA-NeMo/Eval",
DeprecationWarning,
stacklevel=2,
)
"""
Deploys nemo model on a PyTriton server either "in-framework" or by converting to trtllm depending on the backend.
This deploy method is intended to be used for evaluation.
Expand Down Expand Up @@ -656,6 +650,12 @@ def deploy(
the trtllm backend).
legacy_ckpt (bool): Indicates whether the checkpoint is in the legacy format. Default: False
"""
warnings.warn(
"The 'deploy' function is deprecated and will be removed in NeMo FW 25.09 container release. "
"For evaluation functionality, please use the new Eval repository: https://github.com/NVIDIA-NeMo/Eval",
DeprecationWarning,
stacklevel=2,
)
import os

import uvicorn
Expand Down Expand Up @@ -797,12 +797,6 @@ def evaluate(
eval_cfg: EvaluationConfig = EvaluationConfig(type="gsm8k"),
adapter_cfg: AdapterConfig | None = None,
) -> dict:
warnings.warn(
"The 'evaluate' function is deprecated and will be removed in NeMo FW 25.09 container release. "
"For evaluation functionality, please use the new Eval repository: https://github.com/NVIDIA-NeMo/Eval",
DeprecationWarning,
stacklevel=2,
)
"""
Evaluates nemo model deployed on PyTriton server using nvidia-lm-eval

Expand All @@ -813,6 +807,12 @@ def evaluate(
adapter_cfg (AdapterConfig): configuration for adapters, the object between becnhmark and endpoint.
Default: None.
"""
warnings.warn(
"The 'evaluate' function is deprecated and will be removed in NeMo FW 25.09 container release. "
"For evaluation functionality, please use the new Eval repository: https://github.com/NVIDIA-NeMo/Eval",
DeprecationWarning,
stacklevel=2,
)
from nemo.collections.llm.evaluation.base import _legacy_evaluate, find_framework, wait_for_fastapi_server

if target_cfg.api_endpoint.nemo_checkpoint_path is not None:
Expand Down Expand Up @@ -1184,7 +1184,7 @@ def generate(
)

if trainer.strategy.expert_model_parallel_size > 1:
gathered_results = results_on_this_dp_rank
gathered_results = [r.generated_text if text_only else r for r in results_on_this_dp_rank]
else:
gathered_results = [None] * dp_size

Expand Down
5 changes: 5 additions & 0 deletions nemo/collections/llm/gpt/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
Gemma3Config27B,
Gemma3Model,
)
from nemo.collections.llm.gpt.model.gpt_oss import GPTOSSConfig, GPTOSSConfig20B, GPTOSSConfig120B, GPTOSSModel
from nemo.collections.llm.gpt.model.hf_auto_model_for_causal_lm import HFAutoModelForCausalLM
from nemo.collections.llm.gpt.model.hf_llama_embedding import get_llama_bidirectional_hf_model
from nemo.collections.llm.gpt.model.hyena import (
Expand Down Expand Up @@ -236,6 +237,10 @@
"LlamaEmbeddingModel",
"Llama32EmbeddingConfig1B",
"Llama32EmbeddingConfig3B",
"GPTOSSConfig",
"GPTOSSConfig120B",
"GPTOSSConfig20B",
"GPTOSSModel",
"Phi3Config",
"Phi3ConfigMini",
"Phi3Model",
Expand Down
6 changes: 6 additions & 0 deletions nemo/collections/llm/gpt/model/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig
from megatron.core.models.gpt.gpt_model import GPTModel as MCoreGPTModel
from megatron.core.optimizer import OptimizerConfig
from megatron.core.transformer.dot_product_attention import DotProductAttention as MCoreDotProductAttention
from megatron.core.transformer.enums import AttnBackend
from megatron.core.transformer.spec_utils import ModuleSpec
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.core.utils import get_batch_on_this_cp_rank
Expand Down Expand Up @@ -384,6 +386,10 @@ def configure_model(self, tokenizer, pre_process=None, post_process=None, vp_sta
kwargs = {"mtp_block_spec": mtp_block_spec(self, vp_stage=vp_stage)}
else:
kwargs = {}

if self.attention_backend == AttnBackend.local:
if hasattr(transformer_layer_spec, 'submodules'):
transformer_layer_spec.submodules.self_attention.submodules.core_attention = MCoreDotProductAttention
with model_init_device_context():
model = MCoreGPTModel(
self,
Expand Down
Loading
Loading