Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
.github/ @nvidia-nemo/automation
codecov.yml @nvidia-nemo/automation
docker/ @nvidia-nemo/automation
pyproject.toml @nvidia-nemo/automation

Expand Down
54 changes: 27 additions & 27 deletions nemo_automodel/_transformers/auto_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,32 @@

from transformers import AutoConfig, AutoTokenizer

from nemo_automodel._transformers.tokenization.nemo_auto_tokenizer import AutoTokenizerWithBosEosEnforced
from nemo_automodel._transformers.tokenization.nemo_auto_tokenizer import NeMoAutoTokenizerWithBosEosEnforced
from nemo_automodel._transformers.tokenization.registry import TokenizerRegistry

logger = logging.getLogger(__name__)


class NeMoAutoTokenizer:
def _get_model_type(pretrained_model_name_or_path: str, trust_remote_code: bool = False) -> Optional[str]:
"""
Determine the model type from the config.

Args:
pretrained_model_name_or_path: Model identifier or path
trust_remote_code: Whether to trust remote code

Returns:
The model_type string, or None if it cannot be determined
"""
try:
config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust_remote_code)
return getattr(config, "model_type", None)
except Exception as e:
logger.debug(f"Could not load config to determine model type: {e}")
return None


class NeMoAutoTokenizer(AutoTokenizer):
"""
Auto tokenizer class that dispatches to appropriate tokenizer implementations.

Expand All @@ -32,7 +51,7 @@ class NeMoAutoTokenizer:

The dispatch logic is:
1. If a custom tokenizer is registered for the model type, use it
2. Otherwise, fall back to AutoTokenizerWithBosEosEnforced
2. Otherwise, fall back to NeMoAutoTokenizerWithBosEosEnforced

Example:
>>> # Will use MistralCommonBackend if available for Mistral models
Expand Down Expand Up @@ -77,7 +96,7 @@ def from_pretrained(

Args:
pretrained_model_name_or_path: Model identifier or path
force_default: If True, always use AutoTokenizerWithBosEosEnforced
force_default: If True, always use NeMoAutoTokenizerWithBosEosEnforced
force_hf: If True, return the raw HF AutoTokenizer without any wrapping
trust_remote_code: Whether to trust remote code when loading config
**kwargs: Additional arguments passed to the tokenizer's from_pretrained
Expand All @@ -87,45 +106,26 @@ def from_pretrained(
"""
# If force_hf, just use the base HF AutoTokenizer
if force_hf:
return AutoTokenizer.from_pretrained(
return super().from_pretrained(
pretrained_model_name_or_path, *args, trust_remote_code=trust_remote_code, **kwargs
)

# Try to determine model type from config
model_type = cls._get_model_type(pretrained_model_name_or_path, trust_remote_code=trust_remote_code)
model_type = _get_model_type(pretrained_model_name_or_path, trust_remote_code=trust_remote_code)

if model_type and cls._registry.has_custom_tokenizer(model_type):
tokenizer_cls = cls._registry.get_tokenizer_cls(model_type)
logger.info(f"Using custom tokenizer {tokenizer_cls.__name__} for model type '{model_type}'")
return tokenizer_cls.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)

# Fall back to default BOS/EOS enforced tokenizer
return AutoTokenizerWithBosEosEnforced.from_pretrained(
return NeMoAutoTokenizerWithBosEosEnforced.from_pretrained(
pretrained_model_name_or_path, *args, trust_remote_code=trust_remote_code, **kwargs
)

@classmethod
def _get_model_type(cls, pretrained_model_name_or_path: str, trust_remote_code: bool = False) -> Optional[str]:
"""
Determine the model type from the config.

Args:
pretrained_model_name_or_path: Model identifier or path
trust_remote_code: Whether to trust remote code

Returns:
The model_type string, or None if it cannot be determined
"""
try:
config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust_remote_code)
return getattr(config, "model_type", None)
except Exception as e:
logger.debug(f"Could not load config to determine model type: {e}")
return None


__all__ = [
"NeMoAutoTokenizer",
"AutoTokenizerWithBosEosEnforced",
"NeMoAutoTokenizerWithBosEosEnforced",
"TokenizerRegistry",
]
56 changes: 13 additions & 43 deletions nemo_automodel/_transformers/tokenization/nemo_auto_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from transformers.tokenization_utils_base import BatchEncoding


class AutoTokenizerWithBosEosEnforced(AutoTokenizer):
class NeMoAutoTokenizerWithBosEosEnforced(AutoTokenizer):
"""
A wrapper around HuggingFace's AutoTokenizer that ensures consistent BOS/EOS token handling.

Expand All @@ -25,59 +25,29 @@ class AutoTokenizerWithBosEosEnforced(AutoTokenizer):
"""

@classmethod
def from_pretrained(
cls, pretrained_model_name_or_path, *args, force_hf=False, add_bos_token=True, add_eos_token=True, **kwargs
):
def from_pretrained(cls, pretrained_model_name_or_path, *args, add_bos_token=True, add_eos_token=True, **kwargs):
"""
Load the HF tokenizer class via AutoTokenizer and (optionally) wrap it to add BOS/EOS.

Args:
pretrained_model_name_or_path: Model identifier or path
force_hf: If True, return the raw HF tokenizer without wrapping
add_bos_token: Whether to add BOS token (default: True)
add_eos_token: Whether to add EOS token (default: True)
"""
hf_tok = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
if force_hf:
return hf_tok

return cls(hf_tok, add_bos_token=add_bos_token, add_eos_token=add_eos_token)

def __init__(self, base_tokenizer, *, add_bos_token: bool, add_eos_token: bool):
self._base_tokenizer = base_tokenizer
self._add_bos = bool(add_bos_token)
self._add_eos = bool(add_eos_token)

@property
def add_bos_token(self):
return self._add_bos

@property
def add_eos_token(self):
return self._add_eos

def __getattr__(self, name):
base = object.__getattribute__(self, "_base_tokenizer")
return getattr(base, name)

def __setattr__(self, name, value):
# Route writes to the underlying tokenizer when appropriate
internal_fields = {"_base_tokenizer", "_add_bos", "_add_eos"}
if name in internal_fields:
return object.__setattr__(self, name, value)
base = self.__dict__.get("_base_tokenizer", None)
if base is not None and hasattr(base, name):
return setattr(base, name, value)
return object.__setattr__(self, name, value)
tokenizer = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
tokenizer.add_bos_token = add_bos_token
tokenizer.add_eos_token = add_eos_token
tokenizer.__class__ = type("NeMoAutoTokenizerWithBosEosEnforced", (cls, type(tokenizer)), {})
return tokenizer

def __call__(self, *args, **kwargs):
tokenized = self._base_tokenizer(*args, **kwargs)
tokenized = super().__call__(*args, **kwargs)
if not kwargs.get("add_special_tokens", True):
return tokenized
if isinstance(tokenized, BatchEncoding):
_tokenized_keys = {"input_ids", "attention_mask", "assistant_masks"}
add_bos_ids = self._add_bos and (getattr(self, "bos_token_id", None) is not None)
add_eos_ids = self._add_eos and (getattr(self, "eos_token_id", None) is not None)
add_bos_ids = self.add_bos_token and (getattr(self, "bos_token_id", None) is not None)
add_eos_ids = self.add_eos_token and (getattr(self, "eos_token_id", None) is not None)
if not "input_ids" in tokenized:
return tokenized
if add_bos_ids:
Expand All @@ -95,13 +65,13 @@ def __call__(self, *args, **kwargs):
return tokenized

def encode(self, *args, **kwargs):
encoded = self._base_tokenizer.encode(*args, **kwargs)
encoded = super().encode(*args, **kwargs)
if not kwargs.get("add_special_tokens", True):
return encoded
if self._add_bos:
if self.add_bos_token:
if encoded and (getattr(self, "bos_token_id", None) is not None) and encoded[0] != self.bos_token_id:
encoded = [self.bos_token_id] + encoded
if self._add_eos:
if self.add_eos_token:
if encoded and (getattr(self, "eos_token_id", None) is not None) and encoded[-1] != self.eos_token_id:
encoded = encoded + [self.eos_token_id]
return encoded
Expand Down
4 changes: 2 additions & 2 deletions nemo_automodel/_transformers/tokenization/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from dataclasses import dataclass, field
from typing import Callable, Dict, Type, Union

from nemo_automodel._transformers.tokenization.nemo_auto_tokenizer import AutoTokenizerWithBosEosEnforced
from nemo_automodel._transformers.tokenization.nemo_auto_tokenizer import NeMoAutoTokenizerWithBosEosEnforced

logger = logging.getLogger(__name__)

Expand All @@ -33,7 +33,7 @@ class _TokenizerRegistry:
model_type_to_tokenizer: Dict[str, Union[Type, Callable]] = field(default_factory=dict)

# Default tokenizer class when no custom implementation is found
default_tokenizer_cls: Type = AutoTokenizerWithBosEosEnforced
default_tokenizer_cls: Type = NeMoAutoTokenizerWithBosEosEnforced

def register(self, model_type: str, tokenizer_cls: Union[Type, Callable]) -> None:
"""
Expand Down
3 changes: 1 addition & 2 deletions nemo_automodel/components/datasets/llm/squad.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,7 @@ def _formatting_prompts_func_with_chat_template(
answer = example["answers"]["text"][0].strip()

formatted_text = [
{"role": "system", "content": context},
{"role": "user", "content": question},
{"role": "user", "content": f"Context: {context} Question: {question} Answer: "},
{"role": "assistant", "content": answer},
]
return format_chat_template(
Expand Down
14 changes: 1 addition & 13 deletions nemo_automodel/recipes/base_recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@
from transformers.processing_utils import ProcessorMixin
from transformers.tokenization_utils import PreTrainedTokenizerBase

from nemo_automodel._transformers.auto_tokenizer import NeMoAutoTokenizer
from nemo_automodel._transformers.tokenization.nemo_auto_tokenizer import AutoTokenizerWithBosEosEnforced
from nemo_automodel.components.checkpoint.checkpointing import save_config
from nemo_automodel.components.config.loader import ConfigNode
from nemo_automodel.components.optim.scheduler import OptimizerParamScheduler
Expand Down Expand Up @@ -77,17 +75,7 @@ def is_tokenizer(object):
Returns:
bool: returns True if object is a tokenizer or VLM processor.
"""
# Note: some NeMo flows wrap HF tokenizers (e.g., BOS/EOS enforcement wrapper). Those
# wrappers still implement `save_pretrained()` via delegation and should be checkpointed.
return isinstance(
object,
(
PreTrainedTokenizerBase,
ProcessorMixin,
NeMoAutoTokenizer,
AutoTokenizerWithBosEosEnforced,
),
)
return isinstance(object, (PreTrainedTokenizerBase, ProcessorMixin))


def is_lr_scheduler(object):
Expand Down
92 changes: 92 additions & 0 deletions tests/functional_tests/datasets/llm/test_nemo_autotokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pytest
from jinja2.exceptions import TemplateError

from nemo_automodel._transformers.auto_tokenizer import NeMoAutoTokenizer

GEMMA_TOKENIZER_PATH = "/home/TestData/automodel/tokenizers/gemma-2-9b-it"


@pytest.fixture
def conversation_with_system_role():
"""A conversation that includes a system role message."""
return [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello, how are you?"},
{"role": "assistant", "content": "I'm doing well, thank you!"},
{"role": "user", "content": "What is the capital of France?"},
]

@pytest.fixture
def conversation_with_multiple_system_roles():
"""A conversation that includes multiple system role messages."""
return [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello, how are you?"},
{"role": "system", "content": "You are a helpful assistant."},
]


@pytest.mark.parametrize("force_hf", [True, False])
def test_gemma_tokenizer_system_role_handling(force_hf, conversation_with_system_role):
"""
Test that NeMoAutoTokenizer handles system role correctly for Gemma-2.

- force_hf=True: Returns raw HF tokenizer which raises TemplateError on system role
- force_hf=False: Returns NeMoAutoTokenizer wrapper which maps system->assistant
"""
tokenizer = NeMoAutoTokenizer.from_pretrained(GEMMA_TOKENIZER_PATH, force_hf=force_hf)

if force_hf:
assert not isinstance(tokenizer, NeMoAutoTokenizer)
# Raw HF tokenizer should raise TemplateError for system role
with pytest.raises(TemplateError, match="System role not supported"):
tokenizer.apply_chat_template(
conversation_with_system_role,
tokenize=True,
add_generation_prompt=True,
)
else:
assert isinstance(tokenizer, NeMoAutoTokenizer)
# NeMoAutoTokenizer should handle system role gracefully (maps to assistant, then drops)
result = tokenizer.apply_chat_template(
conversation_with_system_role,
tokenize=True,
add_generation_prompt=True,
)
assert result is not None
assert isinstance(result, list)
assert len(result) > 0

@pytest.mark.parametrize("force_hf", [True, False])
def test_gemma_tokenizer_system_role_handling_with_multiple_system_roles(force_hf, conversation_with_multiple_system_roles):
"""
Test that NeMoAutoTokenizer handles system role correctly for Gemma-2.

- force_hf=True: Returns raw HF tokenizer which raises TemplateError on system role
- force_hf=False: Returns NeMoAutoTokenizer wrapper which maps system->assistant
"""
tokenizer = NeMoAutoTokenizer.from_pretrained(GEMMA_TOKENIZER_PATH, force_hf=force_hf)

if force_hf:
assert not isinstance(tokenizer, NeMoAutoTokenizer)
# Raw HF tokenizer should raise TemplateError for system role
with pytest.raises(TemplateError, match="System role not supported"):
tokenizer.apply_chat_template(conversation_with_multiple_system_roles, tokenize=True, add_generation_prompt=True)
else:
assert isinstance(tokenizer, NeMoAutoTokenizer)
with pytest.raises(ValueError, match="System role appeared in multiple messages."):
tokenizer.apply_chat_template(conversation_with_multiple_system_roles, tokenize=True, add_generation_prompt=True)
Original file line number Diff line number Diff line change
Expand Up @@ -96,14 +96,14 @@ def test_format_prompt_completion_options(seq_length, padding, truncation):

# Attention mask should have zeros only in padded tail (if any)
if isinstance(seq_length, int):
# From the end, once we see a 0, the rest must be 0
seen_zero = False
# From the end, once we see a non-zero, no zeros should appear (right padding)
seen_nonzero = False
for v in reversed(out["attention_mask"]):
if v == 0:
seen_zero = True
if v != 0:
seen_nonzero = True
else:
if seen_zero:
pytest.fail("Non-zero attention_mask value after padded zeros.")
if seen_nonzero:
pytest.fail("Zero attention_mask value before non-padded tokens (padding not only in tail). ")


@pytest.mark.parametrize(
Expand Down Expand Up @@ -170,12 +170,13 @@ def test_format_chat_template_options(seq_length, padding, truncation):

# Attention mask padded tail zeros, if padded
if isinstance(seq_length, int) and truncation == False:
seen_zero = False
# From the end, once we see a non-zero, no zeros should appear (right padding)
seen_nonzero = False
for v in reversed(out["attention_mask"]):
if v == 0:
seen_zero = True
if v != 0:
seen_nonzero = True
else:
if seen_zero:
pytest.fail("Non-zero attention_mask value after padded zeros.")
if seen_nonzero:
pytest.fail("Zero attention_mask value before non-padded tokens (padding not only in tail).")