diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 32dbaa298..49a25ef84 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,4 +1,5 @@ .github/ @nvidia-nemo/automation +codecov.yml @nvidia-nemo/automation docker/ @nvidia-nemo/automation pyproject.toml @nvidia-nemo/automation diff --git a/nemo_automodel/_transformers/auto_tokenizer.py b/nemo_automodel/_transformers/auto_tokenizer.py index 328e7e3f1..e9ccf6fa9 100644 --- a/nemo_automodel/_transformers/auto_tokenizer.py +++ b/nemo_automodel/_transformers/auto_tokenizer.py @@ -17,13 +17,32 @@ from transformers import AutoConfig, AutoTokenizer -from nemo_automodel._transformers.tokenization.nemo_auto_tokenizer import AutoTokenizerWithBosEosEnforced +from nemo_automodel._transformers.tokenization.nemo_auto_tokenizer import NeMoAutoTokenizerWithBosEosEnforced from nemo_automodel._transformers.tokenization.registry import TokenizerRegistry logger = logging.getLogger(__name__) -class NeMoAutoTokenizer: +def _get_model_type(pretrained_model_name_or_path: str, trust_remote_code: bool = False) -> Optional[str]: + """ + Determine the model type from the config. + + Args: + pretrained_model_name_or_path: Model identifier or path + trust_remote_code: Whether to trust remote code + + Returns: + The model_type string, or None if it cannot be determined + """ + try: + config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust_remote_code) + return getattr(config, "model_type", None) + except Exception as e: + logger.debug(f"Could not load config to determine model type: {e}") + return None + + +class NeMoAutoTokenizer(AutoTokenizer): """ Auto tokenizer class that dispatches to appropriate tokenizer implementations. @@ -32,7 +51,7 @@ class NeMoAutoTokenizer: The dispatch logic is: 1. If a custom tokenizer is registered for the model type, use it - 2. Otherwise, fall back to AutoTokenizerWithBosEosEnforced + 2. Otherwise, fall back to NeMoAutoTokenizerWithBosEosEnforced Example: >>> # Will use MistralCommonBackend if available for Mistral models @@ -77,7 +96,7 @@ def from_pretrained( Args: pretrained_model_name_or_path: Model identifier or path - force_default: If True, always use AutoTokenizerWithBosEosEnforced + force_default: If True, always use NeMoAutoTokenizerWithBosEosEnforced force_hf: If True, return the raw HF AutoTokenizer without any wrapping trust_remote_code: Whether to trust remote code when loading config **kwargs: Additional arguments passed to the tokenizer's from_pretrained @@ -87,12 +106,12 @@ def from_pretrained( """ # If force_hf, just use the base HF AutoTokenizer if force_hf: - return AutoTokenizer.from_pretrained( + return super().from_pretrained( pretrained_model_name_or_path, *args, trust_remote_code=trust_remote_code, **kwargs ) # Try to determine model type from config - model_type = cls._get_model_type(pretrained_model_name_or_path, trust_remote_code=trust_remote_code) + model_type = _get_model_type(pretrained_model_name_or_path, trust_remote_code=trust_remote_code) if model_type and cls._registry.has_custom_tokenizer(model_type): tokenizer_cls = cls._registry.get_tokenizer_cls(model_type) @@ -100,32 +119,13 @@ def from_pretrained( return tokenizer_cls.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) # Fall back to default BOS/EOS enforced tokenizer - return AutoTokenizerWithBosEosEnforced.from_pretrained( + return NeMoAutoTokenizerWithBosEosEnforced.from_pretrained( pretrained_model_name_or_path, *args, trust_remote_code=trust_remote_code, **kwargs ) - @classmethod - def _get_model_type(cls, pretrained_model_name_or_path: str, trust_remote_code: bool = False) -> Optional[str]: - """ - Determine the model type from the config. - - Args: - pretrained_model_name_or_path: Model identifier or path - trust_remote_code: Whether to trust remote code - - Returns: - The model_type string, or None if it cannot be determined - """ - try: - config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust_remote_code) - return getattr(config, "model_type", None) - except Exception as e: - logger.debug(f"Could not load config to determine model type: {e}") - return None - __all__ = [ "NeMoAutoTokenizer", - "AutoTokenizerWithBosEosEnforced", + "NeMoAutoTokenizerWithBosEosEnforced", "TokenizerRegistry", ] diff --git a/nemo_automodel/_transformers/tokenization/nemo_auto_tokenizer.py b/nemo_automodel/_transformers/tokenization/nemo_auto_tokenizer.py index 848f70f23..5fd85320f 100644 --- a/nemo_automodel/_transformers/tokenization/nemo_auto_tokenizer.py +++ b/nemo_automodel/_transformers/tokenization/nemo_auto_tokenizer.py @@ -16,7 +16,7 @@ from transformers.tokenization_utils_base import BatchEncoding -class AutoTokenizerWithBosEosEnforced(AutoTokenizer): +class NeMoAutoTokenizerWithBosEosEnforced(AutoTokenizer): """ A wrapper around HuggingFace's AutoTokenizer that ensures consistent BOS/EOS token handling. @@ -25,59 +25,29 @@ class AutoTokenizerWithBosEosEnforced(AutoTokenizer): """ @classmethod - def from_pretrained( - cls, pretrained_model_name_or_path, *args, force_hf=False, add_bos_token=True, add_eos_token=True, **kwargs - ): + def from_pretrained(cls, pretrained_model_name_or_path, *args, add_bos_token=True, add_eos_token=True, **kwargs): """ Load the HF tokenizer class via AutoTokenizer and (optionally) wrap it to add BOS/EOS. Args: pretrained_model_name_or_path: Model identifier or path - force_hf: If True, return the raw HF tokenizer without wrapping add_bos_token: Whether to add BOS token (default: True) add_eos_token: Whether to add EOS token (default: True) """ - hf_tok = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs) - if force_hf: - return hf_tok - - return cls(hf_tok, add_bos_token=add_bos_token, add_eos_token=add_eos_token) - - def __init__(self, base_tokenizer, *, add_bos_token: bool, add_eos_token: bool): - self._base_tokenizer = base_tokenizer - self._add_bos = bool(add_bos_token) - self._add_eos = bool(add_eos_token) - - @property - def add_bos_token(self): - return self._add_bos - - @property - def add_eos_token(self): - return self._add_eos - - def __getattr__(self, name): - base = object.__getattribute__(self, "_base_tokenizer") - return getattr(base, name) - - def __setattr__(self, name, value): - # Route writes to the underlying tokenizer when appropriate - internal_fields = {"_base_tokenizer", "_add_bos", "_add_eos"} - if name in internal_fields: - return object.__setattr__(self, name, value) - base = self.__dict__.get("_base_tokenizer", None) - if base is not None and hasattr(base, name): - return setattr(base, name, value) - return object.__setattr__(self, name, value) + tokenizer = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs) + tokenizer.add_bos_token = add_bos_token + tokenizer.add_eos_token = add_eos_token + tokenizer.__class__ = type("NeMoAutoTokenizerWithBosEosEnforced", (cls, type(tokenizer)), {}) + return tokenizer def __call__(self, *args, **kwargs): - tokenized = self._base_tokenizer(*args, **kwargs) + tokenized = super().__call__(*args, **kwargs) if not kwargs.get("add_special_tokens", True): return tokenized if isinstance(tokenized, BatchEncoding): _tokenized_keys = {"input_ids", "attention_mask", "assistant_masks"} - add_bos_ids = self._add_bos and (getattr(self, "bos_token_id", None) is not None) - add_eos_ids = self._add_eos and (getattr(self, "eos_token_id", None) is not None) + add_bos_ids = self.add_bos_token and (getattr(self, "bos_token_id", None) is not None) + add_eos_ids = self.add_eos_token and (getattr(self, "eos_token_id", None) is not None) if not "input_ids" in tokenized: return tokenized if add_bos_ids: @@ -95,13 +65,13 @@ def __call__(self, *args, **kwargs): return tokenized def encode(self, *args, **kwargs): - encoded = self._base_tokenizer.encode(*args, **kwargs) + encoded = super().encode(*args, **kwargs) if not kwargs.get("add_special_tokens", True): return encoded - if self._add_bos: + if self.add_bos_token: if encoded and (getattr(self, "bos_token_id", None) is not None) and encoded[0] != self.bos_token_id: encoded = [self.bos_token_id] + encoded - if self._add_eos: + if self.add_eos_token: if encoded and (getattr(self, "eos_token_id", None) is not None) and encoded[-1] != self.eos_token_id: encoded = encoded + [self.eos_token_id] return encoded diff --git a/nemo_automodel/_transformers/tokenization/registry.py b/nemo_automodel/_transformers/tokenization/registry.py index db6311117..65ddaf0dc 100644 --- a/nemo_automodel/_transformers/tokenization/registry.py +++ b/nemo_automodel/_transformers/tokenization/registry.py @@ -16,7 +16,7 @@ from dataclasses import dataclass, field from typing import Callable, Dict, Type, Union -from nemo_automodel._transformers.tokenization.nemo_auto_tokenizer import AutoTokenizerWithBosEosEnforced +from nemo_automodel._transformers.tokenization.nemo_auto_tokenizer import NeMoAutoTokenizerWithBosEosEnforced logger = logging.getLogger(__name__) @@ -33,7 +33,7 @@ class _TokenizerRegistry: model_type_to_tokenizer: Dict[str, Union[Type, Callable]] = field(default_factory=dict) # Default tokenizer class when no custom implementation is found - default_tokenizer_cls: Type = AutoTokenizerWithBosEosEnforced + default_tokenizer_cls: Type = NeMoAutoTokenizerWithBosEosEnforced def register(self, model_type: str, tokenizer_cls: Union[Type, Callable]) -> None: """ diff --git a/nemo_automodel/components/datasets/llm/squad.py b/nemo_automodel/components/datasets/llm/squad.py index 08d59a710..436e07a78 100644 --- a/nemo_automodel/components/datasets/llm/squad.py +++ b/nemo_automodel/components/datasets/llm/squad.py @@ -50,8 +50,7 @@ def _formatting_prompts_func_with_chat_template( answer = example["answers"]["text"][0].strip() formatted_text = [ - {"role": "system", "content": context}, - {"role": "user", "content": question}, + {"role": "user", "content": f"Context: {context} Question: {question} Answer: "}, {"role": "assistant", "content": answer}, ] return format_chat_template( diff --git a/nemo_automodel/recipes/base_recipe.py b/nemo_automodel/recipes/base_recipe.py index b5d73a77e..5b4222e04 100644 --- a/nemo_automodel/recipes/base_recipe.py +++ b/nemo_automodel/recipes/base_recipe.py @@ -30,8 +30,6 @@ from transformers.processing_utils import ProcessorMixin from transformers.tokenization_utils import PreTrainedTokenizerBase -from nemo_automodel._transformers.auto_tokenizer import NeMoAutoTokenizer -from nemo_automodel._transformers.tokenization.nemo_auto_tokenizer import AutoTokenizerWithBosEosEnforced from nemo_automodel.components.checkpoint.checkpointing import save_config from nemo_automodel.components.config.loader import ConfigNode from nemo_automodel.components.optim.scheduler import OptimizerParamScheduler @@ -77,17 +75,7 @@ def is_tokenizer(object): Returns: bool: returns True if object is a tokenizer or VLM processor. """ - # Note: some NeMo flows wrap HF tokenizers (e.g., BOS/EOS enforcement wrapper). Those - # wrappers still implement `save_pretrained()` via delegation and should be checkpointed. - return isinstance( - object, - ( - PreTrainedTokenizerBase, - ProcessorMixin, - NeMoAutoTokenizer, - AutoTokenizerWithBosEosEnforced, - ), - ) + return isinstance(object, (PreTrainedTokenizerBase, ProcessorMixin)) def is_lr_scheduler(object): diff --git a/tests/functional_tests/datasets/llm/test_nemo_autotokenizer.py b/tests/functional_tests/datasets/llm/test_nemo_autotokenizer.py new file mode 100644 index 000000000..28cbef7d8 --- /dev/null +++ b/tests/functional_tests/datasets/llm/test_nemo_autotokenizer.py @@ -0,0 +1,92 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from jinja2.exceptions import TemplateError + +from nemo_automodel._transformers.auto_tokenizer import NeMoAutoTokenizer + +GEMMA_TOKENIZER_PATH = "/home/TestData/automodel/tokenizers/gemma-2-9b-it" + + +@pytest.fixture +def conversation_with_system_role(): + """A conversation that includes a system role message.""" + return [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hello, how are you?"}, + {"role": "assistant", "content": "I'm doing well, thank you!"}, + {"role": "user", "content": "What is the capital of France?"}, + ] + +@pytest.fixture +def conversation_with_multiple_system_roles(): + """A conversation that includes multiple system role messages.""" + return [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hello, how are you?"}, + {"role": "system", "content": "You are a helpful assistant."}, + ] + + +@pytest.mark.parametrize("force_hf", [True, False]) +def test_gemma_tokenizer_system_role_handling(force_hf, conversation_with_system_role): + """ + Test that NeMoAutoTokenizer handles system role correctly for Gemma-2. + + - force_hf=True: Returns raw HF tokenizer which raises TemplateError on system role + - force_hf=False: Returns NeMoAutoTokenizer wrapper which maps system->assistant + """ + tokenizer = NeMoAutoTokenizer.from_pretrained(GEMMA_TOKENIZER_PATH, force_hf=force_hf) + + if force_hf: + assert not isinstance(tokenizer, NeMoAutoTokenizer) + # Raw HF tokenizer should raise TemplateError for system role + with pytest.raises(TemplateError, match="System role not supported"): + tokenizer.apply_chat_template( + conversation_with_system_role, + tokenize=True, + add_generation_prompt=True, + ) + else: + assert isinstance(tokenizer, NeMoAutoTokenizer) + # NeMoAutoTokenizer should handle system role gracefully (maps to assistant, then drops) + result = tokenizer.apply_chat_template( + conversation_with_system_role, + tokenize=True, + add_generation_prompt=True, + ) + assert result is not None + assert isinstance(result, list) + assert len(result) > 0 + +@pytest.mark.parametrize("force_hf", [True, False]) +def test_gemma_tokenizer_system_role_handling_with_multiple_system_roles(force_hf, conversation_with_multiple_system_roles): + """ + Test that NeMoAutoTokenizer handles system role correctly for Gemma-2. + + - force_hf=True: Returns raw HF tokenizer which raises TemplateError on system role + - force_hf=False: Returns NeMoAutoTokenizer wrapper which maps system->assistant + """ + tokenizer = NeMoAutoTokenizer.from_pretrained(GEMMA_TOKENIZER_PATH, force_hf=force_hf) + + if force_hf: + assert not isinstance(tokenizer, NeMoAutoTokenizer) + # Raw HF tokenizer should raise TemplateError for system role + with pytest.raises(TemplateError, match="System role not supported"): + tokenizer.apply_chat_template(conversation_with_multiple_system_roles, tokenize=True, add_generation_prompt=True) + else: + assert isinstance(tokenizer, NeMoAutoTokenizer) + with pytest.raises(ValueError, match="System role appeared in multiple messages."): + tokenizer.apply_chat_template(conversation_with_multiple_system_roles, tokenize=True, add_generation_prompt=True) \ No newline at end of file diff --git a/tests/functional_tests/hf_transformer/test_formatting_utils_options.py b/tests/functional_tests/hf_transformer/test_formatting_utils_options.py index 8327b0c17..9627309b4 100644 --- a/tests/functional_tests/hf_transformer/test_formatting_utils_options.py +++ b/tests/functional_tests/hf_transformer/test_formatting_utils_options.py @@ -96,14 +96,14 @@ def test_format_prompt_completion_options(seq_length, padding, truncation): # Attention mask should have zeros only in padded tail (if any) if isinstance(seq_length, int): - # From the end, once we see a 0, the rest must be 0 - seen_zero = False + # From the end, once we see a non-zero, no zeros should appear (right padding) + seen_nonzero = False for v in reversed(out["attention_mask"]): - if v == 0: - seen_zero = True + if v != 0: + seen_nonzero = True else: - if seen_zero: - pytest.fail("Non-zero attention_mask value after padded zeros.") + if seen_nonzero: + pytest.fail("Zero attention_mask value before non-padded tokens (padding not only in tail). ") @pytest.mark.parametrize( @@ -170,12 +170,13 @@ def test_format_chat_template_options(seq_length, padding, truncation): # Attention mask padded tail zeros, if padded if isinstance(seq_length, int) and truncation == False: - seen_zero = False + # From the end, once we see a non-zero, no zeros should appear (right padding) + seen_nonzero = False for v in reversed(out["attention_mask"]): - if v == 0: - seen_zero = True + if v != 0: + seen_nonzero = True else: - if seen_zero: - pytest.fail("Non-zero attention_mask value after padded zeros.") + if seen_nonzero: + pytest.fail("Zero attention_mask value before non-padded tokens (padding not only in tail).")