Merge branch 'main' into akoumparouli/feat_backport_devstral_to_v4

akoumpa · web-flow · commit 07195df58cf4 · 2026-01-06T11:21:24.000-08:00
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -2,6 +2,7 @@
 docker/ @nvidia-nemo/automation
 pyproject.toml @nvidia-nemo/automation
 
-nemo_automodel @akoumpa @HuiyingLi @adil-a @hemildesai @ybabakhin @shan-nvidia
-examples @akoumpa @HuiyingLi @adil-a @hemildesai @ybabakhin @shan-nvidia
-README.md @akoumpa @HuiyingLi
+docs @akoumpa @jgerh
+nemo_automodel @akoumpa @HuiyingLi @adil-a @hemildesai @ybabakhin @shan-nvidia @rnyak @oliverholworthy @gabrielspmoreira
+examples @akoumpa @HuiyingLi @adil-a @hemildesai @ybabakhin @shan-nvidia @rnyak @oliverholworthy @gabrielspmoreira
+README.md @akoumpa @HuiyingLi @snowmanwwg
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -45,7 +45,7 @@ jobs:
       - name: Set up UV
         uses: astral-sh/setup-uv@v1
         with:
-          version: 0.7.2
+          version: 0.8.22
       - name: Install ruff
         env:
           UV_PROJECT_ENVIRONMENT: ./venv
@@ -60,8 +60,9 @@ jobs:
       - name: Run ruff
         run: |
           source ./venv/bin/activate
-          uv run ruff check . --verbose
-          uv run ruff format --check . --verbose
+          uv run --active ruff --version
+          uv run --active ruff check --verbose .
+          uv run --active ruff format --check --verbose .
 
   import_linting:
     runs-on: ubuntu-latest
diff --git a/docs/guides/dataset-overview.md b/docs/guides/dataset-overview.md
@@ -1,6 +1,6 @@
 # Dataset Overview: LLM and VLM Datasets in NeMo Automodel
 
-This page summarizes the datasets already supported in NeMo Automodel for LLM and VLM, and shows how to plug in your own datasets using simple Python functions or directly through YAML using the `_target_` mechanism.
+This page summarizes the datasets supported in NeMo Automodel for LLM and VLM and shows how to plug in your own datasets using Python functions or the YAML `_target_` mechanism.
 
 - See also: [LLM datasets](llm/dataset.md) and [VLM datasets](vlm/dataset.md) for deeper, task-specific guides.
 
@@ -23,7 +23,7 @@ dataset:
   split: train
 ```
 
-- **SQuAD-style QA (instruction SFT)**
+- **SQuAD-style Question Answering (QA) (instruction SFT)**
   - Factory: `nemo_automodel.components.datasets.llm.squad.make_squad_dataset`
   - Use case: instruction/QA tuning with either prompt+answer formatting or chat-template formatting
   - Notes:
@@ -57,7 +57,133 @@ dataset:
   answer_only_loss_mask: true
   start_of_turn_token: "<|assistant|>"
 ```
-  - See the detailed guide, [Column-Mapped Text Instruction Dataset](llm/column-mapped-text-instruction-dataset.md), for more information.
+See the detailed guide, [Column-Mapped Text Instruction Dataset](llm/column-mapped-text-instruction-dataset.md), for more information.
+
+- **ChatDataset (multi-turn conversations and tool calling)**
+  - Class: `nemo_automodel.components.datasets.llm.ChatDataset`
+  - Use case: multi-turn conversations and tool calling in OpenAI chat format
+  - Sources: local JSON/JSONL or Hugging Face Hub dataset ID
+  - Key args:
+    - `path_or_dataset_id`: path to local file(s) or HuggingFace dataset ID
+    - `tokenizer`: tokenizer instance (required. Must have chat template support)
+    - `split`: dataset split (e.g., "train", "validation")
+    - `name`: dataset configuration/subset name
+    - `seq_length`: maximum sequence length for padding/truncation
+    - `padding`: padding strategy ("do_not_pad", "max_length", etc.)
+    - `truncation`: truncation strategy ("do_not_truncate", "longest_first", etc.)
+    - `start_of_turn_token`: token marking assistant response start (for answer-only loss)
+    - `chat_template`: optional override for tokenizer's chat template
+  - Notes:
+    - Requires a tokenizer with chat template support
+    - Supports both single-turn and multi-turn tool calling
+    - Tool definitions are provided in a `tools` field at the conversation level
+    - Tool calls appear in assistant messages via `tool_calls` field
+    - Tool responses use the `tool` role
+  - Example YAML:
+```yaml
+dataset:
+  _target_: nemo_automodel.components.datasets.llm.ChatDataset
+  path_or_dataset_id: Salesforce/xlam-function-calling-60k
+  split: train
+  tokenizer:
+    _target_: transformers.AutoTokenizer.from_pretrained
+    pretrained_model_name_or_path: google/functiongemma-270m-it
+  seq_length: 2048
+  start_of_turn_token: "<start_of_turn>"
+```
+  - Expected data format (OpenAI messages format):
+```json
+{
+  "messages": [
+    {
+      "role": "user",
+      "content": "What's the weather in Seattle?"
+    },
+    {
+      "role": "assistant",
+      "content": "",
+      "tool_calls": [
+        {
+          "id": "call_1",
+          "type": "function",
+          "function": {
+            "name": "get_weather",
+            "arguments": "{\"city\": \"Seattle\"}"
+          }
+        }
+      ]
+    },
+    {
+      "role": "tool",
+      "tool_call_id": "call_1",
+      "content": "{\"temperature\": 65, \"condition\": \"cloudy\"}"
+    },
+    {
+      "role": "assistant",
+      "content": "It's 65°F and cloudy in Seattle."
+    }
+  ],
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get_weather",
+        "description": "Get current weather for a city",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "city": {"type": "string"}
+          },
+          "required": ["city"]
+        }
+      }
+    }
+  ]
+}
+```
+  - For single-turn tool calling (one tool call per conversation), omit the tool response and final assistant message:
+```json
+{
+  "messages": [
+    {
+      "role": "user",
+      "content": "Book a table for two at 7pm in Seattle."
+    },
+    {
+      "role": "assistant",
+      "content": "",
+      "tool_calls": [
+        {
+          "id": "call_1",
+          "type": "function",
+          "function": {
+            "name": "book_table",
+            "arguments": "{\"party_size\": 2, \"time\": \"19:00\", \"city\": \"Seattle\"}"
+          }
+        }
+      ]
+    }
+  ],
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "book_table",
+        "description": "Book a restaurant table",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "party_size": {"type": "integer"},
+            "time": {"type": "string"},
+            "city": {"type": "string"}
+          }
+        }
+      }
+    }
+  ]
+}
+```
+See the [Function Calling guide](llm/toolcalling.md) for an end-to-end example with FunctionGemma.
 
 - **NanoGPT Binary Shards (pretraining)**
   - Class: `nemo_automodel.components.datasets.llm.nanogpt_dataset.NanogptDataset`
@@ -69,7 +195,7 @@ dataset:
 - **Megatron (pretraining; interoperable with pre-tokenized Megatron data)**
   - Class: `nemo_automodel.components.datasets.llm.megatron_dataset.MegatronPretraining`
   - Use case: large-scale LM pretraining over Megatron-LM formatted tokenized corpora
-  - Interoperability: if your corpus has already been tokenized/indexed for Megatron (i.e., `.bin`/`.idx` pairs), you can point Automodel to those assets directly; no re-tokenization required
+  - Interoperability: If your corpus has already been tokenized/indexed for Megatron (i.e., `.bin`/`.idx` pairs), you can point Automodel to those assets directly. No re-tokenization required.
   - Key args: `paths` (single path, glob, weighted list, or per-split dict), `seq_length`, `tokenizer`, `split`, `index_mapping_dir`, `splits_to_build`
   - Example YAML:
 ```yaml
@@ -84,9 +210,7 @@ dataset:
   split: "0.99, 0.01, 0.00"  # train, validation, test
   splits_to_build: "train"
 ```
- - See the detailed pretraining guide, [Megatron Core Dataset Pretraining](llm/pretraining.md), which uses MegatronPretraining data.
-
-> ⚠️ Note: Multi-turn conversational and tool-calling/function-calling dataset support is coming soon.
+See the detailed [pretraining guide](llm/pretraining.md), which uses MegatronPretraining data.
 
 ## Packed Sequence Support
 To reduce padding and improve throughput with variable-length sequences:
@@ -111,9 +235,10 @@ VLM datasets are represented as conversations (message lists) that combine text
 
 Built-in dataset makers (return lists of `conversation` dicts):
 - **RDR items**: `nemo_automodel.components.datasets.vlm.datasets.make_rdr_dataset` (HF: `quintend/rdr-items`)
-- **CORD-V2 receipts**: `nemo_automodel.components.datasets.vlm.datasets.make_cord_v2_dataset` (HF: `naver-clova-ix/cord-v2`)
-- **MedPix-VQA (medical)**: `nemo_automodel.components.datasets.vlm.datasets.make_medpix_dataset`
-- **CommonVoice 17 (audio)**: `nemo_automodel.components.datasets.vlm.datasets.make_cv17_dataset`
+- **CORD-V2 receipts (Consolidated Receipt Dataset for Post-OCR Parsing)**: `nemo_automodel.components.datasets.vlm.datasets.make_cord_v2_dataset` (HF: `naver-clova-ix/cord-v2`)
+- **MedPix-VQA (Medical Pixel Question Answering)**: `nemo_automodel.components.datasets.vlm.datasets.make_medpix_dataset`
+- **CommonVoice 17 (CV17) (audio)**: `nemo_automodel.components.datasets.vlm.datasets.make_cv17_dataset`
+
 
 Each example follows the conversation schema expected by `apply_chat_template`, e.g.:
 ```python
@@ -188,7 +313,7 @@ dataset:
 Where `build_my_dataset` returns either a `datasets.Dataset` or a list/iterator of conversation dicts (for VLM).
 
 ### 3) Use ColumnMappedTextInstructionDataset for most instruction datasets (LLM)
-- Ideal when your data has columns like `instruction`, `input`, `output` but with arbitrary names
+- Ideal when your data has columns like `instruction`, `input`, or `output` but with arbitrary names
 - Supports local JSON/JSONL and HF Hub
 ```yaml
 dataset:
diff --git a/examples/llm_finetune/llama3_3/custom_llama3_3_70b_instruct_peft_benchmark.yaml b/examples/llm_finetune/llama3_3/custom_llama3_3_70b_instruct_peft_benchmark.yaml
@@ -28,7 +28,7 @@ rng:
   ranked: true
 
 model:
-  _target_: nemo_automodel.components.models.llama.model.build_llama_model
+  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
   pretrained_model_name_or_path: meta-llama/Llama-3.3-70B-Instruct
   torch_dtype: bf16
 
@@ -87,4 +87,4 @@ optimizer:
 
 lr_scheduler:
   lr_decay_style: cosine
-  min_lr: 1.0e-6 
+  min_lr: 1.0e-6 
diff --git a/examples/llm_finetune/llama3_3/custom_llama3_3_70b_instruct_peft_benchmark_2nodes.yaml b/examples/llm_finetune/llama3_3/custom_llama3_3_70b_instruct_peft_benchmark_2nodes.yaml
@@ -28,7 +28,7 @@ rng:
   ranked: true
 
 model:
-  _target_: nemo_automodel.components.models.llama.model.build_llama_model
+  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
   pretrained_model_name_or_path: meta-llama/Llama-3.3-70B-Instruct
   torch_dtype: bf16
 
@@ -87,4 +87,4 @@ optimizer:
 
 lr_scheduler:
   lr_decay_style: cosine
-  min_lr: 1.0e-6 
+  min_lr: 1.0e-6 
diff --git a/nemo_automodel/_transformers/auto_model.py b/nemo_automodel/_transformers/auto_model.py
@@ -18,6 +18,7 @@
 import logging
 import os
 import types
+from contextlib import contextmanager
 from typing import List, Optional, Union
 
 import torch
@@ -36,10 +37,7 @@
 import nemo_automodel.components.distributed.utils as dist_utils
 from nemo_automodel import __version__
 from nemo_automodel._transformers.registry import ModelRegistry
-from nemo_automodel.components.distributed.init_utils import (
-    get_local_world_size_preinit,
-    get_world_size_safe,
-)
+from nemo_automodel.components.distributed.init_utils import get_local_world_size_preinit, get_world_size_safe
 from nemo_automodel.components.utils.model_utils import resolve_trust_remote_code
 from nemo_automodel.shared.import_utils import safe_import
 from nemo_automodel.shared.utils import dtype_from_str
@@ -49,6 +47,33 @@
 logger = logging.getLogger(__name__)
 
 
+@contextmanager
+def local_torch_dtype(
+    dtype: torch.dtype, model_class_name: str | None = None, default_dtype: torch.dtype = torch.bfloat16
+):
+    """
+    Locally change the torch default dtype to `dtype`, and restore the old one upon exiting the context.
+    If `model_class_name` is provided, it's used to provide a more helpful error message if `dtype` is not valid.
+    """
+    # Just a more helping error before we set `torch.set_default_dtype` later on which would crash in this case
+    if isinstance(dtype, str):
+        dtype = default_dtype
+    if not dtype.is_floating_point:
+        if model_class_name is not None:
+            error_message = (
+                f"{model_class_name} cannot be instantiated under `dtype={dtype}` as it's not a floating-point dtype"
+            )
+        else:
+            error_message = f"Cannot set `{dtype}` as torch's default as it's not a floating-point dtype"
+        raise ValueError(error_message)
+    original_dtype = torch.get_default_dtype()
+    try:
+        torch.set_default_dtype(dtype)
+        yield
+    finally:
+        torch.set_default_dtype(original_dtype)
+
+
 def _assert_same_signature(original, patched):
     """
     Raise AssertionError if the two call signatures differ.
@@ -157,15 +182,17 @@ def _get_next_fallback_attn(attn_implementation: str) -> str:
         return priorities[0]
 
 
-def _prepare_hf_config_and_flag(pretrained_model_name_or_path, force_hf, kwargs):
+def _prepare_hf_config_and_flag(pretrained_model_name_or_path, force_hf, kwargs, attn_implementation):
     """
     Resolve trust_remote_code default, fetch HF config and determine if model is HF-based.
     """
     kwargs["trust_remote_code"] = kwargs.get(
         "trust_remote_code", resolve_trust_remote_code(pretrained_model_name_or_path)
     )
     hf_config = kwargs.pop("config", None) or AutoConfig.from_pretrained(
-        pretrained_model_name_or_path, trust_remote_code=kwargs["trust_remote_code"]
+        pretrained_model_name_or_path,
+        **kwargs,
+        attn_implementation=attn_implementation,
     )
     architectures = getattr(hf_config, "architectures", None) or []
     is_hf_model = (not architectures or architectures[0] not in ModelRegistry.model_arch_name_to_cls) or force_hf
@@ -358,7 +385,9 @@ def from_pretrained(
               `use_liger_kernel=False` or `use_sdpa_patching=False`
         """
         torch_dtype = dtype_from_str(torch_dtype) if torch_dtype != "auto" else torch_dtype
-        hf_config, is_hf_model = _prepare_hf_config_and_flag(pretrained_model_name_or_path, force_hf, kwargs)
+        hf_config, is_hf_model = _prepare_hf_config_and_flag(
+            pretrained_model_name_or_path, force_hf, kwargs, attn_implementation=attn_implementation
+        )
         tp_size, cp_size, has_packed_sequence = _pop_tp_cp_has_packed(kwargs)
         attn_implementation, use_liger_kernel = _apply_preload_overrides(
             is_hf_model, tp_size, cp_size, has_packed_sequence, attn_implementation, use_liger_kernel
@@ -400,7 +429,10 @@ def _retry(**override):
             _download_model_weights(hf_config, pretrained_model_name_or_path)
             logger.info(f"Using custom model implementation for {architectures[0]}")
             kwargs.pop("trust_remote_code", None)
-            return ModelRegistry.model_arch_name_to_cls[architectures[0]](hf_config, *model_args, **kwargs)
+            # TODO(@akoumpa): restore weights after initialization.
+            model_cls = ModelRegistry.model_arch_name_to_cls[architectures[0]]
+            with local_torch_dtype(torch_dtype, model_cls.__name__):
+                return model_cls(hf_config)
 
         # 3. fallback to parent class
         model = None
@@ -533,7 +565,11 @@ def _retry(**override):
 
         # handle model_id passed as config
         if isinstance(config, str):
-            config = AutoConfig.from_pretrained(config, trust_remote_code=kwargs.get("trust_remote_code", False))
+            config = AutoConfig.from_pretrained(
+                config,
+                trust_remote_code=kwargs.get("trust_remote_code", False),
+                attn_implementation=attn_implementation,
+            )
         # 1. if force_hf is True, we will use the parent class to load and return the model as is
         if force_hf:
             return cls._from_config_parent_class(
@@ -547,7 +583,8 @@ def _retry(**override):
         # 2. If we have a custom model implementation available, we prioritize that over HF
         architectures = get_architectures(config)
         if len(architectures) > 0 and architectures[0] in ModelRegistry.model_arch_name_to_cls:
-            return ModelRegistry.model_arch_name_to_cls[architectures[0]](config, *model_args, **kwargs)
+            with local_torch_dtype(torch_dtype, ModelRegistry.model_arch_name_to_cls[architectures[0]].__name__):
+                return ModelRegistry.model_arch_name_to_cls[architectures[0]](config)
 
         # 3. fallback to parent class
         model = None
diff --git a/nemo_automodel/components/models/llama/__init__.py b/nemo_automodel/components/models/llama/__init__.py
@@ -11,9 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-"""Custom Llama model implementation for NeMo Automodel."""
-
-from nemo_automodel.components.models.llama.model import LlamaForCausalLM, build_llama_model
-
-__all__ = ["LlamaForCausalLM", "build_llama_model"]
diff --git a/nemo_automodel/components/models/llama/model.py b/nemo_automodel/components/models/llama/model.py
diff --git a/tests/unit_tests/_transformers/test_auto_model.py b/tests/unit_tests/_transformers/test_auto_model.py
diff --git a/tests/unit_tests/models/llama/test_llama_custom_model.py b/tests/unit_tests/models/llama/test_llama_custom_model.py