Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
214 changes: 214 additions & 0 deletions docker/common/uv-pytorch.lock

Large diffs are not rendered by default.

96 changes: 96 additions & 0 deletions examples/vlm_finetune/nemotron/nemotron_parse_v1_1.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


step_scheduler:
global_batch_size: 16
local_batch_size: 2
ckpt_every_steps: 100
val_every_steps: 100 # will run every x number of gradient steps
max_steps: 1000

dist_env:
backend: nccl
timeout_minutes: 10

rng:
_target_: nemo_automodel.components.training.rng.StatefulRNG
seed: 42
ranked: true

model:
_target_: nemo_automodel.NeMoAutoModelForImageTextToText.from_pretrained
pretrained_model_name_or_path: /lustre/fsw/portfolios/coreai/projects/coreai_dlalgo_llm/users/huiyingl/bulb/Automodel-bulb/my/NVIDIA-Nemotron-Parse-v1.1
torch_dtype: torch.bfloat16
use_liger_kernel: false
attn_implementation: eager
trust_remote_code: True
local_files_only: True

processor:
_target_: transformers.AutoProcessor.from_pretrained
pretrained_model_name_or_path: nvidia/NVIDIA-Nemotron-Parse-v1.1
trust_remote_code: True

checkpoint:
enabled: true
checkpoint_dir: vlm_checkpoints/
model_save_format: safetensors
save_consolidated: true

distributed:
_target_: nemo_automodel.components.distributed.fsdp2.FSDP2Manager
dp_size: none
tp_size: 1
cp_size: 1
sequence_parallel: false

loss_fn:
_target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy

dataset:
_target_: nemo_automodel.components.datasets.vlm.datasets.make_cord_v2_dataset
path_or_dataset: naver-clova-ix/cord-v2
split: train

dataloader:
_target_: torchdata.stateful_dataloader.StatefulDataLoader
num_workers: 0
pin_memory: true
collate_fn:
_target_: nemo_automodel.components.datasets.vlm.collate_fns.nemotron_parse_collate_fn

validation_dataset:
_target_: nemo_automodel.components.datasets.vlm.datasets.make_cord_v2_dataset
path_or_dataset: naver-clova-ix/cord-v2
split: validation

validation_dataloader:
_target_: torchdata.stateful_dataloader.StatefulDataLoader

optimizer:
_target_: torch.optim.AdamW
lr: 1e-5
weight_decay: 0.01
betas: [0.9, 0.95]

freeze_config:
freeze_embeddings: true
freeze_vision_tower: true
freeze_language_model: false

# wandb:
# project: <your_project_name>
# entity: <your_entity_name>
# name: <your_experiment_name>
83 changes: 83 additions & 0 deletions nemo_automodel/components/datasets/vlm/collate_fns.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,88 @@ def has_data(modality_list):
return batch


def nemotron_parse_collate_fn(
examples: Sequence[Dict[str, Any]],
processor,
task_prompt: str = "</s><s><predict_bbox><predict_classes><output_markdown>",
) -> Dict[str, torch.Tensor]:
"""
Collate function for NVIDIA Nemotron-Parse models.

The Nemotron-Parse processor does not expose a chat template, so we build the
prompt + answer string manually, mask the prompt tokens, and keep the
image preprocessing handled by the processor.
"""

conversations = [example["conversation"] for example in examples]

images: List[Any] = []
targets: List[str] = []
for conversation in conversations:
image = None
assistant_text = ""

for message in conversation:
role = message.get("role")
content = message.get("content")

if role == "user":
if isinstance(content, list):
for item in content:
if isinstance(item, dict) and item.get("type") == "image":
image = item.get("image")
break
elif role == "assistant" and not assistant_text:
assistant_text = _extract_assistant_text(message)

if image is not None and assistant_text:
break

images.append(image)
targets.append(assistant_text)

texts = [f"{task_prompt}{target}" for target in targets]

batch = processor(images=images, text=texts, padding=True, return_tensors="pt")

if "pixel_values" in batch:
batch["pixel_values"] = batch["pixel_values"].to(torch.bfloat16)

labels = build_labels(
batch["input_ids"],
conversations,
processor,
)

batch["labels"] = labels[:, 1:]

tokenizer = getattr(processor, "tokenizer", processor)
pad_token_id = getattr(tokenizer, "pad_token_id", None)
decoder_start_token_id = getattr(tokenizer, "decoder_start_token_id", None) or getattr(
tokenizer, "bos_token_id", None
)
if decoder_start_token_id is None:
decoder_start_token_id = getattr(tokenizer, "eos_token_id", None)
if pad_token_id is None or decoder_start_token_id is None:
raise ValueError("Nemotron-Parse collate_fn requires pad_token_id and decoder_start_token_id.")

decoder_input_ids = batch["input_ids"].clone()
decoder_input_ids[:, 0] = decoder_start_token_id
decoder_input_ids[:, 1:] = batch["input_ids"][:, :-1]

decoder_attention_mask = (decoder_input_ids != pad_token_id).long()

batch["decoder_input_ids"] = decoder_input_ids[:, 1:]
batch["decoder_attention_mask"] = decoder_attention_mask[:, 1:]

input_shape = batch["input_ids"].shape
for key, value in list(batch.items()):
if isinstance(value, torch.Tensor) and value.shape == input_shape:
batch[key] = value[:, :-1]

return batch


def default_collate_fn(
examples: Sequence[Dict[str, Any]],
processor,
Expand Down Expand Up @@ -297,5 +379,6 @@ def default_collate_fn(
COLLATE_FNS = {
"Qwen2_5_VLProcessor": qwen2_5_collate_fn,
"Qwen3OmniMoeProcessor": qwen3_omni_collate_fn,
"NemotronParseProcessor": nemotron_parse_collate_fn,
"default": default_collate_fn,
}
Loading
Loading