Skip to content

Commit ff022b2

Browse files
authored
feat: support nemotron_parse vlm (#999)
Signed-off-by: HuiyingLi <willwin.lee@gmail.com> Signed-off-by: HuiyingLi <HuiyingLi@users.noreply.github.com> Co-authored-by: HuiyingLi <HuiyingLi@users.noreply.github.com>
1 parent 51b5f4e commit ff022b2

File tree

8 files changed

+1359
-0
lines changed

8 files changed

+1359
-0
lines changed

docker/common/uv-pytorch.lock

Lines changed: 214 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
step_scheduler:
17+
global_batch_size: 16
18+
local_batch_size: 2
19+
ckpt_every_steps: 100
20+
val_every_steps: 100 # will run every x number of gradient steps
21+
max_steps: 1000
22+
23+
dist_env:
24+
backend: nccl
25+
timeout_minutes: 10
26+
27+
rng:
28+
_target_: nemo_automodel.components.training.rng.StatefulRNG
29+
seed: 42
30+
ranked: true
31+
32+
model:
33+
_target_: nemo_automodel.NeMoAutoModelForImageTextToText.from_pretrained
34+
pretrained_model_name_or_path: /lustre/fsw/portfolios/coreai/projects/coreai_dlalgo_llm/users/huiyingl/bulb/Automodel-bulb/my/NVIDIA-Nemotron-Parse-v1.1
35+
torch_dtype: torch.bfloat16
36+
use_liger_kernel: false
37+
attn_implementation: eager
38+
trust_remote_code: True
39+
local_files_only: True
40+
41+
processor:
42+
_target_: transformers.AutoProcessor.from_pretrained
43+
pretrained_model_name_or_path: nvidia/NVIDIA-Nemotron-Parse-v1.1
44+
trust_remote_code: True
45+
46+
checkpoint:
47+
enabled: true
48+
checkpoint_dir: vlm_checkpoints/
49+
model_save_format: safetensors
50+
save_consolidated: true
51+
52+
distributed:
53+
_target_: nemo_automodel.components.distributed.fsdp2.FSDP2Manager
54+
dp_size: none
55+
tp_size: 1
56+
cp_size: 1
57+
sequence_parallel: false
58+
59+
loss_fn:
60+
_target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
61+
62+
dataset:
63+
_target_: nemo_automodel.components.datasets.vlm.datasets.make_cord_v2_dataset
64+
path_or_dataset: naver-clova-ix/cord-v2
65+
split: train
66+
67+
dataloader:
68+
_target_: torchdata.stateful_dataloader.StatefulDataLoader
69+
num_workers: 0
70+
pin_memory: true
71+
collate_fn:
72+
_target_: nemo_automodel.components.datasets.vlm.collate_fns.nemotron_parse_collate_fn
73+
74+
validation_dataset:
75+
_target_: nemo_automodel.components.datasets.vlm.datasets.make_cord_v2_dataset
76+
path_or_dataset: naver-clova-ix/cord-v2
77+
split: validation
78+
79+
validation_dataloader:
80+
_target_: torchdata.stateful_dataloader.StatefulDataLoader
81+
82+
optimizer:
83+
_target_: torch.optim.AdamW
84+
lr: 1e-5
85+
weight_decay: 0.01
86+
betas: [0.9, 0.95]
87+
88+
freeze_config:
89+
freeze_embeddings: true
90+
freeze_vision_tower: true
91+
freeze_language_model: false
92+
93+
# wandb:
94+
# project: <your_project_name>
95+
# entity: <your_entity_name>
96+
# name: <your_experiment_name>

nemo_automodel/components/datasets/vlm/collate_fns.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,88 @@ def has_data(modality_list):
253253
return batch
254254

255255

256+
def nemotron_parse_collate_fn(
257+
examples: Sequence[Dict[str, Any]],
258+
processor,
259+
task_prompt: str = "</s><s><predict_bbox><predict_classes><output_markdown>",
260+
) -> Dict[str, torch.Tensor]:
261+
"""
262+
Collate function for NVIDIA Nemotron-Parse models.
263+
264+
The Nemotron-Parse processor does not expose a chat template, so we build the
265+
prompt + answer string manually, mask the prompt tokens, and keep the
266+
image preprocessing handled by the processor.
267+
"""
268+
269+
conversations = [example["conversation"] for example in examples]
270+
271+
images: List[Any] = []
272+
targets: List[str] = []
273+
for conversation in conversations:
274+
image = None
275+
assistant_text = ""
276+
277+
for message in conversation:
278+
role = message.get("role")
279+
content = message.get("content")
280+
281+
if role == "user":
282+
if isinstance(content, list):
283+
for item in content:
284+
if isinstance(item, dict) and item.get("type") == "image":
285+
image = item.get("image")
286+
break
287+
elif role == "assistant" and not assistant_text:
288+
assistant_text = _extract_assistant_text(message)
289+
290+
if image is not None and assistant_text:
291+
break
292+
293+
images.append(image)
294+
targets.append(assistant_text)
295+
296+
texts = [f"{task_prompt}{target}" for target in targets]
297+
298+
batch = processor(images=images, text=texts, padding=True, return_tensors="pt")
299+
300+
if "pixel_values" in batch:
301+
batch["pixel_values"] = batch["pixel_values"].to(torch.bfloat16)
302+
303+
labels = build_labels(
304+
batch["input_ids"],
305+
conversations,
306+
processor,
307+
)
308+
309+
batch["labels"] = labels[:, 1:]
310+
311+
tokenizer = getattr(processor, "tokenizer", processor)
312+
pad_token_id = getattr(tokenizer, "pad_token_id", None)
313+
decoder_start_token_id = getattr(tokenizer, "decoder_start_token_id", None) or getattr(
314+
tokenizer, "bos_token_id", None
315+
)
316+
if decoder_start_token_id is None:
317+
decoder_start_token_id = getattr(tokenizer, "eos_token_id", None)
318+
if pad_token_id is None or decoder_start_token_id is None:
319+
raise ValueError("Nemotron-Parse collate_fn requires pad_token_id and decoder_start_token_id.")
320+
321+
decoder_input_ids = batch["input_ids"].clone()
322+
decoder_input_ids[:, 0] = decoder_start_token_id
323+
decoder_input_ids[:, 1:] = batch["input_ids"][:, :-1]
324+
325+
decoder_attention_mask = (decoder_input_ids != pad_token_id).long()
326+
327+
batch["decoder_input_ids"] = decoder_input_ids[:, 1:]
328+
batch["decoder_attention_mask"] = decoder_attention_mask[:, 1:]
329+
330+
input_shape = batch["input_ids"].shape
331+
for key, value in list(batch.items()):
332+
if isinstance(value, torch.Tensor) and value.shape == input_shape:
333+
batch[key] = value[:, :-1]
334+
335+
return batch
336+
337+
256338
def default_collate_fn(
257339
examples: Sequence[Dict[str, Any]],
258340
processor,
@@ -297,5 +379,6 @@ def default_collate_fn(
297379
COLLATE_FNS = {
298380
"Qwen2_5_VLProcessor": qwen2_5_collate_fn,
299381
"Qwen3OmniMoeProcessor": qwen3_omni_collate_fn,
382+
"NemotronParseProcessor": nemotron_parse_collate_fn,
300383
"default": default_collate_fn,
301384
}

0 commit comments

Comments
 (0)