NVIDIA-NeMo · yuanhangsu1986 · Nov 14, 2025 · Nov 17, 2025 · Nov 17, 2025 · Nov 18, 2025
@@ -0,0 +1,14 @@
+defaults:
+  - sft_vlm_3B.yaml
+
+policy:
+  tokenizer:
+    video:
+      num_frames: 16
+
+data:
+  dataset_name: GeneralConversationsJsonlDataset
+  train_data_path: /lustre/fsw/portfolios/llmservice/users/yuanhangs/codes/megatron-lm-omcat/megatron-lm-vlm2/examples/multimodal/avlm/test/datasets/miradata_bat1_filtered_vision_5min_10000.jsonl
+  val_data_path: /lustre/fsw/portfolios/llmservice/users/yuanhangs/codes/megatron-lm-omcat/megatron-lm-vlm2/examples/multimodal/avlm/test/datasets/miradata_bat1_filtered_vision_5min_100.jsonl
+  train_media_data_dir: /lustre/fsw/portfolios/edgeai/projects/edgeai_riva_rivamlops/data/videomme/MiraData/video/batch1/5min
+  val_media_data_dir: /lustre/fsw/portfolios/edgeai/projects/edgeai_riva_rivamlops/data/videomme/MiraData/video/batch1/5min
-  train_data_path: /lustre/fsw/portfolios/llmservice/users/yuanhangs/codes/megatron-lm-omcat/megatron-lm-vlm2/examples/multimodal/avlm/test/datasets/miradata_bat1_filtered_vision_5min_10000.jsonl
-  val_data_path: /lustre/fsw/portfolios/llmservice/users/yuanhangs/codes/megatron-lm-omcat/megatron-lm-vlm2/examples/multimodal/avlm/test/datasets/miradata_bat1_filtered_vision_5min_100.jsonl
-  train_media_data_dir: /lustre/fsw/portfolios/edgeai/projects/edgeai_riva_rivamlops/data/videomme/MiraData/video/batch1/5min
-  val_media_data_dir: /lustre/fsw/portfolios/edgeai/projects/edgeai_riva_rivamlops/data/videomme/MiraData/video/batch1/5min
+data:
+  dataset_name: GeneralConversationsJsonlDataset
+  train_data_path: /path/to/train_data.jsonl  # Replace with your training data path
+  val_data_path: /path/to/val_data.jsonl  # Replace with your validation data path
+  train_media_data_dir: /path/to/train_media/  # Directory containing training media files
+  val_media_data_dir: /path/to/val_media/  # Directory containing validation media files
-  train_data_path: /lustre/fsw/portfolios/llmservice/users/yuanhangs/codes/megatron-lm-omcat/megatron-lm-vlm2/examples/multimodal/avlm/test/datasets/miradata_bat1_filtered_vision_5min_10000.jsonl
-  val_data_path: /lustre/fsw/portfolios/llmservice/users/yuanhangs/codes/megatron-lm-omcat/megatron-lm-vlm2/examples/multimodal/avlm/test/datasets/miradata_bat1_filtered_vision_5min_100.jsonl
-  train_media_data_dir: /lustre/fsw/portfolios/edgeai/projects/edgeai_riva_rivamlops/data/videomme/MiraData/video/batch1/5min
-  val_media_data_dir: /lustre/fsw/portfolios/edgeai/projects/edgeai_riva_rivamlops/data/videomme/MiraData/video/batch1/5min
+data:
+  dataset_name: GeneralConversationsJsonlDataset
+  train_data_path: /path/to/train_data.jsonl  # Replace with your training data path
+  val_data_path: /path/to/val_data.jsonl  # Replace with your validation data path
+  train_media_data_dir: /path/to/train_media/  # Directory containing training media files
+  val_media_data_dir: /path/to/val_media/  # Directory containing validation media files
@@ -110,13 +110,17 @@ def setup_data(tokenizer: AutoTokenizer, data_config: DataConfig, seed: int):
     )
 
     # add preprocessor if needed
-    datum_preprocessor = None
+    datum_preprocessor_train = None
+    datum_preprocessor_val = None
     if "dataset_name" in data_config and data_config["dataset_name"] == "clevr_cogent":
         from nemo_rl.data.datasets.response_datasets.clevr import (
             format_clevr_cogent_dataset,
         )
 
-        datum_preprocessor = partial(format_clevr_cogent_dataset, return_pil=True)
+        datum_preprocessor_train = datum_preprocessor_val = partial(format_clevr_cogent_dataset, return_pil=True)
+    elif hasattr(data, "datum_preprocessor"):
+        datum_preprocessor_train = data.datum_preprocessor["train"]
+        datum_preprocessor_val = data.datum_preprocessor["val"]
 
     train_dataset = AllTaskProcessedDataset(
         train_dataset,
@@ -127,7 +131,7 @@ def setup_data(tokenizer: AutoTokenizer, data_config: DataConfig, seed: int):
             add_bos=data_config["add_bos"],
             add_eos=data_config["add_eos"],
             add_generation_prompt=data_config["add_generation_prompt"],
-            datum_preprocessor=datum_preprocessor,
+            datum_preprocessor=datum_preprocessor_train,
         ),
         max_seq_length=data_config["max_input_seq_length"],
     )
@@ -141,7 +145,7 @@ def setup_data(tokenizer: AutoTokenizer, data_config: DataConfig, seed: int):
             add_bos=data_config.get("add_bos", True),
             add_eos=data_config.get("add_eos", True),
             add_generation_prompt=data_config["add_generation_prompt"],
-            datum_preprocessor=datum_preprocessor,
+            datum_preprocessor=datum_preprocessor_val,
         ),
         max_seq_length=data_config["max_input_seq_length"],
     )

@@ -320,6 +320,31 @@ def get_tokenizer(
         processor.bos_token_id = tokenizer.bos_token_id
         # copy name_or_path from tokenizer to processor for logging
         processor.name_or_path = tokenizer.name_or_path
+        if hasattr(processor, "feature_extractor") and "audio" in tokenizer_config:
+            if "sampling_rate" in tokenizer_config["audio"] and \
+                tokenizer_config["audio"]["sampling_rate"] != processor.feature_extractor.sampling_rate:
+                new_sampling_rate = tokenizer_config["audio"]["sampling_rate"]
+                warnings.warn(
+                    f"Overriding audio sampling rate from {processor.feature_extractor.sampling_rate} to {new_sampling_rate}"
+                )
+                processor.feature_extractor.sampling_rate = new_sampling_rate
+        if hasattr(processor, "video_processor") and "video" in tokenizer_config:
+            if "fps" in tokenizer_config["video"] and \
+                tokenizer_config["video"]["fps"] != processor.video_processor.fps:
+                # override the video loading fps
+                new_fps = tokenizer_config["video"]["fps"]
+                warnings.warn(
+                    f"Overriding video fps from {processor.video_processor.fps} to {new_fps}"
+                )
+                processor.video_processor.fps = new_fps
+            # fps and num_frames cannot co-exist, but let it crash later
+            if "num_frames" in tokenizer_config["video"] and \
+                tokenizer_config["video"]["num_frames"] != processor.video_processor.num_frames:
+                new_num_frames = tokenizer_config["video"]["num_frames"]
+                warnings.warn(
+                    f"Overriding video num_frames from {processor.video_processor.num_frames} to {new_num_frames}"
+                )
+                processor.video_processor.num_frames = new_num_frames
 
     return tokenizer if processor is None else processor
 

@@ -26,6 +26,7 @@
 )
 from nemo_rl.data.datasets.response_datasets.refcoco import RefCOCODataset
 from nemo_rl.data.datasets.response_datasets.response_dataset import ResponseDataset
+from nemo_rl.data.datasets.response_datasets.general_conversations_dataset import GeneralConversationsJsonlDataset
 from nemo_rl.data.datasets.response_datasets.squad import SquadDataset
 from nemo_rl.data.datasets.utils import get_extra_kwargs
 
@@ -113,6 +114,25 @@ def load_response_dataset(data_config, seed: int = 42):
             train_data_path=data_config["train_data_path"],
             **extra_kwargs,
         )
+    elif dataset_name == "GeneralConversationsJsonlDataset":
+        if "train_data_path" not in data_config:
+            raise ValueError(
+                "train_data_path is required when dataset_name is not one of the built-ins."
+            )
+        extra_kwargs = get_extra_kwargs(
+            data_config,
+            [
+                "val_data_path",
+                "train_split",
+                "val_split",
+                "train_media_data_dir",
+                "val_media_data_dir",
+            ],
+        )
+        base_dataset = GeneralConversationsJsonlDataset(
+            train_data_path=data_config["train_data_path"],
+            **extra_kwargs,
+        )
     else:
         raise ValueError(
             f"Unsupported {dataset_name=}. "

diff --git a/nemo_rl/data/datasets/response_datasets/conversation_base.py b/nemo_rl/data/datasets/response_datasets/conversation_base.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import io
+import copy
+import warnings
+import dataclasses
+from PIL import Image
+from pathlib import Path
+from collections import defaultdict
+from typing import Any, Dict, Callable, Optional
+
+from nemo_rl.data import multimodal_utils
+
+
+# map the senders from the sample to the allowed ones
+conversation_sender_mapping_sample_to_allowed = {
+    'human': 'user',
+    'gpt': 'assistant',
+    'agent': 'assistant',
+}
+
+
+# convert 
+def convert_metadata(metadata: Dict[str, Any], return_inplace=False):
+    data = metadata
+    if not return_inplace:
+        data = metadata.copy()
+
+    for tag in multimodal_utils.media_tags_to_allowed:
+        if tag in data:
+            tag_mapped = multimodal_utils.media_tags_to_allowed[tag]
+            if tag_mapped not in data:
+                data[tag_mapped] = data[tag]
+                del  data[tag]
+            else:
+                warnings.warn(
+                    f"Trying to map {tag} to {tag_mapped}, but {tag_mapped} already exists in the raw data. Mapping is not carried out."
+                )
+
+    for idx, message in enumerate(data["conversations"]):
+        msg_str = message["value"]
+        for tag in multimodal_utils.media_tags_to_allowed:
+            tag_str = '<' + tag + '>'
+            if tag_str in msg_str:
+                tag_str_mapped = multimodal_utils.media_tags[
+                    multimodal_utils.media_tags_to_allowed[tag]
+                ]
+                msg_str = msg_str.replace(tag_str, tag_str_mapped)
+        message["value"] = msg_str
+        data["conversations"][idx] = message
+
+    if not return_inplace:
+        return data
+
+
+def conversation_process_message(
+    metadata: Dict[str, Any],
+    message: Dict[str, str],
+    media_index: dict,
+    raw: Dict[str, Any] = {},
+    allow_empty_text: bool = False,
+    check_if_media_file_exist: bool = True,
+    tried_default_extensions: set = set(),
+    tags_mapping_sample_to_allowed: Dict[str, str] = multimodal_utils.media_tags_to_allowed,
+    process_message_fragment: Callable = lambda tag, fragment: [{tag: fragment}],
+) -> list[Dict[str, Any]]:
+    """
+    Args:
+        raw: dictionary with all webdataset compliant keys of a sample. 
+            Emtpy for jsonl dataset, non-empty otherwise
+        metadata: 
+    """
+    fragments = []    
+    parts = re.split(multimodal_utils.media_tag_pattern, message["value"])
+
+    # Convert the parts to message fragments
+    empty_text = True
+    for i, part in enumerate(parts):
+        if part in multimodal_utils.media_tags.values():
+            # process multimodal tags
+            tag = multimodal_utils.media_tags_reversed[part]
+            if not isinstance(metadata[tag], list):
+                metadata[tag] = [metadata[tag]]
+            # try to extract the media object from the shard
+            ext = os.path.basename(metadata[tag][media_index[tag]]).split('.', 1)[1]
+            if raw and ext not in raw and \
+                tag not in tried_default_extensions and \
+                tag in multimodal_utils.default_media_extensions:
+                # try the default extension
+                for ext in multimodal_utils.default_media_extensions[tag]:
+                    if ext in raw:
+                        tried_default_extensions.add(ext)
+                        break
+            media_file = None
+            if ext in raw:
+                media_file = ext
+            elif isinstance(metadata[tag][media_index[tag]], str) and \
+                os.path.isfile(metadata[tag][media_index[tag]]):
+                # if cannot get it from the shard files, try to find the local file
+                media_file = metadata[tag][media_index[tag]]
+            elif check_if_media_file_exist:
+                sample_to_print = raw if raw else metadata
+                raise ValueError(f"Cannot find the media file {metadata[tag][media_index[tag]]} from {sample_to_print} or locally.")
+            else:
+                media_file = metadata[tag][media_index[tag]]
+            media_index[tag] += 1
+            fragments += process_message_fragment(tag, media_file)
+        else:
+            # process text
+            if part.strip():
+                fragments += process_message_fragment('text', part)
+                empty_text = False
+
+    if not allow_empty_text and empty_text:
+        fragments += process_message_fragment('text', ' ')
+
+    return fragments