update fake inputs generation, initialize distributed for Nemotron models

Edwardf0t1 · Edwardf0t1 · commit f40501d7de11 · 2025-09-19T22:38:30.000Z
Signed-off-by: Zhiyu Cheng &lt;zhiyuc@nvidia.com&gt;
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -83,17 +83,26 @@ def _create_fake_vl_inputs(model, fake_input_ids):
     Returns:
         dict: Dictionary of fake inputs for the VL model
     """
+    import inspect
+
     device = fake_input_ids.device
     batch_size = fake_input_ids.shape[0]
 
+    # Get the model's forward method signature to see what parameters it accepts
+    forward_signature = inspect.signature(model.forward)
+    accepted_params = set(forward_signature.parameters.keys())
+
     # Create fake inputs based on common VL model patterns
-    fake_inputs = {
-        "input_ids": fake_input_ids,
-        "attention_mask": torch.ones_like(fake_input_ids),
-    }
+    fake_inputs = {}
+
+    # Always include basic text inputs if accepted
+    if "input_ids" in accepted_params:
+        fake_inputs["input_ids"] = fake_input_ids
+    if "attention_mask" in accepted_params:
+        fake_inputs["attention_mask"] = torch.ones_like(fake_input_ids)
 
-    # Add vision-specific inputs based on model configuration
-    if hasattr(model.config, "vision_config"):
+    # Add vision-specific inputs based on model configuration and accepted parameters
+    if hasattr(model.config, "vision_config") and "pixel_values" in accepted_params:
         vision_config = model.config.vision_config
         # Create fake pixel values based on vision config
         if hasattr(vision_config, "image_size"):
@@ -111,16 +120,34 @@ def _create_fake_vl_inputs(model, fake_input_ids):
             [batch_size, num_channels, image_size, image_size], dtype=torch.float32, device=device
         )
 
-    # Handle Nemotron-specific inputs
+    # Handle Nemotron-specific inputs based on testing results
     model_name = getattr(model, "name_or_path", "").lower()
     if "nemotron" in model_name:
-        # Nemotron models may need specific image flags
-        fake_inputs["image_flags"] = torch.zeros([batch_size, 1], dtype=torch.long, device=device)
+        if "pixel_values" in accepted_params:
+            # Based on testing, Nemotron expects pixel_values with shape [14, 3, 512, 512]
+            # This represents 14 image patches, each 512x512 pixels with 3 channels
+            num_patches = 14
+            patch_size = 512
+            num_channels = 3
+
+            # Override any previous pixel_values with the correct Nemotron format
+            # Use small random values instead of zeros to avoid NoneType issues
+            fake_inputs["pixel_values"] = (
+                torch.randn(
+                    [num_patches, num_channels, patch_size, patch_size],
+                    dtype=torch.float32,
+                    device=device,
+                )
+                * 0.1
+            )  # Small values to avoid extreme activations
 
-        # Some VL models need aspect ratio information
-        fake_inputs["aspect_ratio_ids"] = None
-        fake_inputs["aspect_ratio_mask"] = None
-        fake_inputs["cross_attention_mask"] = None
+        if "image_flags" in accepted_params:
+            # Based on testing, image_flags should have shape [14] (no batch dimension)
+            # to match the [14, 256, 4096] tensor it's used to mask
+            num_patches = 14  # From pixel_values shape [14, 3, 512, 512]
+            fake_inputs["image_flags"] = torch.zeros(
+                [num_patches], dtype=torch.long, device=device
+            )  # Shape [14] to match vision tensor dimensions
 
     return fake_inputs
 
@@ -202,6 +229,31 @@ def _output_hook(module, input, output):
         elif is_vl_model:
             # For VL models, create proper fake vision inputs
             print("Detected VL model during export - creating fake vision inputs")
+
+            # Pre-emptively initialize distributed for Nemotron models that require it
+            model_name = getattr(model, "name_or_path", "").lower()
+            if "nemotron" in model_name:
+                import os
+
+                import torch.distributed as dist
+
+                if not dist.is_available() or not dist.is_initialized():
+                    print("Pre-initializing distributed processing for Nemotron VL model")
+                    # Set up minimal distributed environment
+                    os.environ.setdefault("MASTER_ADDR", "127.0.0.1")
+                    os.environ.setdefault("MASTER_PORT", "29500")
+                    os.environ.setdefault("RANK", "0")
+                    os.environ.setdefault("WORLD_SIZE", "1")
+
+                    if dist.is_available() and not dist.is_initialized():
+                        try:
+                            dist.init_process_group(
+                                backend="nccl" if torch.cuda.is_available() else "gloo",
+                                rank=0,
+                                world_size=1,
+                            )
+                        except Exception as dist_e:
+                            print(f"Failed to initialize distributed processing: {dist_e}")
             try:
                 # Try to create proper fake vision inputs for the VL model
                 fake_kwargs = _create_fake_vl_inputs(model, fake_input)
@@ -219,8 +271,47 @@ def _output_hook(module, input, output):
                 # For encoder-decoder models, we need to pass both the encoder and decoder input ids
                 model(fake_input, decoder_input_ids=decoder_fake_input)
             elif is_vl_model:
-                # For VL models, use the fake vision inputs
-                model(**fake_kwargs)
+                # For VL models, try to run optimization on just the language model part
+                language_model = None
+                if hasattr(model, "language_model"):
+                    language_model = model.language_model
+                    print(
+                        "Found language_model attribute - running optimization on language model only"
+                    )
+                elif hasattr(model, "model") and hasattr(model.model, "language_model"):
+                    language_model = model.model.language_model
+                    print(
+                        "Found language_model in model.model - running optimization on language model only"
+                    )
+
+                if language_model is not None:
+                    # Run optimization on just the language model with the same input format as regular LLMs
+                    # Use the same fake_input tensor that regular LLMs use
+                    print(
+                        f"Running optimization on language model with fake_input shape: {fake_input.shape}"
+                    )
+                    try:
+                        language_model(fake_input)
+                        print("✅ Language model optimization completed successfully")
+                    except Exception as e:
+                        print(f"Language model optimization failed: {e}")
+                        print("Continuing with export...")
+                else:
+                    # Fallback: try full model with VL inputs
+                    print("No separate language_model found - trying full VL model")
+                    try:
+                        model(**fake_kwargs)
+                        print("✅ Full VL model optimization completed successfully")
+                    except (ValueError, RuntimeError, AttributeError) as e:
+                        if (
+                            "Default process group has not been initialized" in str(e)
+                            or "must match the size of tensor" in str(e)
+                            or "'bool' object has no attribute 'sum'" in str(e)
+                        ):
+                            print(f"VL model forward pass failed: {e}")
+                            print("Skipping optimization for VL model - continuing with export")
+                        else:
+                            raise
             else:
                 model(fake_input)