EvolvingLMMs-Lab · RanaZay · Dec 24, 2025 · Dec 25, 2025 · Dec 25, 2025 · Dec 25, 2025
diff --git a/.alignment.sh.swp b/.alignment.sh.swp
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,4 @@
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+checkpoints/LLaVA-OneVision-1.5-4B-stage0/** filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
@@ -60,3 +60,6 @@ coverage.xml
 **/.ipynb_checkpoints/
 .vscode/
 checkpoints/
+stage_1_alignment_llava_ov_4b/
+tmp/
+Stage1/training_output.log
diff --git a/dockerfile → Dockerfile b/dockerfile → Dockerfile
diff --git a/Stage1/alignment.sh b/Stage1/alignment.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+#SBATCH --job-name=llava_stage1_4b
+#SBATCH --time=72:00:00             
+#SBATCH --nodes=1
+#SBATCH -p long
+#SBATCH -q gpu-12
+#SBATCH --gres=gpu:4                # 4 A100 GPUs on a single node
+#SBATCH --mem=230G                  # node RAM, not GPU RAM
+#SBATCH --ntasks-per-node=4         # one task per GPU
+#SBATCH --cpus-per-task=16          # adjust if you want fewer CPU cores
+#SBATCH --output=/l/users/rana.zayed/new_fastvlm/LLaVA-OneVision-1.5/Stage1/logs/%x-%j.out     # logs/llava_stage1_4b-<jobid>.out
+
+# ---- ENV SETUP ----
+source ~/miniconda3/etc/profile.d/conda.sh
+conda activate llava-ov-4b-clean
+# conda activate apex_cuda120
+export PATH=/usr/local/cuda-12.1/bin:$PATH
+export LD_LIBRARY_PATH=/usr/local/cuda-12.1/lib64:$LD_LIBRARY_PATH
+export APEX_CUDA_EXT=1
+
+# Go to repo root on CIAI cluster
+# cd /l/users/rana.zayed/new_fastvlm/LLaVA-OneVision-1.5
+# # Go to repo root on 156 machine 
+cd /share/data/drive_3/mobile_vlm/LLaVA-OneVision-1.5
+
+# ============================================================
+# Required environment variables:
+#   AIAK_TRAINING_PATH  Root directory of the AIAK-Training-LLM project
+#   DATA_PATH           Directory with WebDataset shards (.tar) for pretraining
+#   TOKENIZER_PATH      Hugging Face tokenizer directory
+#   CHECKPOINT_PATH     Megatron-formatted checkpoint directory (e.g., mcore TP1/PP1)
+#   SAVE_CKPT_PATH      Output directory for saving training checkpoints
+# export CUDA_HOME=/apps/local/nvidia/cuda-12.0
+# export PATH="$CUDA_HOME/bin:$PATH"
+# export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
+
+# Use CUDA 12.1 libraries but system nvcc (CUDA 10.1) since 12.1 doesn't have nvcc
+export CUDA_HOME=/usr
+export PATH="/usr/bin:$PATH"
+export LD_LIBRARY_PATH="/usr/local/cuda-12.1/lib64:/usr/local/cuda-12.1/targets/x86_64-linux/lib:$LD_LIBRARY_PATH"
+
+
+# AIAK_TRAINING_PATH=/l/users/rana.zayed/new_fastvlm/LLaVA-OneVision-1.5 \
+# DATA_PATH=/l/users/rana.zayed/new_fastvlm/LLaVA-OneVision-1.5/data/LLaVA-558K-Webdataset \
+# TOKENIZER_PATH=/l/users/rana.zayed/new_fastvlm/LLaVA-OneVision-1.5/checkpoints/LLaVA-OneVision-1.5-4B-stage0 \
+# CHECKPOINT_PATH=/l/users/rana.zayed/new_fastvlm/LLaVA-OneVision-1.5/checkpoints/LLaVA-OneVision-1.5-4B-stage0_mcore_tp1_pp1 \
+
+# echo "AIAK_TRAINING_PATH=${AIAK_TRAINING_PATH}"
+# echo "DATA_PATH=${DATA_PATH}"
+# echo "TOKENIZER_PATH=${TOKENIZER_PATH}"
+# echo "CHECKPOINT_PATH=${CHECKPOINT_PATH}"
+# echo "SLURM_NODELIST=${SLURM_NODELIST}"
+
+# Weights & Biases configuration
+export WANDB_API_KEY="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"
+export WANDB_PROJECT="llava-ov-1_5"
+export WANDB_NAME="mobilellm_integration"
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+export GPUS_PER_NODE=4
+export MASTER_PORT=26000
+
+# Choose which backbone to use for training:
+# ============================================================
+# Option 1: MobileLLM-R1-140M (140M params, efficient for mobile/edge)
+bash examples/llava_ov_1_5/quick_start/stage_1_alignment_mobilellm_140m.sh
+
+# Option 2: Original Qwen2.5-4B backbone (4B params)
+# bash examples/llava_ov_1_5/quick_start/stage_1_alignment_llava_ov_4b.sh 
diff --git a/Stage1/alignment_rocm.sh b/Stage1/alignment_rocm.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+
+#SBATCH --job-name=llava_stage1_4b_amd
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=128
+#SBATCH --gres=gpu:2
+#SBATCH --time=72:00:00
+#SBATCH --mem=230G
+#SBATCH --qos=skqos
+#SBATCH --partition=faculty
+#SBATCH --output=/vast/users/salman.khan/mobile_vlm/llava_ov1.5/LLaVA-OneVision-1.5/Stage1/logs/%x-%j.out
+
+# ---- ENV SETUP (AMD) ----
+source ~/.bashrc
+conda activate mobile_vlm
+
+export MIOPEN_DISABLE_CACHE=1
+export PYTORCH_TUNABLEOP_ENABLED=0
+
+export ROCM_HOME=${ROCM_HOME:-/opt/rocm}
+export PATH="${ROCM_HOME}/bin:${PATH}"
+export LD_LIBRARY_PATH="${ROCM_HOME}/lib:${ROCM_HOME}/lib64:${LD_LIBRARY_PATH}"
+
+export HIP_VISIBLE_DEVICES=${HIP_VISIBLE_DEVICES:-0}
+
+# Force HuggingFace offline mode to use local files only
+export HF_HUB_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+
+
+# RCCL/NCCL runtime hints (tune as needed)
+export NCCL_DEBUG=${NCCL_DEBUG:-WARN}
+export NCCL_COLLNET_ENABLE=${NCCL_COLLNET_ENABLE:-0}
+export NCCL_P2P_ENABLE=${NCCL_P2P_ENABLE:-1}
+# export NCCL_SOCKET_IFNAME=eno1   # uncomment and set to your NIC if needed
+
+# Resolve repo root relative to this script (Stage1/..)
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+REPO_ROOT=/vast/users/salman.khan/mobile_vlm/llava_ov1.5/LLaVA-OneVision-1.5
+cd "$REPO_ROOT" || exit 1
+
+
+# Go to repo root
+cd "$REPO_ROOT" || { echo "[Error] Repo root not found: $REPO_ROOT"; exit 1; }
+
+echo "=== ENV CHECK ==="
+which conda
+which python
+python -V
+echo "CONDA_DEFAULT_ENV=$CONDA_DEFAULT_ENV"
+echo "CONDA_PREFIX=$CONDA_PREFIX"
+python -c "import sys; print('sys.executable=', sys.executable)"
+python -c "import torch; print('torch=', torch.__version__)" || echo "TORCH NOT FOUND"
+pip -V
+pip list | grep -E "torch|pytorch" || true
+echo "=== END ENV CHECK ==="
+
+# Required environment variables
+export AIAK_TRAINING_PATH="${AIAK_TRAINING_PATH:-$REPO_ROOT}"
+export AIAK_MAGATRON_PATH="${AIAK_MAGATRON_PATH:-$REPO_ROOT/aiak_megatron}"
+export DATA_PATH="${DATA_PATH:-$REPO_ROOT/data/LLaVA-558K-Webdataset}"
+export TOKENIZER_PATH="${TOKENIZER_PATH:-$REPO_ROOT/checkpoints/LLaVA-OneVision-1.5-4B-stage0}"
+export CHECKPOINT_PATH="${CHECKPOINT_PATH:-$REPO_ROOT/checkpoints/LLaVA-OneVision-1.5-4B-stage0_mcore_tp1_pp1}"
+# Add megatron to PYTHONPATH so imports work
+export PYTHONPATH="${AIAK_MAGATRON_PATH}:${AIAK_TRAINING_PATH}:${PYTHONPATH}"
+
+echo "AIAK_TRAINING_PATH=${AIAK_TRAINING_PATH}"
+echo "AIAK_MAGATRON_PATH=${AIAK_MAGATRON_PATH}"
+echo "DATA_PATH=${DATA_PATH}"
+echo "TOKENIZER_PATH=${TOKENIZER_PATH}"
+echo "CHECKPOINT_PATH=${CHECKPOINT_PATH}"
+echo "SLURM_NODELIST=${SLURM_NODELIST}"
+echo "PYTHONPATH=${PYTHONPATH}"
+
+# Weights & Biases configuration
+export WANDB_API_KEY="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"
+export WANDB_PROJECT="llava-ov-1_5"
+export WANDB_NAME="fastvit_integration"
+bash examples/llava_ov_1_5/quick_start/stage_1_alignment_llava_ov_4b.sh
+                                                                                                                                                                                        2,1           Top
diff --git a/Stage1/inference_fastvlm.py b/Stage1/inference_fastvlm.py
@@ -0,0 +1,114 @@
+"""
+FastVLM Inference Script
+Example run:
+python inference_fastvlm.py --checkpoint_path ./stage_1_alignment_llava_ov_4b/iter_0000020 \
+                            --image_path ./test_image.jpg \
+                            --prompt "What is in this image?"
+"""
+
+import os
+import sys
+import torch
+from PIL import Image
+from argparse import ArgumentParser
+
+# Add repo root to path
+REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, REPO_ROOT)
+sys.path.insert(0, os.path.join(REPO_ROOT, 'aiak_megatron'))
+
+from transformers import AutoProcessor
+from aiak_training_llm.models.fastvit.fastvit_preprocessor import FastViTImageProcessor
+from aiak_training_llm.models.fastvit.mm_utils import expand2square
+
+# Argument parser
+parser = ArgumentParser(description="FastVLM Inference")
+parser.add_argument('--checkpoint_path', type=str, 
+                    default='/share/data/drive_3/mobile_vlm/LLaVA-OneVision-1.5/stage_1_alignment_llava_ov_4b/iter_0000020',
+                    help='Path to trained checkpoint directory')
+parser.add_argument('--tokenizer_path', type=str,
+                    default='/share/data/drive_3/mobile_vlm/LLaVA-OneVision-1.5/checkpoints/LLaVA-OneVision-1.5-4B-stage0',
+                    help='Path to tokenizer')
+parser.add_argument('--image_path', type=str, default='test_image.jpg',
+                    help='Path to input image')
+parser.add_argument('--prompt', type=str, default='What is in this image?',
+                    help='Text prompt for the model')
+parser.add_argument('--image_size', type=int, default=1024,
+                    help='FastViT image size (384 or 1024)')
+parser.add_argument('--use_gpu', action='store_true', default=True,
+                    help='Use GPU for inference')
+args = parser.parse_args()
+
+print("=" * 80)
+print("FastVLM Inference")
+print("=" * 80)
+print(f"Checkpoint: {args.checkpoint_path}")
+print(f"Tokenizer: {args.tokenizer_path}")
+print(f"Image: {args.image_path}")
+print(f"Prompt: {args.prompt}")
+print(f"Image Size: {args.image_size}")
+print("=" * 80)
+
+# Device setup
+device = torch.device("cuda:0" if torch.cuda.is_available() and args.use_gpu else "cpu")
+print(f"Using device: {device}")
+
+# Load tokenizer/processor
+print("\nLoading tokenizer and processor...")
+processor = AutoProcessor.from_pretrained(args.tokenizer_path, trust_remote_code=True)
+tokenizer = processor.tokenizer
+
+# Initialize FastViT image processor
+fastvit_processor = FastViTImageProcessor(image_size=args.image_size)
+print(f"FastViT processor initialized with image_size={args.image_size}")
+
+# Load and preprocess image
+print(f"\nLoading image from: {args.image_path}")
+if not os.path.exists(args.image_path):
+    print(f"ERROR: Image file not found: {args.image_path}")
+    print("Please provide a valid image path using --image_path")
+    sys.exit(1)
+
+image = Image.open(args.image_path).convert('RGB')
+print(f"Original image size: {image.size}")
+
+# Preprocess with FastViT (pad to square)
+mean_color = tuple(int(x * 255) for x in fastvit_processor.image_mean)
+image_padded = expand2square(image, mean_color)
+print(f"Padded to square: {image_padded.size}")
+
+pixel_values = fastvit_processor(image_padded)
+print(f"Preprocessed image shape: {pixel_values.shape}")
+
+# Create prompt with vision tokens
+IMAGE_TOKEN = "<|image_pad|>"
+VISION_START = "<|vision_start|>"
+VISION_END = "<|vision_end|>"
+
+# Format: <|im_start|>system\n...<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|>...<|vision_end|>\nPROMPT<|im_end|>\n<|im_start|>assistant\n
+conversation = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{VISION_START}{IMAGE_TOKEN}{VISION_END}\n{args.prompt}<|im_end|>\n<|im_start|>assistant\n"
+
+print("\nTokenizing prompt...")
+input_ids = tokenizer(conversation, return_tensors="pt")["input_ids"]
+print(f"Input IDs shape: {input_ids.shape}")
+print(f"Prompt tokens: {input_ids.shape[1]}")
+
+# TODO: Load your trained model checkpoint here
+# This requires implementing model loading from Megatron checkpoint
+print("\n" + "=" * 80)
+print("NOTE: Model loading from Megatron checkpoint not yet implemented.")
+print("This script currently only demonstrates preprocessing.")
+print("\nTo complete inference, you need to:")
+print("1. Load the model from checkpoint using Megatron utilities")
+print("2. Convert distributed checkpoint to single GPU format")
+print("3. Call model.forward() with preprocessed inputs")
+print("=" * 80)
+
+# Placeholder for model inference
+print("\nPreprocessed inputs ready:")
+print(f"  - pixel_values: {pixel_values.shape} ({pixel_values.dtype})")
+print(f"  - input_ids: {input_ids.shape}")
+print(f"  - Device: {device}")
+
+print("\nInference complete (preprocessing only).")
+
diff --git a/Stage1/test_image.jpg b/Stage1/test_image.jpg