Skip to content
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
Binary file added .alignment.sh.swp
Binary file not shown.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,6 @@ coverage.xml
**/.ipynb_checkpoints/
.vscode/
checkpoints/
stage_1_alignment_llava_ov_4b/
tmp/
Stage1/training_output.log
File renamed without changes.
62 changes: 62 additions & 0 deletions Stage1/alignment.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/bin/bash
#SBATCH --job-name=llava_stage1_4b
#SBATCH --time=72:00:00
#SBATCH --nodes=1
#SBATCH -p long
#SBATCH -q gpu-12
#SBATCH --gres=gpu:4 # 4 A100 GPUs on a single node
#SBATCH --mem=230G # node RAM, not GPU RAM
#SBATCH --ntasks-per-node=4 # one task per GPU
#SBATCH --cpus-per-task=16 # adjust if you want fewer CPU cores
#SBATCH --output=/l/users/rana.zayed/new_fastvlm/LLaVA-OneVision-1.5/Stage1/logs/%x-%j.out # logs/llava_stage1_4b-<jobid>.out

# ---- ENV SETUP ----
source ~/miniconda3/etc/profile.d/conda.sh
conda activate llava-ov-4b-clean
# conda activate apex_cuda120
export PATH=/usr/local/cuda-12.1/bin:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda-12.1/lib64:$LD_LIBRARY_PATH
export APEX_CUDA_EXT=1

# Go to repo root on CIAI cluster
# cd /l/users/rana.zayed/new_fastvlm/LLaVA-OneVision-1.5
# # Go to repo root on 156 machine
cd /share/data/drive_3/mobile_vlm/LLaVA-OneVision-1.5

# ============================================================
# Required environment variables:
# AIAK_TRAINING_PATH Root directory of the AIAK-Training-LLM project
# DATA_PATH Directory with WebDataset shards (.tar) for pretraining
# TOKENIZER_PATH Hugging Face tokenizer directory
# CHECKPOINT_PATH Megatron-formatted checkpoint directory (e.g., mcore TP1/PP1)
# SAVE_CKPT_PATH Output directory for saving training checkpoints
# export CUDA_HOME=/apps/local/nvidia/cuda-12.0
# export PATH="$CUDA_HOME/bin:$PATH"
# export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH

# Use CUDA 12.1 libraries but system nvcc (CUDA 10.1) since 12.1 doesn't have nvcc
export CUDA_HOME=/usr
export PATH="/usr/bin:$PATH"
export LD_LIBRARY_PATH="/usr/local/cuda-12.1/lib64:/usr/local/cuda-12.1/targets/x86_64-linux/lib:$LD_LIBRARY_PATH"


# AIAK_TRAINING_PATH=/l/users/rana.zayed/new_fastvlm/LLaVA-OneVision-1.5 \
# DATA_PATH=/l/users/rana.zayed/new_fastvlm/LLaVA-OneVision-1.5/data/LLaVA-558K-Webdataset \
# TOKENIZER_PATH=/l/users/rana.zayed/new_fastvlm/LLaVA-OneVision-1.5/checkpoints/LLaVA-OneVision-1.5-4B-stage0 \
# CHECKPOINT_PATH=/l/users/rana.zayed/new_fastvlm/LLaVA-OneVision-1.5/checkpoints/LLaVA-OneVision-1.5-4B-stage0_mcore_tp1_pp1 \

# echo "AIAK_TRAINING_PATH=${AIAK_TRAINING_PATH}"
# echo "DATA_PATH=${DATA_PATH}"
# echo "TOKENIZER_PATH=${TOKENIZER_PATH}"
# echo "CHECKPOINT_PATH=${CHECKPOINT_PATH}"
# echo "SLURM_NODELIST=${SLURM_NODELIST}"

# Weights & Biases configuration
export WANDB_API_KEY="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"
export WANDB_PROJECT="llava-ov-1_5"
export WANDB_NAME="fastvit_integration"

export CUDA_VISIBLE_DEVICES=0,1
export GPUS_PER_NODE=2

bash examples/llava_ov_1_5/quick_start/stage_1_alignment_llava_ov_4b.sh 2 1
68 changes: 68 additions & 0 deletions Stage1/alignment_rocm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/bin/bash
# AMD/ROCm alignment launcher (separate from alignment.sh)
# Adjust SBATCH partition/queue to your cluster settings.

#SBATCH --job-name=llava_stage1_4b_rocm
#SBATCH --time=72:00:00
#SBATCH --nodes=1
#SBATCH -p amd-gpu # TODO: set your AMD GPU partition/queue
#SBATCH --gres=gpu:2 # match quick_start GPUS_PER_NODE=2
#SBATCH --mem=230G
#SBATCH --ntasks-per-node=2 # one task per GPU
#SBATCH --cpus-per-task=16
#SBATCH --output=/l/users/rana.zayed/new_fastvlm/LLaVA-OneVision-1.5/Stage1/logs/%x-%j.out

#!/bin/bash

# ---- ENV SETUP (ROCm) ----
source ~/.bashrc
# Optional: override with `CONDA_ENV=myenv` before running
CONDA_ENV=${CONDA_ENV:-llava-ov-4b-clean}
if command -v conda >/dev/null 2>&1; then
if conda env list | awk '{print $1}' | grep -qx "$CONDA_ENV"; then
conda activate "$CONDA_ENV"
else
echo "[Warn] Conda env '$CONDA_ENV' not found; continuing without activation"
fi
fi

export ROCM_HOME=${ROCM_HOME:-/opt/rocm}
export PATH="${ROCM_HOME}/bin:${PATH}"
export LD_LIBRARY_PATH="${ROCM_HOME}/lib:${ROCM_HOME}/lib64:${LD_LIBRARY_PATH}"
export HIP_VISIBLE_DEVICES=${HIP_VISIBLE_DEVICES:-0,1}

# CUDA-only Apex extensions are disabled on ROCm
export APEX_CUDA_EXT=0

# RCCL/NCCL runtime hints (tune as needed)
export NCCL_DEBUG=${NCCL_DEBUG:-WARN}
export NCCL_COLLNET_ENABLE=${NCCL_COLLNET_ENABLE:-0}
export NCCL_P2P_ENABLE=${NCCL_P2P_ENABLE:-1}
# export NCCL_SOCKET_IFNAME=eno1 # uncomment and set to your NIC if needed

# Resolve repo root relative to this script (Stage1/..)
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
REPO_ROOT=$(cd "$SCRIPT_DIR/.." && pwd)

# Go to repo root
cd "$REPO_ROOT" || { echo "[Error] Repo root not found: $REPO_ROOT"; exit 1; }

# Required environment variables
AIAK_TRAINING_PATH=${AIAK_TRAINING_PATH:-$REPO_ROOT} \
DATA_PATH=${DATA_PATH:-$REPO_ROOT/data/LLaVA-558K-Webdataset} \
TOKENIZER_PATH=${TOKENIZER_PATH:-$REPO_ROOT/checkpoints/LLaVA-OneVision-1.5-4B-stage0} \
CHECKPOINT_PATH=${CHECKPOINT_PATH:-$REPO_ROOT/checkpoints/LLaVA-OneVision-1.5-4B-stage0_mcore_tp1_pp1} \

echo "AIAK_TRAINING_PATH=${AIAK_TRAINING_PATH}"
echo "DATA_PATH=${DATA_PATH}"
echo "TOKENIZER_PATH=${TOKENIZER_PATH}"
echo "CHECKPOINT_PATH=${CHECKPOINT_PATH}"
echo "SLURM_NODELIST=${SLURM_NODELIST}"

# Launch quick-start script (uses torchrun and nccl backend which maps to RCCL on ROCm)
QS="$REPO_ROOT/examples/llava_ov_1_5/quick_start/stage_1_alignment_llava_ov_4b.sh"
if [[ ! -f "$QS" ]]; then
echo "[Error] Quick-start script not found: $QS"
exit 1
fi
bash "$QS"
10 changes: 9 additions & 1 deletion aiak_megatron/megatron/core/datasets/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,15 @@ LIBEXT = $(shell python3-config --extension-suffix)
OUT = $(LIBNAME)$(LIBEXT)
SRC = helpers.cpp

default: $(OUT)
# Check if any compiled library exists
EXISTING_SO = $(wildcard helpers_cpp*.so)

default:
ifneq ($(EXISTING_SO),)
@echo "Using existing compiled library: $(EXISTING_SO)"
else
@$(MAKE) $(OUT)
endif

$(OUT): $(SRC)
$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
17 changes: 15 additions & 2 deletions aiak_megatron/megatron/core/datasets/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,21 @@ def compile_helpers():
"""Compile C++ helper functions at runtime. Make sure this is invoked on a single process."""
import os
import subprocess

command = ["make", "-C", os.path.abspath(os.path.dirname(__file__))]
import glob

# Check if helpers_cpp is already compiled
helpers_dir = os.path.abspath(os.path.dirname(__file__))
so_files = glob.glob(os.path.join(helpers_dir, "helpers_cpp*.so"))
if so_files:
# Try to import to verify it works
try:
import helpers_cpp
log_single_rank(logger, logging.INFO, f"Using pre-compiled helpers: {so_files[0]}")
return
except ImportError:
pass

command = ["make", "-C", helpers_dir]
if subprocess.run(command).returncode != 0:
import sys

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class StrategyAction(Enum):

default_strategies: DefaultDict[str, dict[tuple, Any]] = defaultdict(dict)


async_calls = AsyncCallsQueue()


Expand Down
Loading