sgl-project
diff --git a/‎configs/qwen3-8b-dflash.json‎
Lines changed: 41 additions & 0 deletions b/‎configs/qwen3-8b-dflash.json‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎datasets/README.md‎
Lines changed: 5 additions & 0 deletions b/‎datasets/README.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎datasets/download_laion.sh‎
Lines changed: 36 additions & 0 deletions b/‎datasets/download_laion.sh‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎docs/basic_usage/training.md‎
Lines changed: 3 additions & 3 deletions b/‎docs/basic_usage/training.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/run_qwen3_8b_dflash_online.sh‎
Lines changed: 26 additions & 0 deletions b/‎examples/run_qwen3_8b_dflash_online.sh‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎scripts/prepare_data.py‎
Lines changed: 59 additions & 8 deletions b/‎scripts/prepare_data.py‎
Lines changed: 59 additions & 8 deletions
diff --git a/‎scripts/prepare_hidden_states.py‎
Lines changed: 1 addition & 1 deletion b/‎scripts/prepare_hidden_states.py‎
Lines changed: 1 addition & 1 deletion
@@ -0,0 +1,41 @@
+{
+  "architectures": [
+    "DFlashDraftModel"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoModel": "modeling_dflash.DFlashDraftModel"
+  },
+  "block_size": 16,
+  "bos_token_id": 151643,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12288,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 40960,
+  "max_window_layers": 5,
+  "model_type": "qwen3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 5,
+  "num_key_value_heads": 8,
+  "num_target_layers": 36,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
@@ -0,0 +1,5 @@
+## Store Comprehensive Datasets Download Scripts
+
+| DatasetName | Github | Huggingface | command |
+| -------- | -------- | -------- | -------- |
+| ALLaVA-4V | [link](https://github.com/FreedomIntelligence/ALLaVA) | [link](https://huggingface.co/datasets/FreedomIntelligence/ALLaVA-4V) | download_laion.sh |
@@ -0,0 +1,36 @@
+
+
+laion_root="allava_laion"
+
+mkdir $laion_root
+cd $laion_root
+
+
+# 1. download annotation files
+## 1.1 caption
+wget -c -O ALLaVA-Caption-LAION-4V.json https://huggingface.co/datasets/FreedomIntelligence/ALLaVA-4V/resolve/main/allava_laion/ALLaVA-Caption-LAION-4V.json?download=true
+
+## 1.2 instruction
+wget -c -O ALLaVA-Instruct-LAION-4V.json https://huggingface.co/datasets/FreedomIntelligence/ALLaVA-4V/resolve/main/allava_laion/ALLaVA-Instruct-LAION-4V.json?download=true
+
+
+# 2. download and upzip images
+mkdir image_chunks
+
+## 2.1 download
+for ((i=0; i<10; i++))
+do
+    wget -c -O image_chunks/images_$i.zip https://huggingface.co/datasets/FreedomIntelligence/ALLaVA-4V/resolve/main/allava_laion/image_chunks/images_$i.zip?download=true &
+done
+
+mkdir -p images/
+wait
+
+## 2.2 unzip
+for ((i=0; i<10; i++))
+do
+    unzip -j -o image_chunks/images_$i.zip -d images/ & # wait patiently, it takes a while...
+done
+
+wait
+echo "All done!"
@@ -34,7 +34,7 @@ python scripts/prepare_data.py --dataset sharegpt
 
 ```bash
 # train llama3-8B-instruct
-bash ./examples/run_llama3_eagle3.1_8b_online.sh
+bash ./examples/run_llama3.1_8b_eagle3_online.sh
 ```
 
 ## 💨 Offline Training
@@ -49,10 +49,10 @@ Same as above
 
 ```bash
 # train llama3-8B-instruct in an offline manner
-bash ./examples/run_llama3_eagle3.1_8b_offline.sh
+bash ./examples/run_llama3.1_8b_eagle3_offline.sh
 ```
 
-It is important to note that the `run_llama3_eagle3_offline.sh` script consists of two steps:
+It is important to note that the `run_llama3.1_8b_eagle3_offline.sh` script consists of two steps:
 
 1. Generate the hidden states using the `prepare_hidden_states.py` script. This script will generate the hidden states for the test and train datasets and save them to the disk.
 2. Train the model: suppling the `--train-hidden-states-path` argument to the script so that the script will load the hidden states from the disk during training.
 
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+ROOT_DIR=$(dirname $SCRIPT_DIR)
+export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels
+export SPECFORGE_DATA_NUM_PROC=32
+NUM_GPUS=${1:-1}
+
+torchrun \
+    --standalone \
+    --nproc_per_node $NUM_GPUS \
+    $ROOT_DIR/scripts/train_dflash.py \
+    --target-model-path Qwen/Qwen3-8B \
+    --draft-config-path $ROOT_DIR/configs/qwen3-8b-dflash.json \
+    --train-data-path $ROOT_DIR/cache/dataset/sharegpt_train.jsonl \
+    --output-dir $ROOT_DIR/outputs/qwen3-8b-dflash-sharegpt \
+    --num-epochs 20 \
+    --batch-size 4 \
+    --learning-rate 1e-4 \
+    --max-length 2048 \
+    --chat-template qwen \
+    --log-interval 50 \
+    --save-interval 1000 \
+    --report-to wandb \
+    --wandb-project specforge-qwen3-8b-dflash \
+    --wandb-name qwen3-8b-dflash-sharegpt
@@ -1,12 +1,14 @@
 import argparse
 import json
 import os
+import subprocess
 from pathlib import Path
 from typing import Dict, Tuple
 
-from datasets import concatenate_datasets, load_dataset
 from tqdm import tqdm
 
+from datasets import concatenate_datasets, config, load_dataset
+
 """
 This script will convert the ultrachat/sharegpt dataset to the following schema in jsonl format:
 {
@@ -88,7 +90,53 @@ def parse_args():
     return parser.parse_args()
 
 
-def process_ultrachat_row(row: Dict) -> Tuple[Dict, int]:
+def get_cache_dir(dataset_name):
+    cache_dir = None
+    if dataset_name == "sharegpt4v":
+        raise ValueError("Downloading 'sharegpt4v' is not supported.")
+    elif dataset_name == "allava4v":
+        cache_dir = os.path.join(
+            config.HF_DATASETS_CACHE, "FreedomIntelligence", "ALLaVA"
+        )
+    else:
+        raise ValueError(
+            f"Dataset '{dataset_name}' is not a supported VLM dataset for download."
+        )
+    return cache_dir
+
+
+def download_vlm_dataset(dataset_name: str) -> None:
+    """Download VLM's dataset such as sharegpt4v and allava4v"""
+    if dataset_name == "sharegpt4v":
+        raise Exception("Don't Support Download sharegpt4v.")
+    elif dataset_name == "allava4v":
+        cache_dir = get_cache_dir(dataset_name)
+        os.makedirs(cache_dir, exist_ok=True)
+        script_path = os.path.join(
+            os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+            "datasets",
+            "download_laion.sh",
+        )
+        os.chmod(script_path, 0o755)
+        if not os.path.exists(
+            os.path.join(cache_dir, "allava_laion", "image_chunks", "images_0.zip")
+        ):
+            result = subprocess.run(
+                ["bash", script_path],
+                cwd=cache_dir,
+                capture_output=True,
+                text=True,
+            )
+            if result.returncode != 0:
+                raise RuntimeError(f"Download image dataset failed: {result.stderr}")
+            print("##### allava4v dataset Download Complete #####")
+        else:
+            print("##### allava4v dataset has existed.")
+    else:
+        raise Exception(f"Don't support {dataset_name}")
+
+
+def process_ultrachat_row(row: Dict, dataset_name: str = None) -> Tuple[Dict, int]:
     """Process a row from the ultrachat dataset.
 
     The function expects a row with the following schema:
@@ -110,7 +158,7 @@ def process_ultrachat_row(row: Dict) -> Tuple[Dict, int]:
     return row, 0
 
 
-def process_sharegpt_row(row: Dict) -> Tuple[Dict, int]:
+def process_sharegpt_row(row: Dict, dataset_name: str = None) -> Tuple[Dict, int]:
     """
     sharegpt dataset schema:
     {
@@ -138,7 +186,7 @@ def process_sharegpt_row(row: Dict) -> Tuple[Dict, int]:
     return row, skipped_count
 
 
-def process_sharegpt4v_row(row) -> Dict:
+def process_sharegpt4v_row(row, dataset_name: str = None) -> Dict:
     """
     sharegpt4v dataset schema:
     {
@@ -153,8 +201,9 @@ def process_sharegpt4v_row(row) -> Dict:
         ]
     }
     """
+    cache_dir = get_cache_dir(dataset_name)
     conversations = row["conversations"]
-    image = f'FreedomIntelligence/ALLaVA-4V/{row["image"]}'
+    image = os.path.join(cache_dir, row["image"])
     if not os.path.exists(image):
         print(f"Image path {image} does not exist, skipping this sample.")
         return None, None
@@ -194,7 +243,7 @@ def process_and_save_ds(train_ds, test_ds, output_path, proc_fn, dataset_name):
     with open(train_output_jsonl_path, "w") as f:
         for item in tqdm(train_ds, desc=f"Processing {dataset_name} dataset"):
             if proc_fn is not None:
-                row, skipped_count = proc_fn(item)
+                row, skipped_count = proc_fn(item, dataset_name)
                 if row is None:
                     continue
                 total_skipped_count += skipped_count
@@ -207,7 +256,7 @@ def process_and_save_ds(train_ds, test_ds, output_path, proc_fn, dataset_name):
         with open(test_output_jsonl_path, "w") as f:
             for item in tqdm(test_ds, desc=f"Processing {dataset_name} test dataset"):
                 if proc_fn is not None:
-                    row, skipped_count = proc_fn(item)
+                    row, skipped_count = proc_fn(item, dataset_name)
                     if row is None:
                         continue
                     total_skipped_count += skipped_count
@@ -292,11 +341,14 @@ def main():
         proc_fn = process_sharegpt_row
     elif args.dataset == "sharegpt4v":
         ds = load_dataset("Lin-Chen/ShareGPT4V", "ShareGPT4V")["train"]
+        raise Exception("Not supported sharegpt4v now")
+        download_vlm_dataset(args.dataset)
         proc_fn = process_sharegpt4v_row
     elif args.dataset == "allava4v":
         ds = load_dataset("FreedomIntelligence/ALLaVA-4V", name="allava_laion")[
             "instruct"
         ]
+        download_vlm_dataset(args.dataset)
         proc_fn = process_sharegpt4v_row
     elif args.dataset == "opc":
         if args.opc_subset == "all":
@@ -318,7 +370,6 @@ def main():
         raise ValueError(
             f"This script only supports ultrachat, sharegpt, sharegpt4v, allava4v, opc, and perfect-blend-gptoss-20B datasets for demo purpose, if you wish to use other datasets, please modify this script."
         )
-
     # filter and split dataset
     if args.sample_size is not None and args.sample_size < len(ds):
         ds = ds.select(range(args.sample_size))
 
@@ -43,10 +43,10 @@
 
 import torch
 import torch.distributed as dist
-from datasets import load_dataset
 from tqdm import tqdm
 from transformers import AutoConfig, AutoProcessor, AutoTokenizer
 
+from datasets import load_dataset
 from specforge.args import SGLangBackendArgs
 from specforge.data import build_eagle3_dataset, prepare_dp_dataloaders
 from specforge.distributed import (