Merge branch 'main' of https://github.com/FoundationVision/Infinity

hanjian.thu123 · hanjian.thu123 · commit 73b36e86b554 · 2025-01-21T16:15:27.000+08:00
diff --git a/DockerFile b/DockerFile
@@ -0,0 +1,27 @@
+FROM pytorch/pytorch:2.5.1-cuda11.8-cudnn9-devel
+
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1 \
+    CUDA_HOME=/usr/local/cuda \
+    PATH="$CUDA_HOME/bin:$PATH"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    curl \
+    ffmpeg \
+    libsm6 \
+    libxext6 \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace/
+
+COPY requirements.txt /workspace/requirements.txt
+
+RUN pip install --upgrade pip \
+    && pip install ninja \
+    && MAX_JOBS=1 pip install flash-attn --no-build-isolation \
+    && pip install -r requirements.txt \
+    && pip install opencv-fixer==0.2.5 \
+    && python -c "from opencv_fixer import AutoFix; AutoFix()"
+
+CMD ["/bin/bash"]
diff --git a/README.md b/README.md
@@ -22,6 +22,7 @@
 ## 🔥 Updates!!
 * Dec 24, 2024: 🔥 Training and Testing Codes && Checkpoints && Demo released!
 * Dec 12, 2024: 💻 Add Project Page
+* Dec 10, 2024: 🏆 Visual AutoRegressive Modeling received NeurIPS 2024 Best Paper Award.
 * Dec 5, 2024: 🤗 Paper release
 
 ## 🕹️ Try and Play with Infinity!
@@ -166,7 +167,28 @@ Fine-tuning Infinity is quite simple where you only need to append ```--rush_res
 
 After fine-tuning, you will get a checkpoint like [model_dir]/ar-ckpt-giter(xxx)K-ep(xxx)-iter(xxx)-last.pth. Note that this checkpoint cotains training states besides model weights. Inference with this model should enable ```--enable_model_cache=1``` in [eval.sh](scripts/eval.sh) or [interactive_infer.ipynb](tools/interactive_infer.ipynb).
 
+## Use Docker
 
+If you are interested in reproducing the paper model locally (inference only) you can refer to our Docker container. This one-stop approach is especially suitable for people with no background knowledge.
+
+### 1. Download weights
+
+Download `flan-t5-xl` folder, `infinity_2b_reg.pth` and `infinity_vae_d32reg.pth` files to weights folder.
+
+### 2. Build Docker container
+
+```
+ docker build -t my-flash-attn-env .
+ docker run --gpus all -it --name my-container -v {your-local-path}:/workspace my-flash-attn-env
+```
+
+### 3. Run
+
+```
+python Infinity/tools/reproduce.py
+```
+
+Note: You can also use your own prompts, just modify the prompt in `reproduce.py`.
 
 ## One More Thing: Infinity-20B is coming soon 📆
 Infinity shows strong scaling capabilities as illustrated before. Thus we are encouraged to continue to scale up the model size to 20B. Here we present the side-by-side comparison results between Infinity-2B and Infinity-20B.
@@ -186,7 +208,7 @@ Currently, Infinity-20B is still on the training phrase. We will release Infinit
 If our work assists your research, feel free to give us a star ⭐ or cite us using:
 
 ```
-@misc{han2024infinityscalingbitwiseautoregressive,
+@misc{Infinity,
     title={Infinity: Scaling Bitwise AutoRegressive Modeling for High-Resolution Image Synthesis}, 
     author={Jian Han and Jinlai Liu and Yi Jiang and Bin Yan and Yuqi Zhang and Zehuan Yuan and Bingyue Peng and Xiaobing Liu},
     year={2024},
@@ -197,5 +219,17 @@ If our work assists your research, feel free to give us a star ⭐ or cite us us
 }
 ```
 
+```
+@misc{VAR,
+      title={Visual Autoregressive Modeling: Scalable Image Generation via Next-Scale Prediction}, 
+      author={Keyu Tian and Yi Jiang and Zehuan Yuan and Bingyue Peng and Liwei Wang},
+      year={2024},
+      eprint={2404.02905},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2404.02905}, 
+}
+```
+
 ## License
 This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
diff --git a/infinity/models/infinity.py b/infinity/models/infinity.py
@@ -16,7 +16,7 @@
 from torch.utils.checkpoint import checkpoint
 from PIL import Image
 import numpy as np
-from torch.nn.attention.flex_attention import flex_attention
+# from torch.nn.attention.flex_attention import flex_attention
 
 import infinity.utils.dist as dist
 from infinity.utils.dist import for_visualize
diff --git a/tools/reproduce.py b/tools/reproduce.py
@@ -0,0 +1,101 @@
+import random
+import torch
+import os
+import os.path as osp
+import cv2
+import numpy as np
+from run_infinity import *
+
+torch.cuda.set_device(0)
+model_path = '/workspace/Infinity/weights/infinity_2b_reg.pth'
+vae_path = '/workspace/Infinity/weights/infinity_vae_d32reg.pth'
+text_encoder_ckpt = '/workspace/Infinity/weights/flan-t5-xl'
+
+# SET
+args = argparse.Namespace(
+    pn='1M',
+    model_path=model_path,
+    cfg_insertion_layer=0,
+    vae_type=32,
+    vae_path=vae_path,
+    add_lvl_embeding_only_first_block=1,
+    use_bit_label=1,
+    model_type='infinity_2b',
+    rope2d_each_sa_layer=1,
+    rope2d_normalized_by_hw=2,
+    use_scale_schedule_embedding=0,
+    sampling_per_bits=1,
+    text_encoder_ckpt=text_encoder_ckpt,
+    text_channels=2048,
+    apply_spatial_patchify=0,
+    h_div_w_template=1.000,
+    use_flex_attn=0,
+    cache_dir='/dev/shm',
+    checkpoint_type='torch',
+    seed=0,
+    bf16=1,
+    save_file='tmp.jpg',
+    enable_model_cache=0
+)
+
+# LOAD
+text_tokenizer, text_encoder = load_tokenizer(t5_path=args.text_encoder_ckpt)
+vae = load_visual_tokenizer(args)
+infinity = load_transformer(vae, args)
+
+# PROMPT
+prompts = {
+    "vintage_insect": "Insect made from vintage 1960s electronic components, capacitors, resistors, transistors, wires, diodes, solder, circuitboard.",
+    "macro_closeup": "Denis Villeneuve's extreme macro cinematographic close-up in water.",
+    "3d_school": "A creative 3D image to be placed at the bottom of a mobile application's homepage, depicting a miniature school and children carrying backpacks.",
+    "explore_more": "Create an image with 'Explore More' in an adventurous font over a picturesque hiking trail.",
+    "toy_car": "Close-up shot of a diecast toy car, diorama, night, lights from windows, bokeh, snow.",
+    "fairy_house": "House: white; pink tinted windows; surrounded by flowers; cute; scenic; garden; fairy-like; epic; photography; photorealistic; insanely detailed and intricate; textures; grain; ultra-realistic.",
+    "cat_fashion": "Hyperrealistic black and white photography of cats fashion show in style of Helmut Newton.",
+    "spacefrog_astroduck": "Two superheroes called Spacefrog (a dashing green cartoon-like frog with a red cape) and Astroduck (a yellow fuzzy duck, part-robot, with blue/grey armor), near a garden pond, next to their spaceship, a classic flying saucer, called the Tadpole 3000. Photorealistic.",
+    "miniature_village": "An enchanted miniature village bustling with activity, featuring tiny houses, markets, and residents.",
+    "corgi_dog": "A close-up photograph of a Corgi dog. The dog is wearing a black hat and round, dark sunglasses. The Corgi has a joyful expression, with its mouth open and tongue sticking out, giving an impression of happiness or excitement.",
+    "robot_eggplant": "a robot holding a huge eggplant, sunny nature background",
+    "perfume_product": "Product photography, a perfume placed on a white marble table with pineapple, coconut, lime next to it as decoration, white curtains, full of intricate details, realistic, minimalist, layered gestures in a bright and concise atmosphere, minimalist style.",
+    "mountain_landscape": "The image presents a picturesque mountainous landscape under a cloudy sky. The mountains, blanketed in lush greenery, rise majestically, their slopes dotted with clusters of trees and shrubs. The sky above is a canvas of blue, adorned with fluffy white clouds that add a sense of tranquility to the scene. In the foreground, a valley unfolds, nestled between the towering mountains. It appears to be a rural area, with a few buildings and structures visible, suggesting the presence of a small settlement. The buildings are scattered, blending harmoniously with the natural surroundings. The image is captured from a high vantage point, providing a sweeping view of the valley and the mountains."
+}
+
+# OUTPUT
+output_dir = "outputs"
+os.makedirs(output_dir, exist_ok=True)
+
+# GEN IMG
+for category, prompt in prompts.items():
+    cfg = 3
+    tau = 0.5
+    h_div_w = 1/1 # Aspect Ratio
+    seed = random.randint(0, 10000)
+    enable_positive_prompt = 0
+
+    h_div_w_template_ = h_div_w_templates[np.argmin(np.abs(h_div_w_templates-h_div_w))]
+    scale_schedule = dynamic_resolution_h_w[h_div_w_template_][args.pn]['scales']
+    scale_schedule = [(1, h, w) for (_, h, w) in scale_schedule]
+
+    # GEN
+    generated_image = gen_one_img(
+        infinity,
+        vae,
+        text_tokenizer,
+        text_encoder,
+        prompt,
+        g_seed=seed,
+        gt_leak=0,
+        gt_ls_Bl=None,
+        cfg_list=cfg,
+        tau_list=tau,
+        scale_schedule=scale_schedule,
+        cfg_insertion_layer=[args.cfg_insertion_layer],
+        vae_type=args.vae_type,
+        sampling_per_bits=args.sampling_per_bits,
+        enable_positive_prompt=enable_positive_prompt,
+    )
+
+    # SAVE
+    save_path = osp.join(output_dir, f"re_{category}_test.jpg")
+    cv2.imwrite(save_path, generated_image.cpu().numpy())
+    print(f"{category} image saved to {save_path}")