AdaptiveMotorControlLab
diff --git a/‎.vscode/launch.json‎
Lines changed: 5 additions & 6 deletions b/‎.vscode/launch.json‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎README.md‎
Lines changed: 17 additions & 2 deletions b/‎README.md‎
Lines changed: 17 additions & 2 deletions
diff --git a/‎action/generate_description.py‎
Lines changed: 0 additions & 3 deletions b/‎action/generate_description.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎docs/LLaVA_OneVision_Chat.md‎
Lines changed: 15 additions & 5 deletions b/‎docs/LLaVA_OneVision_Chat.md‎
Lines changed: 15 additions & 5 deletions
diff --git a/‎docs/LLaVA_OneVision_Tutorials.py‎
Lines changed: 7 additions & 3 deletions b/‎docs/LLaVA_OneVision_Tutorials.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎docs/LLaVA_Video_1003.md‎
Lines changed: 122 additions & 0 deletions b/‎docs/LLaVA_Video_1003.md‎
Lines changed: 122 additions & 0 deletions
diff --git a/‎llava/model/builder.py‎
Lines changed: 20 additions & 2 deletions b/‎llava/model/builder.py‎
Lines changed: 20 additions & 2 deletions
@@ -90,13 +90,12 @@
 //             "request": "launch",
 //             "program": "docs/LLaVA_OneVision_Tutorials.py",
 //             "console": "integratedTerminal",
-//             "env":{"CUDA_VISIBLE_DEVICES":"0",
-//                    "LD_PRELOAD": "/usr/lib/x86_64-linux-gnu/libffi.so.7"},
+//             "env":{
+//                 "CUDA_VISIBLE_DEVICES":"0",
+//                 // "HF_HOME": "/mnt/SV_storage/VFM/huggingface",
+//                 // "LD_PRELOAD": "/usr/lib/x86_64-linux-gnu/libffi.so.7"
+//                 },
 //             "justMyCode": false,
-//             // "args": [
-//             //     "--run_dir_name", "test",
-//             //     // "--use_big_decoder"
-//             // ]
 //         }
 //     ]
 // }
 
@@ -3,21 +3,36 @@
 </p>
 
 # LLaVA-NeXT: Open Large Multimodal Models
+[![Static Badge](https://img.shields.io/badge/llava_video-paper-green)](http://arxiv.org/abs/2410.02713)
 [![Static Badge](https://img.shields.io/badge/llava_onevision-paper-green)](https://arxiv.org/abs/2408.03326)
 [![llava_next-blog](https://img.shields.io/badge/llava_next-blog-green)](https://llava-vl.github.io/blog/)
 
 [![llava_onevision-demo](https://img.shields.io/badge/llava_onevision-demo-red)](https://llava-onevision.lmms-lab.com/)
+[![llava_next-video_demo](https://img.shields.io/badge/llava_video-demo-red)](https://huggingface.co/spaces/WildVision/vision-arena)
 [![llava_next-interleave_demo](https://img.shields.io/badge/llava_next-interleave_demo-red)](https://huggingface.co/spaces/lmms-lab/LLaVA-NeXT-Interleave-Demo)
-[![llava_next-video_demo](https://img.shields.io/badge/llava_next-video_demo-red)](https://huggingface.co/spaces/WildVision/vision-arena)
 [![Openbayes Demo](https://img.shields.io/static/v1?label=Demo&message=OpenBayes%E8%B4%9D%E5%BC%8F%E8%AE%A1%E7%AE%97&color=green)](https://openbayes.com/console/public/tutorials/gW0ng9jKXfO)
 
+[![llava_video-checkpoints](https://img.shields.io/badge/llava_video-checkpoints-blue)](https://huggingface.co/collections/lmms-lab/llava-next-video-661e86f5e8dabc3ff793c944)
 [![llava_onevision-checkpoints](https://img.shields.io/badge/llava_onevision-checkpoints-blue)](https://huggingface.co/collections/lmms-lab/llava-onevision-66a259c3526e15166d6bba37)
 [![llava_next-interleave_checkpoints](https://img.shields.io/badge/llava_next-interleave_checkpoints-blue)](https://huggingface.co/collections/lmms-lab/llava-next-interleave-66763c55c411b340b35873d1)
-[![llava_next-video_checkpoints](https://img.shields.io/badge/llava_next-video_checkpoints-blue)](https://huggingface.co/collections/lmms-lab/llava-next-video-661e86f5e8dabc3ff793c944)
 [![llava_next-image_checkpoints](https://img.shields.io/badge/llava_next-image_checkpoints-blue)](https://huggingface.co/lmms-lab)
 
 ## Release Notes
 
+- **[2024/10/04] 🔥 LLaVA-Video** (formerly LLaVA-NeXT-Video) has undergone a major upgrade! We are excited to release **LLaVA-Video-178K**, a high-quality synthetic dataset for video instruction tuning. This dataset includes:
+
+  - 178,510 caption entries
+  - 960,792 open-ended Q&A pairs
+  - 196,198 multiple-choice Q&A items
+
+  Along with this, we’re also releasing the **LLaVA-Video 7B/72B models**, which deliver competitive performance on the latest video benchmarks, including [Video-MME](https://video-mme.github.io/home_page.html#leaderboard), [LongVideoBench](https://longvideobench.github.io/), and [Dream-1K](https://tarsier-vlm.github.io/).
+
+  📄 **Explore more**:
+  - [LLaVA-Video-178K Dataset](https://huggingface.co/datasets/lmms-lab/LLaVA-Video-178K): Download the dataset.
+  - [LLaVA-Video Models](https://huggingface.co/collections/lmms-lab/llava-video-661e86f5e8dabc3ff793c944): Access model checkpoints.
+  - [Paper](http://arxiv.org/abs/2410.02713): Detailed information about LLaVA-Video.
+  - [LLaVA-Video Documentation](https://github.com/LLaVA-VL/LLaVA-NeXT/blob/main/docs/LLaVA_Video_1003.md): Guidance on training, inference and evaluation.
+
 - [2024/09/13] 🔥 **🚀 [LLaVA-OneVision-Chat](docs/LLaVA_OneVision_Chat.md)**. The new LLaVA-OV-Chat (7B/72B) significantly improves the chat experience of LLaVA-OV. 📄
 
   ![](docs/ov_chat_images/chat_results.png)
 
@@ -79,9 +79,6 @@ def generate_random_mc_conversation(options:list[str], gt_answer_letter, gt_answ
         {"from": "gpt", "value": f"{gt_answer_letter}. {gt_answer_name}"} 
     ]
 
-def generate_avion_mc_conversation():
-    pass
-
 
 def get_args():
     parser = argparse.ArgumentParser(description="For generating VQA for EPIC-KITCHEN")
 
@@ -4,8 +4,8 @@
 
 ### Key Observations:
 
-- **Impact of Alignment Learning**: By incorporating alignment learning—whether through human feedback or AI-generated feedback—we've observed a notable improvement in LLaVA-OneVision's chat experience. This progress is reflected in the significant performance gains recorded on both the LLaVA-W and WildVision benchmarks.
-- **Success of Self-Generated Feedback**: In LLaVA-OneVision's case, leveraging self-generated feedback data has proven to be a highly effective strategy for enhancing its visual chat capabilities. This approach allows the model to refine its responses autonomously, leading to more natural and coherent conversations.
+- **Impact of Preference Learning**: By incorporating alignment learning—whether through human feedback or AI-generated feedback—we've observed a notable improvement in LLaVA-OneVision's chat experience. This progress is reflected in the significant performance gains recorded on both the LLaVA-W and WildVision benchmarks.
+- **Success of Self-Generated Feedback**: In LLaVA-OneVision's case, leveraging self-generated feedback data has proven to be a highly effective strategy for enhancing its visual chat capabilities. Specifically, [LLaVA-Critic](https://llava-vl.github.io/blog/2024-10-03-llava-critic/) is a utilized as a generalist evaluator to generate the scoring feedback for preference learning. This approach allows the model to refine its responses autonomously, leading to more natural and coherent conversations.
 
 ----
 
@@ -57,7 +57,7 @@ To optimize LLaVA-OneVision’s in-the-wild conversational abilities, we've empl
 
 1. **Human Feedback from [LLaVA-RLHF](https://llava-rlhf.github.io/)**: Real-world human input plays a crucial role in guiding the model toward more intuitive and user-friendly responses.
 
-2. **AI Feedback from LLaVA-OV’s Self-Generated Responses**: Additionally, the AI's own self-generated feedback allows it to continuously improve and adapt, making this a valuable source for iterative learning.
+2. **AI Feedback from LLaVA-OV’s Self-Generated Responses**: Additionally, the AI's own self-generated feedback allows it to continuously improve and adapt, making this a valuable source for iterative learning. [LLaVA-Critic](https://llava-vl.github.io/blog/2024-10-03-llava-critic/) is a utilized as a generalist evaluator to generate the scoring feedback for preference learning
 
 By experimenting with either of these two forms of feedback, we've been able to significantly enhance LLaVA-OneVision's conversation capabilities, bringing it closer to achieving seamless visual chat interactions in dynamic, real-world environments.
 
@@ -76,7 +76,7 @@ For each langauge-image prompt in the dataset, we randomly generate `k = 5` cand
 
 ##### Step 2: Scoring and Acquiring Feedback Data
 
-Once the candidate responses are generated, we utilize a feedback source (e.g., the Reward Model from LLaVA-RLHF) to score each of them. The reward model is responsible for evaluating the quality of the responses based on relevance, coherence, and appropriateness in relation to the given image-question pair. From the scored responses, we then select:
+Once the candidate responses are generated, we utilize a feedback source (e.g., the reward signals from LLaVA-RLHF or reward signals from LLaVA-Critic) to score each of them. The reward model is responsible for evaluating the quality of the responses based on relevance, coherence, and appropriateness in relation to the given image-question pair. From the scored responses, we then select:
 
 - The **best** response (highest score)
 - The **worst** response (lowest score)
@@ -111,7 +111,7 @@ This iterative process is repeated for `N=3` rounds in total, with each round re
 
 ------
 
-Stay tuned on how we develop AI feedback for self-improvement LMMs!
+Check out on how we develop AI feedback for self-improvement LMMs, using [LLaVA-Critic](https://llava-vl.github.io/blog/2024-10-03-llava-critic/) as a generalist evaluator to generate the scoring feedback for preference learning!
 
 *Contributors to LLaVA-OneVision-Chat: [Tianyi Xiong](https://tyxiong23.github.io/), [Bo Li](https://brianboli.com/), [Dong Guo](https://www.linkedin.com/in/dongguoset/), [Huizhuo Yuan](https://scholar.google.com/citations?user=8foZzX4AAAAJ), [Quanquan Gu](https://web.cs.ucla.edu/~qgu/), [Chunyuan Li](https://scholar.google.com/citations?user=Zd7WmXUAAAAJ)*
 
@@ -129,6 +129,16 @@ If you find it useful for your research and applications, please cite related pa
   year={2024}
 }
 
+@article{xiong2024llavacritic,
+  title={LLaVA-Critic: Learning to Evaluate Multimodal Models},
+  author={Xiong, Tianyi and Wang, Xiyao and Guo, Dong and Ye, Qinghao and Fan, Haoqi and Gu, Quanquan and Huang, Heng and Li, Chunyuan},
+  year={2024},
+  eprint={2410.02712},
+  archivePrefix={arXiv},
+  primaryClass={cs.CV},
+  url={https://arxiv.org/abs/2410.02712},
+}
+
 @article{li2024llavaov,
   title={Llava-onevision: Easy visual task transfer},
   author={Li, Bo and Zhang, Yuanhan and Guo, Dong and Zhang, Renrui and Li, Feng and Zhang, Hao and Zhang, Kaichen and Li, Yanwei and Liu, Ziwei and Li, Chunyuan},
 
@@ -73,11 +73,15 @@
 
 warnings.filterwarnings("ignore")
 # Load the OneVision model
-pretrained = "lmms-lab/llava-onevision-qwen2-7b-ov"
-model_name = "llava_qwen"
+# pretrained = "/mnt/SV_storage/VFM/huggingface/hub/models--lmms-lab--llava-onevision-qwen2-0.5b-ov/snapshots/381d9947148efb1e58a577f451c05705ceec666e"
+# pretrained = "/mnt/SV_storage/VFM/LLaVA-NeXT/experiments/EK100_quick_config"
+# model_base = None
+pretrained = "/mnt/SV_storage/VFM/LLaVA-NeXT/experiments/EK100_lora_quick_check"
+model_base = "/mnt/SV_storage/VFM/huggingface/hub/models--lmms-lab--llava-onevision-qwen2-0.5b-ov/snapshots/381d9947148efb1e58a577f451c05705ceec666e"
+model_name = "lora_llava_qwen"
 device = "cuda"
 device_map = "auto"
-tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map, attn_implementation="sdpa")
+tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, model_base, model_name, device_map=device_map, attn_implementation="sdpa")
 
 model.eval()
 
 
@@ -0,0 +1,122 @@
+# LLaVA Video
+
+##  Table of Contents
+
+1. [Model Summary](##model-summary)
+2. [Inference](##inference)
+3. [Training](##training)
+4. [Evaluation](##evaluation-guidance)
+6. [Citation](##citation)
+
+## Model Summary
+
+The LLaVA-Video models are 7/72B parameter models trained on [LLaVA-Video-178K](https://huggingface.co/datasets/lmms-lab/LLaVA-Video-178K) and [LLaVA-OneVision Dataset](https://huggingface.co/datasets/lmms-lab/LLaVA-OneVision-Data), based on Qwen2 language model with a context window of 32K tokens.
+
+
+## Inference
+
+We provide the simple generation process for using our model. For more details, you could refer to [Github](https://github.com/LLaVA-VL/LLaVA-NeXT).
+
+```python
+# pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
+from llava.model.builder import load_pretrained_model
+from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
+from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
+from llava.conversation import conv_templates, SeparatorStyle
+from PIL import Image
+import requests
+import copy
+import torch
+import sys
+import warnings
+from decord import VideoReader, cpu
+import numpy as np
+warnings.filterwarnings("ignore")
+def load_video(self, video_path, max_frames_num,fps=1,force_sample=False):
+    if max_frames_num == 0:
+        return np.zeros((1, 336, 336, 3))
+    vr = VideoReader(video_path, ctx=cpu(0),num_threads=1)
+    total_frame_num = len(vr)
+    video_time = total_frame_num / vr.get_avg_fps()
+    fps = round(vr.get_avg_fps()/fps)
+    frame_idx = [i for i in range(0, len(vr), fps)]
+    frame_time = [i/fps for i in frame_idx]
+    if len(frame_idx) > max_frames_num or force_sample:
+        sample_fps = max_frames_num
+        uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_fps, dtype=int)
+        frame_idx = uniform_sampled_frames.tolist()
+        frame_time = [i/vr.get_avg_fps() for i in frame_idx]
+    frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
+    spare_frames = vr.get_batch(frame_idx).asnumpy()
+    # import pdb;pdb.set_trace()
+    return spare_frames,frame_time,video_time
+pretrained = "lmms-lab/LLaVA-Video-7B-Qwen2"
+model_name = "llava_qwen"
+device = "cuda"
+device_map = "auto"
+tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, torch_dtype="bfloat16", device_map=device_map)  # Add any other thing you want to pass in llava_model_args
+model.eval()
+video_path = "XXXX"
+max_frames_num = "64"
+video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
+video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().bfloat16()
+video = [video]
+conv_template = "qwen_1_5"  # Make sure you use correct chat template for different models
+time_instruciton = f"The video lasts for {video_time:.2f} seconds, and {len(video[0])} frames are uniformly sampled from it. These frames are located at {frame_time}.Please answer the following questions related to this video."
+question = DEFAULT_IMAGE_TOKEN + f"{time_instruciton}\nPlease describe this video in detail."
+conv = copy.deepcopy(conv_templates[conv_template])
+conv.append_message(conv.roles[0], question)
+conv.append_message(conv.roles[1], None)
+prompt_question = conv.get_prompt()
+input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
+cont = model.generate(
+    input_ids,
+    images=video,
+    modalities= ["video"],
+    do_sample=False,
+    temperature=0,
+    max_new_tokens=4096,
+)
+text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=True)[0].strip()
+print(text_outputs)
+```
+
+
+## Training
+
+[[Scripts]](/Users/zhangyuanhan/Desktop/LLaVA-NeXT/scripts/video/train): Start training models on your single-image/multi-image/video data.
+
+
+## Evaluation Guidance
+
+We use the [lmms-eval](https://github.com/EvolvingLMMs-Lab/lmms-eval) toolkit to evaluate our models. Ensure you have installed the LLaVA-NeXT model files as per the instructions in the main README.md.
+
+Install lmms-eval:
+
+> pip install git+https://github.com/EvolvingLMMs-Lab/lmms-eval.git
+
+### Reproducing Evaluation Results
+
+Our models' evaluation results can be fully reproduced using the lmms-eval toolkit. After installing lmms-eval and llava, you can run the evaluation using the following commands.
+
+Note: These commands require flash-attn. If you prefer not to install it, disable flash-attn by adding `attn_implementation=None` to the `--model_args` parameter.
+
+Important: Different torch versions may cause slight variations in results. By default in `lmms-eval`, the requirement for torch version is set to the latest version. In `llava` repo, the torch version is set to `2.1.2`. Torch version `2.1.2` would be stable for both `llava` and `lmms-eval`
+
+### Evaluating LLaVA-Video on multiple datasets
+
+We recommend the developers and researchers to thoroughly evaluate the models on more datasets to get a comprehensive understanding of their performance in different scenarios. So we provide a comprehensive list of datasets for evaluation, and welcome to incoporate more evaluation tasks. Please refer to the [lmms-eval](https://github.com/EvolvingLMMs-Lab/lmms-eval) for more details.
+
+```bash
+# video tasks
+accelerate launch --num_processes=8 \
+-m lmms_eval \
+--model llava_vid \
+--model_args pretrained=lmms-lab/LLaVA-Video-7B-Qwen2,conv_template=qwen_1_5,max_frames_num=64,mm_spatial_pool_mode=average \
+--tasks activitynetqa,videochatgpt,nextqa_mc_test,egoschema,video_dc499,videmme,videomme_w_subtitle,perceptiontest_val_mc \
+--batch_size 1 \
+--log_samples \
+--log_samples_suffix llava_vid \
+--output_path ./logs/
+```
+
@@ -24,16 +24,20 @@
 from llava.utils import rank0_print
 
 
-def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", attn_implementation="flash_attention_2", customized_config=None, overwrite_config=None, **kwargs):
+def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", torch_dtype="float16",attn_implementation="flash_attention_2", customized_config=None, overwrite_config=None, **kwargs):
     kwargs["device_map"] = device_map
 
     if load_8bit:
         kwargs["load_in_8bit"] = True
     elif load_4bit:
         kwargs["load_in_4bit"] = True
         kwargs["quantization_config"] = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4")
-    else:
+    elif torch_dtype == "float16":
         kwargs["torch_dtype"] = torch.float16
+    elif torch_dtype == "bfloat16":
+        kwargs["torch_dtype"] = torch.bfloat16
+    else:
+        import pdb;pdb.set_trace()
 
     if customized_config is not None:
         kwargs["config"] = customized_config
@@ -67,6 +71,20 @@ def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, l
                 lora_cfg_pretrained = LlavaMistralConfig.from_pretrained(model_path)
                 tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
                 model = LlavaMistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, attn_implementation=attn_implementation, **kwargs)
+            
+            elif "qwen" in model_name.lower() or "quyen" in model_name.lower():
+
+                tokenizer = AutoTokenizer.from_pretrained(model_base)
+                if "moe" in model_name.lower() or "A14B" in model_name.lower():
+                    from llava.model.language_model.llava_qwen_moe import LlavaQwenMoeConfig
+                    lora_cfg_pretrained = LlavaQwenMoeConfig.from_pretrained(model_path)
+                    model = LlavaQwenMoeForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, attn_implementation=attn_implementation, config=lora_cfg_pretrained, **kwargs)
+                else:
+                    from llava.model.language_model.llava_qwen import LlavaQwenConfig
+                    lora_cfg_pretrained = LlavaQwenConfig.from_pretrained(model_path)
+                    model = LlavaQwenForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, attn_implementation=attn_implementation, config=lora_cfg_pretrained, **kwargs)
+
+            
             elif "gemma" in model_name.lower():
                 from llava.model.language_model.llava_gemma import LlavaGemmaConfig
Original file line number	Diff line number	Diff line change
`@@ -79,9 +79,6 @@ def generate_random_mc_conversation(options:list[str], gt_answer_letter, gt_answ`
`79`	`79`	`{"from": "gpt", "value": f"{gt_answer_letter}. {gt_answer_name}"}`
`80`	`80`	`]`
`81`	`81`
`82`		`-def generate_avion_mc_conversation():`
`83`		`- pass`
`84`		`-`
`85`	`82`
`86`	`83`	`def get_args():`
`87`	`84`	`parser = argparse.ArgumentParser(description="For generating VQA for EPIC-KITCHEN")`