add stream inference code

ZhangYuanhan-AI · ZhangYuanhan-AI · commit 819a61214d6b · 2024-09-22T12:22:30.000Z
diff --git a/llava/model/builder.py b/llava/model/builder.py
@@ -24,16 +24,20 @@
 from llava.utils import rank0_print
 
 
-def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", attn_implementation="flash_attention_2", customized_config=None, overwrite_config=None, **kwargs):
+def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", torch_dtype="float16",attn_implementation="flash_attention_2", customized_config=None, overwrite_config=None, **kwargs):
     kwargs["device_map"] = device_map
 
     if load_8bit:
         kwargs["load_in_8bit"] = True
     elif load_4bit:
         kwargs["load_in_4bit"] = True
         kwargs["quantization_config"] = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4")
-    else:
+    elif torch_dtype == "float16":
         kwargs["torch_dtype"] = torch.float16
+    elif torch_dtype == "bfloat16":
+        kwargs["torch_dtype"] = torch.bfloat16
+    else:
+        import pdb;pdb.set_trace()
 
     if customized_config is not None:
         kwargs["config"] = customized_config
diff --git a/llava/model/llava_arch.py b/llava/model/llava_arch.py
@@ -93,6 +93,7 @@ def initialize_vision_modules(self, model_args, fsdp=None):
         self.config.mm_vision_select_feature = mm_vision_select_feature
         self.config.mm_patch_merge_type = mm_patch_merge_type
 
+        
         if not hasattr(self.config, 'add_faster_video'):
             if model_args.add_faster_video:
                 embed_std = 1 / torch.sqrt(torch.tensor(self.config.hidden_size, dtype=self.dtype))
@@ -227,7 +228,7 @@ def add_token_per_grid(self, image_feature):
         image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
         image_feature = image_feature.flatten(1, 2).flatten(2, 3)
         image_feature = torch.cat((image_feature, self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)), dim=-1)
-        if self.config.add_faster_video:
+        if getattr(self.config, "add_faster_video", False):
             # import pdb; pdb.set_trace()
             # (3584, 832, 14) -> (3584, 64, 13, 14)
             image_feature = image_feature.view(feature_dim, num_frames,resize_h, -1)
@@ -311,7 +312,7 @@ def prepare_inputs_labels_for_multimodal(self, input_ids, position_ids, attentio
                         if mm_newline_position == "grid":
                             # Grid-wise
                             image_feature = self.add_token_per_grid(image_feature)
-                            if self.config.add_faster_video:
+                            if getattr(self.config, "add_faster_video", False):
                                 faster_video_feature = self.add_token_per_grid(all_faster_video_features[image_idx])
                                 # Add a token for each frame
                                 concat_slow_fater_token = []
diff --git a/playground/demo/stream_video_demo.py b/playground/demo/stream_video_demo.py
@@ -0,0 +1,166 @@
+import numpy as np
+import cv2
+import warnings
+import select
+import sys
+import openai
+import base64
+
+warnings.filterwarnings("ignore")
+
+# Global variables for storing video frames and their respective times
+video_frames = []
+frame_times = []
+history_time = 0
+
+
+
+client = openai.Client(api_key="EMPTY", base_url="xxx")
+
+def encode_image(frames):
+    base64_frames = []
+    for frame in frames:
+        # frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)  # Convert BGR to RGB
+        _, buffer = cv2.imencode(".jpg", frame)
+        buffer = base64.b64encode(buffer).decode("utf-8")
+        base64_frames.append(buffer)
+    return base64_frames
+
+# Function to send frames to the server and get a response
+def request_server(question, base64_frames):
+    messages = [{"role": "user", "content": []}]
+    for base64_frame in base64_frames:
+        frame_format = {
+            "type": "image_url",
+            "image_url": {"url": f"data:image/jpeg;base64,{base64_frame}"},
+            "modalities": "video",
+        }
+        messages[0]["content"].append(frame_format)
+
+    prompt = {"type": "text", "text": question}
+    messages[0]["content"].append(prompt)
+
+    video_request = client.chat.completions.create(
+        model="llava-onevision-72b-ov",
+        messages=messages,
+        temperature=0,
+        max_tokens=1024,
+    )
+
+    return video_request.choices[0].message.content
+
+
+class Args:
+    """
+    Class to store configuration arguments.
+    """
+    def __init__(self, frame_limit=30, force_sample=False):
+        self.frame_limit = frame_limit  # Max number of frames to retrieve
+        self.force_sample = force_sample  # Whether to force uniform sampling
+
+
+# Function to capture frames from the camera until the user presses Enter
+def load_camera_frames_until_enter(args):
+    global history_time  # To maintain across multiple captures
+
+    cap = cv2.VideoCapture(0)  # 0 is the ID for the default camera
+    if not cap.isOpened():
+        print("Error: Could not access the camera.")
+        return None, None, None
+
+    fps = cap.get(cv2.CAP_PROP_FPS) or 30  # Default to 30 FPS if unable to retrieve FPS
+    frame_count = 0
+
+    print("Video capturing started. Press 'Enter' in the console to stop capturing.")
+
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            print("Error: Could not read frame from camera.")
+            break
+
+        frame_count += 1
+        cur_frame_time = frame_count / fps
+
+        video_frames.append(frame)
+        frame_times.append(cur_frame_time + history_time)
+
+        # Display the frame
+        cv2.imshow('Camera Feed', frame)
+
+        # Add cv2.waitKey to ensure the window remains visible
+        if cv2.waitKey(1) & 0xFF == ord('q'):
+            break
+
+        # Check if user pressed 'Enter' in the console
+        if sys.stdin in select.select([sys.stdin], [], [], 0)[0]:
+            input()  # Consume the "Enter" key press
+            print("Video capture stopped.")
+            break
+
+    cap.release()
+    cv2.destroyAllWindows()  # Close the camera feed window
+
+    history_time = frame_times[-1] if frame_times else history_time
+
+    # Sample frames
+    total_frames = len(video_frames)
+    print(f"Total Frames Captured: {total_frames}")
+    
+    if total_frames > args.frame_limit:
+        sample_indices = np.linspace(0, total_frames - 1, args.frame_limit, dtype=int)
+        sampled_frames = [video_frames[i] for i in sample_indices]
+        sampled_times = [frame_times[i] for i in sample_indices]
+    else:
+        sampled_frames = video_frames
+        sampled_times = frame_times
+    
+    # import pdb; pdb.set_trace()
+    frame_times_str = ",".join([f"{t:.2f}s" for t in sampled_times])
+    return np.array(sampled_frames), frame_times_str, history_time
+
+
+# Function to stream video, process it, and answer a user question
+def stream_camera_and_ask_question(args):
+    video_frames, frame_times, video_time = load_camera_frames_until_enter(args)
+
+    if video_frames is None:
+        print("Error capturing video frames.")
+        return
+    
+    question = input("Press the query for current video: ").strip().lower()
+    
+    print("question: ", question)  
+    image_base64 = encode_image(video_frames)
+    # import pdb; pdb.set_trace()
+    response = request_server(question, image_base64)
+
+    print(f"Model's Answer: {response}")
+    print(f"Video Duration: 0 to {video_time:.2f} seconds")
+    print(f"Frame Times: {frame_times}")
+
+    return response
+
+
+# Main loop to keep the system running and waiting for user input
+def main_loop():
+    question = "Please describe this video."
+    args = Args(frame_limit=64, force_sample=True)
+
+    while True:
+        answer = stream_camera_and_ask_question(args)
+        if answer is None:
+            print("Exiting the loop.")
+            break
+        
+        user_input = input("Press 'Enter' to capture again, or 'q' to quit: ").strip().lower()
+        if user_input == "q":
+            print("Quitting the demo.")
+            break
+
+    # Close all OpenCV windows after the user quits
+    cv2.destroyAllWindows()
+
+
+if __name__ == "__main__":
+    main_loop()
diff --git a/playground/demo/video_demo.py b/playground/demo/video_demo.py
@@ -153,6 +153,8 @@ def run_inference(args):
     else:
         args.force_sample = False
 
+    # import pdb;pdb.set_trace()
+
     if getattr(model.config, "add_time_instruction", None) is not None:
         args.add_time_instruction = model.config.add_time_instruction
     else:
diff --git a/scripts/video/train/SO400M_Qwen2_72B_ov_to_video_am9.sh b/scripts/video/train/SO400M_Qwen2_72B_ov_to_video_am9.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+
+
+# You should complete the path of the following attributes:
+PROJECT_ROOT="XXXX"
+## This could a yaml file for multiple files or a json file for a single file
+DATA_PATH="XXXX"
+IMAGE_FOLDER="XXXX"
+VIDEO_FOLDER="XXXX"
+
+
+export PYTHONWARNINGS="ignore"
+
+
+############### Prepare Envs #################
+cd $PROJECT_ROOT
+python3 -m pip install --upgrade pip
+python3 -m pip install -e ".[train]"
+
+python3 -m pip install ninja
+python3 -m pip install flash-attn --no-build-isolation
+alias python=python3
+############### Show Envs ####################
+
+nvidia-smi
+# 取 worker0 第一个 port
+ports=($(echo $METIS_WORKER_0_PORT | tr ',' ' '))
+port=${ports[0]}
+port_in_cmd="$(echo "${METIS_WORKER_0_PORT:-2222}" | awk -F',' '{print $1}')"
+
+echo "total workers: ${ARNOLD_WORKER_NUM}"
+echo "cur worker id: ${ARNOLD_ID}"
+echo "gpus per worker: ${ARNOLD_WORKER_GPU}"
+echo "master ip: ${METIS_WORKER_0_HOST}"
+echo "master port: ${port}"
+echo "master port in cmd: ${port_in_cmd}"
+
+export OMP_NUM_THREADS=8
+export NCCL_IB_DISABLE=0
+export NCCL_IB_GID_INDEX=3
+# export NCCL_IB_HCA=${ARNOLD_RDMA_DEVICE}
+export NCCL_SOCKET_IFNAME=eth0
+export NCCL_DEBUG=WARN
+
+PORT=26000
+GPUS="0,1,2,3,4,5,6,7"
+
+################ Arnold Jobs ################
+
+LLM_VERSION="Qwen/Qwen2-72B-Instruct"
+LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
+VISION_MODEL_VERSION="google/siglip-so400m-patch14-384"
+VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
+
+
+# Stage For video
+PROMPT_VERSION="qwen_1_5"
+MID_RUN_NAME="llava_next_video-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-ov_to_video"
+PREV_STAGE_CHECKPOINT=""
+echo "PREV_STAGE_CHECKPOINT: ${PREV_STAGE_CHECKPOINT}"
+echo "MID_RUN_NAME: ${MID_RUN_NAME}"
+
+
+ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \
+    llava/train/train_mem.py \
+    --deepspeed scripts/zero3.json \
+    --model_name_or_path $PREV_STAGE_CHECKPOINT \
+    --version $PROMPT_VERSION \
+    --data_path ${DATA_PATH} \
+    --image_folder ${IMAGE_FOLDER} \
+    --video_folder ${VIDEO_FOLDER} \
+    --mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
+    --mm_vision_tower_lr=2e-6 \
+    --vision_tower ${VISION_MODEL_VERSION} \
+    --mm_projector_type mlp2x_gelu \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --group_by_modality_length True \
+    --image_aspect_ratio anyres_max_9 \
+    --image_grid_pinpoints  "(1x1),...,(6x6)" \
+    --mm_patch_merge_type spatial_unpad \
+    --bf16 True \
+    --run_name $MID_RUN_NAME \
+    --output_dir ./work_dirs/$MID_RUN_NAME \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 2 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 500 \
+    --save_total_limit 1 \
+    --learning_rate 1e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 12768 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 2 \
+    --lazy_preprocess True \
+    --report_to wandb \
+    --torch_compile True \
+    --torch_compile_backend "inductor" \
+    --dataloader_drop_last True \
+    --frames_upbound 32 \
+    --mm_newline_position grid \
+    --add_time_instruction True \
+    --force_sample True \
+    --mm_spatial_pool_stride 2
+exit 0;
diff --git a/scripts/video/train/SO400M_Qwen2_7B_ov_to_video_am9.sh b/scripts/video/train/SO400M_Qwen2_7B_ov_to_video_am9.sh