Merge pull request #152 from LLaVA-VL/fix/onevision_tut

Luodian · web-flow · commit 789531b49cd9 · 2024-08-15T22:36:51.000+10:00
Provide the correct video processing logic with decord
diff --git a/docs/LLaVA_OneVision_Tutorials.ipynb b/docs/LLaVA_OneVision_Tutorials.ipynb
@@ -237,6 +237,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from operator import attrgetter\n",
     "from llava.model.builder import load_pretrained_model\n",
     "from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token\n",
     "from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX\n",
@@ -249,41 +250,39 @@
     "import requests\n",
     "import copy\n",
     "import warnings\n",
+    "from decord import VideoReader, cpu\n",
     "\n",
     "warnings.filterwarnings(\"ignore\")\n",
     "# Load the OneVision model\n",
     "pretrained = \"lmms-lab/llava-onevision-qwen2-0.5b-ov\"\n",
     "model_name = \"llava_qwen\"\n",
     "device = \"cuda\"\n",
     "device_map = \"auto\"\n",
-    "tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map)\n",
+    "tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map, attn_implementation=\"sdpa\")\n",
     "\n",
     "model.eval()\n",
     "\n",
     "\n",
     "# Function to extract frames from video\n",
-    "def extract_frames(video_path, num_frames=8):\n",
-    "    cap = cv2.VideoCapture(video_path)\n",
-    "    frames = []\n",
-    "    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))\n",
-    "    indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)\n",
-    "\n",
-    "    for i in indices:\n",
-    "        cap.set(cv2.CAP_PROP_POS_FRAMES, i)\n",
-    "        ret, frame = cap.read()\n",
-    "        if ret:\n",
-    "            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)\n",
-    "            frames.append(Image.fromarray(frame))\n",
-    "\n",
-    "    cap.release()\n",
-    "    return frames\n",
+    "def load_video(video_path, max_frames_num):\n",
+    "    if type(video_path) == str:\n",
+    "        vr = VideoReader(video_path, ctx=cpu(0))\n",
+    "    else:\n",
+    "        vr = VideoReader(video_path[0], ctx=cpu(0))\n",
+    "    total_frame_num = len(vr)\n",
+    "    uniform_sampled_frames = np.linspace(0, total_frame_num - 1, max_frames_num, dtype=int)\n",
+    "    frame_idx = uniform_sampled_frames.tolist()\n",
+    "    spare_frames = vr.get_batch(frame_idx).asnumpy()\n",
+    "    return spare_frames  # (frames, height, width, channels)\n",
     "\n",
     "\n",
     "# Load and process video\n",
     "video_path = \"jobs.mp4\"\n",
-    "video_frames = extract_frames(video_path)\n",
-    "image_tensors = process_images(video_frames, image_processor, model.config)\n",
-    "image_tensors = [_image.to(dtype=torch.float16, device=device) for _image in image_tensors]\n",
+    "video_frames = load_video(video_path, 16)\n",
+    "print(video_frames.shape) # (16, 1024, 576, 3)\n",
+    "image_tensors = []\n",
+    "frames = image_processor.preprocess(video_frames, return_tensors=\"pt\")[\"pixel_values\"].half().cuda()\n",
+    "image_tensors.append(frames)\n",
     "\n",
     "# Prepare conversation input\n",
     "conv_template = \"qwen_1_5\"\n",