Merge pull request #180 from LLaVA-VL/patch-add_doc

Luodian · web-flow · commit 6ada72e2706b · 2024-08-23T20:52:07.000+10:00
Update LLaVA OneVision model to lmms-lab/llava-onevision-qwen2-7b-ov
diff --git a/docs/LLaVA_OneVision_Tutorials.ipynb b/docs/LLaVA_OneVision_Tutorials.ipynb
@@ -233,9 +233,59 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/tiger/miniconda3/envs/public_llava/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "/home/tiger/miniconda3/envs/public_llava/lib/python3.10/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n",
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded LLaVA model: lmms-lab/llava-onevision-qwen2-7b-ov\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
+      "You are using a model of type llava to instantiate a model of type llava_qwen. This is not supported for all configurations of models and can yield errors.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loading vision tower: google/siglip-so400m-patch14-384\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00,  2.07s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model Class: LlavaQwenForCausalLM\n",
+      "(16, 1024, 576, 3)\n",
+      "The video features a person standing on a stage, dressed in a black shirt and dark pants. A large hand appears from the background, reaching towards the person's pocket. The text 'Source: Joshua AG' is displayed at the top left corner of the frames, and 'EVAN CARMICHAEL' is shown in the top right corner. The text 'Anyone know what this pocket is for?' appears as the hand continues to reach into the pocket. The person then looks down at their pocket, and the text 'I've always wondered that' appears. The hand finally pulls out a small white device labeled 'iPod Nano'. The person holds up the iPod Nano, and the text 'is the new iPod Nano' appears. The video concludes with a close-up of the person holding the iPod Nano, showing it from different angles.\n"
+     ]
+    }
+   ],
    "source": [
     "from operator import attrgetter\n",
     "from llava.model.builder import load_pretrained_model\n",
@@ -254,7 +304,7 @@
     "\n",
     "warnings.filterwarnings(\"ignore\")\n",
     "# Load the OneVision model\n",
-    "pretrained = \"lmms-lab/llava-onevision-qwen2-0.5b-ov\"\n",
+    "pretrained = \"lmms-lab/llava-onevision-qwen2-7b-ov\"\n",
     "model_name = \"llava_qwen\"\n",
     "device = \"cuda\"\n",
     "device_map = \"auto\"\n",
@@ -327,7 +377,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.2"
+   "version": "3.10.14"
   },
   "vscode": {
    "interpreter": {