feat(demo): allow multiple prompts

borisdayma · borisdayma · commit e58bdf2b1404 · 2022-06-06T23:59:52.000-05:00
diff --git a/tools/inference/inference_pipeline.ipynb b/tools/inference/inference_pipeline.ipynb
@@ -47,9 +47,8 @@
       "outputs": [],
       "source": [
         "# Install required libraries\n",
-        "!pip install -q git+https://github.com/huggingface/transformers.git\n",
-        "!pip install -q git+https://github.com/patil-suraj/vqgan-jax.git\n",
-        "!pip install -q git+https://github.com/borisdayma/dalle-mini.git"
+        "!pip install -q dalle-mini\n",
+        "!pip install -q git+https://github.com/patil-suraj/vqgan-jax.git"
       ]
     },
     {
@@ -250,7 +249,7 @@
         "id": "BQ7fymSPyvF_"
       },
       "source": [
-        "Let's define a text prompt."
+        "Let's define some text prompts."
       ]
     },
     {
@@ -261,9 +260,18 @@
       },
       "outputs": [],
       "source": [
-        "prompt = \"sunset over a lake in the mountains\""
+        "prompts = [\"sunset over a lake in the mountains\", \"the Eiffel tower landing on the moon\"]"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Note: we could use the same prompt multiple times for faster inference."
+      ],
+      "metadata": {
+        "id": "XlZUG3SCLnGE"
+      }
+    },
     {
       "cell_type": "code",
       "execution_count": null,
@@ -272,7 +280,7 @@
       },
       "outputs": [],
       "source": [
-        "tokenized_prompt = processor([prompt])"
+        "tokenized_prompts = processor(prompts)"
       ]
     },
     {
@@ -281,7 +289,7 @@
         "id": "-CEJBnuJOe5z"
       },
       "source": [
-        "Finally we replicate it onto each device."
+        "Finally we replicate the prompts onto each device."
       ]
     },
     {
@@ -292,7 +300,7 @@
       },
       "outputs": [],
       "source": [
-        "tokenized_prompt = replicate(tokenized_prompt)"
+        "tokenized_prompt = replicate(tokenized_prompts)"
       ]
     },
     {
@@ -314,10 +322,10 @@
       },
       "outputs": [],
       "source": [
-        "# number of predictions\n",
+        "# number of predictions per prompt\n",
         "n_predictions = 8\n",
         "\n",
-        "# We can customize generation parameters\n",
+        "# We can customize generation parameters (see https://huggingface.co/blog/how-to-generate)\n",
         "gen_top_k = None\n",
         "gen_top_p = None\n",
         "temperature = None\n",
@@ -337,7 +345,7 @@
         "from PIL import Image\n",
         "from tqdm.notebook import trange\n",
         "\n",
-        "print(f\"Prompt: {prompt}\\n\")\n",
+        "print(f\"Prompts: {prompts}\\n\")\n",
         "# generate images\n",
         "images = []\n",
         "for i in trange(max(n_predictions // jax.device_count(), 1)):\n",
@@ -361,7 +369,8 @@
         "    for decoded_img in decoded_images:\n",
         "        img = Image.fromarray(np.asarray(decoded_img * 255, dtype=np.uint8))\n",
         "        images.append(img)\n",
-        "        display(img)"
+        "        display(img)\n",
+        "        print()"
       ]
     },
     {
@@ -415,17 +424,32 @@
         "\n",
         "# get clip scores\n",
         "clip_inputs = clip_processor(\n",
-        "    text=[prompt] * jax.device_count(),\n",
+        "    text=prompts * jax.device_count(),\n",
         "    images=images,\n",
         "    return_tensors=\"np\",\n",
         "    padding=\"max_length\",\n",
         "    max_length=77,\n",
         "    truncation=True,\n",
         ").data\n",
         "logits = p_clip(shard(clip_inputs), clip_params)\n",
-        "logits = logits.squeeze().flatten()"
+        "\n",
+        "# organize scores per prompt\n",
+        "p = len(prompts)\n",
+        "logits = np.asarray([logits[:, i::p, i] for i in range(p)]).squeeze()\n",
+        "#logits = rearrange(logits, '1 b p -> p b')"
       ]
     },
+    {
+      "cell_type": "code",
+      "source": [
+        "logits.shape"
+      ],
+      "metadata": {
+        "id": "ia0302WtRcEO"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
     {
       "cell_type": "markdown",
       "metadata": {
@@ -443,10 +467,12 @@
       },
       "outputs": [],
       "source": [
-        "print(f\"Prompt: {prompt}\\n\")\n",
-        "for idx in logits.argsort()[::-1]:\n",
-        "    display(images[idx])\n",
-        "    print(f\"Score: {logits[idx]:.2f}\\n\")"
+        "for i, prompt in enumerate(prompts):\n",
+        "    print(f\"Prompt: {prompt}\\n\")\n",
+        "    for idx in logits[i].argsort()[::-1]:\n",
+        "        display(images[idx*p+i])\n",
+        "        print(f\"Score: {jnp.asarray(logits[i][idx], dtype=jnp.float32):.2f}\\n\")\n",
+        "    print()"
       ]
     }
   ],
@@ -479,4 +505,4 @@
   },
   "nbformat": 4,
   "nbformat_minor": 0
-}
+}