Update embedding and reranker tutorials (#373)

oyilmaz-nvidia · web-flow · commit 8e8c2633754c · 2025-09-09T14:03:20.000-04:00
Signed-off-by: Onur Yilmaz &lt;oyilmaz@nvidia.com&gt;
diff --git a/tutorials/onnx_tensorrt/embedding/llama_embedding.ipynb b/tutorials/onnx_tensorrt/embedding/llama_embedding.ipynb
@@ -21,12 +21,20 @@
    "source": [
     "#### Launch the NeMo Framework container as follows:\n",
     "\n",
-    "Depending on the number of gpus, `--gpus` might need to adjust accordingly:\n",
+    "1. Run the following command in the NeMo Framework container in a terminal before starting the jupyter notebook if you are using the container version 25.07 and above.\n",
+    "\n",
+    "```\n",
+    "cd /opt/Export-Deploy\n",
+    "uv sync --link-mode symlink --locked --extra trt-onnx $(cat /opt/uv_args.txt)\n",
+    "```\n",
+    "\n",
+    "2. Depending on the number of gpus, `--gpus` might need to adjust accordingly:\n",
+    "\n",
     "```\n",
     "docker run -it -p 8080:8080 -p 8088:8088 --rm --gpus device=0 --ipc=host --network host -v $(pwd):/workspace nvcr.io/nvidia/nemo:25.07\n",
     "```\n",
     "\n",
-    "#### Launch Jupyter Notebook as follows:\n",
+    "3. Launch Jupyter Notebook as follows:\n",
     "```\n",
     "jupyter notebook --allow-root --ip 0.0.0.0 --port 8088 --no-browser --NotebookApp.token=''\n",
     "\n",
@@ -50,7 +58,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from nemo.collections.llm.gpt.model import get_llama_bidirectional_hf_model"
+    "from nemo_export.model_adapters.embedding import get_llama_bidirectional_hf_model"
    ]
   },
   {
@@ -86,11 +94,7 @@
     "]  # Model specific layers to override the precision to fp32.\n",
     "override_layernorm_precision_to_fp32 = True  # Model specific operation wheter to override layernorm precision or not.\n",
     "profiling_verbosity = \"layer_names_only\"\n",
-    "export_to_trt = True  # Export ONNX model to TensorRT or not.\n",
-    "# Generate version compatible TensorRT engine or not. This option might provide slower inference time.\n",
-    "# If you know the TensorRT engine versions match (where the engine was generated versus where it's used), set this to False.\n",
-    "# Please check here https://docs.nvidia.com/deeplearning/tensorrt/latest/inference-library/advanced.html#version-compatibility for more information.\n",
-    "trt_version_compatible = True"
+    "export_to_trt = True  # Export ONNX model to TensorRT or not."
    ]
   },
   {
@@ -190,6 +194,19 @@
     ")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6671db19-74f7-4e60-9086-88e47da3622d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt = [\"hello\", \"world\"]\n",
+    "dimensions = [2, 4] if use_dimension_arg else None\n",
+    "\n",
+    "onnx_exporter.forward(prompt, dimensions)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -198,6 +215,8 @@
    "outputs": [],
    "source": [
     "if export_to_trt:\n",
+    "    import tensorrt as trt\n",
+    "\n",
     "    if use_dimension_arg:\n",
     "        input_profiles = [\n",
     "            {\n",
@@ -214,19 +233,13 @@
     "            }\n",
     "        ]\n",
     "\n",
-    "    trt_builder_flags = None\n",
-    "    if trt_version_compatible:\n",
-    "        import tensorrt as trt\n",
-    "\n",
-    "        trt_builder_flags = [trt.BuilderFlag.VERSION_COMPATIBLE]\n",
-    "\n",
     "    onnx_exporter.export_onnx_to_trt(\n",
     "        trt_model_dir=trt_model_path,\n",
     "        profiles=input_profiles,\n",
     "        override_layernorm_precision_to_fp32=override_layernorm_precision_to_fp32,\n",
     "        override_layers_to_fp32=override_layers_to_fp32,\n",
     "        profiling_verbosity=profiling_verbosity,\n",
-    "        trt_builder_flags=trt_builder_flags,\n",
+    "        trt_builder_flags=[trt.BuilderFlag.VERSION_COMPATIBLE],\n",
     "    )"
    ]
   },
@@ -236,12 +249,7 @@
    "id": "051200b7-6eba-44db-b223-059f1dfb60bd",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "prompt = [\"hello\", \"world\"]\n",
-    "dimensions = [2, 4] if use_dimension_arg else None\n",
-    "\n",
-    "onnx_exporter.forward(prompt, dimensions)"
-   ]
+   "source": []
   }
  ],
  "metadata": {
diff --git a/tutorials/onnx_tensorrt/reranker/llama_reranker.ipynb b/tutorials/onnx_tensorrt/reranker/llama_reranker.ipynb
@@ -21,12 +21,20 @@
    "source": [
     "#### Launch the NeMo Framework container as follows: \n",
     "\n",
-    "Please set the $TAG to the latest NeMo FW container. Depending on the number of gpus, `--gpus` might need to adjust accordingly:\n",
+    "1. Run the following command in the NeMo Framework container in a terminal before starting the jupyter notebook if you are using the container version 25.07 and above.\n",
+    "\n",
+    "```\n",
+    "cd /opt/Export-Deploy\n",
+    "uv sync --link-mode symlink --locked --extra trt-onnx $(cat /opt/uv_args.txt)\n",
+    "```\n",
+    "\n",
+    "2. Depending on the number of gpus, `--gpus` might need to adjust accordingly:\n",
+    "\n",
     "```\n",
-    "docker run -it -p 8080:8080 -p 8088:8088 --rm --gpus '\"device=0,1\"' --ipc=host --network host -v $(pwd):/workspace nvcr.io/nvidia/nemo:$TAG\n",
+    "docker run -it -p 8080:8080 -p 8088:8088 --rm --gpus device=0 --ipc=host --network host -v $(pwd):/workspace nvcr.io/nvidia/nemo:25.07\n",
     "```\n",
     "\n",
-    "#### Launch Jupyter Notebook as follows: \n",
+    "3. Launch Jupyter Notebook as follows:\n",
     "```\n",
     "jupyter notebook --allow-root --ip 0.0.0.0 --port 8088 --no-browser --NotebookApp.token=''\n",
     "\n",
@@ -78,11 +86,7 @@
     "]  # Model specific layers to override the precision to fp32.\n",
     "override_layernorm_precision_to_fp32 = True  # Model specific operation wheter to override layernorm precision or not.\n",
     "profiling_verbosity = \"layer_names_only\"\n",
-    "export_to_trt = True  # Export ONNX model to TensorRT or not.\n",
-    "# Generate version compatible TensorRT engine or not. This option might provide slower inference time.\n",
-    "# If you know the TensorRT engine versions match (where the engine was generated versus where it's used), set this to False.\n",
-    "# Please check here https://docs.nvidia.com/deeplearning/tensorrt/latest/inference-library/advanced.html#version-compatibility for more information.\n",
-    "trt_version_compatible = True"
+    "export_to_trt = True  # Export ONNX model to TensorRT or not."
    ]
   },
   {
@@ -106,29 +110,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from nemo.export.onnx_llm_exporter import OnnxLLMExporter\n",
-    "\n",
-    "use_token_type_ids_arg = False\n",
-    "if \"token_type_ids\" in tokenizer.model_input_names:\n",
-    "    use_token_type_ids_arg = True\n",
-    "\n",
-    "if use_token_type_ids_arg:\n",
-    "    input_names = [\n",
-    "        \"input_ids\",\n",
-    "        \"attention_mask\",\n",
-    "        \"token_type_ids\",\n",
-    "    ]  # ONNX specific arguments, input names in this case.\n",
-    "    dynamic_axes_input = {\n",
-    "        \"input_ids\": {0: \"batch_size\", 1: \"seq_length\"},\n",
-    "        \"attention_mask\": {0: \"batch_size\", 1: \"seq_length\"},\n",
-    "        \"token_type_ids\": {0: \"batch_size\"},\n",
-    "    }\n",
-    "else:\n",
-    "    input_names = [\"input_ids\", \"attention_mask\"]\n",
-    "    dynamic_axes_input = {\n",
-    "        \"input_ids\": {0: \"batch_size\", 1: \"seq_length\"},\n",
-    "        \"attention_mask\": {0: \"batch_size\", 1: \"seq_length\"},\n",
-    "    }\n",
+    "from nemo_export.onnx_llm_exporter import OnnxLLMExporter\n",
+    "\n",
+    "input_names = [\"input_ids\", \"attention_mask\"]\n",
+    "dynamic_axes_input = {\n",
+    "    \"input_ids\": {0: \"batch_size\", 1: \"seq_length\"},\n",
+    "    \"attention_mask\": {0: \"batch_size\", 1: \"seq_length\"},\n",
+    "}\n",
     "\n",
     "output_names = [\"logits\"]  # ONNX specific arguments, output names in this case.\n",
     "dynamic_axes_output = {\"logits\": {0: \"batch_size\"}}\n",
@@ -149,6 +137,17 @@
     ")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a20cd28d-505b-4c93-a53c-93568539dcbf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt = [\"hello\", \"world\"]\n",
+    "onnx_exporter.forward(prompt)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -157,47 +156,27 @@
    "outputs": [],
    "source": [
     "if export_to_trt:\n",
-    "    if use_token_type_ids_arg:\n",
-    "        input_profiles = [\n",
-    "            {\n",
-    "                \"input_ids\": [[1, 3], [16, 128], [64, 256]],\n",
-    "                \"attention_mask\": [[1, 3], [16, 128], [64, 256]],\n",
-    "                \"token_type_ids\": [[1], [16], [64]],\n",
-    "            }\n",
-    "        ]\n",
-    "    else:\n",
-    "        input_profiles = [\n",
-    "            {\"input_ids\": [[1, 3], [16, 128], [64, 256]], \"attention_mask\": [[1, 3], [16, 128], [64, 256]]}\n",
-    "        ]\n",
-    "\n",
-    "    trt_builder_flags = None\n",
-    "    if trt_version_compatible:\n",
-    "        import tensorrt as trt\n",
-    "\n",
-    "        trt_builder_flags = [trt.BuilderFlag.VERSION_COMPATIBLE]\n",
+    "    import tensorrt as trt\n",
+    "\n",
+    "    input_profiles = [{\"input_ids\": [[1, 3], [16, 128], [64, 256]], \"attention_mask\": [[1, 3], [16, 128], [64, 256]]}]\n",
     "\n",
     "    onnx_exporter.export_onnx_to_trt(\n",
     "        trt_model_dir=trt_model_path,\n",
     "        profiles=input_profiles,\n",
     "        override_layernorm_precision_to_fp32=override_layernorm_precision_to_fp32,\n",
     "        override_layers_to_fp32=override_layers_to_fp32,\n",
     "        profiling_verbosity=profiling_verbosity,\n",
-    "        trt_builder_flags=trt_builder_flags,\n",
+    "        trt_builder_flags=[trt.BuilderFlag.VERSION_COMPATIBLE],\n",
     "    )"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "051200b7-6eba-44db-b223-059f1dfb60bd",
+   "id": "78b10377-fe20-47db-9fee-0ef1f12f13a8",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "prompt = [\"hello\", \"world\"]\n",
-    "token_type_ids = [2, 4] if use_token_type_ids_arg else None\n",
-    "\n",
-    "onnx_exporter.forward(prompt, token_type_ids)"
-   ]
+   "source": []
   }
  ],
  "metadata": {