Skip to content

Commit 8e8c263

Browse files
Update embedding and reranker tutorials (#373)
Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
1 parent cdffc59 commit 8e8c263

File tree

2 files changed

+65
-78
lines changed

2 files changed

+65
-78
lines changed

tutorials/onnx_tensorrt/embedding/llama_embedding.ipynb

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,20 @@
2121
"source": [
2222
"#### Launch the NeMo Framework container as follows:\n",
2323
"\n",
24-
"Depending on the number of gpus, `--gpus` might need to adjust accordingly:\n",
24+
"1. Run the following command in the NeMo Framework container in a terminal before starting the jupyter notebook if you are using the container version 25.07 and above.\n",
25+
"\n",
26+
"```\n",
27+
"cd /opt/Export-Deploy\n",
28+
"uv sync --link-mode symlink --locked --extra trt-onnx $(cat /opt/uv_args.txt)\n",
29+
"```\n",
30+
"\n",
31+
"2. Depending on the number of gpus, `--gpus` might need to adjust accordingly:\n",
32+
"\n",
2533
"```\n",
2634
"docker run -it -p 8080:8080 -p 8088:8088 --rm --gpus device=0 --ipc=host --network host -v $(pwd):/workspace nvcr.io/nvidia/nemo:25.07\n",
2735
"```\n",
2836
"\n",
29-
"#### Launch Jupyter Notebook as follows:\n",
37+
"3. Launch Jupyter Notebook as follows:\n",
3038
"```\n",
3139
"jupyter notebook --allow-root --ip 0.0.0.0 --port 8088 --no-browser --NotebookApp.token=''\n",
3240
"\n",
@@ -50,7 +58,7 @@
5058
"metadata": {},
5159
"outputs": [],
5260
"source": [
53-
"from nemo.collections.llm.gpt.model import get_llama_bidirectional_hf_model"
61+
"from nemo_export.model_adapters.embedding import get_llama_bidirectional_hf_model"
5462
]
5563
},
5664
{
@@ -86,11 +94,7 @@
8694
"] # Model specific layers to override the precision to fp32.\n",
8795
"override_layernorm_precision_to_fp32 = True # Model specific operation wheter to override layernorm precision or not.\n",
8896
"profiling_verbosity = \"layer_names_only\"\n",
89-
"export_to_trt = True # Export ONNX model to TensorRT or not.\n",
90-
"# Generate version compatible TensorRT engine or not. This option might provide slower inference time.\n",
91-
"# If you know the TensorRT engine versions match (where the engine was generated versus where it's used), set this to False.\n",
92-
"# Please check here https://docs.nvidia.com/deeplearning/tensorrt/latest/inference-library/advanced.html#version-compatibility for more information.\n",
93-
"trt_version_compatible = True"
97+
"export_to_trt = True # Export ONNX model to TensorRT or not."
9498
]
9599
},
96100
{
@@ -190,6 +194,19 @@
190194
")"
191195
]
192196
},
197+
{
198+
"cell_type": "code",
199+
"execution_count": null,
200+
"id": "6671db19-74f7-4e60-9086-88e47da3622d",
201+
"metadata": {},
202+
"outputs": [],
203+
"source": [
204+
"prompt = [\"hello\", \"world\"]\n",
205+
"dimensions = [2, 4] if use_dimension_arg else None\n",
206+
"\n",
207+
"onnx_exporter.forward(prompt, dimensions)"
208+
]
209+
},
193210
{
194211
"cell_type": "code",
195212
"execution_count": null,
@@ -198,6 +215,8 @@
198215
"outputs": [],
199216
"source": [
200217
"if export_to_trt:\n",
218+
" import tensorrt as trt\n",
219+
"\n",
201220
" if use_dimension_arg:\n",
202221
" input_profiles = [\n",
203222
" {\n",
@@ -214,19 +233,13 @@
214233
" }\n",
215234
" ]\n",
216235
"\n",
217-
" trt_builder_flags = None\n",
218-
" if trt_version_compatible:\n",
219-
" import tensorrt as trt\n",
220-
"\n",
221-
" trt_builder_flags = [trt.BuilderFlag.VERSION_COMPATIBLE]\n",
222-
"\n",
223236
" onnx_exporter.export_onnx_to_trt(\n",
224237
" trt_model_dir=trt_model_path,\n",
225238
" profiles=input_profiles,\n",
226239
" override_layernorm_precision_to_fp32=override_layernorm_precision_to_fp32,\n",
227240
" override_layers_to_fp32=override_layers_to_fp32,\n",
228241
" profiling_verbosity=profiling_verbosity,\n",
229-
" trt_builder_flags=trt_builder_flags,\n",
242+
" trt_builder_flags=[trt.BuilderFlag.VERSION_COMPATIBLE],\n",
230243
" )"
231244
]
232245
},
@@ -236,12 +249,7 @@
236249
"id": "051200b7-6eba-44db-b223-059f1dfb60bd",
237250
"metadata": {},
238251
"outputs": [],
239-
"source": [
240-
"prompt = [\"hello\", \"world\"]\n",
241-
"dimensions = [2, 4] if use_dimension_arg else None\n",
242-
"\n",
243-
"onnx_exporter.forward(prompt, dimensions)"
244-
]
252+
"source": []
245253
}
246254
],
247255
"metadata": {

tutorials/onnx_tensorrt/reranker/llama_reranker.ipynb

Lines changed: 36 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,20 @@
2121
"source": [
2222
"#### Launch the NeMo Framework container as follows: \n",
2323
"\n",
24-
"Please set the $TAG to the latest NeMo FW container. Depending on the number of gpus, `--gpus` might need to adjust accordingly:\n",
24+
"1. Run the following command in the NeMo Framework container in a terminal before starting the jupyter notebook if you are using the container version 25.07 and above.\n",
25+
"\n",
26+
"```\n",
27+
"cd /opt/Export-Deploy\n",
28+
"uv sync --link-mode symlink --locked --extra trt-onnx $(cat /opt/uv_args.txt)\n",
29+
"```\n",
30+
"\n",
31+
"2. Depending on the number of gpus, `--gpus` might need to adjust accordingly:\n",
32+
"\n",
2533
"```\n",
26-
"docker run -it -p 8080:8080 -p 8088:8088 --rm --gpus '\"device=0,1\"' --ipc=host --network host -v $(pwd):/workspace nvcr.io/nvidia/nemo:$TAG\n",
34+
"docker run -it -p 8080:8080 -p 8088:8088 --rm --gpus device=0 --ipc=host --network host -v $(pwd):/workspace nvcr.io/nvidia/nemo:25.07\n",
2735
"```\n",
2836
"\n",
29-
"#### Launch Jupyter Notebook as follows: \n",
37+
"3. Launch Jupyter Notebook as follows:\n",
3038
"```\n",
3139
"jupyter notebook --allow-root --ip 0.0.0.0 --port 8088 --no-browser --NotebookApp.token=''\n",
3240
"\n",
@@ -78,11 +86,7 @@
7886
"] # Model specific layers to override the precision to fp32.\n",
7987
"override_layernorm_precision_to_fp32 = True # Model specific operation wheter to override layernorm precision or not.\n",
8088
"profiling_verbosity = \"layer_names_only\"\n",
81-
"export_to_trt = True # Export ONNX model to TensorRT or not.\n",
82-
"# Generate version compatible TensorRT engine or not. This option might provide slower inference time.\n",
83-
"# If you know the TensorRT engine versions match (where the engine was generated versus where it's used), set this to False.\n",
84-
"# Please check here https://docs.nvidia.com/deeplearning/tensorrt/latest/inference-library/advanced.html#version-compatibility for more information.\n",
85-
"trt_version_compatible = True"
89+
"export_to_trt = True # Export ONNX model to TensorRT or not."
8690
]
8791
},
8892
{
@@ -106,29 +110,13 @@
106110
"metadata": {},
107111
"outputs": [],
108112
"source": [
109-
"from nemo.export.onnx_llm_exporter import OnnxLLMExporter\n",
110-
"\n",
111-
"use_token_type_ids_arg = False\n",
112-
"if \"token_type_ids\" in tokenizer.model_input_names:\n",
113-
" use_token_type_ids_arg = True\n",
114-
"\n",
115-
"if use_token_type_ids_arg:\n",
116-
" input_names = [\n",
117-
" \"input_ids\",\n",
118-
" \"attention_mask\",\n",
119-
" \"token_type_ids\",\n",
120-
" ] # ONNX specific arguments, input names in this case.\n",
121-
" dynamic_axes_input = {\n",
122-
" \"input_ids\": {0: \"batch_size\", 1: \"seq_length\"},\n",
123-
" \"attention_mask\": {0: \"batch_size\", 1: \"seq_length\"},\n",
124-
" \"token_type_ids\": {0: \"batch_size\"},\n",
125-
" }\n",
126-
"else:\n",
127-
" input_names = [\"input_ids\", \"attention_mask\"]\n",
128-
" dynamic_axes_input = {\n",
129-
" \"input_ids\": {0: \"batch_size\", 1: \"seq_length\"},\n",
130-
" \"attention_mask\": {0: \"batch_size\", 1: \"seq_length\"},\n",
131-
" }\n",
113+
"from nemo_export.onnx_llm_exporter import OnnxLLMExporter\n",
114+
"\n",
115+
"input_names = [\"input_ids\", \"attention_mask\"]\n",
116+
"dynamic_axes_input = {\n",
117+
" \"input_ids\": {0: \"batch_size\", 1: \"seq_length\"},\n",
118+
" \"attention_mask\": {0: \"batch_size\", 1: \"seq_length\"},\n",
119+
"}\n",
132120
"\n",
133121
"output_names = [\"logits\"] # ONNX specific arguments, output names in this case.\n",
134122
"dynamic_axes_output = {\"logits\": {0: \"batch_size\"}}\n",
@@ -149,6 +137,17 @@
149137
")"
150138
]
151139
},
140+
{
141+
"cell_type": "code",
142+
"execution_count": null,
143+
"id": "a20cd28d-505b-4c93-a53c-93568539dcbf",
144+
"metadata": {},
145+
"outputs": [],
146+
"source": [
147+
"prompt = [\"hello\", \"world\"]\n",
148+
"onnx_exporter.forward(prompt)"
149+
]
150+
},
152151
{
153152
"cell_type": "code",
154153
"execution_count": null,
@@ -157,47 +156,27 @@
157156
"outputs": [],
158157
"source": [
159158
"if export_to_trt:\n",
160-
" if use_token_type_ids_arg:\n",
161-
" input_profiles = [\n",
162-
" {\n",
163-
" \"input_ids\": [[1, 3], [16, 128], [64, 256]],\n",
164-
" \"attention_mask\": [[1, 3], [16, 128], [64, 256]],\n",
165-
" \"token_type_ids\": [[1], [16], [64]],\n",
166-
" }\n",
167-
" ]\n",
168-
" else:\n",
169-
" input_profiles = [\n",
170-
" {\"input_ids\": [[1, 3], [16, 128], [64, 256]], \"attention_mask\": [[1, 3], [16, 128], [64, 256]]}\n",
171-
" ]\n",
172-
"\n",
173-
" trt_builder_flags = None\n",
174-
" if trt_version_compatible:\n",
175-
" import tensorrt as trt\n",
176-
"\n",
177-
" trt_builder_flags = [trt.BuilderFlag.VERSION_COMPATIBLE]\n",
159+
" import tensorrt as trt\n",
160+
"\n",
161+
" input_profiles = [{\"input_ids\": [[1, 3], [16, 128], [64, 256]], \"attention_mask\": [[1, 3], [16, 128], [64, 256]]}]\n",
178162
"\n",
179163
" onnx_exporter.export_onnx_to_trt(\n",
180164
" trt_model_dir=trt_model_path,\n",
181165
" profiles=input_profiles,\n",
182166
" override_layernorm_precision_to_fp32=override_layernorm_precision_to_fp32,\n",
183167
" override_layers_to_fp32=override_layers_to_fp32,\n",
184168
" profiling_verbosity=profiling_verbosity,\n",
185-
" trt_builder_flags=trt_builder_flags,\n",
169+
" trt_builder_flags=[trt.BuilderFlag.VERSION_COMPATIBLE],\n",
186170
" )"
187171
]
188172
},
189173
{
190174
"cell_type": "code",
191175
"execution_count": null,
192-
"id": "051200b7-6eba-44db-b223-059f1dfb60bd",
176+
"id": "78b10377-fe20-47db-9fee-0ef1f12f13a8",
193177
"metadata": {},
194178
"outputs": [],
195-
"source": [
196-
"prompt = [\"hello\", \"world\"]\n",
197-
"token_type_ids = [2, 4] if use_token_type_ids_arg else None\n",
198-
"\n",
199-
"onnx_exporter.forward(prompt, token_type_ids)"
200-
]
179+
"source": []
201180
}
202181
],
203182
"metadata": {

0 commit comments

Comments
 (0)