|
21 | 21 | "source": [ |
22 | 22 | "#### Launch the NeMo Framework container as follows: \n", |
23 | 23 | "\n", |
24 | | - "Please set the $TAG to the latest NeMo FW container. Depending on the number of gpus, `--gpus` might need to adjust accordingly:\n", |
| 24 | + "1. Run the following command in the NeMo Framework container in a terminal before starting the jupyter notebook if you are using the container version 25.07 and above.\n", |
| 25 | + "\n", |
| 26 | + "```\n", |
| 27 | + "cd /opt/Export-Deploy\n", |
| 28 | + "uv sync --link-mode symlink --locked --extra trt-onnx $(cat /opt/uv_args.txt)\n", |
| 29 | + "```\n", |
| 30 | + "\n", |
| 31 | + "2. Depending on the number of gpus, `--gpus` might need to adjust accordingly:\n", |
| 32 | + "\n", |
25 | 33 | "```\n", |
26 | | - "docker run -it -p 8080:8080 -p 8088:8088 --rm --gpus '\"device=0,1\"' --ipc=host --network host -v $(pwd):/workspace nvcr.io/nvidia/nemo:$TAG\n", |
| 34 | + "docker run -it -p 8080:8080 -p 8088:8088 --rm --gpus device=0 --ipc=host --network host -v $(pwd):/workspace nvcr.io/nvidia/nemo:25.07\n", |
27 | 35 | "```\n", |
28 | 36 | "\n", |
29 | | - "#### Launch Jupyter Notebook as follows: \n", |
| 37 | + "3. Launch Jupyter Notebook as follows:\n", |
30 | 38 | "```\n", |
31 | 39 | "jupyter notebook --allow-root --ip 0.0.0.0 --port 8088 --no-browser --NotebookApp.token=''\n", |
32 | 40 | "\n", |
|
78 | 86 | "] # Model specific layers to override the precision to fp32.\n", |
79 | 87 | "override_layernorm_precision_to_fp32 = True # Model specific operation wheter to override layernorm precision or not.\n", |
80 | 88 | "profiling_verbosity = \"layer_names_only\"\n", |
81 | | - "export_to_trt = True # Export ONNX model to TensorRT or not.\n", |
82 | | - "# Generate version compatible TensorRT engine or not. This option might provide slower inference time.\n", |
83 | | - "# If you know the TensorRT engine versions match (where the engine was generated versus where it's used), set this to False.\n", |
84 | | - "# Please check here https://docs.nvidia.com/deeplearning/tensorrt/latest/inference-library/advanced.html#version-compatibility for more information.\n", |
85 | | - "trt_version_compatible = True" |
| 89 | + "export_to_trt = True # Export ONNX model to TensorRT or not." |
86 | 90 | ] |
87 | 91 | }, |
88 | 92 | { |
|
106 | 110 | "metadata": {}, |
107 | 111 | "outputs": [], |
108 | 112 | "source": [ |
109 | | - "from nemo.export.onnx_llm_exporter import OnnxLLMExporter\n", |
110 | | - "\n", |
111 | | - "use_token_type_ids_arg = False\n", |
112 | | - "if \"token_type_ids\" in tokenizer.model_input_names:\n", |
113 | | - " use_token_type_ids_arg = True\n", |
114 | | - "\n", |
115 | | - "if use_token_type_ids_arg:\n", |
116 | | - " input_names = [\n", |
117 | | - " \"input_ids\",\n", |
118 | | - " \"attention_mask\",\n", |
119 | | - " \"token_type_ids\",\n", |
120 | | - " ] # ONNX specific arguments, input names in this case.\n", |
121 | | - " dynamic_axes_input = {\n", |
122 | | - " \"input_ids\": {0: \"batch_size\", 1: \"seq_length\"},\n", |
123 | | - " \"attention_mask\": {0: \"batch_size\", 1: \"seq_length\"},\n", |
124 | | - " \"token_type_ids\": {0: \"batch_size\"},\n", |
125 | | - " }\n", |
126 | | - "else:\n", |
127 | | - " input_names = [\"input_ids\", \"attention_mask\"]\n", |
128 | | - " dynamic_axes_input = {\n", |
129 | | - " \"input_ids\": {0: \"batch_size\", 1: \"seq_length\"},\n", |
130 | | - " \"attention_mask\": {0: \"batch_size\", 1: \"seq_length\"},\n", |
131 | | - " }\n", |
| 113 | + "from nemo_export.onnx_llm_exporter import OnnxLLMExporter\n", |
| 114 | + "\n", |
| 115 | + "input_names = [\"input_ids\", \"attention_mask\"]\n", |
| 116 | + "dynamic_axes_input = {\n", |
| 117 | + " \"input_ids\": {0: \"batch_size\", 1: \"seq_length\"},\n", |
| 118 | + " \"attention_mask\": {0: \"batch_size\", 1: \"seq_length\"},\n", |
| 119 | + "}\n", |
132 | 120 | "\n", |
133 | 121 | "output_names = [\"logits\"] # ONNX specific arguments, output names in this case.\n", |
134 | 122 | "dynamic_axes_output = {\"logits\": {0: \"batch_size\"}}\n", |
|
149 | 137 | ")" |
150 | 138 | ] |
151 | 139 | }, |
| 140 | + { |
| 141 | + "cell_type": "code", |
| 142 | + "execution_count": null, |
| 143 | + "id": "a20cd28d-505b-4c93-a53c-93568539dcbf", |
| 144 | + "metadata": {}, |
| 145 | + "outputs": [], |
| 146 | + "source": [ |
| 147 | + "prompt = [\"hello\", \"world\"]\n", |
| 148 | + "onnx_exporter.forward(prompt)" |
| 149 | + ] |
| 150 | + }, |
152 | 151 | { |
153 | 152 | "cell_type": "code", |
154 | 153 | "execution_count": null, |
|
157 | 156 | "outputs": [], |
158 | 157 | "source": [ |
159 | 158 | "if export_to_trt:\n", |
160 | | - " if use_token_type_ids_arg:\n", |
161 | | - " input_profiles = [\n", |
162 | | - " {\n", |
163 | | - " \"input_ids\": [[1, 3], [16, 128], [64, 256]],\n", |
164 | | - " \"attention_mask\": [[1, 3], [16, 128], [64, 256]],\n", |
165 | | - " \"token_type_ids\": [[1], [16], [64]],\n", |
166 | | - " }\n", |
167 | | - " ]\n", |
168 | | - " else:\n", |
169 | | - " input_profiles = [\n", |
170 | | - " {\"input_ids\": [[1, 3], [16, 128], [64, 256]], \"attention_mask\": [[1, 3], [16, 128], [64, 256]]}\n", |
171 | | - " ]\n", |
172 | | - "\n", |
173 | | - " trt_builder_flags = None\n", |
174 | | - " if trt_version_compatible:\n", |
175 | | - " import tensorrt as trt\n", |
176 | | - "\n", |
177 | | - " trt_builder_flags = [trt.BuilderFlag.VERSION_COMPATIBLE]\n", |
| 159 | + " import tensorrt as trt\n", |
| 160 | + "\n", |
| 161 | + " input_profiles = [{\"input_ids\": [[1, 3], [16, 128], [64, 256]], \"attention_mask\": [[1, 3], [16, 128], [64, 256]]}]\n", |
178 | 162 | "\n", |
179 | 163 | " onnx_exporter.export_onnx_to_trt(\n", |
180 | 164 | " trt_model_dir=trt_model_path,\n", |
181 | 165 | " profiles=input_profiles,\n", |
182 | 166 | " override_layernorm_precision_to_fp32=override_layernorm_precision_to_fp32,\n", |
183 | 167 | " override_layers_to_fp32=override_layers_to_fp32,\n", |
184 | 168 | " profiling_verbosity=profiling_verbosity,\n", |
185 | | - " trt_builder_flags=trt_builder_flags,\n", |
| 169 | + " trt_builder_flags=[trt.BuilderFlag.VERSION_COMPATIBLE],\n", |
186 | 170 | " )" |
187 | 171 | ] |
188 | 172 | }, |
189 | 173 | { |
190 | 174 | "cell_type": "code", |
191 | 175 | "execution_count": null, |
192 | | - "id": "051200b7-6eba-44db-b223-059f1dfb60bd", |
| 176 | + "id": "78b10377-fe20-47db-9fee-0ef1f12f13a8", |
193 | 177 | "metadata": {}, |
194 | 178 | "outputs": [], |
195 | | - "source": [ |
196 | | - "prompt = [\"hello\", \"world\"]\n", |
197 | | - "token_type_ids = [2, 4] if use_token_type_ids_arg else None\n", |
198 | | - "\n", |
199 | | - "onnx_exporter.forward(prompt, token_type_ids)" |
200 | | - ] |
| 179 | + "source": [] |
201 | 180 | } |
202 | 181 | ], |
203 | 182 | "metadata": { |
|
0 commit comments