|
| 1 | +.. _mixed_precision: |
| 2 | + |
| 3 | +Compile Mixed Precision models with Torch-TensorRT |
| 4 | +==================================== |
| 5 | +.. currentmodule:: torch_tensorrt.dynamo |
| 6 | + |
| 7 | +.. automodule:: torch_tensorrt.dynamo |
| 8 | + :members: |
| 9 | + :undoc-members: |
| 10 | + :show-inheritance: |
| 11 | + |
| 12 | +Consider the following Pytorch model which explicitly casts intermediate layer to run in FP16. |
| 13 | + |
| 14 | +.. code-block:: python |
| 15 | +
|
| 16 | + class MyModule(torch.nn.Module): |
| 17 | + def __init__(self): |
| 18 | + super().__init__() |
| 19 | + self.linear1 = torch.nn.Linear(10,10) |
| 20 | + self.linear2 = torch.nn.Linear(10,30).half() |
| 21 | + self.linear3 = torch.nn.Linear(30,40) |
| 22 | +
|
| 23 | + def forward(self, x): |
| 24 | + x = self.linear1(x) |
| 25 | + x = x.to(torch.float16) |
| 26 | + x = self.linear2(x) |
| 27 | + x = x.to(torch.float32) |
| 28 | + x = self.linear3(x) |
| 29 | + return x |
| 30 | +
|
| 31 | +
|
| 32 | +If we compile the above model using Torch-TensorRT, layer profiling logs indicate that all the layers are |
| 33 | +run in FP32. This is because TensorRT picks the kernels for layers which result in the best performance. |
| 34 | + |
| 35 | +.. code-block:: python |
| 36 | +
|
| 37 | + inputs = [torch.randn((1, 10), dtype=torch.float32).cuda()] |
| 38 | + mod = MyModule().eval().cuda() |
| 39 | + ep = torch.export.export(mod, tuple(inputs)) |
| 40 | + with torch_tensorrt.logging.debug(): |
| 41 | + trt_gm = torch_tensorrt.dynamo.compile(ep, |
| 42 | + inputs=inputs, |
| 43 | + debug=True) |
| 44 | +
|
| 45 | + # Debug log info |
| 46 | + # Layers: |
| 47 | + # Name: __myl_MulSum_myl0_0, LayerType: kgen, Inputs: [ { Name: __mye116_dconst, Dimensions: [10,10], Format/Datatype: Float }, { Name: x, Dimensions: [10,1], Format/Datatype: Float }], Outputs: [ { Name: __myln_k_arg__bb1_2, Dimensions: [1,10], Format/Datatype: Float }], TacticName: __myl_MulSum_0xfa6c1858aea1b13b03f90165d7149ec6, StreamId: 0, Metadata: |
| 48 | + # Name: __myl_AddResMulSum_myl0_1, LayerType: kgen, Inputs: [ { Name: __mye131_dconst, Dimensions: [10,30], Format/Datatype: Float }, { Name: __myln_k_arg__bb1_2, Dimensions: [1,10], Format/Datatype: Float }, { Name: linear1/addmm_constant_0 _ linear1/addmm_add_broadcast_to_same_shape_lhs_broadcast_constantFloat, Dimensions: [1,10], Format/Datatype: Float }], Outputs: [ { Name: __myln_k_arg__bb1_3, Dimensions: [1,30], Format/Datatype: Float }], TacticName: __myl_AddResMulSum_0xb3915d7ebfe48be45b6d49083479e12f, StreamId: 0, Metadata: |
| 49 | + # Name: __myl_AddResMulSumAdd_myl0_2, LayerType: kgen, Inputs: [ { Name: __mye146_dconst, Dimensions: [30,40], Format/Datatype: Float }, { Name: linear3/addmm_2_constant_0 _ linear3/addmm_2_add_broadcast_to_same_shape_lhs_broadcast_constantFloat, Dimensions: [1,40], Format/Datatype: Float }, { Name: __myln_k_arg__bb1_3, Dimensions: [1,30], Format/Datatype: Float }, { Name: linear2/addmm_1_constant_0 _ linear2/addmm_1_add_broadcast_to_same_shape_lhs_broadcast_constantFloat, Dimensions: [1,30], Format/Datatype: Float }], Outputs: [ { Name: output0, Dimensions: [1,40], Format/Datatype: Float }], TacticName: __myl_AddResMulSumAdd_0xcdd0085ad25f5f45ac5fafb72acbffd6, StreamId: 0, Metadata: |
| 50 | +
|
| 51 | +
|
| 52 | +In order to respect the types specified by the user in the model (eg: in this case, ``linear2`` layer to run in FP16), users can enable |
| 53 | +the compilation setting ``use_explicit_typing=True``. Compiling with this option results in the following TensorRT logs |
| 54 | + |
| 55 | +.. note:: If you enable ``use_explicit_typing=True``, only torch.float32 is supported in the enabled_precisions. |
| 56 | + |
| 57 | +.. code-block:: python |
| 58 | +
|
| 59 | + inputs = [torch.randn((1, 10), dtype=torch.float32).cuda()] |
| 60 | + mod = MyModule().eval().cuda() |
| 61 | + ep = torch.export.export(mod, tuple(inputs)) |
| 62 | + with torch_tensorrt.logging.debug(): |
| 63 | + trt_gm = torch_tensorrt.dynamo.compile(ep, |
| 64 | + inputs=inputs, |
| 65 | + use_explicit_typing=True |
| 66 | + debug=True) |
| 67 | +
|
| 68 | + # Debug log info |
| 69 | + # Layers: |
| 70 | + # Name: __myl_MulSumAddCas_myl0_0, LayerType: kgen, Inputs: [ { Name: linear1/addmm_constant_0 _ linear1/addmm_add_broadcast_to_same_shape_lhs_broadcast_constantFloat, Dimensions: [1,10], Format/Datatype: Float }, { Name: __mye112_dconst, Dimensions: [10,10], Format/Datatype: Float }, { Name: x, Dimensions: [10,1], Format/Datatype: Float }], Outputs: [ { Name: __myln_k_arg__bb1_2, Dimensions: [1,10], Format/Datatype: Half }], TacticName: __myl_MulSumAddCas_0xacf8f5dd9be2f3e7bb09cdddeac6c936, StreamId: 0, Metadata: |
| 71 | + # Name: __myl_ResMulSumAddCas_myl0_1, LayerType: kgen, Inputs: [ { Name: __mye127_dconst, Dimensions: [10,30], Format/Datatype: Half }, { Name: linear2/addmm_1_constant_0 _ linear2/addmm_1_add_broadcast_to_same_shape_lhs_broadcast_constantHalf, Dimensions: [1,30], Format/Datatype: Half }, { Name: __myln_k_arg__bb1_2, Dimensions: [1,10], Format/Datatype: Half }], Outputs: [ { Name: __myln_k_arg__bb1_3, Dimensions: [1,30], Format/Datatype: Float }], TacticName: __myl_ResMulSumAddCas_0x5a3b318b5a1c97b7d5110c0291481337, StreamId: 0, Metadata: |
| 72 | + # Name: __myl_ResMulSumAdd_myl0_2, LayerType: kgen, Inputs: [ { Name: __mye142_dconst, Dimensions: [30,40], Format/Datatype: Float }, { Name: linear3/addmm_2_constant_0 _ linear3/addmm_2_add_broadcast_to_same_shape_lhs_broadcast_constantFloat, Dimensions: [1,40], Format/Datatype: Float }, { Name: __myln_k_arg__bb1_3, Dimensions: [1,30], Format/Datatype: Float }], Outputs: [ { Name: output0, Dimensions: [1,40], Format/Datatype: Float }], TacticName: __myl_ResMulSumAdd_0x3fad91127c640fd6db771aa9cde67db0, StreamId: 0, Metadata: |
| 73 | +
|
| 74 | +Now the ``linear2`` layer runs in FP16 as shown in the above logs. |
0 commit comments