66:ref:`l-plot-parallelized-reduction`, reduction operations
77are sensitive to parallelization.
88
9- We consider a small model including a layer normalization
10- followed by a matrix multiplication and we show that replacing
11- a kernel by another one may significantly impact the output.
9+ Methodology
10+ +++++++++++
11+
12+ We consider a simple model with a LayerNormalization followed by a MatMul.
13+ Each operator can be run with :epkg:`onnxruntime` or :epkg:`pytorch`.
14+ We compare the four combinations.
1215
1316The model
1417+++++++++
1518"""
1619
1720import itertools
21+ import numpy as np
1822import pandas
1923import onnx
2024import onnx .helper as oh
2125import onnxruntime
2226import torch
2327from onnx_array_api .plotting .graphviz_helper import plot_dot
28+ from onnx_diagnostic .doc import rotate_align , save_fig , plot_histogram , title
2429from onnx_diagnostic .ext_test_case import unit_test_going
2530from onnx_diagnostic .helpers import max_diff , string_diff , string_type
2631from onnx_diagnostic .helpers .onnx_helper import onnx_dtype_name , onnx_dtype_to_np_dtype
@@ -79,6 +84,8 @@ def make_feeds(last_dim: int):
7984
8085
8186def cast_feeds (itype , provider , feeds ):
87+ ttype = onnx_dtype_to_torch_dtype (itype )
88+ np_dtype = onnx_dtype_to_np_dtype (itype )
8289 np_feeds = {k : v .detach ().numpy () for k , v in feeds .items ()}
8390 if provider == "CUDA" :
8491 if not torch .cuda .is_available ():
@@ -101,8 +108,6 @@ def cast_feeds(itype, provider, feeds):
101108baseline = {}
102109
103110for provider , itype in itertools .product (["CPU" , "CUDA" ], [TFLOAT , TFLOAT16 ]):
104- ttype = onnx_dtype_to_torch_dtype (itype )
105- np_dtype = onnx_dtype_to_np_dtype (itype )
106111 tch_feeds , ort_feeds = cast_feeds (itype , provider , feeds )
107112 if tch_feeds is None :
108113 continue
@@ -143,13 +148,34 @@ def cast_feeds(itype, provider, feeds):
143148# %%
144149# Visually.
145150
146- df ["abs" ].plot .bar (title = "Discrepancies ORT / torch for LayerNorm(X) @ W + B" )
151+ save_fig (
152+ rotate_align (
153+ df [["abs" ]].plot .bar (title = "Discrepancies ORT / torch for LayerNorm(X) @ W + B" )
154+ ),
155+ "plot_layer_norm_discrepancies_1.png" ,
156+ )
147157
148158# %%
149159# The discrepancies are significant on CUDA, higher for float16.
150160# Let's see which operator is responsible for them,
151161# *LayerNormalization* or *MatMul*.
152162
163+ # %%
164+ # Distribution of the results
165+ # +++++++++++++++++++++++++++
166+
167+ tensor = baseline [TFLOAT16 , "CPU" , "ort" ][0 ].ravel ().astype (np .float32 )
168+ print (pandas .DataFrame ({"expected" : tensor }).describe ())
169+
170+ # %%
171+ # Histogram.
172+
173+ save_fig (
174+ title (plot_histogram (tensor ), "Distribution of the computed results" ),
175+ "plot_layer_norm_discrepancies_hist.png" ,
176+ )
177+
178+
153179# %%
154180# The discrepancies come from?
155181# ++++++++++++++++++++++++++++
@@ -159,19 +185,18 @@ def cast_feeds(itype, provider, feeds):
159185data = []
160186
161187for mod , provider , itype in itertools .product (
162- ["ORT-TORCH" , "TORCH-ORT" ], ["CPU" , "CUDA" ], [TFLOAT , TFLOAT16 ]
188+ ["ORT-ORT" , "ORT- TORCH" , "TORCH-ORT" , "TORCH-TORCH " ], ["CPU" , "CUDA" ], [TFLOAT , TFLOAT16 ]
163189):
164190 ttype = onnx_dtype_to_torch_dtype (itype )
165191 np_dtype = onnx_dtype_to_np_dtype (itype )
166192 tch_feeds , _ = cast_feeds (itype , provider , feeds )
167193 if tch_feeds is None :
168194 continue
169195
196+ ker1 , ker2 = mod .split ("-" )
170197 custom_kernels = (
171- {("" , "LayerNormalization" ): LayerNormalizationOrt }
172- if mod == "ORT-TORCH"
173- else {("" , "MatMul" ): MatMulOrt }
174- )
198+ {("" , "LayerNormalization" ): LayerNormalizationOrt } if ker1 == "ORT" else {}
199+ ) | ({("" , "MatMul" ): MatMulOrt } if ker2 == "ORT" else {})
175200
176201 model = get_model (itype )
177202 print ()
@@ -200,13 +225,27 @@ def cast_feeds(itype, provider, feeds):
200225 )
201226
202227# %%
203- df = pandas .DataFrame (data ).set_index (["model " , "provider" , "dtype " ])
228+ df = pandas .DataFrame (data ).set_index (["dtype " , "provider" , "model " ])
204229df = df .sort_index ()
205230print (df )
206231
207232# %%
208233# Visually.
209234
210- df [["diff_ort" , "diff_torch" ]].plot .bar (
211- title = "ORT/Torch or Torch/ORT for LayerNorm(X) @ W + B"
235+ save_fig (
236+ rotate_align (
237+ df [["diff_ort" , "diff_torch" ]].plot .bar (
238+ title = "ORT/Torch or Torch/ORT for LayerNorm(X) @ W + B" ,
239+ figsize = (10 , 4 ),
240+ )
241+ ),
242+ "plot_layer_norm_discrepancies_2.png" ,
212243)
244+
245+ # %%
246+ # Conclusion
247+ # ++++++++++
248+ #
249+ # :epkg:`torch` seems able to replicate the same results if the same computation
250+ # is run multiple times. :epkg:`onnxruntime` is only able to do that on CUDA.
251+ # With float16 and CUDA, LayerNormalization seems to introduce some discrepancies.
0 commit comments