@torch.compile some tutorials

guilhermeleobas · guilhermeleobas · commit 753be5773ef6 · 2024-07-25T15:24:05.000-03:00
diff --git a/intermediate_source/ensembling.py b/intermediate_source/ensembling.py
@@ -25,6 +25,9 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch._dynamo import config
+config.inline_inbuilt_nn_modules = 1
+import profile_utils
 torch.manual_seed(0)
 
 # Here's a simple MLP
@@ -50,7 +53,7 @@ def forward(self, x):
 # minibatch of size 64. Furthermore, lets say we want to combine the predictions
 # from 10 different models.
 
-device = 'cuda'
+device = 'cuda' if torch.cuda.device_count() > 0 else 'cpu'
 num_models = 10
 
 data = torch.randn(100, 64, 1, 28, 28, device=device)
@@ -125,7 +128,12 @@ def fmodel(params, buffers, x):
 
 from torch import vmap
 
-predictions1_vmap = vmap(fmodel)(params, buffers, minibatches)
+@torch.compile
+def compute_predictions1(params, buffers, minibatches):
+    return vmap(fmodel)(params, buffers, minibatches)
+
+predictions1_vmap = compute_predictions1(params, buffers, minibatches)
+profile_utils.compute_speedup(compute_predictions1, (params, buffers, minibatches), device)
 
 # verify the ``vmap`` predictions match the
 assert torch.allclose(predictions1_vmap, torch.stack(predictions_diff_minibatch_loop), atol=1e-3, rtol=1e-5)
@@ -137,7 +145,12 @@ def fmodel(params, buffers, x):
 # By using ``None``, we tell ``vmap`` we want the same minibatch to apply for all of
 # the 10 models.
 
-predictions2_vmap = vmap(fmodel, in_dims=(0, 0, None))(params, buffers, minibatch)
+@torch.compile
+def compute_predictions2(params, buffers, minibatch):
+    return vmap(fmodel, in_dims=(0, 0, None))(params, buffers, minibatch)
+
+predictions2_vmap = compute_predictions2(params, buffers, minibatch)
+profile_utils.compute_speedup(compute_predictions2, (params, buffers, minibatch), device)
 
 assert torch.allclose(predictions2_vmap, torch.stack(predictions2), atol=1e-3, rtol=1e-5)
 
diff --git a/intermediate_source/neural_tangent_kernels.py b/intermediate_source/neural_tangent_kernels.py
@@ -22,8 +22,11 @@
 """
 
 import torch
+import profile_utils
 import torch.nn as nn
 from torch.func import functional_call, vmap, vjp, jvp, jacrev
+from torch._dynamo import config
+config.inline_inbuilt_nn_modules = 1
 device = 'cuda' if torch.cuda.device_count() > 0 else 'cpu'
 
 class CNN(nn.Module):
@@ -95,6 +98,7 @@ def fnet_single(params, x):
 # The first method consists of doing just that - computing the two Jacobians,
 # and contracting them. Here's how to compute the NTK in the batched case:
 
+@torch.compile
 def empirical_ntk_jacobian_contraction(fnet_single, params, x1, x2):
     # Compute J(x1)
     jac1 = vmap(jacrev(fnet_single), (None, 0))(params, x1)
@@ -113,14 +117,16 @@ def empirical_ntk_jacobian_contraction(fnet_single, params, x1, x2):
 
 result = empirical_ntk_jacobian_contraction(fnet_single, params, x_train, x_test)
 print(result.shape)
+profile_utils.compute_speedup(empirical_ntk_jacobian_contraction, (fnet_single, params, x_train, x_test), device)
 
 ######################################################################
 # In some cases, you may only want the diagonal or the trace of this quantity,
 # especially if you know beforehand that the network architecture results in an
 # NTK where the non-diagonal elements can be approximated by zero. It's easy to
 # adjust the above function to do that:
 
-def empirical_ntk_jacobian_contraction(fnet_single, params, x1, x2, compute='full'):
+@torch.compile
+def empirical_ntk_jacobian_contraction(fnet_single, params, x1, x2, compute):
     # Compute J(x1)
     jac1 = vmap(jacrev(fnet_single), (None, 0))(params, x1)
     jac1 = jac1.values()
@@ -148,6 +154,7 @@ def empirical_ntk_jacobian_contraction(fnet_single, params, x1, x2, compute='ful
 
 result = empirical_ntk_jacobian_contraction(fnet_single, params, x_train, x_test, 'trace')
 print(result.shape)
+profile_utils.compute_speedup(empirical_ntk_jacobian_contraction, (fnet_single, params, x_train, x_test, 'trace'), device)
 
 ######################################################################
 # The asymptotic time complexity of this method is :math:`N O [FP]` (time to
@@ -189,7 +196,8 @@ def empirical_ntk_jacobian_contraction(fnet_single, params, x1, x2, compute='ful
 #
 # Let's code that up:
 
-def empirical_ntk_ntk_vps(func, params, x1, x2, compute='full'):
+@torch.compile
+def empirical_ntk_ntk_vps(func, params, x1, x2, compute):
     def get_ntk(x1, x2):
         def func_x1(params):
             return func(params, x1)
@@ -226,8 +234,9 @@ def get_ntk_slice(vec):
 
 # Disable TensorFloat-32 for convolutions on Ampere+ GPUs to sacrifice performance in favor of accuracy
 with torch.backends.cudnn.flags(allow_tf32=False):
-    result_from_jacobian_contraction = empirical_ntk_jacobian_contraction(fnet_single, params, x_test, x_train)
-    result_from_ntk_vps = empirical_ntk_ntk_vps(fnet_single, params, x_test, x_train)
+    result_from_jacobian_contraction = empirical_ntk_jacobian_contraction(fnet_single, params, x_test, x_train, 'full')
+    result_from_ntk_vps = empirical_ntk_ntk_vps(fnet_single, params, x_test, x_train, 'full')
+    profile_utils.compute_speedup(empirical_ntk_ntk_vps, (fnet_single, params, x_train, x_test, 'full'), device)
 
 assert torch.allclose(result_from_jacobian_contraction, result_from_ntk_vps, atol=1e-5)
 
diff --git a/intermediate_source/per_sample_grads.py b/intermediate_source/per_sample_grads.py
@@ -16,9 +16,12 @@
 
 """
 
+import profile_utils
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch._dynamo import config
+config.inline_inbuilt_nn_modules = 1
 torch.manual_seed(0)
 
 # Here's a simple CNN and loss function:
@@ -52,7 +55,7 @@ def loss_fn(predictions, targets):
 # Let’s generate a batch of dummy data and pretend that we’re working with an MNIST dataset.
 # The dummy images are 28 by 28 and we use a minibatch of size 64.
 
-device = 'cuda'
+device = 'cuda' if torch.cuda.device_count() > 0 else 'cpu'
 
 num_models = 10
 batch_size = 64
@@ -159,10 +162,16 @@ def compute_loss(params, buffers, sample, target):
 
 ft_compute_sample_grad = vmap(ft_compute_grad, in_dims=(None, None, 0, 0))
 
+@torch.compile
+def vmap_ft_compute_grad(params, buffers, data, targets):
+    ft_compute_sample_grad_ = vmap(ft_compute_grad, in_dims=(None, None, 0, 0))
+    return ft_compute_sample_grad_(params, buffers, data, targets)
+
 ######################################################################
 # Finally, let's used our transformed function to compute per-sample-gradients:
 
-ft_per_sample_grads = ft_compute_sample_grad(params, buffers, data, targets)
+ft_per_sample_grads = vmap_ft_compute_grad(params, buffers, data, targets)
+profile_utils.compute_speedup(vmap_ft_compute_grad, (params, buffers, data, targets), device)
 
 ######################################################################
 # we can double check that the results using ``grad`` and ``vmap`` match the
@@ -194,7 +203,7 @@ def get_perf(first, first_descriptor, second, second_descriptor):
     first_res = first.times[0]
 
     gain = (first_res-second_res)/first_res
-    if gain < 0: gain *=-1 
+    if gain < 0: gain *=-1
     final_gain = gain*100
 
     print(f"Performance delta: {final_gain:.4f} percent improvement with {first_descriptor} ")
diff --git a/intermediate_source/profile_utils.py b/intermediate_source/profile_utils.py
@@ -0,0 +1,55 @@
+import torch
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.utils.benchmark import Timer, Compare
+
+def profile(fn, inputs):
+    activities = [
+        torch.profiler.ProfilerActivity.CPU,
+        torch.profiler.ProfilerActivity.CUDA,
+    ]
+
+    with torch.profiler.profile(activities=activities, with_stack=True) as prof:
+        fn(*inputs)
+
+    print(prof.key_averages().table(sort_by="self_cuda_time_total"))
+
+def compute_speedup(fn, inputs, device, times=100):
+    lst = []
+
+    fn = fn._torchdynamo_orig_callable
+    fn_opt = torch.compile(fullgraph=True)(fn)
+    fx_g = make_fx(fn)
+
+    for nt in (1, 2, 4, 8, 16):
+        opt = Timer(
+            setup='fn_opt(*inputs)',
+            stmt='fn_opt(*inputs)',
+            globals={'fn_opt': fn_opt, 'inputs': inputs},
+            label=fn.__name__,
+            sub_label='@torch.compile',
+            description=device,
+            num_threads=nt,
+        ).timeit(times)
+
+        fx = Timer(
+            setup='fx_g(*inputs)',
+            stmt='fx_g(*inputs)',
+            globals={'fx_g': fx_g, 'inputs': inputs},
+            label=fn.__name__,
+            sub_label='make_fx',
+            description=device,
+            num_threads=nt,
+        ).timeit(times)
+
+        eager = Timer(
+            setup='fn(*inputs)',
+            stmt='fn(*inputs)',
+            globals={'fn': fn, 'inputs': inputs},
+            label=fn.__name__,
+            sub_label='eager',
+            description=device,
+            num_threads=nt,
+        ).timeit(times)
+        lst.extend([opt, fx, eager])
+
+    Compare(lst).print()