Addressed review comments and added a section on why AOTI Python

agunapal · agunapal · commit 3fa9b208c593 · 2024-08-13T20:10:53.000Z
diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json
@@ -28,6 +28,9 @@
   "intermediate_source/model_parallel_tutorial.py": {
     "needs": "linux.16xlarge.nvidia.gpu"
   },
+  "intermediate_source/torch_export_aoti_python.py": {
+    "needs": "linux.16xlarge.nvidia.gpu"
+  }, 
   "advanced_source/pendulum.py": {
     "needs": "linux.g5.4xlarge.nvidia.gpu",
     "_comment": "need to be here for the compiling_optimizer_lr_scheduler.py to run."
diff --git a/intermediate_source/torch_export_aoti_python.py b/intermediate_source/torch_export_aoti_python.py
@@ -27,6 +27,19 @@
 # .. contents::
 #     :local:
 
+######################################################################
+# Prerequisites
+# -------------
+#   * PyTorch 2.4 or later
+#   * Basic understanding of ``torch._export`` and AOT Inductor
+#   * Complete the `AOTInductor: Ahead-Of-Time Compilation for Torch.Export-ed Models <https://pytorch.org/docs/stable/torch.compiler_aot_inductor.html#>`_ tutorial
+
+######################################################################
+# What you will learn
+# ----------------------
+# * How to use AOTInductor for python runtime.
+# * How  to use :func:`torch._export.aot_compile` to generate a shared library
+# * How to run a shared library in Python runtime using :func:`torch._export.aot_load`.
 
 ######################################################################
 # Model Compilation
@@ -37,8 +50,9 @@
 #
 #  .. note::
 #
-#       This API also supports :func:`torch.compile` options like `mode`
-#       As an example, if used on a CUDA enabled device, we can set `"max_autotune": True`
+#       This API also supports :func:`torch.compile` options like ``mode``
+#       This means that if used on a CUDA enabled device, you can, for example, set ``"max_autotune": True``
+#       which leverages Triton based matrix multiplications & convolutions, and enables CUDA graphs by default.
 #
 # We also specify ``dynamic_shapes`` for the batch dimension. In this example, ``min=2`` is not a bug and is 
 # explained in `The 0/1 Specialization Problem <https://docs.google.com/document/d/16VPOa3d-Liikf48teAOmxLc92rgvJdfosIy-yoT38Io/edit?fbclid=IwAR3HNwmmexcitV0pbZm_x1a4ykdXZ9th_eJWK-3hBtVgKnrkmemz6Pm5jRQ#heading=h.ez923tomjvyk>`__
@@ -89,7 +103,7 @@
 #
 # Typically, the shared object generated above is used in a non-Python environment. In PyTorch 2.3, 
 # we added a new API called :func:`torch._export.aot_load` to load the shared library in the Python runtime.
-# The API follows a similar structure to the :func:`torch.jit.load` API . We specify the path 
+# The API follows a structure similar to the :func:`torch.jit.load` API . You need to specify the path 
 # of the shared library and the device where it should be loaded.
 #  .. note::
 #
@@ -107,4 +121,98 @@
 example_inputs = (torch.randn(1, 3, 224, 224, device=device),)
 
 with torch.inference_mode():
-    output = model(example_inputs)
+    output = model(example_inputs)
+
+######################################################################
+# When to use AOT Inductor Python Runtime
+# ---------------------------------------
+#
+# One of the requirements for using AOT Inductor is that the model shouldn't have any graph breaks.
+# Once this requirement is met, the primary use case for using AOT Inductor Python Runtime is for
+# model deployment using Python.
+# There are mainly two reasons why you would use AOT Inductor Python Runtime:
+#
+# -  ``torch._export.aot_compile`` generates a shared library. This is useful for model
+#    versioning for deployments and tracking model performance over time.
+# -  With :func:`torch.compile` being a JIT compiler, there is a warmup
+#    cost associated with the first compilation. Your deployment needs to account for the
+#    compilation time taken for the first inference. With AOT Inductor, the compilation is
+#    done offline using ``torch._export.aot_compile``. The deployment would only load the
+#    shared library using ``torch._export.aot_load`` and run inference.
+#
+#
+# The section below shows the speedup achieved with AOT Inductor for first inference
+#
+# We define a utility function ``timed`` to measure the time taken for inference
+#
+
+import time
+def timed(fn):
+    # Returns the result of running `fn()` and the time it took for `fn()` to run,
+    # in seconds. We use CUDA events and synchronization for accurate
+    # measurement on CUDA enabled devices.
+    if torch.cuda.is_available():
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+    else:
+        start = time.time()
+
+    result = fn()
+    if torch.cuda.is_available():
+        end.record()
+        torch.cuda.synchronize()
+    else:
+        end = time.time()
+
+    # Measure time taken to execute the function in miliseconds
+    if torch.cuda.is_available():
+        duration = start.elapsed_time(end)
+    else:
+        duration = (end - start) * 1000
+
+    return result, duration
+
+
+######################################################################
+# Lets measure the time for first inference using AOT Inductor
+
+torch._dynamo.reset()
+
+model = torch._export.aot_load(model_so_path, device)
+example_inputs = (torch.randn(1, 3, 224, 224, device=device),)
+
+with torch.inference_mode():
+    _, time_taken = timed(lambda: model(example_inputs))
+    print(f"Time taken for first inference for AOT Inductor is {time_taken:.2f} ms")
+
+
+######################################################################
+# Lets measure the time for first inference using ``torch.compile``
+
+torch._dynamo.reset()
+
+model = resnet18(weights=ResNet18_Weights.DEFAULT).to(device)
+model.eval()
+
+model = torch.compile(model)
+example_inputs = torch.randn(1, 3, 224, 224, device=device)
+
+with torch.inference_mode():
+    _, time_taken = timed(lambda: model(example_inputs))
+    print(f"Time taken for first inference for torch.compile is {time_taken:.2f} ms")
+
+######################################################################
+# We see that there is a drastic speedup in first inference time using AOT Inductor compared
+# to ``torch.compile``
+
+######################################################################
+# Conclusion
+# ----------
+#
+# In this tutorial, we have learned how to effectively use the AOTInductor for Python runtime by 
+# compiling and loading a pretrained ``ResNet18`` model using the ``torch._export.aot_compile``
+# and ``torch._export.aot_load`` APIs. This process demonstrates the practical application of 
+# generating a shared library and running it within a Python environment, even with dynamic shape
+# considerations and device-specific optimizations. We also looked at the advantage of using 
+# AOT Inductor in model deployments, with regards to speed up in first inference time.