diff --git a/_templates/layout.html b/_templates/layout.html index 17d95152d3b..b5c86f19096 100644 --- a/_templates/layout.html +++ b/_templates/layout.html @@ -211,14 +211,5 @@ - {% endblock %} diff --git a/beginner_source/basics/optimization_tutorial.py b/beginner_source/basics/optimization_tutorial.py index c6c327f8511..82bfaa8f07c 100644 --- a/beginner_source/basics/optimization_tutorial.py +++ b/beginner_source/basics/optimization_tutorial.py @@ -76,7 +76,7 @@ def forward(self, x): # (`read more `__ about hyperparameter tuning) # # We define the following hyperparameters for training: -# - **Number of Epochs** - the number times to iterate over the dataset +# - **Number of Epochs** - the number of times to iterate over the dataset # - **Batch Size** - the number of data samples propagated through the network before the parameters are updated # - **Learning Rate** - how much to update models parameters at each batch/epoch. Smaller values yield slow learning speed, while large values may result in unpredictable behavior during training. # diff --git a/en-wordlist.txt b/en-wordlist.txt index 6a794e7786f..baf75d75ac0 100644 --- a/en-wordlist.txt +++ b/en-wordlist.txt @@ -698,3 +698,14 @@ TorchServe Inductor’s onwards recompilations +BiasCorrection +ELU +GELU +NNCF +OpenVINO +OpenVINOQuantizer +PReLU +Quantizer +SmoothQuant +quantizer +quantizers \ No newline at end of file diff --git a/index.rst b/index.rst index 155b63a006d..e4ab3c1d81e 100644 --- a/index.rst +++ b/index.rst @@ -3,13 +3,11 @@ Welcome to PyTorch Tutorials **What's new in PyTorch tutorials?** -* `Dynamic Compilation Control with torch.compiler.set_stance `__ -* `Accelerating PyTorch Transformers by replacing nn.Transformer with Nested Tensors and torch.compile() `__ -* `Understanding the torch.export Flow and Solutions to Common Challenges `__ -* Updated `torch.export Tutorial `__ with automatic dynamic shapes ``Dim.AUTO`` -* Updated `torch.export AOTInductor Tutorial for Python runtime `__ -* Updated `Using User-Defined Triton Kernels with torch.compile `__ with new ``torch.library.triton_op`` -* Updated `Compile Time Caching in torch.compile `__ with new ``Mega-Cache`` +* `Utilizing Torch Function modes with torch.compile `__ +* `Context Parallel Tutorial `__ +* `PyTorch 2 Export Quantization with Intel GPU Backend through Inductor `__ +* `(beta) Explicit horizontal fusion with foreach_map and torch.compile `__ +* Updated `Inductor Windows CPU Tutorial `__ .. raw:: html diff --git a/intermediate_source/torch_compile_tutorial.py b/intermediate_source/torch_compile_tutorial.py index a5c1b345e9c..de31af04dc1 100644 --- a/intermediate_source/torch_compile_tutorial.py +++ b/intermediate_source/torch_compile_tutorial.py @@ -101,8 +101,11 @@ def forward(self, x): return torch.nn.functional.relu(self.lin(x)) mod = MyModule() -opt_mod = torch.compile(mod) -print(opt_mod(t)) +mod.compile() +print(mod(t)) +## or: +# opt_mod = torch.compile(mod) +# print(opt_mod(t)) ###################################################################### # torch.compile and Nested Calls @@ -135,8 +138,8 @@ def forward(self, x): return torch.nn.functional.relu(self.outer_lin(x)) outer_mod = OuterModule() -opt_outer_mod = torch.compile(outer_mod) -print(opt_outer_mod(t)) +outer_mod.compile() +print(outer_mod(t)) ###################################################################### # We can also disable some functions from being compiled by using @@ -197,6 +200,12 @@ def outer_function(): # 4. **Compile Leaf Functions First:** In complex models with multiple nested # functions and modules, start by compiling the leaf functions or modules first. # For more information see `TorchDynamo APIs for fine-grained tracing `__. +# +# 5. **Prefer ``mod.compile()`` over ``torch.compile(mod)``:** Avoids ``_orig_`` prefix issues in ``state_dict``. +# +# 6. **Use ``fullgraph=True`` to catch graph breaks:** Helps ensure end-to-end compilation, maximizing speedup +# and compatibility with ``torch.export``. + ###################################################################### # Demonstrating Speedups diff --git a/prototype_source/inductor_windows.rst b/prototype_source/inductor_windows.rst index ae1b454865e..871cc48a33e 100644 --- a/prototype_source/inductor_windows.rst +++ b/prototype_source/inductor_windows.rst @@ -22,10 +22,9 @@ Install a Compiler C++ compiler is required for TorchInductor optimization, let's take Microsoft Visual C++ (MSVC) as an example. -1. Download and install `MSVC `_. +#. Download and install `MSVC `_. -1. During Installation, select **Workloads** and then **Desktop & Mobile**. -1. Select a checkmark on **Desktop Development with C++** and install. +#. During Installation, select **Workloads** and then **Desktop & Mobile**. Select a checkmark on **Desktop Development with C++** and install. .. image:: ../_static/img/install_msvc.png @@ -44,18 +43,21 @@ Next, let's configure our environment. "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Auxiliary/Build/vcvars64.bat" #. Create and activate a virtual environment: :: + #. Install `PyTorch 2.5 `_ or later for CPU Usage. Install PyTorch 2.7 or later refer to `Getting Started on Intel GPU `_ for XPU usage. + #. Here is an example of how to use TorchInductor on Windows: -.. code-block:: python - - import torch - device="cpu" # or "xpu" for XPU - def foo(x, y): - a = torch.sin(x) - b = torch.cos(x) - return a + b - opt_foo1 = torch.compile(foo) - print(opt_foo1(torch.randn(10, 10).to(device), torch.randn(10, 10).to(device))) + + .. code-block:: python + + import torch + device="cpu" # or "xpu" for XPU + def foo(x, y): + a = torch.sin(x) + b = torch.cos(x) + return a + b + opt_foo1 = torch.compile(foo) + print(opt_foo1(torch.randn(10, 10).to(device), torch.randn(10, 10).to(device))) #. Below is the output of the above example:: diff --git a/prototype_source/openvino_quantizer.rst b/prototype_source/openvino_quantizer.rst new file mode 100644 index 00000000000..9412c772204 --- /dev/null +++ b/prototype_source/openvino_quantizer.rst @@ -0,0 +1,250 @@ +PyTorch 2 Export Quantization for OpenVINO torch.compile Backend +=========================================================================== + +**Authors**: `Daniil Lyakhov `_, `Aamir Nazir `_, `Alexander Suslov `_, `Yamini Nimmagadda `_, `Alexander Kozlov `_ + +Prerequisites +-------------- +- `PyTorch 2 Export Post Training Quantization `_ +- `How to Write a Quantizer for PyTorch 2 Export Quantization `_ + +Introduction +-------------- + +.. note:: + + This is an experimental feature, the quantization API is subject to change. + +This tutorial demonstrates how to use ``OpenVINOQuantizer`` from `Neural Network Compression Framework (NNCF) `_ in PyTorch 2 Export Quantization flow to generate a quantized model customized for the `OpenVINO torch.compile backend `_ and explains how to lower the quantized model into the `OpenVINO `_ representation. +``OpenVINOQuantizer`` unlocks the full potential of low-precision OpenVINO kernels due to the placement of quantizers designed specifically for the OpenVINO. + +The PyTorch 2 export quantization flow uses ``torch.export`` to capture the model into a graph and performs quantization transformations on top of the ATen graph. +This approach is expected to have significantly higher model coverage, improved flexibility, and a simplified UX. +OpenVINO backend compiles the FX Graph generated by TorchDynamo into an optimized OpenVINO model. + +The quantization flow mainly includes four steps: + +- Step 1: Capture the FX Graph from the eager Model based on the `torch export mechanism `_. +- Step 2: Apply the PyTorch 2 Export Quantization flow with OpenVINOQuantizer based on the captured FX Graph. +- Step 3: Lower the quantized model into OpenVINO representation with the `torch.compile `_ API. +- Optional step 4: : Improve quantized model metrics via `quantize_pt2e `_ method. + +The high-level architecture of this flow could look like this: + +:: + + float_model(Python) Example Input + \ / + \ / + —-------------------------------------------------------- + | export | + —-------------------------------------------------------- + | + FX Graph in ATen + | + | OpenVINOQuantizer + | / + —-------------------------------------------------------- + | prepare_pt2e | + | | | + | Calibrate + | | | + | convert_pt2e | + —-------------------------------------------------------- + | + Quantized Model + | + —-------------------------------------------------------- + | Lower into Inductor | + —-------------------------------------------------------- + | + OpenVINO model + +Post Training Quantization +---------------------------- + +Now, we will walk you through a step-by-step tutorial for how to use it with `torchvision resnet18 model `_ +for post training quantization. + +Prerequisite: OpenVINO and NNCF installation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +OpenVINO and NNCF could be easily installed via `pip distribution `_: + +.. code-block:: bash + + pip install -U pip + pip install openvino, nncf + + +1. Capture FX Graph +^^^^^^^^^^^^^^^^^^^^^ + +We will start by performing the necessary imports, capturing the FX Graph from the eager module. + +.. code-block:: python + + import copy + import openvino.torch + import torch + import torchvision.models as models + from torch.ao.quantization.quantize_pt2e import convert_pt2e + from torch.ao.quantization.quantize_pt2e import prepare_pt2e + + import nncf.torch + + # Create the Eager Model + model_name = "resnet18" + model = models.__dict__[model_name](pretrained=True) + + # Set the model to eval mode + model = model.eval() + + # Create the data, using the dummy data here as an example + traced_bs = 50 + x = torch.randn(traced_bs, 3, 224, 224) + example_inputs = (x,) + + # Capture the FX Graph to be quantized + with torch.no_grad(), nncf.torch.disable_patching(): + exported_model = torch.export.export(model, example_inputs).module() + + + +2. Apply Quantization +^^^^^^^^^^^^^^^^^^^^^^^ + +After we capture the FX Module to be quantized, we will import the OpenVINOQuantizer. + + +.. code-block:: python + + from nncf.experimental.torch.fx import OpenVINOQuantizer + + quantizer = OpenVINOQuantizer() + +``OpenVINOQuantizer`` has several optional parameters that allow tuning the quantization process to get a more accurate model. +Below is the list of essential parameters and their description: + + +* ``preset`` - defines quantization scheme for the model. Two types of presets are available: + + * ``PERFORMANCE`` (default) - defines symmetric quantization of weights and activations + + * ``MIXED`` - weights are quantized with symmetric quantization and the activations are quantized with asymmetric quantization. This preset is recommended for models with non-ReLU and asymmetric activation functions, e.g. ELU, PReLU, GELU, etc. + + .. code-block:: python + + OpenVINOQuantizer(preset=nncf.QuantizationPreset.MIXED) + +* ``model_type`` - used to specify quantization scheme required for specific type of the model. Transformer is the only supported special quantization scheme to preserve accuracy after quantization of Transformer models (BERT, Llama, etc.). None is default, i.e. no specific scheme is defined. + + .. code-block:: python + + OpenVINOQuantizer(model_type=nncf.ModelType.Transformer) + +* ``ignored_scope`` - this parameter can be used to exclude some layers from the quantization process to preserve the model accuracy. For example, when you want to exclude the last layer of the model from quantization. Below are some examples of how to use this parameter: + + .. code-block:: python + + #Exclude by layer name: + names = ['layer_1', 'layer_2', 'layer_3'] + OpenVINOQuantizer(ignored_scope=nncf.IgnoredScope(names=names)) + + #Exclude by layer type: + types = ['Conv2d', 'Linear'] + OpenVINOQuantizer(ignored_scope=nncf.IgnoredScope(types=types)) + + #Exclude by regular expression: + regex = '.*layer_.*' + OpenVINOQuantizer(ignored_scope=nncf.IgnoredScope(patterns=regex)) + + #Exclude by subgraphs: + # In this case, all nodes along all simple paths in the graph + # from input to output nodes will be excluded from the quantization process. + subgraph = nncf.Subgraph(inputs=['layer_1', 'layer_2'], outputs=['layer_3']) + OpenVINOQuantizer(ignored_scope=nncf.IgnoredScope(subgraphs=[subgraph])) + + +* ``target_device`` - defines the target device, the specificity of which will be taken into account during optimization. The following values are supported: ``ANY`` (default), ``CPU``, ``CPU_SPR``, ``GPU``, and ``NPU``. + + .. code-block:: python + + OpenVINOQuantizer(target_device=nncf.TargetDevice.CPU) + +For further details on `OpenVINOQuantizer` please see the `documentation `_. + +After we import the backend-specific Quantizer, we will prepare the model for post-training quantization. +``prepare_pt2e`` folds BatchNorm operators into preceding Conv2d operators, and inserts observers in appropriate places in the model. + +.. code-block:: python + + prepared_model = prepare_pt2e(exported_model, quantizer) + +Now, we will calibrate the ``prepared_model`` after the observers are inserted in the model. + +.. code-block:: python + + # We use the dummy data as an example here + prepared_model(*example_inputs) + +Finally, we will convert the calibrated Model to a quantized Model. ``convert_pt2e`` takes a calibrated model and produces a quantized model. + +.. code-block:: python + + quantized_model = convert_pt2e(prepared_model, fold_quantize=False) + +After these steps, we finished running the quantization flow, and we will get the quantized model. + + +3. Lower into OpenVINO representation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +After that the FX Graph can utilize OpenVINO optimizations using `torch.compile(…, backend=”openvino”) `_ functionality. + +.. code-block:: python + + with torch.no_grad(), nncf.torch.disable_patching(): + optimized_model = torch.compile(quantized_model, backend="openvino") + + # Running some benchmark + optimized_model(*example_inputs) + + + +The optimized model is using low-level kernels designed specifically for Intel CPU. +This should significantly speed up inference time in comparison with the eager model. + +4. Optional: Improve quantized model metrics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +NNCF implements advanced quantization algorithms like `SmoothQuant `_ and `BiasCorrection `_, which help +to improve the quantized model metrics while minimizing the output discrepancies between the original and compressed models. +These advanced NNCF algorithms can be accessed via the NNCF `quantize_pt2e` API: + +.. code-block:: python + + from nncf.experimental.torch.fx import quantize_pt2e + + calibration_loader = torch.utils.data.DataLoader(...) + + + def transform_fn(data_item): + images, _ = data_item + return images + + + calibration_dataset = nncf.Dataset(calibration_loader, transform_fn) + quantized_model = quantize_pt2e( + exported_model, quantizer, calibration_dataset, smooth_quant=True, fast_bias_correction=False + ) + + +For further details, please see the `documentation `_ +and a complete `example on Resnet18 quantization `_. + +Conclusion +------------ + +This tutorial introduces how to use torch.compile with the OpenVINO backend and the OpenVINO quantizer. +For more details on NNCF and the NNCF Quantization Flow for PyTorch models, refer to the `NNCF Quantization Guide `_. +For additional information, check out the `OpenVINO Deployment via torch.compile Documentation `_. diff --git a/prototype_source/prototype_index.rst b/prototype_source/prototype_index.rst index a0f7706c61d..5d6a1b5ea9f 100644 --- a/prototype_source/prototype_index.rst +++ b/prototype_source/prototype_index.rst @@ -96,6 +96,13 @@ Prototype features are not available as part of binary distributions like PyPI o :link: ../prototype/pt2e_quant_x86_inductor.html :tags: Quantization +.. customcarditem:: + :header: PyTorch 2 Export Quantization for OpenVINO torch.compile Backend + :card_description: Learn how to use PT2 Export Quantization with OpenVINO torch.compile Backend. + :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: ../prototype/openvino_quantizer.html + :tags: Quantization + .. customcarditem:: :header: PyTorch 2 Export Quantization with Intel GPU Backend through Inductor :card_description: Learn how to use PT2 Export Quantization with Intel GPU Backend through Inductor. diff --git a/recipes_source/foreach_map.py b/recipes_source/foreach_map.py index b8bf0aa2836..0225a77e279 100644 --- a/recipes_source/foreach_map.py +++ b/recipes_source/foreach_map.py @@ -1,6 +1,6 @@ """ -(beta) Explicit horizontal fusion with foreach_map and torch.compile -============================================================ +Explicit horizontal fusion with foreach_map and torch.compile +=============================================================== **Author:** `Michael Lazos `_ """ @@ -13,11 +13,17 @@ # allows conversion of any pointwise op in ``torch`` to a horiztonally fused foreach # variant. In this tutorial, we will demonstrate how to implement the Adam optimizer # with ``foreach_map`` to generate a fully fused kernel. -# # # .. note:: # -# This tutorial requires PyTorch 2.7.0 or later. +# This recipe describes a prototype feature. Prototype features are typically +# at an early stage for feedback and testing and are subject to change. +# +# Prerequisites +# ------------- +# +# * PyTorch v2.7.0 or later +# ##################################################################### # Model Setup diff --git a/recipes_source/recipes/zeroing_out_gradients.py b/recipes_source/recipes/zeroing_out_gradients.py index 0914edbf558..a4f80354961 100644 --- a/recipes_source/recipes/zeroing_out_gradients.py +++ b/recipes_source/recipes/zeroing_out_gradients.py @@ -182,7 +182,7 @@ def forward(self, x): # ``optimizer.zero_grad()`` as long as all your model parameters are in # that optimizer. Use your best judgment to decide which one to use. # -# Congratulations! You have successfully zeroed out gradients PyTorch. +# Congratulations! You have successfully zeroed out gradients in PyTorch. # # Learn More # ----------