diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt index dae2606eaf7..a25c4494b64 100644 --- a/.ci/docker/requirements.txt +++ b/.ci/docker/requirements.txt @@ -2,7 +2,7 @@ # Refer to ./jenkins/build.sh for tutorial build instructions sphinx==5.3.0 -sphinx-gallery==0.11.1 +sphinx-gallery==0.17.1 sphinx-reredirects==0.1.4 sphinx-design==0.4.0 docutils==0.16 @@ -31,7 +31,8 @@ pytorch-lightning torchx torchrl==0.7.2 tensordict==0.7.2 -ax-platform>=0.4.0 +# For ax_multiobjective_nas_tutorial.py +ax-platform>=0.4.0,<0.5.0 nbformat>=5.9.2 datasets transformers @@ -68,5 +69,5 @@ pycocotools semilearn==0.3.2 torchao==0.10.0 segment_anything==1.0 -torchrec==1.1.0; platform_system == "Linux" +torchrec==1.2.0; platform_system == "Linux" fbgemm-gpu==1.2.0; platform_system == "Linux" diff --git a/.jenkins/validate_tutorials_built.py b/.jenkins/validate_tutorials_built.py index e02ea6028f7..9d2ec2d8159 100644 --- a/.jenkins/validate_tutorials_built.py +++ b/.jenkins/validate_tutorials_built.py @@ -51,7 +51,6 @@ "intermediate_source/text_to_speech_with_torchaudio", "intermediate_source/tensorboard_profiler_tutorial", # reenable after 2.0 release. "advanced_source/semi_structured_sparse", # reenable after 3303 is fixed. - "intermediate_source/torchrec_intro_tutorial", # reenable after 3302 is fixe ] def tutorial_source_dirs() -> List[Path]: diff --git a/Makefile b/Makefile index 6b61e36ec5d..7fcf1de6636 100644 --- a/Makefile +++ b/Makefile @@ -61,23 +61,6 @@ download: wget -nv -N https://s3.amazonaws.com/pytorch-tutorial-assets/cornell_movie_dialogs_corpus_v2.zip -P $(DATADIR) unzip $(ZIPOPTS) $(DATADIR)/cornell_movie_dialogs_corpus_v2.zip -d beginner_source/data/ - # Download model for advanced_source/dynamic_quantization_tutorial.py - wget -nv -N https://s3.amazonaws.com/pytorch-tutorial-assets/word_language_model_quantize.pth -P $(DATADIR) - cp $(DATADIR)/word_language_model_quantize.pth advanced_source/data/word_language_model_quantize.pth - - # Download data for advanced_source/dynamic_quantization_tutorial.py - wget -nv -N https://s3.amazonaws.com/pytorch-tutorial-assets/wikitext-2.zip -P $(DATADIR) - unzip $(ZIPOPTS) $(DATADIR)/wikitext-2.zip -d advanced_source/data/ - - # Download model for advanced_source/static_quantization_tutorial.py - wget -nv -N https://download.pytorch.org/models/mobilenet_v2-b0353104.pth -P $(DATADIR) - cp $(DATADIR)/mobilenet_v2-b0353104.pth advanced_source/data/mobilenet_pretrained_float.pth - - - # Download model for prototype_source/graph_mode_static_quantization_tutorial.py - wget -nv -N https://download.pytorch.org/models/resnet18-5c106cde.pth -P $(DATADIR) - cp $(DATADIR)/resnet18-5c106cde.pth prototype_source/data/resnet18_pretrained_float.pth - # Download PennFudanPed dataset for intermediate_source/torchvision_tutorial.py wget https://www.cis.upenn.edu/~jshi/ped_html/PennFudanPed.zip -P $(DATADIR) unzip -o $(DATADIR)/PennFudanPed.zip -d intermediate_source/data/ diff --git a/_static/img/thumbnails/cropped/graph-mode-dynamic-bert.png b/_static/img/thumbnails/cropped/graph-mode-dynamic-bert.png deleted file mode 100644 index 34bbf8c7bdf..00000000000 Binary files a/_static/img/thumbnails/cropped/graph-mode-dynamic-bert.png and /dev/null differ diff --git a/conf.py b/conf.py index 7401bdcea2a..1011fca60c0 100644 --- a/conf.py +++ b/conf.py @@ -85,7 +85,10 @@ def wrapper(*args, **kwargs): raise RuntimeError(f"Error in subprocess: {result}") return wrapper -sphinx_gallery.gen_rst.generate_file_rst = call_in_subprocess(sphinx_gallery.gen_rst.generate_file_rst) +# Windows does not support multiprocessing with fork and mac has issues with +# fork so we do not monkey patch sphinx gallery to run in subprocesses. +if os.getenv("TUTORIALS_ISOLATE_BUILD", "1") == "1" and not sys.platform.startswith("win") and not sys.platform == "darwin": + sphinx_gallery.gen_rst.generate_file_rst = call_in_subprocess(sphinx_gallery.gen_rst.generate_file_rst) try: import torchvision diff --git a/custom_directives.py b/custom_directives.py index 388aa262e6e..d989723f190 100644 --- a/custom_directives.py +++ b/custom_directives.py @@ -88,7 +88,8 @@ def run(self): if 'intro' in self.options: intro = self.options['intro'][:195] + '...' else: - _, blocks = sphinx_gallery.gen_rst.split_code_and_text_blocks(abs_fname) + block_parser = sphinx_gallery.gen_rst.BlockParser(abs_fname, {"filetype_parsers": {}}) + _, blocks, _ = block_parser.split_code_and_text_blocks(abs_fname) intro, _ = sphinx_gallery.gen_rst.extract_intro_and_title(abs_fname, blocks[0][1]) thumbnail_rst = '' diff --git a/intermediate_source/torchrec_intro_tutorial.py b/intermediate_source/torchrec_intro_tutorial.py index 5f9464decd1..81b7663c110 100644 --- a/intermediate_source/torchrec_intro_tutorial.py +++ b/intermediate_source/torchrec_intro_tutorial.py @@ -2,7 +2,7 @@ Introduction to TorchRec ================================== -**TorchRec** is a PyTorch library tailored for building scalable and efficient recommendation systems using embeddings. +**TorchRec** is a PyTorch library tailored for building scalable and efficient recommendation systems using embeddings. This tutorial guides you through the installation process, introduces the concept of embeddings, and highlights their importance in recommendation systems. It offers practical demonstrations on implementing embeddings with PyTorch and TorchRec, focusing on handling large embedding tables through distributed training and advanced optimizations. @@ -11,7 +11,7 @@ .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn :class-card: card-prerequisites - + * Fundamentals of embeddings and their role in recommendation systems * How to set up TorchRec to manage and implement embeddings in PyTorch environments * Explore advanced techniques for distributing large embedding tables across multiple GPUs @@ -30,7 +30,7 @@ # Install Dependencies # ^^^^^^^^^^^^^^^^^^^^ # -# Before running this tutorial in Google Colab or other environment, install the +# Before running this tutorial in Google Colab, make sure to install the # following dependencies: # # .. code-block:: sh @@ -40,54 +40,53 @@ # !pip3 install torchmetrics==1.0.3 # !pip3 install torchrec --index-url https://download.pytorch.org/whl/cu121 # -# .. note:: +# .. note:: # If you are running this in Google Colab, make sure to switch to a GPU runtime type. # For more information, # see `Enabling CUDA `__ # - ###################################################################### # Embeddings # ~~~~~~~~~~ -# +# # When building recommendation systems, categorical features typically # have massive cardinality, posts, users, ads, and so on. -# +# # In order to represent these entities and model these relationships, # **embeddings** are used. In machine learning, **embeddings are a vectors # of real numbers in a high-dimensional space used to represent meaning in # complex data like words, images, or users**. -# +# # Embeddings in RecSys # ~~~~~~~~~~~~~~~~~~~~ -# +# # Now you might wonder, how are these embeddings generated in the first # place? Well, embeddings are represented as individual rows in an # **Embedding Table**, also referred to as embedding weights. The reason # for this is that embeddings or embedding table weights are trained just # like all of the other weights of the model via gradient descent! -# +# # Embedding tables are simply a large matrix for storing embeddings, with # two dimensions (B, N), where: -# +# # * B is the number of embeddings stored by the table # * N is the number of dimensions per embedding (N-dimensional embedding). -# +# # The inputs to embedding tables represent embedding lookups to retrieve # the embedding for a specific index or row. In recommendation systems, such # as those used in many large systems, unique IDs are not only used for # specific users, but also across entities like posts and ads to serve as # lookup indices to respective embedding tables! -# +# # Embeddings are trained in RecSys through the following process: # # * **Input/lookup indices are fed into the model, as unique IDs**. IDs are # hashed to the total size of the embedding table to prevent issues when # the ID > number of rows -# -# * Embeddings are then retrieved and **pooled, such as taking the sum or +# +# * Embeddings are then retrieved and **pooled, such as taking the sum or # mean of the embeddings**. This is required as there can be a variable number of # embeddings per example while the model expects consistent shapes. # @@ -100,17 +99,17 @@ # for an example, and **all weights of the model are updated through # gradient descent and backpropagation, including the embedding weights** # that were associated with the example. -# +# # These embeddings are crucial for representing categorical features, such # as users, posts, and ads, in order to capture relationships and make # good recommendations. The `Deep learning recommendation # model `__ (DLRM) paper talks more # about the technical details of using embedding tables in RecSys. -# +# # This tutorial introduces the concept of embeddings, showcase # TorchRec specific modules and data types, and depict how distributed training # works with TorchRec. -# +# import torch @@ -118,19 +117,19 @@ ###################################################################### # Embeddings in PyTorch # --------------------- -# -# In PyTorch, we have the following types of embeddings: +# +# In PyTorch, we have the following types of embeddings: # # * :class:`torch.nn.Embedding`: An embedding table where forward pass returns the # embeddings themselves as is. -# +# # * :class:`torch.nn.EmbeddingBag`: Embedding table where forward pass returns # embeddings that are then pooled, for example, sum or mean, otherwise known # as **Pooled Embeddings**. -# +# # In this section, we will go over a very brief introduction to performing -# embedding lookups by passing in indices into the table. -# +# embedding lookups by passing in indices into the table. +# num_embeddings, embedding_dim = 10, 4 @@ -181,13 +180,13 @@ # systems! These tables represent entities and their relationships. For # example, the relationship between a given user and the pages and posts # they have liked. -# +# ###################################################################### # TorchRec Features Overview # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# +# # In the section above we've learned how to use embedding tables, one of the foundations of # modern recommendation systems! These tables represent entities and # relationships, such as users, pages, posts, etc. Given that these @@ -197,26 +196,25 @@ # collisions, these tables can become quite massive (think about the number of ads # for example). In fact, these tables can become so massive that they # won't be able to fit on 1 GPU, even with 80G of memory. -# +# # In order to train models with massive embedding tables, sharding these # tables across GPUs is required, which then introduces a whole new set of # problems and opportunities in parallelism and optimization. Luckily, we have -# the TorchRec library that has encountered, consolidated, and addressed +# the TorchRec library `__ that has encountered, consolidated, and addressed # many of these concerns. TorchRec serves as a **library that provides # primitives for large scale distributed embeddings**. -# +# # Next, we will explore the major features of the TorchRec # library. We will start with ``torch.nn.Embedding`` and will extend that to # custom TorchRec modules, explore distributed training environment with # generating a sharding plan for embeddings, look at inherent TorchRec # optimizations, and extend the model to be ready for inference in C++. # Below is a quick outline of what this section consists of: -# +# # * TorchRec Modules and Data Types # * Distributed Training, Sharding, and Optimizations -# * Inference -# -# Let's begin with importing TorchRec: +# +# Let's begin with importing TorchRec: import torchrec @@ -224,26 +222,26 @@ ###################################################################### # TorchRec Modules and Data Types # ---------------------------------- -# +# # This section goes over TorchRec Modules and data types including such # entities as ``EmbeddingCollection`` and ``EmbeddingBagCollection``, # ``JaggedTensor``, ``KeyedJaggedTensor``, ``KeyedTensor`` and more. # # From ``EmbeddingBag`` to ``EmbeddingBagCollection`` # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# +# # We have already explored :class:`torch.nn.Embedding` and :class:`torch.nn.EmbeddingBag`. # TorchRec extends these modules by creating collections of embeddings, in # other words modules that can have multiple embedding tables, with # ``EmbeddingCollection`` and ``EmbeddingBagCollection`` # We will use ``EmbeddingBagCollection`` to represent a group of # embedding bags. -# +# # In the example code below, we create an ``EmbeddingBagCollection`` (EBC) -# with two embedding bags, 1 representing **products** and 1 representing **users**. +# with two embedding bags, 1 representing **products** and 1 representing **users**. # Each table, ``product_table`` and ``user_table``, is represented by a 64 dimension # embedding of size 4096. -# +# ebc = torchrec.EmbeddingBagCollection( device="cpu", @@ -261,8 +259,8 @@ num_embeddings=4096, feature_names=["user"], pooling=torchrec.PoolingType.SUM, - ) - ] + ), + ], ) print(ebc.embedding_bags) @@ -270,7 +268,7 @@ ###################################################################### # Let’s inspect the forward method for ``EmbeddingBagCollection`` and the # module’s inputs and outputs: -# +# import inspect @@ -282,13 +280,13 @@ ###################################################################### # TorchRec Input/Output Data Types # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# +# # TorchRec has distinct data types for input and output of its modules: # ``JaggedTensor``, ``KeyedJaggedTensor``, and ``KeyedTensor``. Now you # might ask, why create new data types to represent sparse features? To # answer that question, we must understand how sparse features are # represented in code. -# +# # Sparse features are otherwise known as ``id_list_feature`` and # ``id_score_list_feature``, and are the **IDs** that will be used as # indices to an embedding table to retrieve the embedding for that ID. To @@ -299,11 +297,11 @@ # these features in code is that in each input example, **the number of # IDs is variable**. One day a user might have interacted with only one ad # while the next day they interact with three. -# +# # A simple representation is shown below, where we have a ``lengths`` # tensor denoting how many indices are in an example for a batch and a # ``values`` tensor containing the indices themselves. -# +# # Batch Size 2 # 1 ID in example 1, 2 IDs in example 2 @@ -315,7 +313,7 @@ ###################################################################### # Next, let's look at the offsets as well as what is contained in each batch -# +# # Lengths can be converted to offsets for easy indexing of values id_list_feature_offsets = torch.cumsum(id_list_feature_lengths, dim=0) @@ -394,16 +392,16 @@ # Congrats! You now understand TorchRec modules and data types. # Give yourself a pat on the back for making it this far. Next, we will # learn about distributed training and sharding. -# +# ###################################################################### # Distributed Training and Sharding # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# +# # Now that we have a grasp on TorchRec modules and data types, it's time # to take it to the next level. -# +# # Remember, the main purpose of TorchRec is to provide primitives for # distributed embeddings. So far, we've only worked with embedding tables # on a single device. This has been possible given how small the embedding tables @@ -411,18 +409,18 @@ # Embedding tables often get massive, where one table can't fit on a single # GPU, creating the requirement for multiple devices and a distributed # environment. -# +# # In this section, we will explore setting up a distributed environment, # exactly how actual production training is done, and explore sharding # embedding tables, all with TorchRec. -# +# # **This section will also only use 1 GPU, though it will be treated in a # distributed fashion. This is only a limitation for training, as training # has a process per GPU. Inference does not run into this requirement** -# +# # In the example code below, we set up our PyTorch distributed environment. # -# .. warning:: +# .. warning:: # If you are running this in Google Colab, you can only call this cell once, # calling it again will cause an error as you can only initialize the process # group once. @@ -450,46 +448,46 @@ ###################################################################### # Distributed Embeddings # ~~~~~~~~~~~~~~~~~~~~~~ -# +# # We have already worked with the main TorchRec module: # ``EmbeddingBagCollection``. We have examined how it works along with how # data is represented in TorchRec. However, we have not yet explored one # of the main parts of TorchRec, which is **distributed embeddings**. -# +# # GPUs are the most popular choice for ML workloads by far today, as they # are able to do magnitudes more floating point operations/s # (`FLOPs `__) than CPU. However, # GPUs come with the limitation of scarce fast memory (HBM which is # analogous to RAM for CPU), typically, ~10s of GBs. -# +# # A RecSys model can contain embedding tables that far exceed the memory # limit for 1 GPU, hence the need for distribution of the embedding tables # across multiple GPUs, otherwise known as **model parallel**. On the # other hand, **data parallel** is where the entire model is replicated on # each GPU, which each GPU taking in a distinct batch of data for # training, syncing gradients on the backwards pass. -# +# # Parts of the model that **require less compute but more memory # (embeddings) are distributed with model parallel** while parts that # **require more compute and less memory (dense layers, MLP, etc.) are # distributed with data parallel**. -# +# # Sharding # ~~~~~~~~ -# +# # In order to distribute an embedding table, we split up the embedding # table into parts and place those parts onto different devices, also # known as “sharding”. -# +# # There are many ways to shard embedding tables. The most common ways are: # # * Table-Wise: the table is placed entirely onto one device # * Column-Wise: columns of embedding tables are sharded # * Row-Wise: rows of embedding tables are sharded -# +# # Sharded Modules # ~~~~~~~~~~~~~~~ -# +# # While all of this seems like a lot to deal with and implement, you're in # luck. **TorchRec provides all the primitives for easy distributed # training and inference**! In fact, TorchRec modules have two corresponding @@ -498,23 +496,23 @@ # # * **The module sharder**: This class exposes a ``shard`` API # that handles sharding a TorchRec Module, producing a sharded module. -# * For ``EmbeddingBagCollection``, the sharder is `EmbeddingBagCollectionSharder `__ +# * For ``EmbeddingBagCollection``, the sharder is `EmbeddingBagCollectionSharder ` # * **Sharded module**: This class is a sharded variant of a TorchRec module. # It has the same input/output as a the regular TorchRec module, but much # more optimized and works in a distributed environment. -# * For ``EmbeddingBagCollection``, the sharded variant is `ShardedEmbeddingBagCollection `__ -# +# * For ``EmbeddingBagCollection``, the sharded variant is `ShardedEmbeddingBagCollection` +# # Every TorchRec module has an unsharded and sharded variant. -# +# # * The unsharded version is meant to be prototyped and experimented with. # * The sharded version is meant to be used in a distributed environment for # distributed training and inference. -# +# # The sharded versions of TorchRec modules, for example # ``EmbeddingBagCollection``, will handle everything that is needed for Model # Parallelism, such as communication between GPUs for distributing # embeddings to the correct GPUs. -# +# # Refresher of our ``EmbeddingBagCollection`` module ebc @@ -535,10 +533,10 @@ ###################################################################### # Planner # ~~~~~~~ -# +# # Before we can show how sharding works, we must know about the # **planner**, which helps us determine the best sharding configuration. -# +# # Given a number of embedding tables and a number of ranks, there are many # different sharding configurations that are possible. For example, given # 2 embedding tables and 2 GPUs, you can: @@ -546,34 +544,34 @@ # * Place 1 table on each GPU # * Place both tables on a single GPU and no tables on the other # * Place certain rows and columns on each GPU -# +# # Given all of these possibilities, we typically want a sharding # configuration that is optimal for performance. -# +# # That is where the planner comes in. The planner is able to determine # given the number of embedding tables and the number of GPUs, what is the optimal # configuration. Turns out, this is incredibly difficult to do manually, # with tons of factors that engineers have to consider to ensure an # optimal sharding plan. Luckily, TorchRec provides an auto planner when -# the planner is used. -# +# the planner is used. +# # The TorchRec planner: -# +# # * Assesses memory constraints of hardware # * Estimates compute based on memory fetches as embedding lookups # * Addresses data specific factors # * Considers other hardware specifics like bandwidth to generate an optimal sharding plan -# +# # In order to take into consideration all these variables, The TorchRec # planner can take in `various amounts of data for embedding tables, # constraints, hardware information, and # topology `__ # to aid in generating the optimal sharding plan for a model, which is # routinely provided across stacks. -# +# # To learn more about sharding, see our `sharding # tutorial `__. -# +# # In our case, 1 GPU and compute on CUDA device planner = EmbeddingShardingPlanner( @@ -592,16 +590,16 @@ ###################################################################### # Planner Result # ~~~~~~~~~~~~~~ -# +# # As you can see above, when running the planner there is quite a bit of output. # We can see a lot of stats being calculated along with where our # tables end up being placed. -# +# # The result of running the planner is a static plan, which can be reused # for sharding! This allows sharding to be static for production models # instead of determining a new sharding plan everytime. Below, we use the # sharding plan to finally generate our ``ShardedEmbeddingBagCollection``. -# +# # The static plan that was generated plan @@ -617,15 +615,15 @@ ###################################################################### # GPU Training with ``LazyAwaitable`` # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# +# # Remember that TorchRec is a highly optimized library for distributed # embeddings. A concept that TorchRec introduces to enable higher # performance for training on GPU is a -# `LazyAwaitable `__. +# `LazyAwaitable `. # You will see ``LazyAwaitable`` types as outputs of various sharded # TorchRec modules. All a ``LazyAwaitable`` type does is delay calculating some # result as long as possible, and it does it by acting like an async type. -# +# from typing import List @@ -669,19 +667,19 @@ def _wait_impl(self) -> torch.Tensor: ###################################################################### # Anatomy of Sharded TorchRec modules # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# +# # We have now successfully sharded an ``EmbeddingBagCollection`` given a # sharding plan that we generated! The sharded module has common APIs from # TorchRec which abstract away distributed communication/compute amongst # multiple GPUs. In fact, these APIs are highly optimized for performance # in training and inference. **Below are the three common APIs for # distributed training/inference** that are provided by TorchRec: -# +# # * ``input_dist``: Handles distributing inputs from GPU to GPU. # * ``lookups``: Does the actual embedding lookup in an optimized, # batched manner using FBGEMM TBE (more on this later). # * ``output_dist``: Handles distributing outputs from GPU to GPU. -# +# # The distribution of inputs and outputs is done through `NCCL # Collectives `__, # namely @@ -690,13 +688,13 @@ def _wait_impl(self) -> torch.Tensor: # TorchRec interfaces with PyTorch distributed for collectives and # provides clean abstractions to the end users, removing the concern for # the lower level details. -# +# # The backwards pass does all of these collectives but in the reverse # order for distribution of gradients. ``input_dist``, ``lookup``, and # ``output_dist`` all depend on the sharding scheme. Since we sharded in a # table-wise fashion, these APIs are modules that are constructed by -# `TwPooledEmbeddingSharding `__. -# +# `TwPooledEmbeddingSharding`. +# sharded_ebc @@ -710,27 +708,27 @@ def _wait_impl(self) -> torch.Tensor: ###################################################################### # Optimizing Embedding Lookups # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# +# # In performing lookups for a collection of embedding tables, a trivial # solution would be to iterate through all the ``nn.EmbeddingBags`` and do # a lookup per table. This is exactly what the standard, unsharded # ``EmbeddingBagCollection`` does. However, while this solution # is simple, it is extremely slow. -# +# # `FBGEMM `__ is a # library that provides GPU operators (otherwise known as kernels) that # are very optimized. One of these operators is known as **Table Batched # Embedding** (TBE), provides two major optimizations: -# +# # - Table batching, which allows you to look up multiple embeddings with # one kernel call. # - Optimizer Fusion, which allows the module to update itself given the # canonical pytorch optimizers and arguments. -# +# # The ``ShardedEmbeddingBagCollection`` uses the FBGEMM TBE as the lookup # instead of traditional ``nn.EmbeddingBags`` for optimized embedding # lookups. -# +# sharded_ebc._lookups @@ -738,28 +736,28 @@ def _wait_impl(self) -> torch.Tensor: ###################################################################### # ``DistributedModelParallel`` # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# +# # We have now explored sharding a single ``EmbeddingBagCollection``! We were # able to take the ``EmbeddingBagCollectionSharder`` and use the unsharded # ``EmbeddingBagCollection`` to generate a # ``ShardedEmbeddingBagCollection`` module. This workflow is fine, but # typically when implementing model parallel, -# `DistributedModelParallel `__ +# `DistributedModelParallel` # (DMP) is used as the standard interface. When wrapping your model (in # our case ``ebc``), with DMP, the following will occur: -# +# # 1. Decide how to shard the model. DMP will collect the available # sharders and come up with a plan of the optimal way to shard the # embedding table(s) (for example, ``EmbeddingBagCollection``) # 2. Actually shard the model. This includes allocating memory for each # embedding table on the appropriate device(s). -# +# # DMP takes in everything that we've just experimented with, like a static # sharding plan, a list of sharders, etc. However, it also has some nice # defaults to seamlessly shard a TorchRec model. In this toy example, # since we have two embedding tables and one GPU, TorchRec will place both # on the single GPU. -# +# ebc @@ -771,10 +769,12 @@ def _wait_impl(self) -> torch.Tensor: model +from fbgemm_gpu.split_embedding_configs import EmbOptimType + ###################################################################### # Sharding Best Practices # ~~~~~~~~~~~~~~~~~~~~~~~ -# +# # Currently, our configuration is only sharding on 1 GPU (or rank), which # is trivial: just place all the tables on 1 GPUs memory. However, in real # production use cases, embedding tables are **typically sharded on @@ -783,26 +783,26 @@ def _wait_impl(self) -> torch.Tensor: # proper sharding configuration (to prevent out of memory issues) while # keeping it balanced not only in terms of memory but also compute for # optimal performance. -# +# ###################################################################### # Adding in the Optimizer # ~~~~~~~~~~~~~~~~~~~~~~~ -# +# # Remember that TorchRec modules are hyperoptimized for large scale # distributed training. An important optimization is in regards to the -# optimizer. +# optimizer. # # TorchRec modules provide a seamless API to fuse the # backwards pass and optimize step in training, providing a significant # optimization in performance and decreasing the memory used, alongside # granularity in assigning distinct optimizers to distinct model # parameters. -# +# # Optimizer Classes # ^^^^^^^^^^^^^^^^^ -# +# # TorchRec uses ``CombinedOptimizer``, which contains a collection of # ``KeyedOptimizers``. A ``CombinedOptimizer`` effectively makes it easy # to handle multiple optimizers for various sub groups in the model. A @@ -810,47 +810,46 @@ def _wait_impl(self) -> torch.Tensor: # initialized through a dictionary of parameters exposes the parameters. # Each ``TBE`` module in a ``EmbeddingBagCollection`` will have it's own # ``KeyedOptimizer`` which combines into one ``CombinedOptimizer``. -# +# # Fused optimizer in TorchRec # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# +# # Using ``DistributedModelParallel``, the **optimizer is fused, which # means that the optimizer update is done in the backward**. This is an # optimization in TorchRec and FBGEMM, where the optimizer embedding # gradients are not materialized and applied directly to the parameters. # This brings significant memory savings as embedding gradients are # typically size of the parameters themselves. -# +# # You can, however, choose to make the optimizer ``dense`` which does not # apply this optimization and let's you inspect the embedding gradients or # apply computations to it as you wish. A dense optimizer in this case # would be your `canonical PyTorch model training loop with # optimizer. `__ -# +# # Once the optimizer is created through ``DistributedModelParallel``, you # still need to manage an optimizer for the other parameters not # associated with TorchRec embedding modules. To find the other # parameters, -# use ``in_backward_optimizer_filter(model.named_parameters())``. +# use ``in_backward_optimizer_filter(model.named_parameters())``. # Apply an optimizer to those parameters as you would a normal Torch # optimizer and combine this and the ``model.fused_optimizer`` into one # ``CombinedOptimizer`` that you can use in your training loop to # ``zero_grad`` and ``step`` through. -# +# # Adding an Optimizer to ``EmbeddingBagCollection`` # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# +# # We will do this in two ways, which are equivalent, but give you options # depending on your preferences: # # 1. Passing optimizer kwargs through ``fused_params`` in sharder. # 2. Through ``apply_optimizer_in_backward``, which converts the optimizer # parameters to ``fused_params`` to pass to the ``TBE`` in the ``EmbeddingBagCollection`` or ``EmbeddingCollection``. -# +# # Option 1: Passing optimizer kwargs through fused parameters from torchrec.optim.optimizers import in_backward_optimizer_filter -from fbgemm_gpu.split_embedding_configs import EmbOptimType # We initialize the sharder with @@ -864,17 +863,25 @@ def _wait_impl(self) -> torch.Tensor: sharder_with_fused_params = EmbeddingBagCollectionSharder(fused_params=fused_params) # We'll use same plan and unsharded EBC as before but this time with our new sharder -sharded_ebc_fused_params = sharder_with_fused_params.shard(ebc, plan.plan[""], env, torch.device("cuda")) +sharded_ebc_fused_params = sharder_with_fused_params.shard( + ebc, plan.plan[""], env, torch.device("cuda") +) # Looking at the optimizer of each, we can see that the learning rate changed, which indicates our optimizer has been applied correctly. # If seen, we can also look at the TBE logs of the cell to see that our new optimizer is indeed being applied print(f"Original Sharded EBC fused optimizer: {sharded_ebc.fused_optimizer}") -print(f"Sharded EBC with fused parameters fused optimizer: {sharded_ebc_fused_params.fused_optimizer}") +print( + f"Sharded EBC with fused parameters fused optimizer: {sharded_ebc_fused_params.fused_optimizer}" +) print(f"Type of optimizer: {type(sharded_ebc_fused_params.fused_optimizer)}") -from torch.distributed.optim import _apply_optimizer_in_backward as apply_optimizer_in_backward import copy + +from torch.distributed.optim import ( + _apply_optimizer_in_backward as apply_optimizer_in_backward, +) + # Option 2: Applying optimizer through apply_optimizer_in_backward # Note: we need to call apply_optimizer_in_backward on unsharded model first and then shard it @@ -886,7 +893,9 @@ def _wait_impl(self) -> torch.Tensor: print(f"{name=}") apply_optimizer_in_backward(torch.optim.SGD, [param], optimizer_kwargs) -sharded_ebc_apply_opt = sharder.shard(ebc_apply_opt, plan.plan[""], env, torch.device("cuda")) +sharded_ebc_apply_opt = sharder.shard( + ebc_apply_opt, plan.plan[""], env, torch.device("cuda") +) # Now when we print the optimizer, we will see our new learning rate, you can verify momentum through the TBE logs as well if outputted print(sharded_ebc_apply_opt.fused_optimizer) @@ -896,7 +905,11 @@ def _wait_impl(self) -> torch.Tensor: # Practically, just non TorchRec module parameters. Since our module is just a TorchRec EBC # there are no other parameters that aren't associated with TorchRec print("Non Fused Model Parameters:") -print(dict(in_backward_optimizer_filter(sharded_ebc_fused_params.named_parameters())).keys()) +print( + dict( + in_backward_optimizer_filter(sharded_ebc_fused_params.named_parameters()) + ).keys() +) # Here we do a dummy backwards call and see that parameter updates for fused # optimizers happen as a result of the backward pass @@ -915,202 +928,23 @@ def _wait_impl(self) -> torch.Tensor: print(f"Second Iteration Loss: {loss}") -###################################################################### -# Inference -# ~~~~~~~~~ -# -# Now that we are able to train distributed embeddings, how can we take -# the trained model and optimize it for inference? Inference is typically -# very sensitive to **performance and size of the model**. Running just -# the trained model in a Python environment is incredibly inefficient. -# There are two key differences between inference and training -# environments: -# -# * **Quantization**: Inference models are typically -# quantized, where model parameters lose precision for lower latency in -# predictions and reduced model size. For example FP32 (4 bytes) in -# trained model to INT8 (1 byte) for each embedding weight. This is also -# necessary given the vast scale of embedding tables, as we want to use as -# few devices as possible for inference to minimize latency. -# -# * **C++ environment**: Inference latency is very important, so in order to ensure -# ample performance, the model is typically ran in a C++ environment, -# along with the situations where we don't have a Python runtime, like on -# device. -# -# TorchRec provides primitives for converting a TorchRec model into being -# inference ready with: -# -# * APIs for quantizing the model, introducing -# optimizations automatically with FBGEMM TBE -# * Sharding embeddings for distributed inference -# * Compiling the model to `TorchScript `__ -# (compatible in C++) -# -# In this section, we will go over this entire workflow of: -# -# * Quantizing the model -# * Sharding the quantized model -# * Compiling the sharded quantized model into TorchScript -# - -ebc - -class InferenceModule(torch.nn.Module): - def __init__(self, ebc: torchrec.EmbeddingBagCollection): - super().__init__() - self.ebc_ = ebc - - def forward(self, kjt: KeyedJaggedTensor): - return self.ebc_(kjt) - -module = InferenceModule(ebc) -for name, param in module.named_parameters(): - # Here, the parameters should still be FP32, as we are using a standard EBC - # FP32 is default, regularly used for training - print(name, param.shape, param.dtype) - - -###################################################################### -# Quantization -# ~~~~~~~~~~~~ -# -# As you can see above, the normal EBC contains embedding table weights as -# FP32 precision (32 bits for each weight). Here, we will use the TorchRec -# inference library to quantize the embedding weights of the model to INT8 -# - -from torch import quantization as quant -from torchrec.modules.embedding_configs import QuantConfig -from torchrec.quant.embedding_modules import ( - EmbeddingBagCollection as QuantEmbeddingBagCollection, -) - - -quant_dtype = torch.int8 - - -qconfig = QuantConfig( - # dtype of the result of the embedding lookup, post activation - # torch.float generally for compatibility with rest of the model - # as rest of the model here usually isn't quantized - activation=quant.PlaceholderObserver.with_args(dtype=torch.float), - # quantized type for embedding weights, aka parameters to actually quantize - weight=quant.PlaceholderObserver.with_args(dtype=quant_dtype), -) -qconfig_spec = { - # Map of module type to qconfig - torchrec.EmbeddingBagCollection: qconfig, -} -mapping = { - # Map of module type to quantized module type - torchrec.EmbeddingBagCollection: QuantEmbeddingBagCollection, -} - - -module = InferenceModule(ebc) - -# Quantize the module -qebc = quant.quantize_dynamic( - module, - qconfig_spec=qconfig_spec, - mapping=mapping, - inplace=False, -) - - -print(f"Quantized EBC: {qebc}") - -kjt = kjt.to("cpu") - -qebc(kjt) - -# Once quantized, goes from parameters -> buffers, as no longer trainable -for name, buffer in qebc.named_buffers(): - # The shapes of the tables should be the same but the dtype should be int8 now - # post quantization - print(name, buffer.shape, buffer.dtype) - - -###################################################################### -# Shard -# ~~~~~ -# -# Here we perform sharding of the TorchRec quantized model. This is to -# ensure we are using the performant module through FBGEMM TBE. Here we -# are using one device to be consistent with training (1 TBE). -# - -from torchrec import distributed as trec_dist -from torchrec.distributed.shard import _shard_modules - - -sharded_qebc = _shard_modules( - module=qebc, - device=torch.device("cpu"), - env=trec_dist.ShardingEnv.from_local( - 1, - 0, - ), -) - - -print(f"Sharded Quantized EBC: {sharded_qebc}") - -sharded_qebc(kjt) - - -###################################################################### -# Compilation -# ~~~~~~~~~~~ -# -# Now we have the optimized eager TorchRec inference model. The next step -# is to ensure that this model is loadable in C++, as currently it is only -# runnable in a Python runtime. -# -# The recommended method of compilation at Meta is two fold: `torch.fx -# tracing `__ (generate -# intermediate representation of model) and converting the result to -# TorchScript, where TorchScript is C++ compatible. -# - -from torchrec.fx import Tracer - - -tracer = Tracer(leaf_modules=["IntNBitTableBatchedEmbeddingBagsCodegen"]) - -graph = tracer.trace(sharded_qebc) -gm = torch.fx.GraphModule(sharded_qebc, graph) - -print("Graph Module Created!") - -print(gm.code) - -scripted_gm = torch.jit.script(gm) -print("Scripted Graph Module Created!") - -print(scripted_gm.code) - - ###################################################################### # Conclusion # ^^^^^^^^^^ -# -# In this tutorial, you have gone from training a distributed RecSys model all the way -# to making it inference ready. The `TorchRec repo +# In this tutorial, you have done training a distributed RecSys model +# If you are interested in the inference the `TorchRec repo # `__ has a -# full example of how to load a TorchRec TorchScript model into C++ for -# inference. -# +# full example of how to run the TorchRec in Inference mode. +# ###################################################################### # See Also # -------------- -# +# # For more information, please see our # `dlrm `__ # example, which includes multinode training on the Criteo 1TB # dataset using the methods described in `Deep Learning Recommendation Model # for Personalization and Recommendation Systems `__. -# +# diff --git a/prototype_source/README.txt b/prototype_source/README.txt index 9428fe3d124..67aab29bb47 100644 --- a/prototype_source/README.txt +++ b/prototype_source/README.txt @@ -4,42 +4,18 @@ Prototype Tutorials Profiling PyTorch RPC-Based Workloads https://github.com/pytorch/tutorials/blob/main/prototype_source/distributed_rpc_profiling.rst -2. graph_mode_static_quantization_tutorial.py - Graph Mode Post Training Static Quantization in PyTorch - https://pytorch.org/tutorials/prototype/graph_mode_static_quantization_tutorial.html - -3. graph_mode_dynamic_bert_tutorial.rst - Graph Mode Dynamic Quantization on BERT - https://github.com/pytorch/tutorials/blob/main/prototype_source/graph_mode_dynamic_bert_tutorial.rst - -4. numeric_suite_tutorial.py - PyTorch Numeric Suite Tutorial - https://github.com/pytorch/tutorials/blob/main/prototype_source/numeric_suite_tutorial.py - -5. torchscript_freezing.py +2. torchscript_freezing.py Model Freezing in TorchScript https://github.com/pytorch/tutorials/blob/main/prototype_source/torchscript_freezing.py -6. vulkan_workflow.rst +3. vulkan_workflow.rst Vulkan Backend User Workflow - https://pytorch.org/tutorials/intermediate/vulkan_workflow.html - -7. fx_graph_mode_ptq_static.rst - FX Graph Mode Post Training Static Quantization - https://pytorch.org/tutorials/prototype/fx_graph_mode_ptq_static.html - -8. fx_graph_mode_ptq_dynamic.py - FX Graph Mode Post Training Dynamic Quantization - https://pytorch.org/tutorials/prototype/fx_graph_mode_ptq_dynamic.html - -9. fx_graph_mode_quant_guide.py - FX Graph Mode Quantization User Guide - https://pytorch.org/tutorials/prototype/fx_graph_mode_quant_guide.html - -10 flight_recorder_tutorial.rst + https://pytorch.org/tutorials/prototype/vulkan_workflow.html + +4. flight_recorder_tutorial.rst Flight Recorder User Guide https://pytorch.org/tutorials/prototype/flight_recorder_tutorial.html -11 python_extension_autoload.rst +5. python_extension_autoload.rst Autoloading Out-of-Tree Extension https://pytorch.org/tutorials/prototype/python_extension_autoload.html diff --git a/prototype_source/gpu_quantization_torchao_tutorial.py b/prototype_source/gpu_quantization_torchao_tutorial.py index f901f8abd31..2cea60b39d3 100644 --- a/prototype_source/gpu_quantization_torchao_tutorial.py +++ b/prototype_source/gpu_quantization_torchao_tutorial.py @@ -31,7 +31,7 @@ # > conda create -n myenv python=3.10 # > pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121 # > pip install git+https://github.com/facebookresearch/segment-anything.git -# > pip install git+https://github.com/pytorch-labs/ao.git +# > pip install git+https://github.com/pytorch/ao.git # # Segment Anything Model checkpoint setup: # @@ -44,7 +44,7 @@ # import torch -from torchao.quantization.quant_api import quantize_, int8_dynamic_activation_int8_weight +from torchao.quantization.quant_api import quantize_, Int8DynamicActivationInt8WeightConfig from torchao.utils import unwrap_tensor_subclass, TORCH_VERSION_AT_LEAST_2_5 from segment_anything import sam_model_registry from torch.utils.benchmark import Timer @@ -143,7 +143,7 @@ def get_sam_model(only_one_block=False, batchsize=1): # for improvements. # # Next, let's apply quantization. Quantization for GPUs comes in three main forms -# in `torchao `_ which is just native +# in `torchao `_ which is just native # pytorch+python code. This includes: # # * int8 dynamic quantization @@ -157,9 +157,9 @@ def get_sam_model(only_one_block=False, batchsize=1): # in memory bound situations where the benefit comes from loading less # weight data, rather than doing less computation. The torchao APIs: # -# ``int8_dynamic_activation_int8_weight()``, -# ``int8_weight_only()`` or -# ``int4_weight_only()`` +# ``Int8DynamicActivationInt8WeightConfig()``, +# ``Int8WeightOnlyConfig()`` or +# ``Int4WeightOnlyConfig()`` # # can be used to easily apply the desired quantization technique and then # once the model is compiled with ``torch.compile`` with ``max-autotune``, quantization is @@ -171,7 +171,7 @@ def get_sam_model(only_one_block=False, batchsize=1): # ``apply_weight_only_int8_quant`` instead as drop in replacement for the two # above (no replacement for int4). # -# The difference between the two APIs is that ``int8_dynamic_activation`` API +# The difference between the two APIs is that the ``Int8DynamicActivationInt8WeightConfig`` API # alters the weight tensor of the linear module so instead of doing a # normal linear, it does a quantized operation. This is helpful when you # have non-standard linear ops that do more than one thing. The ``apply`` @@ -186,7 +186,7 @@ def get_sam_model(only_one_block=False, batchsize=1): model, image = get_sam_model(only_one_block, batchsize) model = model.to(torch.bfloat16) image = image.to(torch.bfloat16) -quantize_(model, int8_dynamic_activation_int8_weight()) +quantize_(model, Int8DynamicActivationInt8WeightConfig()) if not TORCH_VERSION_AT_LEAST_2_5: # needed for subclass + compile to work on older versions of pytorch unwrap_tensor_subclass(model) @@ -224,7 +224,7 @@ def get_sam_model(only_one_block=False, batchsize=1): model = model.to(torch.bfloat16) image = image.to(torch.bfloat16) torch._inductor.config.force_fuse_int_mm_with_mul = True -quantize_(model, int8_dynamic_activation_int8_weight()) +quantize_(model, Int8DynamicActivationInt8WeightConfig()) if not TORCH_VERSION_AT_LEAST_2_5: # needed for subclass + compile to work on older versions of pytorch unwrap_tensor_subclass(model) @@ -258,7 +258,7 @@ def get_sam_model(only_one_block=False, batchsize=1): torch._inductor.config.coordinate_descent_tuning = True torch._inductor.config.coordinate_descent_check_all_directions = True torch._inductor.config.force_fuse_int_mm_with_mul = True -quantize_(model, int8_dynamic_activation_int8_weight()) +quantize_(model, Int8DynamicActivationInt8WeightConfig()) if not TORCH_VERSION_AT_LEAST_2_5: # needed for subclass + compile to work on older versions of pytorch unwrap_tensor_subclass(model) @@ -290,7 +290,7 @@ def get_sam_model(only_one_block=False, batchsize=1): model, image = get_sam_model(False, batchsize) model = model.to(torch.bfloat16) image = image.to(torch.bfloat16) - quantize_(model, int8_dynamic_activation_int8_weight()) + quantize_(model, Int8DynamicActivationInt8WeightConfig()) if not TORCH_VERSION_AT_LEAST_2_5: # needed for subclass + compile to work on older versions of pytorch unwrap_tensor_subclass(model) @@ -315,6 +315,6 @@ def get_sam_model(only_one_block=False, batchsize=1): # the model. For example, this can be done with some form of flash attention. # # For more information visit -# `torchao `_ and try it on your own +# `torchao `_ and try it on your own # models. # diff --git a/prototype_source/graph_mode_dynamic_bert_tutorial.rst b/prototype_source/graph_mode_dynamic_bert_tutorial.rst deleted file mode 100644 index 949002a55dc..00000000000 --- a/prototype_source/graph_mode_dynamic_bert_tutorial.rst +++ /dev/null @@ -1,544 +0,0 @@ -(prototype) Graph Mode Dynamic Quantization on BERT -=================================================== - - -**Author**: `Supriya Rao `_ - -Introduction ------------- - -This tutorial introduces the steps to do post training Dynamic Quantization with Graph Mode Quantization. Dynamic quantization converts a float model to a quantized model with static int8 data types for the weights and dynamic quantization for the activations. The activations are quantized dynamically (per batch) to int8 while the weights are statically quantized to int8. Graph Mode Quantization flow operates on the model graph and requires minimal user intervention to quantize the model. To be able to use graph mode, the float model needs to be either traced or scripted first. - -Advantages of graph mode quantization are: - -- In graph mode, we can inspect the code that is executed in forward function (e.g. aten function calls) and quantization is achieved by module and graph manipulations. -- Simple quantization flow, minimal manual steps. -- Unlocks the possibility of doing higher level optimizations like automatic precision selection. - -For additional details on Graph Mode Quantization please refer to the `Graph Mode Static Quantization Tutorial `_. - -tl;dr The Graph Mode Dynamic `Quantization API `_: - -.. code:: python - - import torch - from torch.quantization import per_channel_dynamic_qconfig - from torch.quantization import quantize_dynamic_jit - - ts_model = torch.jit.script(float_model) # or torch.jit.trace(float_model, input) - - quantized = quantize_dynamic_jit(ts_model, {'': per_channel_dynamic_qconfig}) - -1. Quantizing BERT Model ------------------------- - -The installaion steps and details about the model are identical to the steps in the Eager Mode Tutorial. Please refer to the tutorial `here `_ for more details. - -1.1 Setup -^^^^^^^^^ -Once all the necesessary packages are downloaded and installed we setup the code. We first start with the necessary imports and setup for the model. - -.. code:: python - - import logging - import numpy as np - import os - import random - import sys - import time - import torch - - from argparse import Namespace - from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, - TensorDataset) - from tqdm import tqdm - from transformers import (BertConfig, BertForSequenceClassification, BertTokenizer,) - from transformers import glue_compute_metrics as compute_metrics - from transformers import glue_output_modes as output_modes - from transformers import glue_processors as processors - from transformers import glue_convert_examples_to_features as convert_examples_to_features - from torch.quantization import per_channel_dynamic_qconfig - from torch.quantization import quantize_dynamic_jit - - def ids_tensor(shape, vocab_size): - # Creates a random int32 tensor of the shape within the vocab size - return torch.randint(0, vocab_size, shape=shape, dtype=torch.int, device='cpu') - - # Setup logging - logger = logging.getLogger(__name__) - logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.WARN) - - logging.getLogger("transformers.modeling_utils").setLevel( - logging.WARN) # Reduce logging - - print(torch.__version__) - - torch.set_num_threads(1) - print(torch.__config__.parallel_info()) - -1.2 Download GLUE dataset -^^^^^^^^^^^^^^^^^^^^^^^^^ -Before running MRPC tasks we download the GLUE data by running this script and unpack it to a directory glue_data. - -.. code:: shell - - python download_glue_data.py --data_dir='glue_data' --tasks='MRPC' - -1.3 Set global BERT configurations -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -To run this experiment we first need a fine tuned BERT model. We provide the fined-tuned BERT model for MRPC task `here `_. To save time, you can download the model file (~400 MB) directly into your local folder $OUT_DIR. - - -.. code:: python - - configs = Namespace() - - # The output directory for the fine-tuned model, $OUT_DIR. - configs.output_dir = "./MRPC/" - - # The data directory for the MRPC task in the GLUE benchmark, $GLUE_DIR/$TASK_NAME. - configs.data_dir = "./glue_data/MRPC" - - # The model name or path for the pre-trained model. - configs.model_name_or_path = "bert-base-uncased" - # The maximum length of an input sequence - configs.max_seq_length = 128 - - # Prepare GLUE task. - configs.task_name = "MRPC".lower() - configs.processor = processors[configs.task_name]() - configs.output_mode = output_modes[configs.task_name] - configs.label_list = configs.processor.get_labels() - configs.model_type = "bert".lower() - configs.do_lower_case = True - - # Set the device, batch size, topology, and caching flags. - configs.device = "cpu" - configs.per_gpu_eval_batch_size = 8 - configs.n_gpu = 0 - configs.local_rank = -1 - configs.overwrite_cache = False - - # Set random seed for reproducibility. - def set_seed(seed): - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - set_seed(42) - - tokenizer = BertTokenizer.from_pretrained( - configs.output_dir, do_lower_case=configs.do_lower_case) - - model = BertForSequenceClassification.from_pretrained(configs.output_dir, torchscript=True) - model.to(configs.device) - -1.4 Quantizing BERT model with Graph Mode Quantization -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -1.4.1 Script/Trace the model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The input for graph mode quantization is a TorchScript model, so you'll need to either script or trace the model first. Currently, scripting the BERT model is not supported so we trace the model here. - -We first identify the inputs to be passed to the model. Here, we trace the model with the largest possible input size that will be passed during the evaluation. -We choose a batch size of 8 and sequence lenght of 128 based on the input sizes passed in during the evaluation step below. Using the max possible shape during inference while tracing is a limitation of the huggingface BERT model as mentioned `here `_. - -We trace the model using ``torch.jit.trace``. - -.. code:: python - - input_ids = ids_tensor([8, 128], 2) - token_type_ids = ids_tensor([8, 128], 2) - attention_mask = ids_tensor([8, 128], vocab_size=2) - dummy_input = (input_ids, attention_mask, token_type_ids) - traced_model = torch.jit.trace(model, dummy_input) - -1.4.2 Specify qconfig_dict -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. code:: - - qconfig_dict = {'': per_channel_dynamic_qconfig} - -qconfig is a named tuple of the observers for activation and weight. For dynamic quantization we use a dummy activation observer to mimic the dynamic quantization process that happens in the operator during runtime. For the weight tensors we recommend using per-channel quantization which helps improve the final accuracy. -``qconfig_dict`` is a dictionary with names of sub modules as key and qconfig for that module as value, empty key means the qconfig will be applied to whole model unless it’s overwritten by more specific configurations, the qconfig for each module is either found in the dictionary or fallback to the qconfig of parent module. - -Right now qconfig_dict is the only way to configure how the model is quantized, and it is done in the granularity of module, that is, we only support one type of qconfig for each module, and the qconfig for sub module will override the qconfig for parent module. For example, if we have - -.. code:: - - qconfig = { - '' : qconfig_global, - 'sub' : qconfig_sub, - 'sub.fc1' : qconfig_fc, - 'sub.fc2': None - } - -Module ``sub.fc1`` will be configured with ``qconfig_fc``, and all other child modules in ``sub`` will be configured with ``qconfig_sub`` and ``sub.fc2`` will not be quantized. All other modules in the model will be quantized with qconfig_global - -.. code:: python - - qconfig_dict = {'': per_channel_dynamic_qconfig} - -1.4.3 Quantize the model (one-line API) -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -We call the one line API (similar to eager mode) to perform quantization as follows. - -.. code:: python - - quantized_model = quantize_dynamic_jit(traced_model, qconfig_dict) - -2. Evaluation -------------- - -We reuse the tokenize and evaluation function from Huggingface. - -.. code:: python - - def evaluate(args, model, tokenizer, prefix=""): - # Loop to handle MNLI double evaluation (matched, mis-matched) - eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) - eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) - - results = {} - for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): - eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) - - if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: - os.makedirs(eval_output_dir) - - args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) - # Note that DistributedSampler samples randomly - eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) - eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) - - # multi-gpu eval - if args.n_gpu > 1: - model = torch.nn.DataParallel(model) - - # Eval! - logger.info("***** Running evaluation {} *****".format(prefix)) - logger.info(" Num examples = %d", len(eval_dataset)) - logger.info(" Batch size = %d", args.eval_batch_size) - nb_eval_steps = 0 - preds = None - out_label_ids = None - for batch in tqdm(eval_dataloader, desc="Evaluating"): - model.eval() - batch = tuple(t.to(args.device) for t in batch) - - with torch.no_grad(): - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1]} - labels = batch[3] - if args.model_type != 'distilbert': - inputs['input'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids - outputs = model(**inputs) - logits = outputs[0] - nb_eval_steps += 1 - if preds is None: - preds = logits.detach().cpu().numpy() - out_label_ids = labels.detach().cpu().numpy() - else: - preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) - out_label_ids = np.append(out_label_ids, labels.detach().cpu().numpy(), axis=0) - - if args.output_mode == "classification": - preds = np.argmax(preds, axis=1) - elif args.output_mode == "regression": - preds = np.squeeze(preds) - result = compute_metrics(eval_task, preds, out_label_ids) - results.update(result) - - output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") - with open(output_eval_file, "w") as writer: - logger.info("***** Eval results {} *****".format(prefix)) - for key in sorted(result.keys()): - logger.info(" %s = %s", key, str(result[key])) - writer.write("%s = %s\n" % (key, str(result[key]))) - - return results - - def load_and_cache_examples(args, task, tokenizer, evaluate=False): - if args.local_rank not in [-1, 0] and not evaluate: - torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache - - processor = processors[task]() - output_mode = output_modes[task] - # Load data features from cache or dataset file - cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format( - 'dev' if evaluate else 'train', - list(filter(None, args.model_name_or_path.split('/'))).pop(), - str(args.max_seq_length), - str(task))) - if os.path.exists(cached_features_file) and not args.overwrite_cache: - logger.info("Loading features from cached file %s", cached_features_file) - features = torch.load(cached_features_file) - else: - logger.info("Creating features from dataset file at %s", args.data_dir) - label_list = processor.get_labels() - if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']: - # HACK(label indices are swapped in RoBERTa pretrained model) - label_list[1], label_list[2] = label_list[2], label_list[1] - examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) - features = convert_examples_to_features(examples, - tokenizer, - label_list=label_list, - max_length=args.max_seq_length, - output_mode=output_mode,) - if args.local_rank in [-1, 0]: - logger.info("Saving features into cached file %s", cached_features_file) - torch.save(features, cached_features_file) - - if args.local_rank == 0 and not evaluate: - torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache - - # Convert to Tensors and build dataset - all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) - all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) - if output_mode == "classification": - all_labels = torch.tensor([f.label for f in features], dtype=torch.long) - elif output_mode == "regression": - all_labels = torch.tensor([f.label for f in features], dtype=torch.float) - - dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) - return dataset - - def time_model_evaluation(model, configs, tokenizer): - eval_start_time = time.time() - result = evaluate(configs, model, tokenizer, prefix="") - eval_end_time = time.time() - eval_duration_time = eval_end_time - eval_start_time - print(result) - print("Evaluate total time (seconds): {0:.1f}".format(eval_duration_time)) - - -2.1 Check Model Size -^^^^^^^^^^^^^^^^^^^^ - -We print the model size to account for wins from quantization - -.. code:: python - - def print_size_of_model(model): - if isinstance(model, torch.jit.RecursiveScriptModule): - torch.jit.save(model, "temp.p") - else: - torch.jit.save(torch.jit.script(model), "temp.p") - print('Size (MB):', os.path.getsize("temp.p")/1e6) - os.remove('temp.p') - - print("Size of model before quantization") - print_size_of_model(traced_model) - print("Size of model after quantization") - - print_size_of_model(quantized_model) - -.. code:: - - Size of model before quantization - Size (MB): 438.242141 - Size of model after quantization - Size (MB): 184.354759 - -2.2 Run the evaluation -^^^^^^^^^^^^^^^^^^^^^^ -We evaluate the FP32 and quantized model and compare the F1 score. Note that the performance numbers below are on a dev machine and they would likely improve on a production server. - -.. code:: python - - time_model_evaluation(traced_model, configs, tokenizer) - time_model_evaluation(quantized_model, configs, tokenizer) - -.. code:: - - FP32 model results - - 'f1': 0.901 - Time taken - 188.0s - - INT8 model results - - 'f1': 0.902 - Time taken - 157.4s - -3. Debugging the Quantized Model --------------------------------- - -We can debug the quantized model by passing in the debug option. - -.. code:: - - quantized_model = quantize_dynamic_jit(traced_model, qconfig_dict, debug=True) - -If debug is set to True: - -- We can access the attributes of the quantized model the same way as in a torchscript model, e.g. model.fc1.weight (might be harder if you use a module list or sequential). -- The arithmetic operations all occur in floating point with the numerics being identical to the final quantized model, allowing for debugging. - -.. code:: python - - quantized_model_debug = quantize_dynamic_jit(traced_model, qconfig_dict, debug=True) - -Calling ``quantize_dynamic_jit`` is equivalent to calling ``prepare_dynamic_jit`` followed by ``convert_dynamic_jit``. Usage of the one-line API is recommended. But if you wish to debug or analyze the model after each step, the multi-line API comes into use. - -3.1. Evaluate the Debug Model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. code:: python - - # Evaluate the debug model - time_model_evaluation(quantized_model_debug, configs, tokenizer) - -.. code:: - - Size (MB): 438.406429 - - INT8 (debug=True) model results - - 'f1': 0.897 - -Note that the accuracy of the debug version is close to, but not exactly the same as the non-debug version as the debug version uses floating point ops to emulate quantized ops and the numerics match is approximate. -This is the case only for per-channel quantization (we are working on improving this). Per-tensor quantization (using default_dynamic_qconfig) has exact numerics match between debug and non-debug version. - -.. code:: python - - print(str(quantized_model_debug.graph)) - -Snippet of the graph printed - - -.. code:: - - %111 : Tensor = prim::GetAttr[name="bias"](%110) - %112 : Tensor = prim::GetAttr[name="weight"](%110) - %113 : Float(768:1) = prim::GetAttr[name="4_scale_0"](%110) - %114 : Int(768:1) = prim::GetAttr[name="4_zero_point_0"](%110) - %115 : int = prim::GetAttr[name="4_axis_0"](%110) - %116 : int = prim::GetAttr[name="4_scalar_type_0"](%110) - %4.quant.6 : Tensor = aten::quantize_per_channel(%112, %113, %114, %115, %116) - %4.dequant.6 : Tensor = aten::dequantize(%4.quant.6) - %1640 : bool = prim::Constant[value=1]() - %input.5.scale.1 : float, %input.5.zero_point.1 : int = aten::_choose_qparams_per_tensor(%input.5, %1640) - %input.5.quant.1 : Tensor = aten::quantize_per_tensor(%input.5, %input.5.scale.1, %input.5.zero_point.1, %74) - %input.5.dequant.1 : Float(8:98304, 128:768, 768:1) = aten::dequantize(%input.5.quant.1) - %119 : Tensor = aten::linear(%input.5.dequant.1, %4.dequant.6, %111) - -We can see that there is no ``quantized::linear_dynamic`` in the model, but the numerically equivalent pattern of ``aten::_choose_qparams_per_tensor`` - ``aten::quantize_per_tensor`` - ``aten::dequantize`` - ``aten::linear``. - -.. code:: python - - # Get the size of the debug model - print_size_of_model(quantized_model_debug) - -.. code:: - - Size (MB): 438.406429 - -Size of the debug model is the close to the floating point model because all the weights are in float and not yet quantized and frozen, this allows people to inspect the weight. -You may access the weight attributes directly in the torchscript model. Accessing the weight in the debug model is the same as accessing the weight in a TorchScript model: - -.. code:: python - - print(quantized_model.bert.encoder.layer._c.getattr('0').attention.self.query.weight) - -.. code:: - - tensor([[-0.0157, 0.0257, -0.0269, ..., 0.0158, 0.0764, 0.0548], - [-0.0325, 0.0345, -0.0423, ..., -0.0528, 0.1382, 0.0069], - [ 0.0106, 0.0335, 0.0113, ..., -0.0275, 0.0253, -0.0457], - ..., - [-0.0090, 0.0512, 0.0555, ..., 0.0277, 0.0543, -0.0539], - [-0.0195, 0.0943, 0.0619, ..., -0.1040, 0.0598, 0.0465], - [ 0.0009, -0.0949, 0.0097, ..., -0.0183, -0.0511, -0.0085]], - grad_fn=) - -Accessing the scale and zero_point for the corresponding weight can be done as follows - - -.. code:: python - - print(quantized_model.bert.encoder.layer._c.getattr('0').attention.self.query.getattr('4_scale_0')) - print(quantized_model.bert.encoder.layer._c.getattr('0').attention.self.query.getattr('4_zero_point_0')) - -Since we use per-channel quantization, we get per-channel scales tensor. - -.. code:: - - tensor([0.0009, 0.0011, 0.0010, 0.0011, 0.0034, 0.0013, 0.0010, 0.0010, 0.0013, - 0.0012, 0.0011, 0.0010, 0.0010, 0.0010, 0.0010, 0.0010, 0.0009, 0.0015, - 0.0016, 0.0036, 0.0012, 0.0009, 0.0010, 0.0014, 0.0008, 0.0008, 0.0008, - ..., - 0.0019, 0.0023, 0.0013, 0.0018, 0.0012, 0.0031, 0.0015, 0.0013, 0.0014, - 0.0022, 0.0011, 0.0024]) - -Zero-point tensor - - -.. code:: - - tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - .., - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - dtype=torch.int32) - -4. Comparing Results with Eager Mode ------------------------------------- - -Following results show the F1 score and model size for Eager Mode Quantization of the same model by following the steps mentioned in the `tutorial `_. Results show that Eager and Graph Mode Quantization on the model produce identical results. - -.. code:: - - FP32 model results - - Size (MB): 438.016605 - 'f1': 0.901 - - INT8 model results - - Size (MB): 182.878029 - 'f1': 0.902 - -5. Benchmarking the Model -------------------------- - -We benchmark the model with dummy input and compare the Float model with Eager and Graph Mode Quantized Model on a production server machine. - -.. code:: python - - def benchmark(model): - model = torch.jit.load(model) - model.eval() - torch.set_num_threads(1) - input_ids = ids_tensor([8, 128], 2) - token_type_ids = ids_tensor([8, 128], 2) - attention_mask = ids_tensor([8, 128], vocab_size=2) - elapsed = 0 - for _i in range(50): - start = time.time() - output = model(input_ids, token_type_ids, attention_mask) - end = time.time() - elapsed = elapsed + (end - start) - print('Elapsed time: ', (elapsed / 50), ' s') - return - print("Running benchmark for Float model") - benchmark(args.jit_model_path_float) - print("Running benchmark for Eager Mode Quantized model") - benchmark(args.jit_model_path_eager) - print("Running benchmark for Graph Mode Quantized model") - benchmark(args.jit_model_path_graph) - -.. code:: - - Running benchmark for Float model - Elapsed time: 4.49 s - Running benchmark for Eager Mode Quantized model - Elapsed time: 2.67 s - Running benchmark for Graph Mode Quantized model - Elapsed time: 2.69 s - As we can see both graph mode and eager mode quantized model have a similar speed up over the floating point model. - -Conclusion ----------- - -In this tutorial, we demonstrated how to convert a well-known state-of-the-art NLP model like BERT into dynamic quantized model using graph mode with same performance as eager mode. -Dynamic quantization can reduce the size of the model while only having a limited implication on accuracy. - -Thanks for reading! As always, we welcome any feedback, so please create an issue `here `_ if you have any. diff --git a/prototype_source/prototype_index.rst b/prototype_source/prototype_index.rst index d8839020633..d1f802adfb4 100644 --- a/prototype_source/prototype_index.rst +++ b/prototype_source/prototype_index.rst @@ -31,85 +31,6 @@ Prototype features are not available as part of binary distributions like PyPI o .. Add prototype tutorial cards below this line -.. Quantization - -.. customcarditem:: - :header: FX Graph Mode Quantization User Guide - :card_description: Learn about FX Graph Mode Quantization. - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../prototype/fx_graph_mode_quant_guide.html - :tags: FX,Quantization - -.. customcarditem:: - :header: FX Graph Mode Post Training Dynamic Quantization - :card_description: Learn how to do post training dynamic quantization in graph mode based on torch.fx. - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../prototype/fx_graph_mode_ptq_dynamic.html - :tags: FX,Quantization - -.. customcarditem:: - :header: FX Graph Mode Post Training Static Quantization - :card_description: Learn how to do post training static quantization in graph mode based on torch.fx. - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../prototype/fx_graph_mode_ptq_static.html - :tags: FX,Quantization - -.. customcarditem:: - :header: Graph Mode Dynamic Quantization on BERT - :card_description: Learn how to do post training dynamic quantization with graph mode quantization on BERT models. - :image: ../_static/img/thumbnails/cropped/graph-mode-dynamic-bert.png - :link: ../prototype/graph_mode_dynamic_bert_tutorial.html - :tags: Text,Quantization - -.. customcarditem:: - :header: PyTorch Numeric Suite Tutorial - :card_description: Learn how to use the PyTorch Numeric Suite to support quantization debugging efforts. - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../prototype/numeric_suite_tutorial.html - :tags: Debugging,Quantization - -.. customcarditem:: - :header: How to Write a Quantizer for PyTorch 2 Export Quantization - :card_description: Learn how to implement a Quantizer for PT2 Export Quantization - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../prototype/pt2e_quantizer.html - :tags: Quantization - -.. customcarditem:: - :header: PyTorch 2 Export Post Training Quantization - :card_description: Learn how to use Post Training Quantization in PyTorch 2 Export. - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../prototype/pt2e_quant_ptq.html - :tags: Quantization - -.. customcarditem:: - :header: PyTorch 2 Export Quantization-Aware Training - :card_description: Learn how to use Quantization-Aware-Training in PyTorch 2 Export. - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../prototype/pt2e_quant_qat.html - :tags: Quantization - -.. customcarditem:: - :header: PyTorch 2 Export Quantization with X86 Backend through Inductor - :card_description: Learn how to use PT2 Export Quantization with X86 Backend through Inductor. - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../prototype/pt2e_quant_x86_inductor.html - :tags: Quantization - -.. customcarditem:: - :header: PyTorch 2 Export Quantization for OpenVINO torch.compile Backend - :card_description: Learn how to use PT2 Export Quantization with OpenVINO torch.compile Backend. - :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../prototype/openvino_quantizer.html - :tags: Quantization - -.. customcarditem:: - :header: PyTorch 2 Export Quantization with Intel GPU Backend through Inductor - :card_description: Learn how to use PT2 Export Quantization with Intel GPU Backend through Inductor. - :image: _static/img/thumbnails/cropped/pytorch-logo.png - :link: ../prototype/pt2e_quant_xpu_inductor.html - :tags: Quantization - .. Sparsity .. customcarditem:: @@ -295,16 +216,9 @@ Prototype features are not available as part of binary distributions like PyPI o :hidden: prototype/context_parallel.html - prototype/fx_graph_mode_quant_guide.html - prototype/fx_graph_mode_ptq_dynamic.html - prototype/fx_graph_mode_ptq_static.html prototype/flight_recorder_tutorial.html - prototype/graph_mode_dynamic_bert_tutorial.html prototype/inductor_cpp_wrapper_tutorial.html prototype/inductor_windows.html - prototype/pt2e_quantizer.html - prototype/pt2e_quant_ptq.html - prototype/pt2e_quant_qat.html prototype/ios_gpu_workflow.html prototype/nnapi_mobilenetv2.html prototype/tracing_based_selective_build.html diff --git a/redirects.py b/redirects.py index b0566c63f56..c66bbc088af 100644 --- a/redirects.py +++ b/redirects.py @@ -11,9 +11,11 @@ "advanced/dynamic_quantization_tutorial.html": "../index.html", "intermediate/dynamic_quantization_bert_tutorial.html": "../index.html", "intermediate/quantized_transfer_learning_tutorial.html": "../index.html", + "prototype/graph_mode_dynamic_bert_tutorial.html": "../index.html", "prototype/fx_graph_mode_ptq_dynamic.html": "../index.html", "prototype/fx_graph_mode_ptq_static.html": "../index.html", "prototype/fx_graph_mode_quant_guide.html": "../index.html", + "prototype/backend_config_tutorial.html": "../index.html", "prototype/numeric_suite_tutorial.html": "../index.html", "prototype/quantization_in_pytorch_2_0_export_tutorial.html": "../index.html", "prototype/pt2e_quant_ptq.html": "https://docs.pytorch.org/ao/main/tutorials_source/pt2e_quant_ptq.html",