Skip to content

Commit 6ca1922

Browse files
committed
Update tutorial spelling
1 parent b285551 commit 6ca1922

File tree

2 files changed

+67
-41
lines changed

2 files changed

+67
-41
lines changed

en-wordlist.txt

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -619,3 +619,29 @@ warmup
619619
webp
620620
wsi
621621
wsis
622+
Meta's
623+
criteo
624+
RecSys
625+
TorchRec
626+
sharding
627+
TBE
628+
dtype
629+
EBC
630+
sharder
631+
hyperoptimized
632+
DMP
633+
unsharded
634+
lookups
635+
KJTs
636+
amongst
637+
async
638+
everytime
639+
prototyped
640+
GBs
641+
HBM
642+
gloo
643+
nccl
644+
Localhost
645+
gpu
646+
torchmetrics
647+
url

intermediate_source/torchrec_interactive_tutorial.py

Lines changed: 41 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
# ~~~~~~~~~~
4444
#
4545
# When building recommendation systems, categorical features typically
46-
# have massive cardinalities, posts, users, ads, etc.
46+
# have massive cardinality, posts, users, ads, etc.
4747
#
4848
# In order to represent these entities and model these relationships,
4949
# **embeddings** are used. In machine learning, **embeddings are a vectors
@@ -213,7 +213,7 @@
213213

214214

215215
######################################################################
216-
# From EmbeddingBag to EmbeddingBagCollection
216+
# From ``EmbeddingBag`` to ``EmbeddingBagCollection``
217217
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
218218
#
219219
# We have already explored
@@ -229,7 +229,7 @@
229229
# We will use ``EmbeddingBagCollection`` to represent a group of
230230
# EmbeddingBags.
231231
#
232-
# Here, we create an EmbeddingBagCollection (EBC) with two embedding bags,
232+
# Here, we create an ``EmbeddingBagCollection`` (EBC) with two embedding bags,
233233
# 1 representing **products** and 1 representing **users**. Each table,
234234
# ``product_table`` and ``user_table``, is represented by 64 dimension
235235
# embedding of size 4096.
@@ -264,8 +264,8 @@
264264

265265
import inspect
266266

267-
# Let's look at the EmbeddingBagCollection forward method
268-
# What is a KeyedJaggedTensor and KeyedTensor?
267+
# Let's look at the ``EmbeddingBagCollection`` forward method
268+
# What is a ``KeyedJaggedTensor`` and ``KeyedTensor``?
269269
print(inspect.getsource(ebc.forward))
270270

271271

@@ -333,9 +333,9 @@
333333

334334
from torchrec import KeyedJaggedTensor
335335

336-
# JaggedTensor represents IDs for 1 feature, but we have multiple features in an EmbeddingBagCollection
337-
# That's where KeyedJaggedTensor comes in! KeyedJaggedTensor is just multiple JaggedTensors for multiple id_list_feature_offsets
338-
# From before, we have our two features "product" and "user". Let's create JaggedTensors for both!
336+
# ``JaggedTensor`` represents IDs for 1 feature, but we have multiple features in an ``EmbeddingBagCollection``
337+
# That's where ``KeyedJaggedTensor`` comes in! ``KeyedJaggedTensor`` is just multiple ``JaggedTensors`` for multiple id_list_feature_offsets
338+
# From before, we have our two features "product" and "user". Let's create ``JaggedTensors`` for both!
339339

340340
product_jt = JaggedTensor(
341341
values=torch.tensor([1, 2, 1, 5]), lengths=torch.tensor([3, 1])
@@ -345,32 +345,32 @@
345345
# Q1: How many batches are there, and which values are in the first batch for product_jt and user_jt?
346346
kjt = KeyedJaggedTensor.from_jt_dict({"product": product_jt, "user": user_jt})
347347

348-
# Look at our feature keys for the KeyedJaggedTensor
348+
# Look at our feature keys for the ``KeyedJaggedTensor``
349349
print("Keys: ", kjt.keys())
350350

351-
# Look at the overall lengths for the KeyedJaggedTensor
351+
# Look at the overall lengths for the ``KeyedJaggedTensor``
352352
print("Lengths: ", kjt.lengths())
353353

354-
# Look at all values for KeyedJaggedTensor
354+
# Look at all values for ``KeyedJaggedTensor``
355355
print("Values: ", kjt.values())
356356

357357
# Can convert KJT to dictionary representation
358358
print("to_dict: ", kjt.to_dict())
359359

360-
# KeyedJaggedTensor(KJT) string representation
360+
# ``KeyedJaggedTensor`` (KJT) string representation
361361
print(kjt)
362362

363-
# Q2: What are the offsets for the KeyedJaggedTensor?
363+
# Q2: What are the offsets for the ``KeyedJaggedTensor``?
364364

365365
# Now we can run a forward pass on our ebc from before
366366
result = ebc(kjt)
367367
result
368368

369-
# Result is a KeyedTensor, which contains a list of the feature names and the embedding results
369+
# Result is a ``KeyedTensor``, which contains a list of the feature names and the embedding results
370370
print(result.keys())
371371

372372
# The results shape is [2, 128], as batch size of 2. Reread previous section if you need a refresher on how the batch size is determined
373-
# 128 for dimension of embedding. If you look at where we initialized the EmbeddingBagCollection, we have two tables "product" and "user" of dimension 64 each
373+
# 128 for dimension of embedding. If you look at where we initialized the ``EmbeddingBagCollection``, we have two tables "product" and "user" of dimension 64 each
374374
# meaning emebddings for both features are of size 64. 64 + 64 = 128
375375
print(result.values().shape)
376376

@@ -392,7 +392,7 @@
392392
# Now that we have a grasp on TorchRec modules and data types, it's time
393393
# to take it to the next level.
394394
#
395-
# Remember, TorchRec's main purpose is to provide primitives for
395+
# Remember, the main purpose of TorchRec is to provide primitives for
396396
# distributed embeddings. So far, we've only worked with embedding tables
397397
# on 1 device. This has been possible given how small the embedding tables
398398
# have been, but in a production setting this isn't generally the case.
@@ -420,7 +420,7 @@
420420
# Set up environment variables for distributed training
421421
# RANK is which GPU we are on, default 0
422422
os.environ["RANK"] = "0"
423-
# How many devices in our "world", since Bento can only handle 1 process, 1 GPU
423+
# How many devices in our "world", notebook can only handle 1 process
424424
os.environ["WORLD_SIZE"] = "1"
425425
# Localhost as we are training locally
426426
os.environ["MASTER_ADDR"] = "localhost"
@@ -447,7 +447,7 @@
447447
# are able to do magnitudes more floating point operations/s
448448
# (`FLOPs <https://en.wikipedia.org/wiki/FLOPS>`__) than CPU. However,
449449
# GPUs come with the limitation of scarce fast memory (HBM which is
450-
# analgous to RAM for CPU), typically ~10s of GBs.
450+
# analogous to RAM for CPU), typically ~10s of GBs.
451451
#
452452
# A RecSys model can contain embedding tables that far exceed the memory
453453
# limit for 1 GPU, hence the need for distribution of the embedding tables
@@ -496,22 +496,22 @@
496496
# distributed training/inference.
497497
#
498498
# The sharded versions of TorchRec modules, for example
499-
# EmbeddingBagCollection, will handle everything that is needed for Model
499+
# ``EmbeddingBagCollection``, will handle everything that is needed for Model
500500
# Parallelism, such as communication between GPUs for distributing
501501
# embeddings to the correct GPUs.
502502
#
503503

504-
# Refresher of our EmbeddingBagCollection module
504+
# Refresher of our ``EmbeddingBagCollection`` module
505505
ebc
506506

507507
from torchrec.distributed.embeddingbag import EmbeddingBagCollectionSharder
508508
from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology
509509
from torchrec.distributed.types import ShardingEnv
510510

511-
# Corresponding sharder for EmbeddingBagCollection module
511+
# Corresponding sharder for ``EmbeddingBagCollection`` module
512512
sharder = EmbeddingBagCollectionSharder()
513513

514-
# ProcessGroup from torch.distributed initialized 2 cells above
514+
# ``ProcessGroup`` from torch.distributed initialized 2 cells above
515515
pg = dist.GroupMember.WORLD
516516
assert pg is not None, "Process group is not initialized"
517517

@@ -589,7 +589,7 @@
589589

590590
env = ShardingEnv.from_process_group(pg)
591591

592-
# Shard the EmbeddingBagCollection module using the EmbeddingBagCollectionSharder
592+
# Shard the ``EmbeddingBagCollection`` module using the ``EmbeddingBagCollectionSharder``
593593
sharded_ebc = sharder.shard(ebc, plan.plan[""], env, torch.device("cuda"))
594594

595595
print(f"Sharded EBC Module: {sharded_ebc}")
@@ -601,7 +601,7 @@
601601

602602

603603
######################################################################
604-
# Awaitable
604+
# ``Awaitable``
605605
# ^^^^^^^^^
606606
#
607607
# Remember that TorchRec is a highly optimized library for distributed
@@ -618,7 +618,7 @@
618618
from torchrec.distributed.types import LazyAwaitable
619619

620620

621-
# Demonstrate a LazyAwaitable type
621+
# Demonstrate a ``LazyAwaitable`` type
622622
class ExampleAwaitable(LazyAwaitable[torch.Tensor]):
623623
def __init__(self, size: List[int]) -> None:
624624
super().__init__()
@@ -633,20 +633,20 @@ def _wait_impl(self) -> torch.Tensor:
633633

634634
kjt = kjt.to("cuda")
635635
output = sharded_ebc(kjt)
636-
# The output of our sharded EmbeddingBagCollection module is a an Awaitable?
636+
# The output of our sharded ``EmbeddingBagCollection`` module is an `Awaitable`?
637637
print(output)
638638

639639
kt = output.wait()
640640
# Now we have out KeyedTensor after calling .wait()
641641
# If you are confused as to why we have a KeyedTensor output,
642-
# give yourself a refresher on the unsharded EmbeddingBagCollection module
642+
# give yourself a refresher on the unsharded ``EmbeddingBagCollection`` module
643643
print(type(kt))
644644

645645
print(kt.keys())
646646

647647
print(kt.values().shape)
648648

649-
# Same output format as unsharded EmbeddingBagCollection
649+
# Same output format as unsharded ``EmbeddingBagCollection``
650650
result_dict = kt.to_dict()
651651
for key, embedding in result_dict.items():
652652
print(key, embedding.shape)
@@ -656,7 +656,7 @@ def _wait_impl(self) -> torch.Tensor:
656656
# Anatomy of Sharded TorchRec modules
657657
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
658658
#
659-
# We have now successfully sharded an EmbeddingBagCollection given a
659+
# We have now successfully sharded an ``EmbeddingBagCollection`` given a
660660
# sharding plan that we generated! The sharded module has common APIs from
661661
# TorchRec which abstract away distributed communication/compute amongst
662662
# multiple GPUs. In fact, these APIs are highly optimized for performance
@@ -691,7 +691,7 @@ def _wait_impl(self) -> torch.Tensor:
691691
# Distribute input KJTs to all other GPUs and receive KJTs
692692
sharded_ebc._input_dists
693693

694-
# Distribute output embeddingts to all other GPUs and receive embeddings
694+
# Distribute output embeddings to all other GPUs and receive embeddings
695695
sharded_ebc._output_dists
696696

697697

@@ -702,11 +702,11 @@ def _wait_impl(self) -> torch.Tensor:
702702
# In performing lookups for a collection of embedding tables, a trivial
703703
# solution would be to iterate through all the ``nn.EmbeddingBags`` and do
704704
# a lookup per table. This is exactly what the standard, unsharded
705-
# TorchRec's ``EmbeddingBagCollection`` does. However, while this solution
705+
# ``EmbeddingBagCollection`` does. However, while this solution
706706
# is simple, it is extremely slow.
707707
#
708708
# `FBGEMM <https://github.com/pytorch/FBGEMM/tree/main/fbgemm_gpu>`__ is a
709-
# library that provides GPU operators (otherewise known as kernels) that
709+
# library that provides GPU operators (otherwise known as kernels) that
710710
# are very optimized. One of these operators is known as **Table Batched
711711
# Embedding** (TBE), provides two major optimizations:
712712
#
@@ -724,10 +724,10 @@ def _wait_impl(self) -> torch.Tensor:
724724

725725

726726
######################################################################
727-
# DistributedModelParallel
727+
# ``DistributedModelParallel``
728728
# ~~~~~~~~~~~~~~~~~~~~~~~~
729729
#
730-
# We have now explored sharding a single EmbeddingBagCollection! We were
730+
# We have now explored sharding a single ``EmbeddingBagCollection``! We were
731731
# able to take the ``EmbeddingBagCollectionSharder`` and use the unsharded
732732
# ``EmbeddingBagCollection`` to generate a
733733
# ``ShardedEmbeddingBagCollection`` module. This workflow is fine, but
@@ -738,14 +738,14 @@ def _wait_impl(self) -> torch.Tensor:
738738
#
739739
# 1. Decide how to shard the model. DMP will collect the available
740740
# ‘sharders’ and come up with a ‘plan’ of the optimal way to shard the
741-
# embedding table(s) (i.e, the EmbeddingBagCollection)
741+
# embedding table(s) (i.e, the ``EmbeddingBagCollection``)
742742
# 2. Actually shard the model. This includes allocating memory for each
743743
# embedding table on the appropriate device(s).
744744
#
745745
# DMP takes in everything that we've just experimented with, like a static
746746
# sharding plan, a list of sharders, etc. However, it also has some nice
747747
# defaults to seamlessly shard a TorchRec model. In this toy example,
748-
# since we have two EmbeddingTables and one GPU, TorchRec will place both
748+
# since we have two embedding tables and one GPU, TorchRec will place both
749749
# on the single GPU.
750750
#
751751

@@ -824,7 +824,7 @@ def _wait_impl(self) -> torch.Tensor:
824824
# ``CombinedOptimizer`` that you can use in your training loop to
825825
# ``zero_grad`` and ``step`` through.
826826
#
827-
# Let's add an optimizer to our EmbeddingBagCollection
827+
# Let's add an optimizer to our ``EmbeddingBagCollection``
828828
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
829829
#
830830
# We will do this in two ways, which are equivalent, but give you options
@@ -847,13 +847,13 @@ def _wait_impl(self) -> torch.Tensor:
847847
"eps": 0.002,
848848
}
849849

850-
# Init sharder with fused_params
850+
# Initialize sharder with fused_params
851851
sharder_with_fused_params = EmbeddingBagCollectionSharder(fused_params=fused_params)
852852

853853
# We'll use same plan and unsharded EBC as before but this time with our new sharder
854854
sharded_ebc_fused_params = sharder_with_fused_params.shard(ebc, plan.plan[""], env, torch.device("cuda"))
855855

856-
# Looking at the optimizer of each, we can see that the learning rate changed, which indicates our optimizer has been applied correclty.
856+
# Looking at the optimizer of each, we can see that the learning rate changed, which indicates our optimizer has been applied correctly.
857857
# If seen, we can also look at the TBE logs of the cell to see that our new optimizer is indeed being applied
858858
print(f"Original Sharded EBC fused optimizer: {sharded_ebc.fused_optimizer}")
859859
print(f"Sharded EBC with fused parameters fused optimizer: {sharded_ebc_fused_params.fused_optimizer}")
@@ -880,7 +880,7 @@ def _wait_impl(self) -> torch.Tensor:
880880
print(type(sharded_ebc_apply_opt.fused_optimizer))
881881

882882
# We can also check through the filter other parameters that aren't associated with the "fused" optimizer(s)
883-
# Pratically, just non TorchRec module parameters. Since our module is just a TorchRec EBC
883+
# Practically, just non TorchRec module parameters. Since our module is just a TorchRec EBC
884884
# there are no other parameters that aren't associated with TorchRec
885885
print("Non Fused Model Parameters:")
886886
print(dict(in_backward_optimizer_filter(sharded_ebc_fused_params.named_parameters())).keys())
@@ -972,7 +972,7 @@ def forward(self, kjt: KeyedJaggedTensor):
972972

973973
qconfig = QuantConfig(
974974
# dtype of the result of the embedding lookup, post activation
975-
# torch.float generally for compatability with rest of the model
975+
# torch.float generally for compatibility with rest of the model
976976
# as rest of the model here usually isn't quantized
977977
activation=quant.PlaceholderObserver.with_args(dtype=torch.float),
978978
# quantized type for embedding weights, aka parameters to actually quantize

0 commit comments

Comments
 (0)