diff --git a/docs/source/api_ref_decoders.rst b/docs/source/api_ref_decoders.rst
index 0ae159c37..1417d7aea 100644
--- a/docs/source/api_ref_decoders.rst
+++ b/docs/source/api_ref_decoders.rst
@@ -19,6 +19,12 @@ For an audio decoder tutorial, see: :ref:`sphx_glr_generated_examples_decoding_a
     VideoDecoder
     AudioDecoder
 
+.. autosummary::
+    :toctree: generated/
+    :nosignatures:
+    :template: function.rst
+
+    set_cuda_backend
 
 .. autosummary::
     :toctree: generated/
diff --git a/examples/decoding/basic_cuda_example.py b/examples/decoding/basic_cuda_example.py
index 7c29e4475..8f82940c0 100644
--- a/examples/decoding/basic_cuda_example.py
+++ b/examples/decoding/basic_cuda_example.py
@@ -94,9 +94,10 @@
 #
 # To use CUDA decoder, you need to pass in a cuda device to the decoder.
 #
-from torchcodec.decoders import VideoDecoder
+from torchcodec.decoders import set_cuda_backend, VideoDecoder
 
-decoder = VideoDecoder(video_file, device="cuda")
+with set_cuda_backend("beta"):  # Use the BETA backend, it's faster!
+    decoder = VideoDecoder(video_file, device="cuda")
 frame = decoder[0]
 
 # %%
@@ -120,7 +121,8 @@
 # against equivalent results from the CPU decoders.
 timestamps = [12, 19, 45, 131, 180]
 cpu_decoder = VideoDecoder(video_file, device="cpu")
-cuda_decoder = VideoDecoder(video_file, device="cuda")
+with set_cuda_backend("beta"):
+    cuda_decoder = VideoDecoder(video_file, device="cuda")
 cpu_frames = cpu_decoder.get_frames_played_at(timestamps).data
 cuda_frames = cuda_decoder.get_frames_played_at(timestamps).data
 
diff --git a/src/torchcodec/decoders/_decoder_utils.py b/src/torchcodec/decoders/_decoder_utils.py
index 549756b81..2619acd24 100644
--- a/src/torchcodec/decoders/_decoder_utils.py
+++ b/src/torchcodec/decoders/_decoder_utils.py
@@ -66,20 +66,29 @@ def set_cuda_backend(backend: str) -> Generator[None, None, None]:
 
     This context manager allows you to specify which CUDA backend implementation
     to use when creating :class:`~torchcodec.decoders.VideoDecoder` instances
-    with CUDA devices.  This is thread-safe and async-safe.
+    with CUDA devices.
 
-    Note that you still need to pass `device="cuda"` when creating the
-    :class:`~torchcodec.decoders.VideoDecoder` instance. If a CUDA device isn't
-    specified, this context manager will have no effect.
+    .. note::
+        **We recommend trying the "beta" backend instead of the default "ffmpeg"
+        backend!** The beta backend is faster, and will eventually become the
+        default in future versions. It may have rough edges that we'll polish
+        over time, but it's already quite stable and ready for adoption. Let us
+        know what you think!
 
     Only the creation of the decoder needs to be inside the context manager, the
-    decoding methods can be called outside of it.
+    decoding methods can be called outside of it. You still need to pass
+    ``device="cuda"`` when creating the
+    :class:`~torchcodec.decoders.VideoDecoder` instance. If a CUDA device isn't
+    specified, this context manager will have no effect. See example below.
+
+    This is thread-safe and async-safe.
 
     Args:
-        backend (str): The CUDA backend to use. Can be "ffmpeg" or "beta". Default is "ffmpeg".
+        backend (str): The CUDA backend to use. Can be "ffmpeg" (default) or
+            "beta". We recommend trying "beta" as it's faster!
 
     Example:
-        >>> with torchcodec.set_cuda_backend("beta"):
+        >>> with set_cuda_backend("beta"):
         ...     decoder = VideoDecoder("video.mp4", device="cuda")
         ...
         ... # Only the decoder creation needs to be part of the context manager.
diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
index f22f5a3fc..331c7ba79 100644
--- a/src/torchcodec/decoders/_video_decoder.py
+++ b/src/torchcodec/decoders/_video_decoder.py
@@ -56,6 +56,8 @@ class VideoDecoder:
             Passing 0 lets FFmpeg decide on the number of threads.
             Default: 1.
         device (str or torch.device, optional): The device to use for decoding. Default: "cpu".
+            If you pass a CUDA device, we recommend trying the "beta" CUDA
+            backend which is faster! See :func:`~torchcodec.decoders.set_cuda_backend`.
         seek_mode (str, optional): Determines if frame access will be "exact" or
             "approximate". Exact guarantees that requesting frame i will always
             return frame i, but doing so requires an initial :term:`scan` of the