pytorch · msaroufim · Aug 23, 2025 · Aug 21, 2025
diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@ https://pytorch.org/examples/
 - [Variational Auto-Encoders](./vae/README.md)
 - [Superresolution using an efficient sub-pixel convolutional neural network](./super_resolution/README.md)
 - [Hogwild training of shared ConvNets across multiple processes on MNIST](mnist_hogwild)
-- [Training a CartPole to balance in OpenAI Gym with actor-critic](./reinforcement_learning/README.md)
+- [Training a CartPole to balance with actor-critic](./reinforcement_learning/README.md)
 - [Natural Language Inference (SNLI) with GloVe vectors, LSTMs, and torchtext](snli)
 - [Time sequence prediction - use an LSTM to learn Sine waves](./time_sequence_prediction/README.md)
 - [Implement the Neural Style Transfer algorithm on images](./fast_neural_style/README.md)

diff --git a/distributed/rpc/batch/reinforce.py b/distributed/rpc/batch/reinforce.py
@@ -1,5 +1,5 @@
 import argparse
-import gym
+import gymnasium as gym
 import os
 import threading
 import time
@@ -68,7 +68,7 @@ class Observer:
     def __init__(self, batch=True):
         self.id = rpc.get_worker_info().id - 1
         self.env = gym.make('CartPole-v1')
-        self.env.seed(args.seed)
+        self.env.reset(seed=args.seed)
         self.select_action = Agent.select_action_batch if batch else Agent.select_action
 
     def run_episode(self, agent_rref, n_steps):
@@ -92,10 +92,10 @@ def run_episode(self, agent_rref, n_steps):
             )
 
             # apply the action to the environment, and get the reward
-            state, reward, done, _ = self.env.step(action)
+            state, reward, terminated, truncated, _ = self.env.step(action)
             rewards[step] = reward
 
-            if done or step + 1 >= n_steps:
+            if terminated or truncated or step + 1 >= n_steps:
                 curr_rewards = rewards[start_step:(step + 1)]
                 R = 0
                 for i in range(curr_rewards.numel() -1, -1, -1):
@@ -226,8 +226,7 @@ def run_worker(rank, world_size, n_episode, batch, print_log=True):
             last_reward, running_reward = agent.run_episode(n_steps=NUM_STEPS)
 
             if print_log:
-                print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
-                    i_episode, last_reward, running_reward))
+                print(f'Episode {i_episode}\tLast reward: {last_reward:.2f}\tAverage reward: {running_reward:.2f}')
     else:
         # other ranks are the observer
         rpc.init_rpc(OBSERVER_NAME.format(rank), rank=rank, world_size=world_size)

diff --git a/distributed/rpc/batch/requirements.txt b/distributed/rpc/batch/requirements.txt
@@ -1,4 +1,4 @@
 torch==2.2.0
 torchvision==0.7.0
 numpy
-gym
+gymnasium
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -88,12 +88,12 @@ experiment with PyTorch.
     `GO TO EXAMPLE <https://github.com/pytorch/examples/blob/main/mnist_hogwild>`__ :opticon:`link-external`
 
     ---
-    Training a CartPole to balance in OpenAI Gym with actor-critic
-    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+    Training a CartPole to balance with actor-critic
+    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
     This reinforcement learning tutorial demonstrates how to train a
     CartPole to balance
-    in the `OpenAI Gym <https://gym.openai.com/>`__ toolkit by using the
+    in the `Gymnasium <https://gymnasium.farama.org/>`__ toolkit by using the
     `Actor-Critic <https://proceedings.neurips.cc/paper/1999/file/6449f44a102fde848669bdd9eb6b76fa-Paper.pdf>`__ method.
 
     `GO TO EXAMPLE <https://github.com/pytorch/examples/blob/main/reinforcement_learning>`__ :opticon:`link-external`

diff --git a/reinforcement_learning/actor_critic.py b/reinforcement_learning/actor_critic.py
@@ -1,5 +1,5 @@
 import argparse
-import gym
+import gymnasium as gym
 import numpy as np
 from itertools import count
 from collections import namedtuple
@@ -24,7 +24,8 @@
 args = parser.parse_args()
 
 
-env = gym.make('CartPole-v1')
+render_mode = "human" if args.render else None
+env = gym.make('CartPole-v1', render_mode=render_mode)
 env.reset(seed=args.seed)
 torch.manual_seed(args.seed)
 
@@ -152,14 +153,11 @@ def main():
             action = select_action(state)
 
             # take the action
-            state, reward, done, _, _ = env.step(action)
-
-            if args.render:
-                env.render()
+            state, reward, terminated, truncated, _ = env.step(action)
 
             model.rewards.append(reward)
             ep_reward += reward
-            if done:
+            if terminated or truncated:
                 break
 
         # update cumulative reward
@@ -170,13 +168,12 @@ def main():
 
         # log results
         if i_episode % args.log_interval == 0:
-            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
-                  i_episode, ep_reward, running_reward))
+            print(f'Episode {i_episode}\tLast reward: {ep_reward:.2f}\tAverage reward: {running_reward:.2f}')
 
         # check if we have "solved" the cart pole problem
         if running_reward > env.spec.reward_threshold:
-            print("Solved! Running reward is now {} and "
-                  "the last episode runs to {} time steps!".format(running_reward, t))
+            print(f"Solved! Running reward is now {running_reward} and "
+                  f"the last episode runs to {t} time steps!")
             break
 
 

diff --git a/reinforcement_learning/reinforce.py b/reinforcement_learning/reinforce.py
@@ -1,5 +1,5 @@
 import argparse
-import gym
+import gymnasium as gym
 import numpy as np
 from itertools import count
 from collections import deque
@@ -22,7 +22,8 @@
 args = parser.parse_args()
 
 
-env = gym.make('CartPole-v1')
+render_mode = "human" if args.render else None
+env = gym.make('CartPole-v1', render_mode=render_mode)
 env.reset(seed=args.seed)
 torch.manual_seed(args.seed)
 
@@ -85,22 +86,20 @@ def main():
         ep_reward = 0
         for t in range(1, 10000):  # Don't infinite loop while learning
             action = select_action(state)
-            state, reward, done, _, _ = env.step(action)
+            state, reward, terminated, truncated, _ = env.step(action)
             if args.render:
                 env.render()
             policy.rewards.append(reward)
             ep_reward += reward
-            if done:
+            if terminated or truncated:
                 break
 
         running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
         finish_episode()
         if i_episode % args.log_interval == 0:
-            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
-                  i_episode, ep_reward, running_reward))
+            print(f'Episode {i_episode}\tLast reward: {ep_reward:.2f}\tAverage reward: {running_reward:.2f}')
         if running_reward > env.spec.reward_threshold:
-            print("Solved! Running reward is now {} and "
-                  "the last episode runs to {} time steps!".format(running_reward, t))
+            print(f"Solved! Running reward is now {running_reward} and the last episode runs to {t} time steps!")
             break
 
 

diff --git a/reinforcement_learning/requirements.txt b/reinforcement_learning/requirements.txt
@@ -1,4 +1,3 @@
 torch
-numpy<2
-gym
-pygame
+numpy
+gymnasium[classic-control]