Explicitly set the terminal_dtype in the replay memories.

William Fedus · psc-g · commit b9e932a1a83d · 2019-07-22T06:26:23.000-04:00
PiperOrigin-RevId: 256687171
diff --git a/dopamine/replay_memory/circular_replay_buffer.py b/dopamine/replay_memory/circular_replay_buffer.py
@@ -105,6 +105,7 @@ def __init__(self,
                max_sample_attempts=1000,
                extra_storage_types=None,
                observation_dtype=np.uint8,
+               terminal_dtype=np.uint8,
                action_shape=(),
                action_dtype=np.int32,
                reward_shape=(),
@@ -124,6 +125,8 @@ def __init__(self,
         contents that will be stored and returned by sample_transition_batch.
       observation_dtype: np.dtype, type of the observations. Defaults to
         np.uint8 for Atari 2600.
+      terminal_dtype: np.dtype, type of the terminals. Defaults to np.uint8 for
+        Atari 2600.
       action_shape: tuple of ints, the shape for the action vector. Empty tuple
         means the action is a scalar.
       action_dtype: np.dtype, type of elements in the action.
@@ -145,6 +148,7 @@ def __init__(self,
         self.__class__.__name__)
     tf.logging.info('\t observation_shape: %s', str(observation_shape))
     tf.logging.info('\t observation_dtype: %s', str(observation_dtype))
+    tf.logging.info('\t terminal_dtype: %s', str(terminal_dtype))
     tf.logging.info('\t stack_size: %d', stack_size)
     tf.logging.info('\t replay_capacity: %d', replay_capacity)
     tf.logging.info('\t batch_size: %d', batch_size)
@@ -163,6 +167,7 @@ def __init__(self,
     self._update_horizon = update_horizon
     self._gamma = gamma
     self._observation_dtype = observation_dtype
+    self._terminal_dtype = terminal_dtype
     self._max_sample_attempts = max_sample_attempts
     if extra_storage_types:
       self._extra_storage_types = extra_storage_types
@@ -210,7 +215,7 @@ def get_storage_signature(self):
                       self._observation_dtype),
         ReplayElement('action', self._action_shape, self._action_dtype),
         ReplayElement('reward', self._reward_shape, self._reward_dtype),
-        ReplayElement('terminal', (), np.uint8)
+        ReplayElement('terminal', (), self._terminal_dtype)
     ]
 
     for extra_replay_element in self._extra_storage_types:
@@ -241,7 +246,7 @@ def add(self, observation, action, reward, terminal, *args):
       observation: np.array with shape observation_shape.
       action: int, the action in the transition.
       reward: float, the reward received in the transition.
-      terminal: A uint8 acting as a boolean indicating whether the transition
+      terminal: np.dtype, acts as a boolean indicating whether the transition
                 was terminal (1) or not (0).
       *args: extra contents with shapes and dtypes according to
         extra_storage_types.
@@ -555,7 +560,7 @@ def get_transition_elements(self, batch_size=None):
                       self._action_dtype),
         ReplayElement('next_reward', (batch_size,) + self._reward_shape,
                       self._reward_dtype),
-        ReplayElement('terminal', (batch_size,), np.uint8),
+        ReplayElement('terminal', (batch_size,), self._terminal_dtype),
         ReplayElement('indices', (batch_size,), np.int32)
     ]
     for element in self._extra_storage_types:
@@ -687,6 +692,7 @@ def __init__(self,
                max_sample_attempts=1000,
                extra_storage_types=None,
                observation_dtype=np.uint8,
+               terminal_dtype=np.uint8,
                action_shape=(),
                action_dtype=np.int32,
                reward_shape=(),
@@ -710,6 +716,8 @@ def __init__(self,
         contents that will be stored and returned by sample_transition_batch.
       observation_dtype: np.dtype, type of the observations. Defaults to
         np.uint8 for Atari 2600.
+      terminal_dtype: np.dtype, type of the terminals. Defaults to np.uint8 for
+        Atari 2600.
       action_shape: tuple of ints, the shape for the action vector. Empty tuple
         means the action is a scalar.
       action_dtype: np.dtype, type of elements in the action.
@@ -745,6 +753,7 @@ def __init__(self,
           gamma,
           max_sample_attempts,
           observation_dtype=observation_dtype,
+          terminal_dtype=terminal_dtype,
           extra_storage_types=extra_storage_types,
           action_shape=action_shape,
           action_dtype=action_dtype,
@@ -765,7 +774,7 @@ def add(self, observation, action, reward, terminal, *args):
       observation: np.array with shape observation_shape.
       action: int, the action in the transition.
       reward: float, the reward received in the transition.
-      terminal: A uint8 acting as a boolean indicating whether the transition
+      terminal: np.dtype, acts as a boolean indicating whether the transition
                 was terminal (1) or not (0).
       *args: extra contents with shapes and dtypes according to
         extra_storage_types.
diff --git a/dopamine/replay_memory/prioritized_replay_buffer.py b/dopamine/replay_memory/prioritized_replay_buffer.py
@@ -50,6 +50,7 @@ def __init__(self,
                max_sample_attempts=1000,
                extra_storage_types=None,
                observation_dtype=np.uint8,
+               terminal_dtype=np.uint8,
                action_shape=(),
                action_dtype=np.int32,
                reward_shape=(),
@@ -69,6 +70,8 @@ def __init__(self,
         contents that will be stored and returned by sample_transition_batch.
       observation_dtype: np.dtype, type of the observations. Defaults to
         np.uint8 for Atari 2600.
+      terminal_dtype: np.dtype, type of the terminals. Defaults to np.uint8 for
+        Atari 2600.
       action_shape: tuple of ints, the shape for the action vector. Empty tuple
         means the action is a scalar.
       action_dtype: np.dtype, type of elements in the action.
@@ -86,6 +89,7 @@ def __init__(self,
         max_sample_attempts=max_sample_attempts,
         extra_storage_types=extra_storage_types,
         observation_dtype=observation_dtype,
+        terminal_dtype=terminal_dtype,
         action_shape=action_shape,
         action_dtype=action_dtype,
         reward_shape=reward_shape,
@@ -274,6 +278,7 @@ def __init__(self,
                max_sample_attempts=1000,
                extra_storage_types=None,
                observation_dtype=np.uint8,
+               terminal_dtype=np.uint8,
                action_shape=(),
                action_dtype=np.int32,
                reward_shape=(),
@@ -295,6 +300,8 @@ def __init__(self,
         contents that will be stored and returned by sample_transition_batch.
       observation_dtype: np.dtype, type of the observations. Defaults to
         np.uint8 for Atari 2600.
+      terminal_dtype: np.dtype, type of the terminals. Defaults to np.uint8 for
+        Atari 2600.
       action_shape: tuple of ints, the shape for the action vector. Empty tuple
         means the action is a scalar.
       action_dtype: np.dtype, type of elements in the action.
@@ -322,6 +329,7 @@ def __init__(self,
         wrapped_memory=memory,
         extra_storage_types=extra_storage_types,
         observation_dtype=observation_dtype,
+        terminal_dtype=terminal_dtype,
         action_shape=action_shape,
         action_dtype=action_dtype,
         reward_shape=reward_shape,
diff --git a/tests/dopamine/replay_memory/circular_replay_buffer_test.py b/tests/dopamine/replay_memory/circular_replay_buffer_test.py
@@ -79,6 +79,14 @@ def testConstructor(self):
         batch_size=BATCH_SIZE)
     self.assertEqual(memory._observation_shape, (4, 20))
     self.assertEqual(memory.add_count, 0)
+    # Test with terminal datatype of np.int32
+    memory = circular_replay_buffer.OutOfGraphReplayBuffer(
+        observation_shape=OBSERVATION_SHAPE,
+        stack_size=STACK_SIZE,
+        terminal_dtype=np.int32,
+        replay_capacity=5,
+        batch_size=BATCH_SIZE)
+    self.assertEqual(memory._terminal_dtype, np.int32)
 
   def testAdd(self):
     memory = circular_replay_buffer.OutOfGraphReplayBuffer(