1- import numpy as np
21import tensorflow as tf
32import tensorflow .keras as keras
43from tensorflow .keras .optimizers import Adam
54from buffer import ReplayBuffer
65from networks import ActorNetwork , CriticNetwork
76
7+
88class Agent :
99 def __init__ (self , input_dims , alpha = 0.001 , beta = 0.002 , env = None ,
10- gamma = 0.99 , n_actions = 2 , max_size = 1000000 , tau = 0.005 ,
11- fc1 = 400 , fc2 = 300 , batch_size = 64 , noise = 0.1 ):
10+ gamma = 0.99 , n_actions = 2 , max_size = 1000000 , tau = 0.005 ,
11+ fc1 = 400 , fc2 = 300 , batch_size = 64 , noise = 0.1 ):
1212 self .gamma = gamma
1313 self .tau = tau
1414 self .memory = ReplayBuffer (max_size , input_dims , n_actions )
@@ -17,11 +17,12 @@ def __init__(self, input_dims, alpha=0.001, beta=0.002, env=None,
1717 self .noise = noise
1818 self .max_action = env .action_space .high [0 ]
1919 self .min_action = env .action_space .low [0 ]
20-
20+
2121 self .actor = ActorNetwork (n_actions = n_actions , name = 'actor' )
22- self .critic = CriticNetwork (n_actions = n_actions , name = 'critic' )
23- self .target_actor = ActorNetwork (n_actions = n_actions , name = 'target_actor' )
24- self .target_critic = CriticNetwork (n_actions = n_actions , name = 'target_critic' )
22+ self .critic = CriticNetwork (name = 'critic' )
23+ self .target_actor = ActorNetwork (n_actions = n_actions ,
24+ name = 'target_actor' )
25+ self .target_critic = CriticNetwork (name = 'target_critic' )
2526
2627 self .actor .compile (optimizer = Adam (learning_rate = alpha ))
2728 self .critic .compile (optimizer = Adam (learning_rate = beta ))
@@ -68,8 +69,8 @@ def choose_action(self, observation, evaluate=False):
6869 actions = self .actor (state )
6970 if not evaluate :
7071 actions += tf .random .normal (shape = [self .n_actions ],
71- mean = 0.0 , stddev = self .noise )
72- # note that if the environment has an action > 1, we have to multiply by
72+ mean = 0.0 , stddev = self .noise )
73+ # note that if the env has an action > 1, we have to multiply by
7374 # max action at some point
7475 actions = tf .clip_by_value (actions , self .min_action , self .max_action )
7576
@@ -80,7 +81,7 @@ def learn(self):
8081 return
8182
8283 state , action , reward , new_state , done = \
83- self .memory .sample_buffer (self .batch_size )
84+ self .memory .sample_buffer (self .batch_size )
8485
8586 states = tf .convert_to_tensor (state , dtype = tf .float32 )
8687 states_ = tf .convert_to_tensor (new_state , dtype = tf .float32 )
@@ -92,11 +93,11 @@ def learn(self):
9293 critic_value_ = tf .squeeze (self .target_critic (
9394 states_ , target_actions ), 1 )
9495 critic_value = tf .squeeze (self .critic (states , actions ), 1 )
95- target = reward + self .gamma * critic_value_ * (1 - done )
96+ target = rewards + self .gamma * critic_value_ * (1 - done )
9697 critic_loss = keras .losses .MSE (target , critic_value )
9798
9899 critic_network_gradient = tape .gradient (critic_loss ,
99- self .critic .trainable_variables )
100+ self .critic .trainable_variables )
100101 self .critic .optimizer .apply_gradients (zip (
101102 critic_network_gradient , self .critic .trainable_variables ))
102103
@@ -105,8 +106,8 @@ def learn(self):
105106 actor_loss = - self .critic (states , new_policy_actions )
106107 actor_loss = tf .math .reduce_mean (actor_loss )
107108
108- actor_network_gradient = tape .gradient (actor_loss ,
109- self .actor .trainable_variables )
109+ actor_network_gradient = tape .gradient (actor_loss ,
110+ self .actor .trainable_variables )
110111 self .actor .optimizer .apply_gradients (zip (
111112 actor_network_gradient , self .actor .trainable_variables ))
112113
0 commit comments