1+ import tensorflow as tf
2+ import numpy as np
3+
4+ class MarketEnvironment :
5+ def __init__ (self , returns , initial_balance = 10000 , transaction_cost = 0.001 ):
6+ self .returns = returns
7+ self .initial_balance = initial_balance
8+ self .transaction_cost = transaction_cost
9+ self .reset ()
10+
11+ def reset (self ):
12+ self .balance = self .initial_balance
13+ self .position = np .zeros (self .returns .shape [1 ])
14+ self .time = 0
15+ return self ._get_state ()
16+
17+ def step (self , action ):
18+ old_position = self .position
19+ self .position = action
20+
21+ # Apply transaction costs
22+ self .balance -= np .sum (np .abs (self .position - old_position )) * self .balance * self .transaction_cost
23+
24+ # Apply market returns
25+ self .balance *= 1 + np .sum (self .position * self .returns [self .time ])
26+
27+ self .time += 1
28+ done = self .time >= len (self .returns )
29+
30+ return self ._get_state (), self ._get_reward (), done
31+
32+ def _get_state (self ):
33+ return np .concatenate ([
34+ self .position ,
35+ [self .balance ],
36+ self .returns [self .time ] if self .time < len (self .returns ) else np .zeros_like (self .returns [0 ])
37+ ])
38+
39+ def _get_reward (self ):
40+ return np .log (self .balance / self .initial_balance )
41+
42+ class Actor (tf .keras .Model ):
43+ def __init__ (self , state_dim , action_dim ):
44+ super (Actor , self ).__init__ ()
45+ self .model = tf .keras .Sequential ([
46+ tf .keras .layers .Dense (64 , activation = 'relu' , input_shape = (state_dim ,)),
47+ tf .keras .layers .Dense (64 , activation = 'relu' ),
48+ tf .keras .layers .Dense (action_dim , activation = 'softmax' )
49+ ])
50+
51+ def call (self , state ):
52+ return self .model (state )
53+
54+ class Critic (tf .keras .Model ):
55+ def __init__ (self , state_dim ):
56+ super (Critic , self ).__init__ ()
57+ self .model = tf .keras .Sequential ([
58+ tf .keras .layers .Dense (64 , activation = 'relu' , input_shape = (state_dim ,)),
59+ tf .keras .layers .Dense (64 , activation = 'relu' ),
60+ tf .keras .layers .Dense (1 )
61+ ])
62+
63+ def call (self , state ):
64+ return self .model (state )
65+
66+ class RLDynamicAllocation (tf .keras .Model ):
67+ def __init__ (self , state_dim , action_dim , lr_actor = 0.0001 , lr_critic = 0.001 ):
68+ super (RLDynamicAllocation , self ).__init__ ()
69+ self .actor = Actor (state_dim , action_dim )
70+ self .critic = Critic (state_dim )
71+ self .actor_optimizer = tf .keras .optimizers .Adam (lr_actor )
72+ self .critic_optimizer = tf .keras .optimizers .Adam (lr_critic )
73+
74+ def train (self , env , episodes = 1000 ):
75+ for episode in range (episodes ):
76+ state = env .reset ()
77+ done = False
78+ while not done :
79+ with tf .GradientTape () as tape_actor , tf .GradientTape () as tape_critic :
80+ action_probs = self .actor (tf .convert_to_tensor ([state ], dtype = tf .float32 ))
81+ action = tf .random .categorical (tf .math .log (action_probs ), 1 )[0 , 0 ]
82+ action_onehot = tf .one_hot (action , env .action_space .n )
83+
84+ next_state , reward , done = env .step (action_onehot .numpy ())
85+
86+ critic_value = self .critic (tf .convert_to_tensor ([state ], dtype = tf .float32 ))
87+ next_critic_value = self .critic (tf .convert_to_tensor ([next_state ], dtype = tf .float32 ))
88+
89+ advantage = reward + 0.99 * next_critic_value * (1 - done ) - critic_value
90+ actor_loss = - tf .math .log (action_probs [0 , action ]) * advantage
91+ critic_loss = advantage ** 2
92+
93+ actor_grads = tape_actor .gradient (actor_loss , self .actor .trainable_variables )
94+ critic_grads = tape_critic .gradient (critic_loss , self .critic .trainable_variables )
95+
96+ self .actor_optimizer .apply_gradients (zip (actor_grads , self .actor .trainable_variables ))
97+ self .critic_optimizer .apply_gradients (zip (critic_grads , self .critic .trainable_variables ))
98+
99+ state = next_state
100+
101+ def get_action (self , state ):
102+ action_probs = self .actor (tf .convert_to_tensor ([state ], dtype = tf .float32 ))
103+ return action_probs .numpy ()[0 ]
0 commit comments