Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added __pycache__/actor_critic.cpython-36.pyc
Binary file not shown.
Binary file added __pycache__/dqn.cpython-36.pyc
Binary file not shown.
Binary file added __pycache__/simulator.cpython-36.pyc
Binary file not shown.
Binary file added __pycache__/train.cpython-36.pyc
Binary file not shown.
29 changes: 18 additions & 11 deletions simulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ class UE:
state_dim = len(state_var['scalar']) + RBG_NUM * len(state_var['vector'])
attr_range = {
'buffer': (int(1e3), int(1e6)),
'rsrp': (-120, -90),
'avg_snr': (1, 31),
'rsrp': (-120, -89),
'avg_snr': (1, 32),
'avg_thp': (0, BANDWIDTH * 0.9 * TTI * np.log2(1+29**2)),
'avg_cqi': (1, 29),
'sched_rbg_num': (0, RBG_NUM),
Expand Down Expand Up @@ -195,7 +195,7 @@ def step(self, action):
action[i] represents the MCS level of RBG[i]

Returns:
state (np.array): state.shape == (RBG_NUM, UE.state_dim)
state (np.array): state.shape == (ue_num, UE.state_dim)
reward (float): reward returned after taking the action
done (bool): whether the episode has ended
info (dict): extra informaction
Expand Down Expand Up @@ -276,12 +276,13 @@ def _calc_prior(self, ues):
" for every ue, update rbg_cqi per 'sqi_report_interval' "
for ue in ues:
dt = self.sim_time - ue.arrive
if np.isclose(dt % self.cqi_report_interval, 0.0) or np.isclose(dt, self.cqi_report_interval):
ue.cqi = ue.avg_snr + np.random.randint(-2, 2, size=RBG_NUM)
np.clip(ue.cqi, *ue.attr_range['cqi'], out=ue.cqi)
logging.debug(f"{self.sim_time}: cqi reported for {ue}")
# if np.isclose(dt % self.cqi_report_interval, 0.0) or np.isclose(dt, self.cqi_report_interval):
ue.cqi = ue.avg_snr + np.random.randint(-2, 3, size=RBG_NUM)
np.clip(ue.cqi, *ue.attr_range['cqi'], out=ue.cqi)
logging.debug(f"{self.sim_time}: cqi reported for {ue}")

np.copyto(ue.mcs, ue.cqi)
ue.mcs[np.isnan(ue.mcs)] = DEFAULT_MCS # set default mcs
# ue.mcs[np.isnan(ue.mcs)] = DEFAULT_MCS # set default mcs
np.log2(1 + ue.mcs**2.0, out=ue.se)
ue.prior = ue.se / max(1, ue.avg_thp/PRIOR_THRESHOLD)
return ues
Expand Down Expand Up @@ -320,21 +321,27 @@ def _rm_no_buffer_ue(self):

def _is_ack(self, ue, sched_mcs):
" whether the ue successfully sends the package in the given sched_mcs "
is_ack = ue.avg_snr + np.random.randint(-2, 2) - sched_mcs
is_ack = ue.avg_snr + np.random.randint(-2, 3) - sched_mcs
if is_ack > 0:
is_ack = 1
elif is_ack < 0:
is_ack = 0
else:
is_ack = np.random.randint(0, 1)
is_ack = np.random.randint(0,2)
return is_ack


class Policy:
" default policy (taking floor after mean) "

def __init__(self, mode="avg"):
self.mode = mode

def decide(self, ues):
return np.array([np.floor(np.sum(ue.mcs*ue.sched_rbg)/ue.sched_rbg.sum()) for ue in ues])
if self.mode == "avg":
return np.array([np.floor(np.sum(ue.mcs*ue.sched_rbg)/ue.sched_rbg.sum()) for ue in ues])
assert self.mode == "snr"
return np.array([ue.avg_snr for ue in ues])

def learn(self, *args):
pass
Expand Down
216 changes: 172 additions & 44 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,15 @@
import torch.optim as optim
from torch.distributions import Categorical


from simulator import *
from actor_critic import *
from dqn import *

import matplotlib.pyplot as plt
from concurrent import futures
import time

env = Airview(episode_length=10, ue_arrival_rate=0.1)

state_dim = env.observation_space.shape[1]
action_dim = 29

model = ActorCritic(state_dim, action_dim, {
'share': [128, ], 'critic': [32, ], 'actor': [64, ]})
model = DQN(state_dim, action_dim, [128, 64])
# with open("model.pkl", 'rb') as f:
# model = torch.load(f)
optimizer = optim.Adam(model.parameters())
buffer = ReplayBuffer(10000)


def trainAC(env, model, optimizer, max_frames=50000, num_steps=5, replay=None, replay_size=20):
Expand Down Expand Up @@ -50,7 +41,7 @@ def trainAC(env, model, optimizer, max_frames=50000, num_steps=5, replay=None, r
total_reward += reward
average_rewards.append(total_reward / frame_idx)
total_deliver += info['total']
success_rate.append(total_reward/total_deliver)
success_rate.append(total_reward/total_deliver if total_deliver > 0 else 0)
if replay is not None:
replay.push((state, action, reward, done, next_state))
else:
Expand All @@ -62,12 +53,12 @@ def trainAC(env, model, optimizer, max_frames=50000, num_steps=5, replay=None, r

state = next_state
if frame_idx % 1000 == 0:
print(average_rewards[-1])
print("%.2f" % average_rewards[-1])

if replay is not None:
if len(replay) > replay_size:
actor_loss, critic_loss, entropy = replay_loss(
model, optimizer, *replay.sample(replay_size))
actor_loss, critic_loss, entropy = replay_loss_AC(
model, *replay.sample(replay_size))
else:
continue

Expand All @@ -91,53 +82,46 @@ def trainDQN(env, model, optimizer, max_frames=50000, num_steps=5, epsilon=0.9,
success_rate = []
state = env.reset()
state = torch.FloatTensor(state)
plt.ion()
while frame_idx < max_frames:
log_probs = []
values = []
rewards = []
masks = []
actor_loss = 0
critic_loss = 0
entropy = 0
for _ in range(num_steps):
frame_idx += 1
value = model(state)
if np.random.uniform() < epsilon:
action = value.argmax().unsqueeze(0)
action = value.argmax(dim=-1)

else:
action = np.random.uniform(
size=value.shape[0]) * value.shape[-1]
action = torch.randint(1,value.shape[1],(value.shape[0],))

next_state, reward, done, info = env.step(action)
next_state = torch.FloatTensor(next_state)
total_reward += reward
average_rewards.append(total_reward / frame_idx)
total_deliver += info['total']
success_rate.append(total_reward/total_deliver)
success_rate.append(total_reward/total_deliver if total_deliver > 0 else 0)
if replay is not None:
replay.push((state, action, reward, done, next_state))
else:
advantage = reward - value
actor_loss += -(log_prob*advantage.detach()).mean()
critic_loss += advantage.pow(2).mean()
pass # TODO

state = next_state
if frame_idx % 1000 == 0:
print(average_rewards[-1])
print("%.2f" % average_rewards[-1])

if frame_idx % 10000 == 0 and frame_idx > 30000:
print('saving model')
torch.save(model.state_dict(), "DQN_model.pt")

if replay is not None:
if len(replay) > replay_size:
actor_loss, critic_loss, entropy = replay_loss(
model, optimizer, *replay.sample(replay_size))
if len(replay) == replay.capacity:
loss = replay_loss_DQN(model, *replay.sample(replay_size))
optimizer.zero_grad()
loss.backward()
optimizer.step()

else:
continue

loss = actor_loss + 0.5*critic_loss - 0.01 * entropy

optimizer.zero_grad()
loss.backward()
optimizer.step()

if done:
state = env.reset()
state = torch.FloatTensor(state)
Expand All @@ -153,14 +137,14 @@ def baseline(env, policy, max_frames=10000, alter=0):
state = env.reset()
while frame_idx < max_frames:
action = policy.decide(env.sched_ue_count.keys())
next_state, reward, done, info = env.step(action+alter)
next_state, reward, done, info = env.step(np.clip(action+alter,1,29))
total_reward += reward
total_deliver += info['total']
success_rate.append(total_reward/total_deliver)
frame_idx += 1
average_rewards.append(total_reward/frame_idx)
if frame_idx % 1000 == 0:
print(average_rewards[-1])
print("%.2f" % average_rewards[-1])
if done:
env.reset()
return average_rewards, success_rate
Expand All @@ -184,13 +168,13 @@ def random_test(env, max_frames=10000):
frame_idx += 1
average_rewards.append(total_reward/frame_idx)
if frame_idx % 1000 == 0:
print(average_rewards[-1])
print("%.2f" % average_rewards[-1])
if done:
env.reset()
return average_rewards, success_rate


def replay_loss(model, optimizer, state, action, reward, done, next_state):
def replay_loss_AC(model, state, action, reward, done, next_state):
actor_loss = 0
critic_loss = 0
entropy = 0
Expand All @@ -203,3 +187,147 @@ def replay_loss(model, optimizer, state, action, reward, done, next_state):
actor_loss += -(log_prob*advantage.detach()).mean()
critic_loss += advantage.pow(2).mean()
return actor_loss, critic_loss, entropy


def replay_loss_DQN(model, batch_state, batch_action, batch_reward, batch_done, batch_next_state):
loss = 0
batch_reward = torch.FloatTensor(np.vstack(batch_reward))

for state, action, reward in zip(batch_state,batch_action,batch_reward):
value = model(state)
eval_ = torch.sum(value.gather(1,action.unsqueeze(dim=1)))
loss += (eval_-reward)**2

return loss


def plot_fig(rewards, title, start_step=5000, save=False, labels=None):
plt.figure()
for i,average_rewards in enumerate(rewards):
plt.plot(average_rewards[start_step:],label=f"{i}th run" if labels is not None else labels[i])
plt.title(title,fontsize=15)
plt.xlabel("Steps",fontsize=10)
plt.ylabel("Average_rewards",fontsize=10)
plt.legend(loc="best")
plt.show()
if save: plt.savefig(f"{title}.png")


def run_model(train_fun,inputs):
"""
use multi-threading to run train_fun, given inputs
Args:
train_fun: the training function. i.e. trainDQN/trainAC/random_test etc.
inputs: list(args of train_fun)

Returns:
eg.
rewards(list(rewards)): model rewards of different inputs
success_rates(list(success_rates)): model success_rates of different inputs

the return format of rewards and sucess_rate are as follows:
[inputs1,inputs2,...]

Examples:
inputs = [(env1,model1,opt1), (env2,model2,opt2),...]
rewards, sucess_rate = run_model(trainAC,inputs)
plot_fig(rewards)
"""
rewards = []
success_rates = []

with futures.ThreadPoolExecutor(max_workers=30) as executor:
future_list = []
for input_ in inputs:
future = executor.submit(train_fun,*input_)
future_list.append(future)

for future in futures.as_completed(future_list):
rewards.append(future.result()[0])
success_rates.append(future.result()[1])

return rewards, success_rates


# train experiment
env = Airview(episode_length=10, ue_arrival_rate=0.05)
state_dim = env.observation_space.shape[1]
action_dim = 29
net_hidden = [126,64,32,16]


max_frames = 100000
num_steps = 5
epsilon = 0.9
replay_size = 20
replay_buffer_size = 2000
replay = ReplayBuffer(replay_buffer_size)

default_para = (max_frames,num_steps,epsilon,replay,replay_size)

inputs = []
all_rewards = []

# DQN
model = DQN(state_dim,action_dim)
opt = torch.optim.Adam(model.parameters())
env = Airview(episode_length=10, ue_arrival_rate=0.005)
replay = ReplayBuffer(replay_buffer_size)
inputs = [(env,model,opt,*default_para)]
rewards,success_rate = run_model(trainDQN,inputs)
all_rewards.append(rewards[0])

# AC
model = ActorCritic(state_dim,action_dim)
opt = torch.optim.Adam(model.parameters())
env = Airview(episode_length=10, ue_arrival_rate=0.005)
replay = ReplayBuffer(replay_buffer_size)
inputs = [(env,model,opt,max_frames,num_steps,replay,replay_size)]
rewards,success_rate = run_model(trainAC,inputs)
all_rewards.append(rewards[0])

# AVG
model = Policy(mode="avg")
env = Airview(episode_length=10, ue_arrival_rate=0.005)
rewards,success_rate = baseline(env,model,max_frames=max_frames)
all_rewards.append(rewards)

# AVG-3
model = Policy(mode="avg")
env = Airview(episode_length=10, ue_arrival_rate=0.005)
rewards,success_rate = baseline(env,model,max_frames=max_frames,alter=-3)
all_rewards.append(rewards)

# SNR-3
model = Policy(mode="snr")
env = Airview(episode_length=10, ue_arrival_rate=0.005)
rewards,success_rate = baseline(env,model,max_frames=max_frames,alter=-3)
all_rewards.append(rewards)


model_names = ["DQN","AC","AVG","AVG-3","SNR-3"]
plot_fig(rewards,"performance comparison",save=True,labels=model_names)




# ---------- experiment on baseline --------------------
# alters = list(range(-3,4))
# avg_rewards = []
# model = Policy()

# for alter in alters:
# print(f"alter:{alter}")
# average_rewards, _ = baseline(env,model,max_frames=200000,alter=alter)
# avg_rewards.append(average_rewards)

# plt.figure()

# for (alter, average_rewards) in zip(alters, avg_rewards):
# plt.plot(average_rewards[3000:],label=f'alter:{alter}')

# plt.xlabel('step')
# plt.ylabel('average reward')
# plt.title('SNR')
# plt.legend(loc='best')
# plt.show()