diff --git a/__pycache__/actor_critic.cpython-36.pyc b/__pycache__/actor_critic.cpython-36.pyc new file mode 100644 index 0000000..b0189dd Binary files /dev/null and b/__pycache__/actor_critic.cpython-36.pyc differ diff --git a/__pycache__/dqn.cpython-36.pyc b/__pycache__/dqn.cpython-36.pyc new file mode 100644 index 0000000..4ea5054 Binary files /dev/null and b/__pycache__/dqn.cpython-36.pyc differ diff --git a/__pycache__/simulator.cpython-36.pyc b/__pycache__/simulator.cpython-36.pyc new file mode 100644 index 0000000..87c9201 Binary files /dev/null and b/__pycache__/simulator.cpython-36.pyc differ diff --git a/__pycache__/train.cpython-36.pyc b/__pycache__/train.cpython-36.pyc new file mode 100644 index 0000000..19ff327 Binary files /dev/null and b/__pycache__/train.cpython-36.pyc differ diff --git a/simulator.py b/simulator.py index ead1146..e06a9e2 100755 --- a/simulator.py +++ b/simulator.py @@ -59,8 +59,8 @@ class UE: state_dim = len(state_var['scalar']) + RBG_NUM * len(state_var['vector']) attr_range = { 'buffer': (int(1e3), int(1e6)), - 'rsrp': (-120, -90), - 'avg_snr': (1, 31), + 'rsrp': (-120, -89), + 'avg_snr': (1, 32), 'avg_thp': (0, BANDWIDTH * 0.9 * TTI * np.log2(1+29**2)), 'avg_cqi': (1, 29), 'sched_rbg_num': (0, RBG_NUM), @@ -195,7 +195,7 @@ def step(self, action): action[i] represents the MCS level of RBG[i] Returns: - state (np.array): state.shape == (RBG_NUM, UE.state_dim) + state (np.array): state.shape == (ue_num, UE.state_dim) reward (float): reward returned after taking the action done (bool): whether the episode has ended info (dict): extra informaction @@ -276,12 +276,13 @@ def _calc_prior(self, ues): " for every ue, update rbg_cqi per 'sqi_report_interval' " for ue in ues: dt = self.sim_time - ue.arrive - if np.isclose(dt % self.cqi_report_interval, 0.0) or np.isclose(dt, self.cqi_report_interval): - ue.cqi = ue.avg_snr + np.random.randint(-2, 2, size=RBG_NUM) - np.clip(ue.cqi, *ue.attr_range['cqi'], out=ue.cqi) - logging.debug(f"{self.sim_time}: cqi reported for {ue}") + # if np.isclose(dt % self.cqi_report_interval, 0.0) or np.isclose(dt, self.cqi_report_interval): + ue.cqi = ue.avg_snr + np.random.randint(-2, 3, size=RBG_NUM) + np.clip(ue.cqi, *ue.attr_range['cqi'], out=ue.cqi) + logging.debug(f"{self.sim_time}: cqi reported for {ue}") + np.copyto(ue.mcs, ue.cqi) - ue.mcs[np.isnan(ue.mcs)] = DEFAULT_MCS # set default mcs + # ue.mcs[np.isnan(ue.mcs)] = DEFAULT_MCS # set default mcs np.log2(1 + ue.mcs**2.0, out=ue.se) ue.prior = ue.se / max(1, ue.avg_thp/PRIOR_THRESHOLD) return ues @@ -320,21 +321,27 @@ def _rm_no_buffer_ue(self): def _is_ack(self, ue, sched_mcs): " whether the ue successfully sends the package in the given sched_mcs " - is_ack = ue.avg_snr + np.random.randint(-2, 2) - sched_mcs + is_ack = ue.avg_snr + np.random.randint(-2, 3) - sched_mcs if is_ack > 0: is_ack = 1 elif is_ack < 0: is_ack = 0 else: - is_ack = np.random.randint(0, 1) + is_ack = np.random.randint(0,2) return is_ack class Policy: " default policy (taking floor after mean) " + def __init__(self, mode="avg"): + self.mode = mode + def decide(self, ues): - return np.array([np.floor(np.sum(ue.mcs*ue.sched_rbg)/ue.sched_rbg.sum()) for ue in ues]) + if self.mode == "avg": + return np.array([np.floor(np.sum(ue.mcs*ue.sched_rbg)/ue.sched_rbg.sum()) for ue in ues]) + assert self.mode == "snr" + return np.array([ue.avg_snr for ue in ues]) def learn(self, *args): pass diff --git a/train.py b/train.py index f3b41f2..14af2d7 100644 --- a/train.py +++ b/train.py @@ -2,24 +2,15 @@ import torch.optim as optim from torch.distributions import Categorical + from simulator import * from actor_critic import * from dqn import * import matplotlib.pyplot as plt +from concurrent import futures +import time -env = Airview(episode_length=10, ue_arrival_rate=0.1) - -state_dim = env.observation_space.shape[1] -action_dim = 29 - -model = ActorCritic(state_dim, action_dim, { - 'share': [128, ], 'critic': [32, ], 'actor': [64, ]}) -model = DQN(state_dim, action_dim, [128, 64]) -# with open("model.pkl", 'rb') as f: -# model = torch.load(f) -optimizer = optim.Adam(model.parameters()) -buffer = ReplayBuffer(10000) def trainAC(env, model, optimizer, max_frames=50000, num_steps=5, replay=None, replay_size=20): @@ -50,7 +41,7 @@ def trainAC(env, model, optimizer, max_frames=50000, num_steps=5, replay=None, r total_reward += reward average_rewards.append(total_reward / frame_idx) total_deliver += info['total'] - success_rate.append(total_reward/total_deliver) + success_rate.append(total_reward/total_deliver if total_deliver > 0 else 0) if replay is not None: replay.push((state, action, reward, done, next_state)) else: @@ -62,12 +53,12 @@ def trainAC(env, model, optimizer, max_frames=50000, num_steps=5, replay=None, r state = next_state if frame_idx % 1000 == 0: - print(average_rewards[-1]) + print("%.2f" % average_rewards[-1]) if replay is not None: if len(replay) > replay_size: - actor_loss, critic_loss, entropy = replay_loss( - model, optimizer, *replay.sample(replay_size)) + actor_loss, critic_loss, entropy = replay_loss_AC( + model, *replay.sample(replay_size)) else: continue @@ -91,53 +82,46 @@ def trainDQN(env, model, optimizer, max_frames=50000, num_steps=5, epsilon=0.9, success_rate = [] state = env.reset() state = torch.FloatTensor(state) + plt.ion() while frame_idx < max_frames: - log_probs = [] - values = [] - rewards = [] - masks = [] - actor_loss = 0 - critic_loss = 0 - entropy = 0 for _ in range(num_steps): frame_idx += 1 value = model(state) if np.random.uniform() < epsilon: - action = value.argmax().unsqueeze(0) + action = value.argmax(dim=-1) + else: - action = np.random.uniform( - size=value.shape[0]) * value.shape[-1] + action = torch.randint(1,value.shape[1],(value.shape[0],)) next_state, reward, done, info = env.step(action) next_state = torch.FloatTensor(next_state) total_reward += reward average_rewards.append(total_reward / frame_idx) total_deliver += info['total'] - success_rate.append(total_reward/total_deliver) + success_rate.append(total_reward/total_deliver if total_deliver > 0 else 0) if replay is not None: replay.push((state, action, reward, done, next_state)) else: - advantage = reward - value - actor_loss += -(log_prob*advantage.detach()).mean() - critic_loss += advantage.pow(2).mean() + pass # TODO state = next_state if frame_idx % 1000 == 0: - print(average_rewards[-1]) + print("%.2f" % average_rewards[-1]) + + if frame_idx % 10000 == 0 and frame_idx > 30000: + print('saving model') + torch.save(model.state_dict(), "DQN_model.pt") if replay is not None: - if len(replay) > replay_size: - actor_loss, critic_loss, entropy = replay_loss( - model, optimizer, *replay.sample(replay_size)) + if len(replay) == replay.capacity: + loss = replay_loss_DQN(model, *replay.sample(replay_size)) + optimizer.zero_grad() + loss.backward() + optimizer.step() + else: continue - loss = actor_loss + 0.5*critic_loss - 0.01 * entropy - - optimizer.zero_grad() - loss.backward() - optimizer.step() - if done: state = env.reset() state = torch.FloatTensor(state) @@ -153,14 +137,14 @@ def baseline(env, policy, max_frames=10000, alter=0): state = env.reset() while frame_idx < max_frames: action = policy.decide(env.sched_ue_count.keys()) - next_state, reward, done, info = env.step(action+alter) + next_state, reward, done, info = env.step(np.clip(action+alter,1,29)) total_reward += reward total_deliver += info['total'] success_rate.append(total_reward/total_deliver) frame_idx += 1 average_rewards.append(total_reward/frame_idx) if frame_idx % 1000 == 0: - print(average_rewards[-1]) + print("%.2f" % average_rewards[-1]) if done: env.reset() return average_rewards, success_rate @@ -184,13 +168,13 @@ def random_test(env, max_frames=10000): frame_idx += 1 average_rewards.append(total_reward/frame_idx) if frame_idx % 1000 == 0: - print(average_rewards[-1]) + print("%.2f" % average_rewards[-1]) if done: env.reset() return average_rewards, success_rate -def replay_loss(model, optimizer, state, action, reward, done, next_state): +def replay_loss_AC(model, state, action, reward, done, next_state): actor_loss = 0 critic_loss = 0 entropy = 0 @@ -203,3 +187,147 @@ def replay_loss(model, optimizer, state, action, reward, done, next_state): actor_loss += -(log_prob*advantage.detach()).mean() critic_loss += advantage.pow(2).mean() return actor_loss, critic_loss, entropy + + +def replay_loss_DQN(model, batch_state, batch_action, batch_reward, batch_done, batch_next_state): + loss = 0 + batch_reward = torch.FloatTensor(np.vstack(batch_reward)) + + for state, action, reward in zip(batch_state,batch_action,batch_reward): + value = model(state) + eval_ = torch.sum(value.gather(1,action.unsqueeze(dim=1))) + loss += (eval_-reward)**2 + + return loss + + +def plot_fig(rewards, title, start_step=5000, save=False, labels=None): + plt.figure() + for i,average_rewards in enumerate(rewards): + plt.plot(average_rewards[start_step:],label=f"{i}th run" if labels is not None else labels[i]) + plt.title(title,fontsize=15) + plt.xlabel("Steps",fontsize=10) + plt.ylabel("Average_rewards",fontsize=10) + plt.legend(loc="best") + plt.show() + if save: plt.savefig(f"{title}.png") + + +def run_model(train_fun,inputs): + """ + use multi-threading to run train_fun, given inputs + Args: + train_fun: the training function. i.e. trainDQN/trainAC/random_test etc. + inputs: list(args of train_fun) + + Returns: + eg. + rewards(list(rewards)): model rewards of different inputs + success_rates(list(success_rates)): model success_rates of different inputs + + the return format of rewards and sucess_rate are as follows: + [inputs1,inputs2,...] + + Examples: + inputs = [(env1,model1,opt1), (env2,model2,opt2),...] + rewards, sucess_rate = run_model(trainAC,inputs) + plot_fig(rewards) + """ + rewards = [] + success_rates = [] + + with futures.ThreadPoolExecutor(max_workers=30) as executor: + future_list = [] + for input_ in inputs: + future = executor.submit(train_fun,*input_) + future_list.append(future) + + for future in futures.as_completed(future_list): + rewards.append(future.result()[0]) + success_rates.append(future.result()[1]) + + return rewards, success_rates + + +# train experiment +env = Airview(episode_length=10, ue_arrival_rate=0.05) +state_dim = env.observation_space.shape[1] +action_dim = 29 +net_hidden = [126,64,32,16] + + +max_frames = 100000 +num_steps = 5 +epsilon = 0.9 +replay_size = 20 +replay_buffer_size = 2000 +replay = ReplayBuffer(replay_buffer_size) + +default_para = (max_frames,num_steps,epsilon,replay,replay_size) + +inputs = [] +all_rewards = [] + +# DQN +model = DQN(state_dim,action_dim) +opt = torch.optim.Adam(model.parameters()) +env = Airview(episode_length=10, ue_arrival_rate=0.005) +replay = ReplayBuffer(replay_buffer_size) +inputs = [(env,model,opt,*default_para)] +rewards,success_rate = run_model(trainDQN,inputs) +all_rewards.append(rewards[0]) + +# AC +model = ActorCritic(state_dim,action_dim) +opt = torch.optim.Adam(model.parameters()) +env = Airview(episode_length=10, ue_arrival_rate=0.005) +replay = ReplayBuffer(replay_buffer_size) +inputs = [(env,model,opt,max_frames,num_steps,replay,replay_size)] +rewards,success_rate = run_model(trainAC,inputs) +all_rewards.append(rewards[0]) + +# AVG +model = Policy(mode="avg") +env = Airview(episode_length=10, ue_arrival_rate=0.005) +rewards,success_rate = baseline(env,model,max_frames=max_frames) +all_rewards.append(rewards) + +# AVG-3 +model = Policy(mode="avg") +env = Airview(episode_length=10, ue_arrival_rate=0.005) +rewards,success_rate = baseline(env,model,max_frames=max_frames,alter=-3) +all_rewards.append(rewards) + +# SNR-3 +model = Policy(mode="snr") +env = Airview(episode_length=10, ue_arrival_rate=0.005) +rewards,success_rate = baseline(env,model,max_frames=max_frames,alter=-3) +all_rewards.append(rewards) + + +model_names = ["DQN","AC","AVG","AVG-3","SNR-3"] +plot_fig(rewards,"performance comparison",save=True,labels=model_names) + + + + +# ---------- experiment on baseline -------------------- +# alters = list(range(-3,4)) +# avg_rewards = [] +# model = Policy() + +# for alter in alters: +# print(f"alter:{alter}") +# average_rewards, _ = baseline(env,model,max_frames=200000,alter=alter) +# avg_rewards.append(average_rewards) + +# plt.figure() + +# for (alter, average_rewards) in zip(alters, avg_rewards): +# plt.plot(average_rewards[3000:],label=f'alter:{alter}') + +# plt.xlabel('step') +# plt.ylabel('average reward') +# plt.title('SNR') +# plt.legend(loc='best') +# plt.show() \ No newline at end of file