Skip to content

Commit e118d27

Browse files
committed
Better handling of learning parameters.
New file called "exploit_PPO_perso.py" to evaluate PPO learning.
1 parent e999f81 commit e118d27

File tree

8 files changed

+357
-35
lines changed

8 files changed

+357
-35
lines changed

environment/gymnasium_envs/quadmesh_env/envs/mesh_conv.py

Lines changed: 81 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
def get_x(state: Mesh, n_darts_selected: int, deep :int, degree: bool, restricted:bool, nodes_scores: list[int], nodes_adjacency: list[int]):
88
mesh = state
99
if degree:
10-
template, darts_id = get_template_deg(mesh, deep, nodes_scores, nodes_adjacency)
10+
deep = int(deep / 2)
11+
template, darts_id = get_template_boundary(mesh, deep, nodes_scores, nodes_adjacency)
1112
else:
1213
template, darts_id = get_template(mesh, deep, nodes_scores)
1314

@@ -126,19 +127,85 @@ def get_template_deg(mesh: Mesh, deep: int, nodes_scores, nodes_adjacency):
126127
if deep > 4:
127128
while len(E) < deep:
128129
df = F.pop(0)
129-
df1 = df.get_beta(1)
130-
df11 = df1.get_beta(1)
131-
df111 = df11.get_beta(1)
132-
F.append(df1)
133-
F.append(df11)
134-
F.append(df111)
135-
N1, N2 = df11.get_node(), df111.get_node()
136-
E.append(N1)
137-
template[n_darts - 1, len(E)] = nodes_scores[N1.id]
138-
template[n_darts - 1, deep + len(E)] = nodes_adjacency[N1.id]
139-
E.append(N2)
140-
template[n_darts - 1, len(E)] = nodes_scores[N2.id]
141-
template[n_darts - 1, deep + len(E)] = nodes_adjacency[N2.id]
130+
if df is not None:
131+
df1 = df.get_beta(1)
132+
df11 = df1.get_beta(1)
133+
df111 = df11.get_beta(1)
134+
F.append(df1)
135+
F.append(df11)
136+
F.append(df111)
137+
N1, N2 = df11.get_node(), df111.get_node()
138+
E.append(N1)
139+
template[n_darts-1, len(E)-1] = nodes_scores[N1.id]
140+
template[n_darts-1, deep + len(E)-1] = nodes_adjacency[N1.id]
141+
E.append(N2)
142+
template[n_darts - 1, len(E)-1] = nodes_scores[N2.id]
143+
template[n_darts - 1, deep + len(E)-1] = nodes_adjacency[N2.id]
144+
else:
145+
E.extend([None,None])
146+
#template[n_darts - 1, len(E) - 1] = -500 # dummy vertices are assigned to -500
147+
#template[n_darts - 1, len(E) - 2] = -500 # dummy vertices are assigned to -500
148+
149+
template = template[:n_darts, :]
150+
return template, dart_ids
151+
152+
def get_template_boundary(mesh: Mesh, deep: int, nodes_scores, nodes_adjacency):
153+
size = len(mesh.dart_info)
154+
template = np.zeros((size, deep*2), dtype=np.int64)
155+
dart_ids = []
156+
n_darts = 0
157+
158+
for d_info in mesh.active_darts():
159+
n_darts += 1
160+
d_id = d_info[0]
161+
dart_ids.append(d_id)
162+
d = Dart(mesh, d_id)
163+
A = d.get_node()
164+
d1 = d.get_beta(1)
165+
B = d1.get_node()
166+
d11 = d1.get_beta(1)
167+
C = d11.get_node()
168+
d111 = d11.get_beta(1)
169+
D = d111.get_node()
170+
171+
# Template niveau 1
172+
template[n_darts - 1, 0] = nodes_scores[A.id]
173+
template[n_darts - 1, deep] = 1
174+
template[n_darts - 1, 1] = nodes_scores[B.id]
175+
template[n_darts - 1, deep+1] = 1
176+
template[n_darts - 1, 2] = nodes_scores[C.id]
177+
template[n_darts - 1, deep+2] = 1
178+
template[n_darts - 1, 3] = nodes_scores[D.id]
179+
template[n_darts - 1, deep + 3] = 1
180+
181+
E = [A, B, C, D]
182+
deep_captured = len(E)
183+
d2 = d.get_beta(2)
184+
d12 = d1.get_beta(2)
185+
d112 = d11.get_beta(2)
186+
d1112 = d111.get_beta(2)
187+
F = [d2, d12, d112, d1112]
188+
if deep > 4:
189+
while len(E) < deep:
190+
df = F.pop(0)
191+
if df is not None:
192+
df1 = df.get_beta(1)
193+
df11 = df1.get_beta(1)
194+
df111 = df11.get_beta(1)
195+
F.append(df1)
196+
F.append(df11)
197+
F.append(df111)
198+
N1, N2 = df11.get_node(), df111.get_node()
199+
E.append(N1)
200+
template[n_darts-1, len(E)-1] = nodes_scores[N1.id]
201+
template[n_darts-1, deep + len(E)-1] = 1
202+
E.append(N2)
203+
template[n_darts - 1, len(E)-1] = nodes_scores[N2.id]
204+
template[n_darts - 1, deep + len(E)-1] = 1
205+
else:
206+
E.extend([None,None])
207+
#template[n_darts - 1, len(E) - 1] = -500 # dummy vertices are assigned to -500
208+
#template[n_darts - 1, len(E) - 2] = -500 # dummy vertices are assigned to -500
142209

143210
template = template[:n_darts, :]
144211
return template, dart_ids

environment/gymnasium_envs/quadmesh_env/envs/quadmesh.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -145,18 +145,22 @@ def step(self, action: np.ndarray):
145145
# An episode is done if the actual score is the same as the ideal
146146
next_nodes_score, self.next_mesh_score, _, next_nodes_adjacency = global_score(self.mesh)
147147
terminated = np.array_equal(self._ideal_score, self.next_mesh_score)
148-
mesh_reward = (self._mesh_score - self.next_mesh_score)*10
149-
reward = mesh_reward
148+
if terminated:
149+
mesh_reward = (self._mesh_score - self.next_mesh_score)*10
150+
reward = mesh_reward
151+
else:
152+
mesh_reward = (self._mesh_score - self.next_mesh_score)*10
153+
reward = mesh_reward
150154
self._nodes_scores, self._mesh_score, self._nodes_adjacency = next_nodes_score, self.next_mesh_score, next_nodes_adjacency
151155
self.observation = self._get_obs()
152156
self.nb_invalid_actions = 0
153157
elif not valid_topo:
154-
reward = -10
158+
reward = -3
155159
mesh_reward = 0
156160
terminated = False
157161
self.nb_invalid_actions += 1
158162
elif not valid_geo:
159-
mesh_reward = 0
163+
mesh_reward = -1
160164
terminated = False
161165
reward = 0
162166
self.nb_invalid_actions += 1

model_RL/PPO_model_pers.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -137,10 +137,11 @@ def learn(self, critic_loss):
137137

138138

139139
class PPO:
140-
def __init__(self, env, lr, gamma, nb_iterations, nb_episodes_per_iteration, nb_epochs, batch_size):
140+
def __init__(self, env, obs_size, max_steps, lr, gamma, nb_iterations, nb_episodes_per_iteration, nb_epochs, batch_size):
141141
self.env = env
142-
self.actor = Actor(env, 10*8, 4*10, lr=0.0001)
143-
self.critic = Critic(8*10, lr=0.0001)
142+
self.max_steps = max_steps
143+
self.actor = Actor(self.env, obs_size, 4*10, lr=lr)
144+
self.critic = Critic(obs_size, lr=lr)
144145
self.lr = lr
145146
self.gamma = gamma
146147
self.nb_iterations = nb_iterations
@@ -205,6 +206,7 @@ def learn(self, writer):
205206
rewards = []
206207
wins = []
207208
len_ep = []
209+
valid_actions = []
208210
global_step = 0
209211
nb_episodes = 0
210212

@@ -218,11 +220,12 @@ def learn(self, writer):
218220
trajectory = []
219221
ep_reward = 0
220222
ep_mesh_reward = 0
223+
ep_valid_actions = 0
221224
ideal_reward = info["mesh_ideal_rewards"]
222225
G = 0
223226
done = False
224227
step = 0
225-
while step < 40:
228+
while step < self.max_steps:
226229
state = copy.deepcopy(info["mesh"])
227230
obs = next_obs
228231
action, prob = self.actor.select_action(obs, info)
@@ -233,6 +236,7 @@ def learn(self, writer):
233236
next_obs, reward, terminated, truncated, info = self.env.step(gym_action)
234237
ep_reward += reward
235238
ep_mesh_reward += info["mesh_reward"]
239+
ep_valid_actions += info["valid_action"]
236240
G = info["mesh_reward"] + 0.9 * G
237241
if terminated:
238242
if truncated:
@@ -247,14 +251,20 @@ def learn(self, writer):
247251
step += 1
248252
if len(trajectory) != 0:
249253
rewards.append(ep_reward)
254+
valid_actions.append(ep_valid_actions)
250255
rollouts.append(trajectory)
251256
dataset.extend(trajectory)
252257
len_ep.append(len(trajectory))
253258
nb_episodes += 1
254259
writer.add_scalar("episode_reward", ep_reward, nb_episodes)
255260
writer.add_scalar("episode_mesh_reward", ep_mesh_reward, nb_episodes)
256-
writer.add_scalar("normalized return", (ep_mesh_reward/ideal_reward), nb_episodes)
257-
writer.add_scalar("len_episodes", len(trajectory), nb_episodes)
261+
if ideal_reward !=0 :
262+
writer.add_scalar("normalized return", (ep_mesh_reward/ideal_reward), nb_episodes)
263+
else :
264+
writer.add_scalar("normalized return", ep_mesh_reward, nb_episodes)
265+
if len(trajectory) != 0:
266+
writer.add_scalar("len_episodes", len(trajectory), nb_episodes)
267+
writer.add_scalar("valid_actions", ep_valid_actions*100/len(trajectory), nb_episodes)
258268

259269
self.train(dataset)
260270

training/exploit_PPO_perso.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
from numpy import ndarray
2+
3+
import gymnasium as gym
4+
import json
5+
import torch
6+
from torch.distributions import Categorical
7+
from model_RL.PPO_model_pers import Actor
8+
from stable_baselines3 import PPO
9+
from mesh_model.mesh_analysis.global_mesh_analysis import global_score
10+
from mesh_model.mesh_struct.mesh import Mesh
11+
from mesh_model.reader import read_gmsh
12+
from view.mesh_plotter.create_plots import plot_test_results
13+
from view.mesh_plotter.mesh_plots import plot_dataset
14+
from environment.actions.smoothing import smoothing_mean
15+
import mesh_model.random_quadmesh as QM
16+
from environment.gymnasium_envs.quadmesh_env.envs.quadmesh import QuadMeshEnv
17+
import numpy as np
18+
import copy
19+
from tqdm import tqdm
20+
21+
22+
def testPolicy(
23+
actor,
24+
n_eval_episodes: int,
25+
env_config,
26+
dataset: list[Mesh]
27+
) -> tuple[ndarray, ndarray, ndarray, ndarray, list[Mesh]]:
28+
"""
29+
Tests policy on each mesh of a dataset with n_eval_episodes.
30+
:param policy: the policy to test
31+
:param n_eval_episodes: number of evaluation episodes on each mesh
32+
:param dataset: list of mesh objects
33+
:param max_steps: max steps to evaluate
34+
:return: average length of evaluation episodes, number of wins,average reward per mesh, dataset with the modified meshes
35+
"""
36+
print('Testing policy')
37+
avg_length = np.zeros(len(dataset))
38+
avg_mesh_rewards = np.zeros(len(dataset))
39+
avg_normalized_return = np.zeros(len(dataset))
40+
nb_wins = np.zeros(len(dataset))
41+
final_meshes = []
42+
for i, mesh in tqdm(enumerate(dataset, 1)):
43+
best_mesh = mesh
44+
env = gym.make(
45+
env_config["env_name"],
46+
max_episode_steps=30,
47+
mesh = mesh,
48+
n_darts_selected=env_config["n_darts_selected"],
49+
deep= env_config["deep"],
50+
action_restriction=env_config["action_restriction"],
51+
with_degree_obs=env_config["with_degree_observation"]
52+
)
53+
for _ in range(n_eval_episodes):
54+
terminated = False
55+
truncated = False
56+
ep_mesh_rewards: int = 0
57+
ep_length: int = 0
58+
observation, info = env.reset(options={"mesh": copy.deepcopy(mesh)})
59+
while terminated == False and truncated == False:
60+
obs = torch.tensor(observation.flatten(), dtype=torch.float32)
61+
pmf = actor.forward(obs)
62+
dist = Categorical(pmf)
63+
action = dist.sample()
64+
action = action.tolist()
65+
action_dart = int(action / 4)
66+
action_type = action % 4
67+
gymnasium_action = [action_type, action_dart]
68+
if action is None:
69+
env.terminal = True
70+
break
71+
observation, reward, terminated, truncated, info = env.step(gymnasium_action)
72+
ep_mesh_rewards += info['mesh_reward']
73+
ep_length += 1
74+
if terminated:
75+
nb_wins[i-1] += 1
76+
if isBetterMesh(best_mesh, info['mesh']):
77+
best_mesh = copy.deepcopy(info['mesh'])
78+
avg_length[i-1] += ep_length
79+
avg_mesh_rewards[i-1] += ep_mesh_rewards
80+
avg_normalized_return[i-1] += 0 if info['mesh_ideal_rewards'] == 0 else ep_mesh_rewards/info['mesh_ideal_rewards']
81+
final_meshes.append(best_mesh)
82+
avg_length[i-1] = avg_length[i-1]/n_eval_episodes
83+
avg_mesh_rewards[i-1] = avg_mesh_rewards[i-1]/n_eval_episodes
84+
avg_normalized_return[i-1] = avg_normalized_return[i-1]/n_eval_episodes
85+
return avg_length, nb_wins, avg_mesh_rewards, avg_normalized_return, final_meshes
86+
87+
88+
def isBetterPolicy(actual_best_policy, policy_to_test):
89+
if actual_best_policy is None:
90+
return True
91+
92+
def isBetterMesh(best_mesh, actual_mesh):
93+
if best_mesh is None or global_score(best_mesh)[1] > global_score(actual_mesh)[1]:
94+
return True
95+
else:
96+
return False
97+
98+
if __name__ == '__main__':
99+
100+
101+
#Create a dataset of 9 meshes
102+
mesh = read_gmsh("../mesh_files/medium_quad.msh")
103+
dataset = [mesh for _ in range(9)]
104+
with open("../environment/environment_config.json", "r") as f:
105+
env_config = json.load(f)
106+
plot_dataset(dataset)
107+
108+
env = gym.make(
109+
env_config["env_name"],
110+
mesh=mesh,
111+
max_episode_steps=env_config["max_episode_steps"],
112+
n_darts_selected=env_config["n_darts_selected"],
113+
deep=env_config["deep"],
114+
action_restriction=env_config["action_restriction"],
115+
with_degree_obs=env_config["with_degree_observation"]
116+
)
117+
118+
#Load the model
119+
actor = Actor(env, 10*8, 4*10, lr=0.0001)
120+
actor.load_state_dict(torch.load('policy_saved/quad-perso/medium_quad_perso.pth'))
121+
avg_steps, avg_wins, avg_rewards, normalized_return, final_meshes = testPolicy(actor, 15, env_config, dataset)
122+
123+
plot_test_results(avg_rewards, avg_wins, avg_steps, normalized_return)
124+
plot_dataset(final_meshes)
125+
for m in final_meshes:
126+
smoothing_mean(m)
127+
plot_dataset(final_meshes)

training/exploit_trimesh.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import mesh_model.random_trimesh as TM
2+
import torch
3+
import json
4+
import gymnasium as gym
5+
from environment.trimesh_env import TriMesh
6+
from model_RL.utilities.actor_critic_networks import Actor
7+
from mesh_model.reader import read_gmsh
8+
from view.mesh_plotter.create_plots import plot_test_results
9+
from view.mesh_plotter.mesh_plots import plot_dataset
10+
from environment.actions.smoothing import smoothing_mean
11+
from model_RL.evaluate_model import testPolicy
12+
13+
LOCAL_MESH_FEAT = 0
14+
15+
16+
def exploit():
17+
mesh_size = 12
18+
feature = LOCAL_MESH_FEAT
19+
20+
dataset = [TM.random_mesh(30) for _ in range(9)]
21+
plot_dataset(dataset)
22+
23+
env = TriMesh(None, mesh_size, max_steps=60, feat=feature)
24+
25+
26+
actor = Actor(env, 30, 15, lr=0.0001)
27+
actor.load_state_dict(torch.load('policy_saved/actor_network.pth'))
28+
29+
avg_steps, avg_wins, avg_rewards, final_meshes = testPolicy(actor, 30, dataset, 100)
30+
31+
if avg_steps is not None:
32+
plot_test_results(avg_rewards, avg_wins, avg_steps)
33+
plot_dataset(final_meshes)
34+
35+
if __name__ == '__main__':
36+
mesh = read_gmsh("../mesh_files/t1_quad.msh")
37+
38+
#Create a dataset of 9 meshes
39+
dataset = [mesh for _ in range(9)]
40+
with open("../environment/environment_config.json", "r") as f:
41+
env_config = json.load(f)
42+
plot_dataset(dataset)
43+
44+
env = gym.make(
45+
env_config["env_name"],
46+
mesh=read_gmsh("mesh_files/simple_quad.msh"),
47+
max_episode_steps=env_config["max_episode_steps"],
48+
n_darts_selected=env_config["n_darts_selected"],
49+
deep=env_config["deep"],
50+
action_restriction=env_config["action_restriction"],
51+
with_degree_obs=env_config["with_degree_observation"]
52+
)
53+
54+
#Load the model
55+
actor = Actor(env, 10*8, 4*10, lr=0.0001)
56+
actor.load_state_dict(torch.load('policy_saved/actor_network.pth'))
57+
avg_steps, avg_wins, avg_rewards, final_meshes = testPolicy(actor, 15, dataset, 20)
58+
59+
plot_test_results(avg_rewards, avg_wins, avg_steps)
60+
plot_dataset(final_meshes)
61+
for m in final_meshes:
62+
smoothing_mean(m)
63+
plot_dataset(final_meshes)

0 commit comments

Comments
 (0)