@@ -21,13 +21,25 @@ class MARL_SIM(SIM_ENV):
2121 y_range (tuple): World y-range.
2222 """
2323
24- def __init__ (self , world_file = "multi_robot_world.yaml" , disable_plotting = False ):
24+ def __init__ (
25+ self ,
26+ world_file = "multi_robot_world.yaml" ,
27+ disable_plotting = False ,
28+ reward_phase = 1 ,
29+ ):
2530 """
2631 Initialize the MARL_SIM environment.
2732
2833 Args:
29- world_file (str, optional): Path to the world configuration YAML file.
30- disable_plotting (bool, optional): If True, disables IRSim rendering and plotting.
34+ world_file (str, optional):
35+ Path to an IRSim YAML world configuration. Defaults to
36+ "multi_robot_world.yaml".
37+ disable_plotting (bool, optional):
38+ If True, disables all IRSim plotting and display windows.
39+ Useful for headless training. Defaults to False.
40+ reward_phase (int, optional):
41+ Selects the reward function variant used by `get_reward`.
42+ Supported values: {1, 2}. Defaults to 1.
3143 """
3244 display = False if disable_plotting else True
3345 self .env = irsim .make (
@@ -38,29 +50,58 @@ def __init__(self, world_file="multi_robot_world.yaml", disable_plotting=False):
3850 self .num_robots = len (self .env .robot_list )
3951 self .x_range = self .env ._world .x_range
4052 self .y_range = self .env ._world .y_range
53+ self .reward_phase = reward_phase
4154
4255 def step (self , action , connection , combined_weights = None ):
4356 """
4457 Perform a simulation step for all robots using the provided actions and connections.
4558
4659 Args:
47- action (list): List of actions for each robot [[lin_vel, ang_vel], ...].
48- connection (Tensor): Tensor of shape (num_robots, num_robots-1) containing logits indicating connections between robots.
49- combined_weights (Tensor or None, optional): Optional weights for each connection, shape (num_robots, num_robots-1).
60+ action (list):
61+ A list of length `num_robots`. Each element is
62+ `[linear_velocity, angular_velocity]` to be applied to the
63+ corresponding robot for this step. Units follow IRSim's
64+ kinematics setup (typically m/s and rad/s).
65+ connection (Tensor):
66+ A torch.Tensor of shape `(num_robots, num_robots-1)` containing
67+ *logits* that indicate pairwise connections per robot *without*
68+ the self-connection column. This is not used for logic here,
69+ but preserved for compatibility (see commented code); you may
70+ use it upstream to form `combined_weights`.
71+ combined_weights (Tensor or None, optional):
72+ A torch.Tensor of shape `(num_robots, num_robots)` **or**
73+ `(num_robots, num_robots-1)` that encodes visualization weights
74+ for robot-to-robot edges. When provided, edges from robot *i* to
75+ *j* are drawn with line width `weight * 2`. If you pass the
76+ `(num_robots, num_robots-1)` form, ensure indexing aligns with
77+ how you constructed it (self-column typically omitted).
5078
5179 Returns:
52- tuple: (
53- poses (list): List of [x, y, theta] for each robot,
54- distances (list): Distance to goal for each robot,
55- coss (list): Cosine of angle to goal for each robot,
56- sins (list): Sine of angle to goal for each robot,
57- collisions (list): Collision status for each robot,
58- goals (list): Goal reached status for each robot,
59- action (list): Actions applied,
60- rewards (list): Rewards computed,
61- positions (list): Current [x, y] for each robot,
62- goal_positions (list): Goal [x, y] for each robot,
63- )
80+ tuple:
81+ (
82+ poses (list[list[float]]):
83+ `[x, y, theta]` for each robot after the step.
84+ distances (list[float]):
85+ Euclidean distance from each robot to its goal.
86+ coss (list[float]):
87+ Cosine of the angle between robot heading and goal vector.
88+ sins (list[float]):
89+ Sine of the angle between robot heading and goal vector.
90+ collisions (list[bool]):
91+ Collision flags for each robot, as reported by IRSim.
92+ goals (list[bool]):
93+ Arrival flags for each robot. If True this step, a new
94+ random goal was scheduled for that robot.
95+ action (list):
96+ Echo of the input `action`.
97+ rewards (list[float]):
98+ Per-robot rewards computed by `get_reward(...)`.
99+ positions (list[list[float]]):
100+ `[x, y]` positions for each robot after the step.
101+ goal_positions (list[list[float]]):
102+ `[x, y]` goal coordinates for each robot after any
103+ arrival updates this step.
104+ )
64105 """
65106 self .env .step (action_id = [i for i in range (self .num_robots )], action = action )
66107 self .env .render ()
@@ -103,7 +144,7 @@ def step(self, action, connection, combined_weights=None):
103144 collision = self .env .robot_list [i ].collision
104145 action_i = action [i ]
105146 reward = self .get_reward (
106- goal , collision , action_i , closest_robots , distance
147+ goal , collision , action_i , closest_robots , distance , self . reward_phase
107148 )
108149
109150 position = [robot_state [0 ].item (), robot_state [1 ].item ()]
@@ -121,23 +162,14 @@ def step(self, action, connection, combined_weights=None):
121162 )
122163 goal_positions .append (goal_position )
123164
124- i_probs = torch .sigmoid (
125- connection [i ]
126- ) # connection[i] is logits for "connect" per pair
127-
128- # Now we need to insert the self-connection (optional, typically skipped)
129- i_connections = i_probs .tolist ()
130- i_connections .insert (i , 0 )
131165 if combined_weights is not None :
132166 i_weights = combined_weights [i ].tolist ()
133- i_weights .insert (i , 0 )
134167
135168 for j in range (self .num_robots ):
136- if i_connections [j ] > 0.5 :
137- if combined_weights is not None :
138- weight = i_weights [j ]
139- else :
140- weight = 1
169+ if combined_weights is not None :
170+ weight = i_weights [j ]
171+ # else:
172+ # weight = 1
141173 other_robot_state = self .env .robot_list [j ].state
142174 other_pos = [
143175 other_robot_state [0 ].item (),
@@ -146,7 +178,7 @@ def step(self, action, connection, combined_weights=None):
146178 rx = [position [0 ], other_pos [0 ]]
147179 ry = [position [1 ], other_pos [1 ]]
148180 self .env .draw_trajectory (
149- np .array ([rx , ry ]), refresh = True , linewidth = weight
181+ np .array ([rx , ry ]), refresh = True , linewidth = weight * 2
150182 )
151183
152184 if goal :
@@ -183,10 +215,13 @@ def reset(
183215 Reset the simulation environment and optionally set robot and obstacle positions.
184216
185217 Args:
186- robot_state (list or None, optional): Initial state for robots as [x, y, theta, speed].
187- robot_goal (list or None, optional): Goal position(s) for the robots.
188- random_obstacles (bool, optional): If True, randomly position obstacles.
189- random_obstacle_ids (list or None, optional): IDs of obstacles to randomize.
218+ robot_state (list or None, optional): Initial robot state(s) as [[x], [y], [theta]].
219+ If None, random states are assigned ensuring minimum spacing between robots.
220+ robot_goal (list or None, optional): Fixed goal position(s) for robots. If None, random
221+ goals are generated within the environment boundaries.
222+ random_obstacles (bool, optional): If True, reposition obstacles randomly within bounds.
223+ random_obstacle_ids (list or None, optional): IDs of obstacles to randomize. Defaults to
224+ seven obstacles starting from index equal to the number of robots.
190225
191226 Returns:
192227 tuple: (
@@ -278,18 +313,22 @@ def reset(
278313 @staticmethod
279314 def get_reward (goal , collision , action , closest_robots , distance , phase = 1 ):
280315 """
281- Calculate the reward for a robot given the current state and action.
316+ Compute the reward for a single robot based on goal progress, collisions,
317+ control effort, and proximity to other robots.
282318
283319 Args:
284320 goal (bool): Whether the robot reached its goal.
285- collision (bool): Whether a collision occurred .
286- action (list): [linear_velocity, angular_velocity] applied.
287- closest_robots (list): Distances to the closest other robots.
288- distance (float): Distance to the goal.
289- phase (int, optional): Reward phase/function selector (default: 1) .
321+ collision (bool): Whether the robot collided with an obstacle or another robot .
322+ action (list): [linear_velocity, angular_velocity] applied by the robot .
323+ closest_robots (list): Distances to other robots.
324+ distance (float): Current distance to the goal.
325+ phase (int, optional): Reward configuration (1 or 2). Default is 1 .
290326
291327 Returns:
292- float: Computed reward.
328+ float: Computed scalar reward.
329+
330+ Raises:
331+ ValueError: If an unknown reward phase is specified.
293332 """
294333
295334 match phase :
@@ -302,7 +341,7 @@ def get_reward(goal, collision, action, closest_robots, distance, phase=1):
302341 r_dist = 1.5 / distance
303342 cl_pen = 0
304343 for rob in closest_robots :
305- add = 1.5 - rob if rob < 1.5 else 0
344+ add = ( 1.25 - rob ) ** 2 if rob < 1.25 else 0
306345 cl_pen += add
307346
308347 return action [0 ] - 0.5 * abs (action [1 ]) - cl_pen + r_dist
0 commit comments