diff --git a/controllers/mat_rapid/algorithms/__init__.py b/controllers/mat_rapid/algorithms/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/controllers/mat_rapid/algorithms/mat/__init__.py b/controllers/mat_rapid/algorithms/mat/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/controllers/mat_rapid/algorithms/mat/algorithm/ma_transformer.py b/controllers/mat_rapid/algorithms/mat/algorithm/ma_transformer.py new file mode 100644 index 0000000..f879684 --- /dev/null +++ b/controllers/mat_rapid/algorithms/mat/algorithm/ma_transformer.py @@ -0,0 +1,312 @@ +import torch +import torch.nn as nn +from torch.nn import functional as F +import math +import numpy as np +from torch.distributions import Categorical +from algorithms.utils.util import check, init +from algorithms.utils.transformer_act import discrete_autoregreesive_act +from algorithms.utils.transformer_act import discrete_parallel_act +from algorithms.utils.transformer_act import continuous_autoregreesive_act +from algorithms.utils.transformer_act import continuous_parallel_act + +def init_(m, gain=0.01, activate=False): + if activate: + gain = nn.init.calculate_gain('relu') + return init(m, nn.init.orthogonal_, lambda x: nn.init.constant_(x, 0), gain=gain) + + +class SelfAttention(nn.Module): + + def __init__(self, n_embd, n_head, n_agent, masked=False): + super(SelfAttention, self).__init__() + + assert n_embd % n_head == 0 + self.masked = masked + self.n_head = n_head + # key, query, value projections for all heads + self.key = init_(nn.Linear(n_embd, n_embd)) + self.query = init_(nn.Linear(n_embd, n_embd)) + self.value = init_(nn.Linear(n_embd, n_embd)) + # output projection + self.proj = init_(nn.Linear(n_embd, n_embd)) + # if self.masked: + # causal mask to ensure that attention is only applied to the left in the input sequence + self.register_buffer("mask", torch.tril(torch.ones(n_agent + 1, n_agent + 1)) + .view(1, 1, n_agent + 1, n_agent + 1)) + + self.att_bp = None + + def forward(self, key, value, query): + B, L, D = query.size() + + # calculate query, key, values for all heads in batch and move head forward to be the batch dim + k = self.key(key).view(B, L, self.n_head, D // self.n_head).transpose(1, 2) # (B, nh, L, hs) + q = self.query(query).view(B, L, self.n_head, D // self.n_head).transpose(1, 2) # (B, nh, L, hs) + v = self.value(value).view(B, L, self.n_head, D // self.n_head).transpose(1, 2) # (B, nh, L, hs) + + # causal attention: (B, nh, L, hs) x (B, nh, hs, L) -> (B, nh, L, L) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + + # self.att_bp = F.softmax(att, dim=-1) + + if self.masked: + att = att.masked_fill(self.mask[:, :, :L, :L] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + + y = att @ v # (B, nh, L, L) x (B, nh, L, hs) -> (B, nh, L, hs) + y = y.transpose(1, 2).contiguous().view(B, L, D) # re-assemble all head outputs side by side + + # output projection + y = self.proj(y) + return y + + +class EncodeBlock(nn.Module): + """ an unassuming Transformer block """ + + def __init__(self, n_embd, n_head, n_agent): + super(EncodeBlock, self).__init__() + + self.ln1 = nn.LayerNorm(n_embd) + self.ln2 = nn.LayerNorm(n_embd) + # self.attn = SelfAttention(n_embd, n_head, n_agent, masked=True) + self.attn = SelfAttention(n_embd, n_head, n_agent, masked=False) + self.mlp = nn.Sequential( + init_(nn.Linear(n_embd, 1 * n_embd), activate=True), + nn.GELU(), + init_(nn.Linear(1 * n_embd, n_embd)) + ) + + def forward(self, x): + x = self.ln1(x + self.attn(x, x, x)) + x = self.ln2(x + self.mlp(x)) + return x + + +class DecodeBlock(nn.Module): + """ an unassuming Transformer block """ + + def __init__(self, n_embd, n_head, n_agent): + super(DecodeBlock, self).__init__() + + self.ln1 = nn.LayerNorm(n_embd) + self.ln2 = nn.LayerNorm(n_embd) + self.ln3 = nn.LayerNorm(n_embd) + self.attn1 = SelfAttention(n_embd, n_head, n_agent, masked=True) + self.attn2 = SelfAttention(n_embd, n_head, n_agent, masked=True) + self.mlp = nn.Sequential( + init_(nn.Linear(n_embd, 1 * n_embd), activate=True), + nn.GELU(), + init_(nn.Linear(1 * n_embd, n_embd)) + ) + + def forward(self, x, rep_enc): + x = self.ln1(x + self.attn1(x, x, x)) + x = self.ln2(rep_enc + self.attn2(key=x, value=x, query=rep_enc)) + x = self.ln3(x + self.mlp(x)) + return x + + +class Encoder(nn.Module): + + def __init__(self, state_dim, obs_dim, n_block, n_embd, n_head, n_agent, encode_state): + super(Encoder, self).__init__() + + self.state_dim = state_dim + self.obs_dim = obs_dim + self.n_embd = n_embd + self.n_agent = n_agent + self.encode_state = encode_state + # self.agent_id_emb = nn.Parameter(torch.zeros(1, n_agent, n_embd)) + + self.state_encoder = nn.Sequential(nn.LayerNorm(state_dim), + init_(nn.Linear(state_dim, n_embd), activate=True), nn.GELU()) + self.obs_encoder = nn.Sequential(nn.LayerNorm(obs_dim), + init_(nn.Linear(obs_dim, n_embd), activate=True), nn.GELU()) + + self.ln = nn.LayerNorm(n_embd) + self.blocks = nn.Sequential(*[EncodeBlock(n_embd, n_head, n_agent) for _ in range(n_block)]) + self.head = nn.Sequential(init_(nn.Linear(n_embd, n_embd), activate=True), nn.GELU(), nn.LayerNorm(n_embd), + init_(nn.Linear(n_embd, 1))) + + def forward(self, state, obs): + # state: (batch, n_agent, state_dim) + # obs: (batch, n_agent, obs_dim) + if self.encode_state: + state_embeddings = self.state_encoder(state) + x = state_embeddings + else: + obs_embeddings = self.obs_encoder(obs) + x = obs_embeddings + + rep = self.blocks(self.ln(x)) + v_loc = self.head(rep) + + return v_loc, rep + + +class Decoder(nn.Module): + + def __init__(self, obs_dim, action_dim, n_block, n_embd, n_head, n_agent, + action_type='Discrete', dec_actor=False, share_actor=False): + super(Decoder, self).__init__() + + self.action_dim = action_dim + self.n_embd = n_embd + self.dec_actor = dec_actor + self.share_actor = share_actor + self.action_type = action_type + + if action_type != 'Discrete': + log_std = torch.ones(action_dim) + # log_std = torch.zeros(action_dim) + self.log_std = torch.nn.Parameter(log_std) + # self.log_std = torch.nn.Parameter(torch.zeros(action_dim)) + + if self.dec_actor: + if self.share_actor: + print("mac_dec!!!!!") + self.mlp = nn.Sequential(nn.LayerNorm(obs_dim), + init_(nn.Linear(obs_dim, n_embd), activate=True), nn.GELU(), nn.LayerNorm(n_embd), + init_(nn.Linear(n_embd, n_embd), activate=True), nn.GELU(), nn.LayerNorm(n_embd), + init_(nn.Linear(n_embd, action_dim))) + else: + self.mlp = nn.ModuleList() + for n in range(n_agent): + actor = nn.Sequential(nn.LayerNorm(obs_dim), + init_(nn.Linear(obs_dim, n_embd), activate=True), nn.GELU(), nn.LayerNorm(n_embd), + init_(nn.Linear(n_embd, n_embd), activate=True), nn.GELU(), nn.LayerNorm(n_embd), + init_(nn.Linear(n_embd, action_dim))) + self.mlp.append(actor) + else: + # self.agent_id_emb = nn.Parameter(torch.zeros(1, n_agent, n_embd)) + if action_type == 'Discrete': + self.action_encoder = nn.Sequential(init_(nn.Linear(action_dim + 1, n_embd, bias=False), activate=True), + nn.GELU()) + else: + self.action_encoder = nn.Sequential(init_(nn.Linear(action_dim, n_embd), activate=True), nn.GELU()) + self.obs_encoder = nn.Sequential(nn.LayerNorm(obs_dim), + init_(nn.Linear(obs_dim, n_embd), activate=True), nn.GELU()) + self.ln = nn.LayerNorm(n_embd) + self.blocks = nn.Sequential(*[DecodeBlock(n_embd, n_head, n_agent) for _ in range(n_block)]) + self.head = nn.Sequential(init_(nn.Linear(n_embd, n_embd), activate=True), nn.GELU(), nn.LayerNorm(n_embd), + init_(nn.Linear(n_embd, action_dim))) + + def zero_std(self, device): + if self.action_type != 'Discrete': + log_std = torch.zeros(self.action_dim).to(device) + self.log_std.data = log_std + + # state, action, and return + def forward(self, action, obs_rep, obs): + # action: (batch, n_agent, action_dim), one-hot/logits? + # obs_rep: (batch, n_agent, n_embd) + if self.dec_actor: + if self.share_actor: + logit = self.mlp(obs) + else: + logit = [] + for n in range(len(self.mlp)): + logit_n = self.mlp[n](obs[:, n, :]) + logit.append(logit_n) + logit = torch.stack(logit, dim=1) + else: + action_embeddings = self.action_encoder(action) + x = self.ln(action_embeddings) + for block in self.blocks: + x = block(x, obs_rep) + logit = self.head(x) + + return logit + + +class MultiAgentTransformer(nn.Module): + + def __init__(self, state_dim, obs_dim, action_dim, n_agent, + n_block, n_embd, n_head, encode_state=False, device=torch.device("cpu"), + action_type='Discrete', dec_actor=False, share_actor=False): + super(MultiAgentTransformer, self).__init__() + + self.n_agent = n_agent + self.action_dim = action_dim + self.tpdv = dict(dtype=torch.float32, device=device) + self.action_type = action_type + self.device = device + + # state unused + state_dim = 37 + + self.encoder = Encoder(state_dim, obs_dim, n_block, n_embd, n_head, n_agent, encode_state) + self.decoder = Decoder(obs_dim, action_dim, n_block, n_embd, n_head, n_agent, + self.action_type, dec_actor=dec_actor, share_actor=share_actor) + self.to(device) + + def zero_std(self): + if self.action_type != 'Discrete': + self.decoder.zero_std(self.device) + + def forward(self, state, obs, action, available_actions=None): + # state: (batch, n_agent, state_dim) + # obs: (batch, n_agent, obs_dim) + # action: (batch, n_agent, 1) + # available_actions: (batch, n_agent, act_dim) + + # state unused + ori_shape = np.shape(state) + state = np.zeros((*ori_shape[:-1], 37), dtype=np.float32) + + state = check(state).to(**self.tpdv) + obs = check(obs).to(**self.tpdv) + action = check(action).to(**self.tpdv) + + if available_actions is not None: + available_actions = check(available_actions).to(**self.tpdv) + + batch_size = np.shape(state)[0] + v_loc, obs_rep = self.encoder(state, obs) + if self.action_type == 'Discrete': + action = action.long() + action_log, entropy = discrete_parallel_act(self.decoder, obs_rep, obs, action, batch_size, + self.n_agent, self.action_dim, self.tpdv, available_actions) + else: + action_log, entropy = continuous_parallel_act(self.decoder, obs_rep, obs, action, batch_size, + self.n_agent, self.action_dim, self.tpdv) + + return action_log, v_loc, entropy + + def get_actions(self, state, obs, available_actions=None, deterministic=False): + # state unused + ori_shape = np.shape(obs) + state = np.zeros((*ori_shape[:-1], 37), dtype=np.float32) + + state = check(state).to(**self.tpdv) + obs = check(obs).to(**self.tpdv) + if available_actions is not None: + available_actions = check(available_actions).to(**self.tpdv) + + batch_size = np.shape(obs)[0] + v_loc, obs_rep = self.encoder(state, obs) + if self.action_type == "Discrete": + output_action, output_action_log = discrete_autoregreesive_act(self.decoder, obs_rep, obs, batch_size, + self.n_agent, self.action_dim, self.tpdv, + available_actions, deterministic) + else: + output_action, output_action_log = continuous_autoregreesive_act(self.decoder, obs_rep, obs, batch_size, + self.n_agent, self.action_dim, self.tpdv, + deterministic) + + return output_action, output_action_log, v_loc + + def get_values(self, state, obs, available_actions=None): + # state unused + ori_shape = np.shape(state) + state = np.zeros((*ori_shape[:-1], 37), dtype=np.float32) + + state = check(state).to(**self.tpdv) + obs = check(obs).to(**self.tpdv) + v_tot, obs_rep = self.encoder(state, obs) + return v_tot + + + diff --git a/controllers/mat_rapid/algorithms/mat/algorithm/mat_decoder.py b/controllers/mat_rapid/algorithms/mat/algorithm/mat_decoder.py new file mode 100644 index 0000000..4553bc7 --- /dev/null +++ b/controllers/mat_rapid/algorithms/mat/algorithm/mat_decoder.py @@ -0,0 +1,297 @@ +import torch +import torch.nn as nn +from torch.nn import functional as F +import math +import numpy as np +from torch.distributions import Categorical, Normal +from algorithms.utils.util import check, init + + +def init_(m, gain=0.01, activate=False): + if activate: + gain = nn.init.calculate_gain('relu') + return init(m, nn.init.orthogonal_, lambda x: nn.init.constant_(x, 0), gain=gain) + + +def discrete_autoregreesive_act(decoder, obs_rep, obs, batch_size, n_agent, action_dim, tpdv, + available_actions=None, deterministic=False): + shifted_action = torch.zeros((batch_size, n_agent, action_dim + 1)).to(**tpdv) + shifted_action[:, 0, 0] = 1 + output_action = torch.zeros((batch_size, n_agent, 1), dtype=torch.long) + output_action_log = torch.zeros_like(output_action, dtype=torch.float32) + + for i in range(n_agent): + logit, v_loc = decoder(shifted_action, obs_rep, obs) + logit = logit[:, i, :] + if available_actions is not None: + logit[available_actions[:, i, :] == 0] = -1e10 + + distri = Categorical(logits=logit) + action = distri.probs.argmax(dim=-1) if deterministic else distri.sample() + action_log = distri.log_prob(action) + + output_action[:, i, :] = action.unsqueeze(-1) + output_action_log[:, i, :] = action_log.unsqueeze(-1) + if i + 1 < n_agent: + shifted_action[:, i + 1, 1:] = F.one_hot(action, num_classes=action_dim) + return output_action, output_action_log, v_loc + + +def discrete_parallel_act(decoder, obs_rep, obs, action, batch_size, n_agent, action_dim, tpdv, + available_actions=None): + one_hot_action = F.one_hot(action.squeeze(-1), num_classes=action_dim) # (batch, n_agent, action_dim) + shifted_action = torch.zeros((batch_size, n_agent, action_dim + 1)).to(**tpdv) + shifted_action[:, 0, 0] = 1 + shifted_action[:, 1:, 1:] = one_hot_action[:, :-1, :] + logit, v_loc = decoder(shifted_action, obs_rep, obs) + if available_actions is not None: + logit[available_actions == 0] = -1e10 + + distri = Categorical(logits=logit) + action_log = distri.log_prob(action.squeeze(-1)).unsqueeze(-1) + entropy = distri.entropy().unsqueeze(-1) + return action_log, entropy, v_loc + + +def continuous_autoregreesive_act(decoder, obs_rep, obs, batch_size, n_agent, action_dim, tpdv, + deterministic=False): + shifted_action = torch.zeros((batch_size, n_agent, action_dim)).to(**tpdv) + output_action = torch.zeros((batch_size, n_agent, action_dim), dtype=torch.float32) + output_action_log = torch.zeros_like(output_action, dtype=torch.float32) + + for i in range(n_agent): + act_mean, v_loc = decoder(shifted_action, obs_rep, obs) + act_mean = act_mean[:, i, :] + action_std = torch.sigmoid(decoder.log_std) * 0.5 + + # log_std = torch.zeros_like(act_mean).to(**tpdv) + decoder.log_std + # distri = Normal(act_mean, log_std.exp()) + distri = Normal(act_mean, action_std) + action = act_mean if deterministic else distri.sample() + action_log = distri.log_prob(action) + + output_action[:, i, :] = action + output_action_log[:, i, :] = action_log + if i + 1 < n_agent: + shifted_action[:, i + 1, :] = action + + # print("act_mean: ", act_mean) + # print("action: ", action) + + return output_action, output_action_log, v_loc + + +def continuous_parallel_act(decoder, obs_rep, obs, action, batch_size, n_agent, action_dim, tpdv): + shifted_action = torch.zeros((batch_size, n_agent, action_dim)).to(**tpdv) + shifted_action[:, 1:, :] = action[:, :-1, :] + + act_mean, v_loc = decoder(shifted_action, obs_rep, obs) + action_std = torch.sigmoid(decoder.log_std) * 0.5 + distri = Normal(act_mean, action_std) + + # log_std = torch.zeros_like(act_mean).to(**tpdv) + decoder.log_std + # distri = Normal(act_mean, log_std.exp()) + + action_log = distri.log_prob(action) + entropy = distri.entropy() + return action_log, entropy, v_loc + + + +class SelfAttention(nn.Module): + + def __init__(self, n_embd, n_head, n_agent, masked=False): + super(SelfAttention, self).__init__() + + assert n_embd % n_head == 0 + self.masked = masked + self.n_head = n_head + # key, query, value projections for all heads + self.key = init_(nn.Linear(n_embd, n_embd)) + self.query = init_(nn.Linear(n_embd, n_embd)) + self.value = init_(nn.Linear(n_embd, n_embd)) + # output projection + self.proj = init_(nn.Linear(n_embd, n_embd)) + # if self.masked: + # causal mask to ensure that attention is only applied to the left in the input sequence + self.register_buffer("mask", torch.tril(torch.ones(n_agent + 1, n_agent + 1)) + .view(1, 1, n_agent + 1, n_agent + 1)) + + self.att_bp = None + + def forward(self, key, value, query): + B, L, D = query.size() + + # calculate query, key, values for all heads in batch and move head forward to be the batch dim + k = self.key(key).view(B, L, self.n_head, D // self.n_head).transpose(1, 2) # (B, nh, L, hs) + q = self.query(query).view(B, L, self.n_head, D // self.n_head).transpose(1, 2) # (B, nh, L, hs) + v = self.value(value).view(B, L, self.n_head, D // self.n_head).transpose(1, 2) # (B, nh, L, hs) + + # causal attention: (B, nh, L, hs) x (B, nh, hs, L) -> (B, nh, L, L) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + + # self.att_bp = F.softmax(att, dim=-1) + + if self.masked: + att = att.masked_fill(self.mask[:, :, :L, :L] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + + y = att @ v # (B, nh, L, L) x (B, nh, L, hs) -> (B, nh, L, hs) + y = y.transpose(1, 2).contiguous().view(B, L, D) # re-assemble all head outputs side by side + + # output projection + y = self.proj(y) + return y + + +class DecodeBlock(nn.Module): + + def __init__(self, n_embd, n_head, n_agent): + super(DecodeBlock, self).__init__() + + self.ln1 = nn.LayerNorm(n_embd) + self.ln2 = nn.LayerNorm(n_embd) + self.ln3 = nn.LayerNorm(n_embd) + self.attn1 = SelfAttention(n_embd, n_head, n_agent, masked=True) + self.attn2 = SelfAttention(n_embd, n_head, n_agent, masked=True) + self.mlp = nn.Sequential( + init_(nn.Linear(n_embd, 1 * n_embd), activate=True), + nn.GELU(), + init_(nn.Linear(1 * n_embd, n_embd)) + ) + + def forward(self, x, rep_enc): + x = self.ln1(x + self.attn1(x, x, x)) + x = self.ln2(rep_enc + self.attn2(key=x, value=x, query=rep_enc)) + x = self.ln3(x + self.mlp(x)) + return x + + +class Decoder(nn.Module): + + def __init__(self, obs_dim, action_dim, n_block, n_embd, n_head, n_agent, + action_type='Discrete', dec_actor=False, share_actor=False): + super(Decoder, self).__init__() + + self.action_dim = action_dim + self.n_embd = n_embd + self.dec_actor = dec_actor + self.share_actor = share_actor + self.action_type = action_type + + if action_type == 'Discrete': + self.action_encoder = nn.Sequential(init_(nn.Linear(action_dim + 1, n_embd, bias=False), activate=True), + nn.GELU()) + else: + log_std = torch.ones(action_dim) + # log_std = torch.zeros(action_dim) + self.log_std = torch.nn.Parameter(log_std) + # self.log_std = torch.nn.Parameter(torch.zeros(action_dim)) + self.action_encoder = nn.Sequential(init_(nn.Linear(action_dim, n_embd), activate=True), nn.GELU()) + self.obs_encoder = nn.Sequential(nn.LayerNorm(obs_dim), + init_(nn.Linear(obs_dim, n_embd), activate=True), nn.GELU()) + self.ln = nn.LayerNorm(n_embd) + self.blocks = nn.Sequential(*[DecodeBlock(n_embd, n_head, n_agent) for _ in range(n_block)]) + self.head = nn.Sequential(init_(nn.Linear(n_embd, n_embd), activate=True), nn.GELU(), nn.LayerNorm(n_embd), + init_(nn.Linear(n_embd, action_dim))) + self.val_head = nn.Sequential(init_(nn.Linear(n_embd, n_embd), activate=True), nn.GELU(), nn.LayerNorm(n_embd), + init_(nn.Linear(n_embd, 1))) + + def zero_std(self, device): + if self.action_type != 'Discrete': + log_std = torch.zeros(self.action_dim).to(device) + self.log_std.data = log_std + + # state, action, and return + def forward(self, action, obs_rep, obs): + # action: (batch, n_agent, action_dim), one-hot/logits? + # obs_rep: (batch, n_agent, n_embd) + obs_embeddings = self.obs_encoder(obs) + action_embeddings = self.action_encoder(action) + x = action_embeddings + x = self.ln(x) + for block in self.blocks: + x = block(x, obs_embeddings) + logit = self.head(x) + val = self.val_head(x) + + return logit, val + + +class MultiAgentDecoder(nn.Module): + + def __init__(self, state_dim, obs_dim, action_dim, n_agent, + n_block, n_embd, n_head, encode_state=False, device=torch.device("cpu"), + action_type='Discrete', dec_actor=False, share_actor=False): + super(MultiAgentDecoder, self).__init__() + + self.n_agent = n_agent + self.action_dim = action_dim + self.tpdv = dict(dtype=torch.float32, device=device) + self.action_type = action_type + self.device = device + + self.decoder = Decoder(obs_dim, action_dim, n_block, n_embd, n_head, n_agent, + self.action_type, dec_actor=dec_actor, share_actor=share_actor) + self.to(device) + + def zero_std(self): + if self.action_type != 'Discrete': + self.decoder.zero_std(self.device) + + def forward(self, state, obs, action, available_actions=None): + # state: (batch, n_agent, state_dim) + # obs: (batch, n_agent, obs_dim) + # action: (batch, n_agent, 1) + # available_actions: (batch, n_agent, act_dim) + + # state unused + ori_shape = np.shape(state) + state = np.zeros((*ori_shape[:-1], 37), dtype=np.float32) + + state = check(state).to(**self.tpdv) + obs = check(obs).to(**self.tpdv) + action = check(action).to(**self.tpdv) + + if available_actions is not None: + available_actions = check(available_actions).to(**self.tpdv) + + batch_size = np.shape(state)[0] + if self.action_type == 'Discrete': + action = action.long() + action_log, entropy, v_loc = discrete_parallel_act(self.decoder, None, obs, action, batch_size, + self.n_agent, self.action_dim, self.tpdv, available_actions) + else: + action_log, entropy, v_loc = continuous_parallel_act(self.decoder, None, obs, action, batch_size, + self.n_agent, self.action_dim, self.tpdv) + return action_log, v_loc, entropy + + def get_actions(self, state, obs, available_actions=None, deterministic=False): + # state unused + ori_shape = np.shape(obs) + state = np.zeros((*ori_shape[:-1], 37), dtype=np.float32) + + state = check(state).to(**self.tpdv) + obs = check(obs).to(**self.tpdv) + if available_actions is not None: + available_actions = check(available_actions).to(**self.tpdv) + + batch_size = np.shape(obs)[0] + if self.action_type == "Discrete": + output_action, output_action_log, v_loc = discrete_autoregreesive_act(self.decoder, None, obs, batch_size, + self.n_agent, self.action_dim, self.tpdv, + available_actions, deterministic) + else: + output_action, output_action_log, v_loc = continuous_autoregreesive_act(self.decoder, None, obs, batch_size, + self.n_agent, self.action_dim, self.tpdv, + deterministic) + + return output_action, output_action_log, v_loc + + def get_values(self, state, obs, available_actions=None): + _, __, v_loc = self.get_actions(state, obs, available_actions) + + return v_loc + + + diff --git a/controllers/mat_rapid/algorithms/mat/algorithm/mat_encoder.py b/controllers/mat_rapid/algorithms/mat/algorithm/mat_encoder.py new file mode 100644 index 0000000..10fa7e6 --- /dev/null +++ b/controllers/mat_rapid/algorithms/mat/algorithm/mat_encoder.py @@ -0,0 +1,240 @@ +import torch +import torch.nn as nn +from torch.nn import functional as F +import math +import numpy as np +from torch.distributions import Categorical, Normal +from algorithms.utils.util import check, init +from algorithms.utils.transformer_act import discrete_autoregreesive_act +from algorithms.utils.transformer_act import discrete_parallel_act +from algorithms.utils.transformer_act import continuous_autoregreesive_act +from algorithms.utils.transformer_act import continuous_parallel_act + +def init_(m, gain=0.01, activate=False): + if activate: + gain = nn.init.calculate_gain('relu') + return init(m, nn.init.orthogonal_, lambda x: nn.init.constant_(x, 0), gain=gain) + + +class SelfAttention(nn.Module): + + def __init__(self, n_embd, n_head, n_agent, masked=False): + super(SelfAttention, self).__init__() + + assert n_embd % n_head == 0 + self.masked = masked + self.n_head = n_head + # key, query, value projections for all heads + self.key = init_(nn.Linear(n_embd, n_embd)) + self.query = init_(nn.Linear(n_embd, n_embd)) + self.value = init_(nn.Linear(n_embd, n_embd)) + # output projection + self.proj = init_(nn.Linear(n_embd, n_embd)) + # if self.masked: + # causal mask to ensure that attention is only applied to the left in the input sequence + self.register_buffer("mask", torch.tril(torch.ones(n_agent + 1, n_agent + 1)) + .view(1, 1, n_agent + 1, n_agent + 1)) + + self.att_bp = None + + def forward(self, key, value, query): + B, L, D = query.size() + + # calculate query, key, values for all heads in batch and move head forward to be the batch dim + k = self.key(key).view(B, L, self.n_head, D // self.n_head).transpose(1, 2) # (B, nh, L, hs) + q = self.query(query).view(B, L, self.n_head, D // self.n_head).transpose(1, 2) # (B, nh, L, hs) + v = self.value(value).view(B, L, self.n_head, D // self.n_head).transpose(1, 2) # (B, nh, L, hs) + + # causal attention: (B, nh, L, hs) x (B, nh, hs, L) -> (B, nh, L, L) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + + # self.att_bp = F.softmax(att, dim=-1) + + if self.masked: + att = att.masked_fill(self.mask[:, :, :L, :L] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + + y = att @ v # (B, nh, L, L) x (B, nh, L, hs) -> (B, nh, L, hs) + y = y.transpose(1, 2).contiguous().view(B, L, D) # re-assemble all head outputs side by side + + # output projection + y = self.proj(y) + return y + + +class EncodeBlock(nn.Module): + """ an unassuming Transformer block """ + + def __init__(self, n_embd, n_head, n_agent): + super(EncodeBlock, self).__init__() + + self.ln1 = nn.LayerNorm(n_embd) + self.ln2 = nn.LayerNorm(n_embd) + # self.attn = SelfAttention(n_embd, n_head, n_agent, masked=True) + self.attn = SelfAttention(n_embd, n_head, n_agent, masked=False) + self.mlp = nn.Sequential( + init_(nn.Linear(n_embd, 1 * n_embd), activate=True), + nn.GELU(), + init_(nn.Linear(1 * n_embd, n_embd)) + ) + + def forward(self, x): + x = self.ln1(x + self.attn(x, x, x)) + x = self.ln2(x + self.mlp(x)) + return x + + +class Encoder(nn.Module): + + def __init__(self, state_dim, obs_dim, action_dim, n_block, n_embd, + n_head, n_agent, encode_state, action_type='Discrete'): + super(Encoder, self).__init__() + + self.state_dim = state_dim + self.obs_dim = obs_dim + self.action_dim = action_dim + self.n_embd = n_embd + self.n_agent = n_agent + self.encode_state = encode_state + self.action_type = action_type + + self.state_encoder = nn.Sequential(nn.LayerNorm(state_dim), + init_(nn.Linear(state_dim, n_embd), activate=True), nn.GELU()) + self.obs_encoder = nn.Sequential(nn.LayerNorm(obs_dim), + init_(nn.Linear(obs_dim, n_embd), activate=True), nn.GELU()) + + self.ln = nn.LayerNorm(n_embd) + self.blocks = nn.Sequential(*[EncodeBlock(n_embd, n_head, n_agent) for _ in range(n_block)]) + self.head = nn.Sequential(init_(nn.Linear(n_embd, n_embd), activate=True), nn.GELU(), nn.LayerNorm(n_embd), + init_(nn.Linear(n_embd, 1))) + self.act_head = nn.Sequential(init_(nn.Linear(n_embd, n_embd), activate=True), nn.GELU(), nn.LayerNorm(n_embd), + init_(nn.Linear(n_embd, action_dim))) + if action_type != 'Discrete': + log_std = torch.ones(action_dim) + # log_std = torch.zeros(action_dim) + self.log_std = torch.nn.Parameter(log_std) + # self.log_std = torch.nn.Parameter(torch.zeros(action_dim)) + + def zero_std(self, device): + if self.action_type != 'Discrete': + log_std = torch.zeros(self.action_dim).to(device) + self.log_std.data = log_std + + def forward(self, state, obs): + # state: (batch, n_agent, state_dim) + # obs: (batch, n_agent, obs_dim) + if self.encode_state: + state_embeddings = self.state_encoder(state) + x = state_embeddings + else: + obs_embeddings = self.obs_encoder(obs) + x = obs_embeddings + + rep = self.blocks(self.ln(x)) + v_loc = self.head(rep) + logit = self.act_head(rep) + + return v_loc, rep, logit + + +class MultiAgentEncoder(nn.Module): + + def __init__(self, state_dim, obs_dim, action_dim, n_agent, + n_block, n_embd, n_head, encode_state=False, device=torch.device("cpu"), + action_type='Discrete', dec_actor=False, share_actor=False): + super(MultiAgentEncoder, self).__init__() + + self.n_agent = n_agent + self.action_dim = action_dim + self.tpdv = dict(dtype=torch.float32, device=device) + self.action_type = action_type + self.device = device + + # state unused + state_dim = 37 + + self.encoder = Encoder(state_dim, obs_dim, action_dim, n_block, n_embd, n_head, n_agent, encode_state, + action_type=self.action_type) + self.to(device) + + def zero_std(self): + if self.action_type != 'Discrete': + self.encoder.zero_std(self.device) + + def forward(self, state, obs, action, available_actions=None): + # state: (batch, n_agent, state_dim) + # obs: (batch, n_agent, obs_dim) + # action: (batch, n_agent, 1) + # available_actions: (batch, n_agent, act_dim) + + # state unused + ori_shape = np.shape(state) + state = np.zeros((*ori_shape[:-1], 37), dtype=np.float32) + + state = check(state).to(**self.tpdv) + obs = check(obs).to(**self.tpdv) + action = check(action).to(**self.tpdv) + + if available_actions is not None: + available_actions = check(available_actions).to(**self.tpdv) + + batch_size = np.shape(state)[0] + v_loc, obs_rep, logit = self.encoder(state, obs) + if self.action_type == 'Discrete': + action = action.long() + if available_actions is not None: + logit[available_actions == 0] = -1e10 + + distri = Categorical(logits=logit) + action_log = distri.log_prob(action.squeeze(-1)).unsqueeze(-1) + entropy = distri.entropy().unsqueeze(-1) + else: + act_mean = logit + action_std = torch.sigmoid(self.encoder.log_std) * 0.5 + distri = Normal(act_mean, action_std) + action_log = distri.log_prob(action) + entropy = distri.entropy() + + return action_log, v_loc, entropy + + def get_actions(self, state, obs, available_actions=None, deterministic=False): + # state unused + ori_shape = np.shape(obs) + state = np.zeros((*ori_shape[:-1], 37), dtype=np.float32) + + state = check(state).to(**self.tpdv) + obs = check(obs).to(**self.tpdv) + if available_actions is not None: + available_actions = check(available_actions).to(**self.tpdv) + + v_loc, obs_rep, logit = self.encoder(state, obs) + if self.action_type == "Discrete": + if available_actions is not None: + logit[available_actions == 0] = -1e10 + + distri = Categorical(logits=logit) + output_action = distri.probs.argmax(dim=-1) if deterministic else distri.sample() + output_action_log = distri.log_prob(output_action) + output_action = output_action.unsqueeze(-1) + output_action_log = output_action_log.unsqueeze(-1) + else: + act_mean = logit + action_std = torch.sigmoid(self.encoder.log_std) * 0.5 + distri = Normal(act_mean, action_std) + output_action = act_mean if deterministic else distri.sample() + output_action_log = distri.log_prob(output_action) + + return output_action, output_action_log, v_loc + + def get_values(self, state, obs, available_actions=None): + # state unused + ori_shape = np.shape(state) + state = np.zeros((*ori_shape[:-1], 37), dtype=np.float32) + + state = check(state).to(**self.tpdv) + obs = check(obs).to(**self.tpdv) + v_tot, _, _ = self.encoder(state, obs) + return v_tot + + + diff --git a/controllers/mat_rapid/algorithms/mat/algorithm/mat_gru.py b/controllers/mat_rapid/algorithms/mat/algorithm/mat_gru.py new file mode 100644 index 0000000..141a671 --- /dev/null +++ b/controllers/mat_rapid/algorithms/mat/algorithm/mat_gru.py @@ -0,0 +1,188 @@ +import torch +import torch.nn as nn +from torch.nn import functional as F +import math +import numpy as np +from torch.distributions import Categorical +from algorithms.utils.util import check, init +from algorithms.utils.transformer_act import discrete_autoregreesive_act +from algorithms.utils.transformer_act import discrete_parallel_act +from algorithms.utils.transformer_act import continuous_autoregreesive_act +from algorithms.utils.transformer_act import continuous_parallel_act + + +def init_(m, gain=0.01, activate=False): + if activate: + gain = nn.init.calculate_gain('relu') + return init(m, nn.init.orthogonal_, lambda x: nn.init.constant_(x, 0), gain=gain) + + +class Encoder(nn.Module): + + def __init__(self, state_dim, obs_dim, n_block, n_embd, n_head, n_agent, encode_state): + super(Encoder, self).__init__() + + self.state_dim = state_dim + self.obs_dim = obs_dim + self.n_embd = n_embd + self.n_agent = n_agent + self.encode_state = encode_state + + self.state_encoder = nn.Sequential(nn.LayerNorm(state_dim), + init_(nn.Linear(state_dim, n_embd), activate=True), nn.GELU()) + self.obs_encoder = nn.Sequential(nn.LayerNorm(obs_dim), + init_(nn.Linear(obs_dim, n_embd), activate=True), nn.GELU()) + + self.ln = nn.LayerNorm(n_embd) + # self.blocks = nn.Sequential(*[EncodeBlock(n_embd, n_head, n_agent) for _ in range(n_block)]) + self.gru = nn.GRU(n_embd, n_embd, num_layers=2, batch_first=True) + self.head = nn.Sequential(init_(nn.Linear(n_embd, n_embd), activate=True), nn.GELU(), nn.LayerNorm(n_embd), + init_(nn.Linear(n_embd, 1))) + + def forward(self, state, obs): + # state: (batch, n_agent, state_dim) + # obs: (batch, n_agent, obs_dim) + obs_embeddings = self.obs_encoder(obs) + x = obs_embeddings + + rep, _ = self.gru(self.ln(x)) + v_loc = self.head(rep) + + return v_loc, rep + + +class Decoder(nn.Module): + + def __init__(self, obs_dim, action_dim, n_block, n_embd, n_head, n_agent, + action_type='Discrete', dec_actor=False, share_actor=False): + super(Decoder, self).__init__() + + self.action_dim = action_dim + self.n_embd = n_embd + self.dec_actor = dec_actor + self.share_actor = share_actor + self.action_type = action_type + + if action_type == 'Discrete': + self.action_encoder = nn.Sequential(init_(nn.Linear(action_dim + 1, n_embd, bias=False), activate=True), + nn.GELU()) + else: + log_std = torch.ones(action_dim) + # log_std = torch.zeros(action_dim) + self.log_std = torch.nn.Parameter(log_std) + # self.log_std = torch.nn.Parameter(torch.zeros(action_dim)) + self.action_encoder = nn.Sequential(init_(nn.Linear(action_dim, n_embd), activate=True), nn.GELU()) + self.obs_encoder = nn.Sequential(nn.LayerNorm(obs_dim), + init_(nn.Linear(obs_dim, n_embd), activate=True), nn.GELU()) + self.ln = nn.LayerNorm(n_embd) + # self.blocks = nn.Sequential(*[DecodeBlock(n_embd, n_head, n_agent) for _ in range(n_block)]) + self.gru = nn.GRU(n_embd, n_embd, num_layers=2, batch_first=True) + self.head = nn.Sequential(init_(nn.Linear(n_embd, n_embd), activate=True), nn.GELU(), nn.LayerNorm(n_embd), + init_(nn.Linear(n_embd, action_dim))) + + def zero_std(self, device): + if self.action_type != 'Discrete': + log_std = torch.zeros(self.action_dim).to(device) + self.log_std.data = log_std + + # state, action, and return + def forward(self, action, obs_rep, obs): + # action: (batch, n_agent, action_dim), one-hot/logits? + # obs_rep: (batch, n_agent, n_embd) + action_embeddings = self.action_encoder(action) + x = action_embeddings + x += obs_rep + x, _ = self.gru(self.ln(x)) + logit = self.head(x) + + return logit + + +class MultiAgentGRU(nn.Module): + def __init__(self, state_dim, obs_dim, action_dim, n_agent, + n_block, n_embd, n_head, encode_state=False, device=torch.device("cpu"), + action_type='Discrete', dec_actor=False, share_actor=False): + super(MultiAgentGRU, self).__init__() + + self.n_agent = n_agent + self.action_dim = action_dim + self.tpdv = dict(dtype=torch.float32, device=device) + self.action_type = action_type + self.device = device + + # state unused + state_dim = 37 + + self.encoder = Encoder(state_dim, obs_dim, n_block, n_embd, n_head, n_agent, encode_state) + self.decoder = Decoder(obs_dim, action_dim, n_block, n_embd, n_head, n_agent, + self.action_type, dec_actor=dec_actor, share_actor=share_actor) + self.to(device) + + def zero_std(self): + if self.action_type != 'Discrete': + self.decoder.zero_std(self.device) + + def forward(self, state, obs, action, available_actions=None): + # state: (batch, n_agent, state_dim) + # obs: (batch, n_agent, obs_dim) + # action: (batch, n_agent, 1) + # available_actions: (batch, n_agent, act_dim) + + # state unused + ori_shape = np.shape(state) + state = np.zeros((*ori_shape[:-1], 37), dtype=np.float32) + + state = check(state).to(**self.tpdv) + obs = check(obs).to(**self.tpdv) + action = check(action).to(**self.tpdv) + + if available_actions is not None: + available_actions = check(available_actions).to(**self.tpdv) + + batch_size = np.shape(state)[0] + v_loc, obs_rep = self.encoder(state, obs) + if self.action_type == 'Discrete': + action = action.long() + action_log, entropy = discrete_parallel_act(self.decoder, obs_rep, obs, action, batch_size, + self.n_agent, self.action_dim, self.tpdv, available_actions) + else: + action_log, entropy = continuous_parallel_act(self.decoder, obs_rep, obs, action, batch_size, + self.n_agent, self.action_dim, self.tpdv) + + return action_log, v_loc, entropy + + def get_actions(self, state, obs, available_actions=None, deterministic=False): + # state unused + ori_shape = np.shape(obs) + state = np.zeros((*ori_shape[:-1], 37), dtype=np.float32) + + state = check(state).to(**self.tpdv) + obs = check(obs).to(**self.tpdv) + if available_actions is not None: + available_actions = check(available_actions).to(**self.tpdv) + + batch_size = np.shape(obs)[0] + v_loc, obs_rep = self.encoder(state, obs) + if self.action_type == "Discrete": + output_action, output_action_log = discrete_autoregreesive_act(self.decoder, obs_rep, obs, batch_size, + self.n_agent, self.action_dim, self.tpdv, + available_actions, deterministic) + else: + output_action, output_action_log = continuous_autoregreesive_act(self.decoder, obs_rep, obs, batch_size, + self.n_agent, self.action_dim, self.tpdv, + deterministic) + + return output_action, output_action_log, v_loc + + def get_values(self, state, obs, available_actions=None): + # state unused + ori_shape = np.shape(state) + state = np.zeros((*ori_shape[:-1], 37), dtype=np.float32) + + state = check(state).to(**self.tpdv) + obs = check(obs).to(**self.tpdv) + v_tot, obs_rep = self.encoder(state, obs) + return v_tot + + + diff --git a/controllers/mat_rapid/algorithms/mat/algorithm/transformer_policy.py b/controllers/mat_rapid/algorithms/mat/algorithm/transformer_policy.py new file mode 100644 index 0000000..5792c71 --- /dev/null +++ b/controllers/mat_rapid/algorithms/mat/algorithm/transformer_policy.py @@ -0,0 +1,228 @@ +import torch +import numpy as np +from utils.util import update_linear_schedule +from utils.util import get_shape_from_obs_space, get_shape_from_act_space +from algorithms.utils.util import check +from algorithms.mat.algorithm.ma_transformer import MultiAgentTransformer + + +class TransformerPolicy: + """ + MAT Policy class. Wraps actor and critic networks to compute actions and value function predictions. + + :param args: (argparse.Namespace) arguments containing relevant model and policy information. + :param obs_space: (gym.Space) observation space. + :param cent_obs_space: (gym.Space) value function input space (centralized input for MAPPO, decentralized for IPPO). + :param action_space: (gym.Space) action space. + :param device: (torch.device) specifies the device to run on (cpu/gpu). + """ + + def __init__(self, args, obs_space, cent_obs_space, act_space, num_agents, device=torch.device("cpu")): + self.device = device + self.algorithm_name = args.algorithm_name + self.lr = args.lr + self.opti_eps = args.opti_eps + self.weight_decay = args.weight_decay + self._use_policy_active_masks = args.use_policy_active_masks + if act_space.__class__.__name__ == 'Box': + self.action_type = 'Continuous' + else: + self.action_type = 'Discrete' + + self.obs_dim = get_shape_from_obs_space(obs_space)[0] + self.share_obs_dim = get_shape_from_obs_space(cent_obs_space)[0] + if self.action_type == 'Discrete': + self.act_dim = act_space.n + self.act_num = 1 + else: + print("act high: ", act_space.high) + self.act_dim = act_space.shape[0] + self.act_num = self.act_dim + + print("obs_dim: ", self.obs_dim) + print("share_obs_dim: ", self.share_obs_dim) + print("act_dim: ", self.act_dim) + + self.num_agents = num_agents + self.tpdv = dict(dtype=torch.float32, device=device) + + if self.algorithm_name in ["mat", "mat_dec"]: + from algorithms.mat.algorithm.ma_transformer import MultiAgentTransformer as MAT + elif self.algorithm_name == "mat_gru": + from mat.algorithms.mat.algorithm.mat_gru import MultiAgentGRU as MAT + elif self.algorithm_name == "mat_decoder": + from mat.algorithms.mat.algorithm.mat_decoder import MultiAgentDecoder as MAT + elif self.algorithm_name == "mat_encoder": + from mat.algorithms.mat.algorithm.mat_encoder import MultiAgentEncoder as MAT + else: + raise NotImplementedError + + self.transformer = MAT(self.share_obs_dim, self.obs_dim, self.act_dim, num_agents, + n_block=args.n_block, n_embd=args.n_embd, n_head=args.n_head, + encode_state=args.encode_state, device=device, + action_type=self.action_type, dec_actor=args.dec_actor, + share_actor=args.share_actor) + if args.env_name == "hands": + self.transformer.zero_std() + + # count the volume of parameters of model + # Total_params = 0 + # Trainable_params = 0 + # NonTrainable_params = 0 + # for param in self.transformer.parameters(): + # mulValue = np.prod(param.size()) + # Total_params += mulValue + # if param.requires_grad: + # Trainable_params += mulValue + # else: + # NonTrainable_params += mulValue + # print(f'Total params: {Total_params}') + # print(f'Trainable params: {Trainable_params}') + # print(f'Non-trainable params: {NonTrainable_params}') + + self.optimizer = torch.optim.Adam(self.transformer.parameters(), + lr=self.lr, eps=self.opti_eps, + weight_decay=self.weight_decay) + + def lr_decay(self, episode, episodes): + """ + Decay the actor and critic learning rates. + :param episode: (int) current training episode. + :param episodes: (int) total number of training episodes. + """ + update_linear_schedule(self.optimizer, episode, episodes, self.lr) + + def get_actions(self, cent_obs, obs, rnn_states_actor, rnn_states_critic, masks, available_actions=None, + deterministic=False): + """ + Compute actions and value function predictions for the given inputs. + :param cent_obs (np.ndarray): centralized input to the critic. + :param obs (np.ndarray): local agent inputs to the actor. + :param rnn_states_actor: (np.ndarray) if actor is RNN, RNN states for actor. + :param rnn_states_critic: (np.ndarray) if critic is RNN, RNN states for critic. + :param masks: (np.ndarray) denotes points at which RNN states should be reset. + :param available_actions: (np.ndarray) denotes which actions are available to agent + (if None, all actions available) + :param deterministic: (bool) whether the action should be mode of distribution or should be sampled. + + :return values: (torch.Tensor) value function predictions. + :return actions: (torch.Tensor) actions to take. + :return action_log_probs: (torch.Tensor) log probabilities of chosen actions. + :return rnn_states_actor: (torch.Tensor) updated actor network RNN states. + :return rnn_states_critic: (torch.Tensor) updated critic network RNN states. + """ + + cent_obs = cent_obs.reshape(-1, self.num_agents, self.share_obs_dim) + obs = obs.reshape(-1, self.num_agents, self.obs_dim) + if available_actions is not None: + available_actions = available_actions.reshape(-1, self.num_agents, self.act_dim) + + actions, action_log_probs, values = self.transformer.get_actions(cent_obs, + obs, + available_actions, + deterministic) + + actions = actions.view(-1, self.act_num) + action_log_probs = action_log_probs.view(-1, self.act_num) + values = values.view(-1, 1) + + # unused, just for compatibility + rnn_states_actor = check(rnn_states_actor).to(**self.tpdv) + rnn_states_critic = check(rnn_states_critic).to(**self.tpdv) + return values, actions, action_log_probs, rnn_states_actor, rnn_states_critic + + def get_values(self, cent_obs, obs, rnn_states_critic, masks, available_actions=None): + """ + Get value function predictions. + :param cent_obs (np.ndarray): centralized input to the critic. + :param rnn_states_critic: (np.ndarray) if critic is RNN, RNN states for critic. + :param masks: (np.ndarray) denotes points at which RNN states should be reset. + + :return values: (torch.Tensor) value function predictions. + """ + + cent_obs = cent_obs.reshape(-1, self.num_agents, self.share_obs_dim) + obs = obs.reshape(-1, self.num_agents, self.obs_dim) + if available_actions is not None: + available_actions = available_actions.reshape(-1, self.num_agents, self.act_dim) + + values = self.transformer.get_values(cent_obs, obs, available_actions) + + values = values.view(-1, 1) + + return values + + def evaluate_actions(self, cent_obs, obs, rnn_states_actor, rnn_states_critic, actions, masks, + available_actions=None, active_masks=None): + """ + Get action logprobs / entropy and value function predictions for actor update. + :param cent_obs (np.ndarray): centralized input to the critic. + :param obs (np.ndarray): local agent inputs to the actor. + :param rnn_states_actor: (np.ndarray) if actor is RNN, RNN states for actor. + :param rnn_states_critic: (np.ndarray) if critic is RNN, RNN states for critic. + :param actions: (np.ndarray) actions whose log probabilites and entropy to compute. + :param masks: (np.ndarray) denotes points at which RNN states should be reset. + :param available_actions: (np.ndarray) denotes which actions are available to agent + (if None, all actions available) + :param active_masks: (torch.Tensor) denotes whether an agent is active or dead. + + :return values: (torch.Tensor) value function predictions. + :return action_log_probs: (torch.Tensor) log probabilities of the input actions. + :return dist_entropy: (torch.Tensor) action distribution entropy for the given inputs. + """ + cent_obs = cent_obs.reshape(-1, self.num_agents, self.share_obs_dim) + obs = obs.reshape(-1, self.num_agents, self.obs_dim) + actions = actions.reshape(-1, self.num_agents, self.act_num) + if available_actions is not None: + available_actions = available_actions.reshape(-1, self.num_agents, self.act_dim) + + action_log_probs, values, entropy = self.transformer(cent_obs, obs, actions, available_actions) + + action_log_probs = action_log_probs.view(-1, self.act_num) + values = values.view(-1, 1) + entropy = entropy.view(-1, self.act_num) + + if self._use_policy_active_masks and active_masks is not None: + entropy = (entropy*active_masks).sum()/active_masks.sum() + else: + entropy = entropy.mean() + + return values, action_log_probs, entropy + + def act(self, cent_obs, obs, rnn_states_actor, masks, available_actions=None, deterministic=True): + """ + Compute actions using the given inputs. + :param obs (np.ndarray): local agent inputs to the actor. + :param rnn_states_actor: (np.ndarray) if actor is RNN, RNN states for actor. + :param masks: (np.ndarray) denotes points at which RNN states should be reset. + :param available_actions: (np.ndarray) denotes which actions are available to agent + (if None, all actions available) + :param deterministic: (bool) whether the action should be mode of distribution or should be sampled. + """ + + # this function is just a wrapper for compatibility + rnn_states_critic = np.zeros_like(rnn_states_actor) + _, actions, _, rnn_states_actor, _ = self.get_actions(cent_obs, + obs, + rnn_states_actor, + rnn_states_critic, + masks, + available_actions, + deterministic) + + return actions, rnn_states_actor + + def save(self, save_dir, episode): + torch.save(self.transformer.state_dict(), str(save_dir) + "/transformer_" + str(episode) + ".pt") + + def restore(self, model_dir): + transformer_state_dict = torch.load(model_dir) + self.transformer.load_state_dict(transformer_state_dict) + # self.transformer.reset_std() + + def train(self): + self.transformer.train() + + def eval(self): + self.transformer.eval() + diff --git a/controllers/mat_rapid/algorithms/mat/mat_trainer.py b/controllers/mat_rapid/algorithms/mat/mat_trainer.py new file mode 100644 index 0000000..a6979e4 --- /dev/null +++ b/controllers/mat_rapid/algorithms/mat/mat_trainer.py @@ -0,0 +1,205 @@ +import numpy as np +import torch +import torch.nn as nn +from utils.util import get_gard_norm, huber_loss, mse_loss +from utils.valuenorm import ValueNorm +from algorithms.utils.util import check + + +class MATTrainer: + """ + Trainer class for MAT to update policies. + :param args: (argparse.Namespace) arguments containing relevant model, policy, and env information. + :param policy: (R_MAPPO_Policy) policy to update. + :param device: (torch.device) specifies the device to run on (cpu/gpu). + """ + def __init__(self, + args, + policy, + num_agents, + device=torch.device("cpu")): + + self.device = device + self.tpdv = dict(dtype=torch.float32, device=device) + self.policy = policy + self.num_agents = num_agents + + self.clip_param = args.clip_param + self.ppo_epoch = args.ppo_epoch + self.num_mini_batch = args.num_mini_batch + self.data_chunk_length = args.data_chunk_length + self.value_loss_coef = args.value_loss_coef + self.entropy_coef = args.entropy_coef + self.max_grad_norm = args.max_grad_norm + self.huber_delta = args.huber_delta + + self._use_recurrent_policy = args.use_recurrent_policy + self._use_naive_recurrent = args.use_naive_recurrent_policy + self._use_max_grad_norm = args.use_max_grad_norm + self._use_clipped_value_loss = args.use_clipped_value_loss + self._use_huber_loss = args.use_huber_loss + self._use_valuenorm = args.use_valuenorm + self._use_value_active_masks = args.use_value_active_masks + self._use_policy_active_masks = args.use_policy_active_masks + self.dec_actor = args.dec_actor + + if self._use_valuenorm: + self.value_normalizer = ValueNorm(1, device=self.device) + else: + self.value_normalizer = None + + def cal_value_loss(self, values, value_preds_batch, return_batch, active_masks_batch): + """ + Calculate value function loss. + :param values: (torch.Tensor) value function predictions. + :param value_preds_batch: (torch.Tensor) "old" value predictions from data batch (used for value clip loss) + :param return_batch: (torch.Tensor) reward to go returns. + :param active_masks_batch: (torch.Tensor) denotes if agent is active or dead at a given timesep. + + :return value_loss: (torch.Tensor) value function loss. + """ + + value_pred_clipped = value_preds_batch + (values - value_preds_batch).clamp(-self.clip_param, + self.clip_param) + + if self._use_valuenorm: + self.value_normalizer.update(return_batch) + error_clipped = self.value_normalizer.normalize(return_batch) - value_pred_clipped + error_original = self.value_normalizer.normalize(return_batch) - values + else: + error_clipped = return_batch - value_pred_clipped + error_original = return_batch - values + + if self._use_huber_loss: + value_loss_clipped = huber_loss(error_clipped, self.huber_delta) + value_loss_original = huber_loss(error_original, self.huber_delta) + else: + value_loss_clipped = mse_loss(error_clipped) + value_loss_original = mse_loss(error_original) + + if self._use_clipped_value_loss: + value_loss = torch.max(value_loss_original, value_loss_clipped) + else: + value_loss = value_loss_original + + # if self._use_value_active_masks and not self.dec_actor: + if self._use_value_active_masks: + value_loss = (value_loss * active_masks_batch).sum() / active_masks_batch.sum() + else: + value_loss = value_loss.mean() + + return value_loss + + def ppo_update(self, sample): + """ + Update actor and critic networks. + :param sample: (Tuple) contains data batch with which to update networks. + :update_actor: (bool) whether to update actor network. + + :return value_loss: (torch.Tensor) value function loss. + :return critic_grad_norm: (torch.Tensor) gradient norm from critic up9date. + ;return policy_loss: (torch.Tensor) actor(policy) loss value. + :return dist_entropy: (torch.Tensor) action entropies. + :return actor_grad_norm: (torch.Tensor) gradient norm from actor update. + :return imp_weights: (torch.Tensor) importance sampling weights. + """ + share_obs_batch, obs_batch, rnn_states_batch, rnn_states_critic_batch, actions_batch, \ + value_preds_batch, return_batch, masks_batch, active_masks_batch, old_action_log_probs_batch, \ + adv_targ, available_actions_batch = sample + + old_action_log_probs_batch = check(old_action_log_probs_batch).to(**self.tpdv) + adv_targ = check(adv_targ).to(**self.tpdv) + value_preds_batch = check(value_preds_batch).to(**self.tpdv) + return_batch = check(return_batch).to(**self.tpdv) + active_masks_batch = check(active_masks_batch).to(**self.tpdv) + + # Reshape to do in a single forward pass for all steps + values, action_log_probs, dist_entropy = self.policy.evaluate_actions(share_obs_batch, + obs_batch, + rnn_states_batch, + rnn_states_critic_batch, + actions_batch, + masks_batch, + available_actions_batch, + active_masks_batch) + # actor update + imp_weights = torch.exp(action_log_probs - old_action_log_probs_batch) + + surr1 = imp_weights * adv_targ + surr2 = torch.clamp(imp_weights, 1.0 - self.clip_param, 1.0 + self.clip_param) * adv_targ + + if self._use_policy_active_masks: + policy_loss = (-torch.sum(torch.min(surr1, surr2), + dim=-1, + keepdim=True) * active_masks_batch).sum() / active_masks_batch.sum() + else: + policy_loss = -torch.sum(torch.min(surr1, surr2), dim=-1, keepdim=True).mean() + + # critic update + value_loss = self.cal_value_loss(values, value_preds_batch, return_batch, active_masks_batch) + + loss = policy_loss - dist_entropy * self.entropy_coef + value_loss * self.value_loss_coef + + self.policy.optimizer.zero_grad() + loss.backward() + + if self._use_max_grad_norm: + grad_norm = nn.utils.clip_grad_norm_(self.policy.transformer.parameters(), self.max_grad_norm) + else: + grad_norm = get_gard_norm(self.policy.transformer.parameters()) + + self.policy.optimizer.step() + + return value_loss, grad_norm, policy_loss, dist_entropy, grad_norm, imp_weights + + def train(self, buffer): + """ + Perform a training update using minibatch GD. + :param buffer: (SharedReplayBuffer) buffer containing training data. + :param update_actor: (bool) whether to update actor network. + + :return train_info: (dict) contains information regarding training update (e.g. loss, grad norms, etc). + """ + advantages_copy = buffer.advantages.copy() + advantages_copy[buffer.active_masks[:-1] == 0.0] = np.nan + mean_advantages = np.nanmean(advantages_copy) + std_advantages = np.nanstd(advantages_copy) + advantages = (buffer.advantages - mean_advantages) / (std_advantages + 1e-5) + + + train_info = {} + + train_info['value_loss'] = 0 + train_info['policy_loss'] = 0 + train_info['dist_entropy'] = 0 + train_info['actor_grad_norm'] = 0 + train_info['critic_grad_norm'] = 0 + train_info['ratio'] = 0 + + for _ in range(self.ppo_epoch): + data_generator = buffer.feed_forward_generator_transformer(advantages, self.num_mini_batch) + + for sample in data_generator: + + value_loss, critic_grad_norm, policy_loss, dist_entropy, actor_grad_norm, imp_weights \ + = self.ppo_update(sample) + + train_info['value_loss'] += value_loss.item() + train_info['policy_loss'] += policy_loss.item() + train_info['dist_entropy'] += dist_entropy.item() + train_info['actor_grad_norm'] += actor_grad_norm + train_info['critic_grad_norm'] += critic_grad_norm + train_info['ratio'] += imp_weights.mean() + + num_updates = self.ppo_epoch * self.num_mini_batch + + for k in train_info.keys(): + train_info[k] /= num_updates + + return train_info + + def prep_training(self): + self.policy.train() + + def prep_rollout(self): + self.policy.eval() diff --git a/controllers/mat_rapid/algorithms/utils/transformer_act.py b/controllers/mat_rapid/algorithms/utils/transformer_act.py new file mode 100644 index 0000000..9364b69 --- /dev/null +++ b/controllers/mat_rapid/algorithms/utils/transformer_act.py @@ -0,0 +1,85 @@ +import torch +from torch.distributions import Categorical, Normal +from torch.nn import functional as F + + +def discrete_autoregreesive_act(decoder, obs_rep, obs, batch_size, n_agent, action_dim, tpdv, + available_actions=None, deterministic=False): + shifted_action = torch.zeros((batch_size, n_agent, action_dim + 1)).to(**tpdv) + shifted_action[:, 0, 0] = 1 + output_action = torch.zeros((batch_size, n_agent, 1), dtype=torch.long) + output_action_log = torch.zeros_like(output_action, dtype=torch.float32) + + for i in range(n_agent): + logit = decoder(shifted_action, obs_rep, obs)[:, i, :] + if available_actions is not None: + logit[available_actions[:, i, :] == 0] = -1e10 + + distri = Categorical(logits=logit) + action = distri.probs.argmax(dim=-1) if deterministic else distri.sample() + action_log = distri.log_prob(action) + + output_action[:, i, :] = action.unsqueeze(-1) + output_action_log[:, i, :] = action_log.unsqueeze(-1) + if i + 1 < n_agent: + shifted_action[:, i + 1, 1:] = F.one_hot(action, num_classes=action_dim) + return output_action, output_action_log + + +def discrete_parallel_act(decoder, obs_rep, obs, action, batch_size, n_agent, action_dim, tpdv, + available_actions=None): + one_hot_action = F.one_hot(action.squeeze(-1), num_classes=action_dim) # (batch, n_agent, action_dim) + shifted_action = torch.zeros((batch_size, n_agent, action_dim + 1)).to(**tpdv) + shifted_action[:, 0, 0] = 1 + shifted_action[:, 1:, 1:] = one_hot_action[:, :-1, :] + logit = decoder(shifted_action, obs_rep, obs) + if available_actions is not None: + logit[available_actions == 0] = -1e10 + + distri = Categorical(logits=logit) + action_log = distri.log_prob(action.squeeze(-1)).unsqueeze(-1) + entropy = distri.entropy().unsqueeze(-1) + return action_log, entropy + + +def continuous_autoregreesive_act(decoder, obs_rep, obs, batch_size, n_agent, action_dim, tpdv, + deterministic=False): + shifted_action = torch.zeros((batch_size, n_agent, action_dim)).to(**tpdv) + output_action = torch.zeros((batch_size, n_agent, action_dim), dtype=torch.float32) + output_action_log = torch.zeros_like(output_action, dtype=torch.float32) + + for i in range(n_agent): + act_mean = decoder(shifted_action, obs_rep, obs)[:, i, :] + action_std = torch.sigmoid(decoder.log_std) * 0.5 + + # log_std = torch.zeros_like(act_mean).to(**tpdv) + decoder.log_std + # distri = Normal(act_mean, log_std.exp()) + distri = Normal(act_mean, action_std) + action = act_mean if deterministic else distri.sample() + action_log = distri.log_prob(action) + + output_action[:, i, :] = action + output_action_log[:, i, :] = action_log + if i + 1 < n_agent: + shifted_action[:, i + 1, :] = action + + # print("act_mean: ", act_mean) + # print("action: ", action) + + return output_action, output_action_log + + +def continuous_parallel_act(decoder, obs_rep, obs, action, batch_size, n_agent, action_dim, tpdv): + shifted_action = torch.zeros((batch_size, n_agent, action_dim)).to(**tpdv) + shifted_action[:, 1:, :] = action[:, :-1, :] + + act_mean = decoder(shifted_action, obs_rep, obs) + action_std = torch.sigmoid(decoder.log_std) * 0.5 + distri = Normal(act_mean, action_std) + + # log_std = torch.zeros_like(act_mean).to(**tpdv) + decoder.log_std + # distri = Normal(act_mean, log_std.exp()) + + action_log = distri.log_prob(action) + entropy = distri.entropy() + return action_log, entropy diff --git a/controllers/mat_rapid/algorithms/utils/util.py b/controllers/mat_rapid/algorithms/utils/util.py new file mode 100644 index 0000000..de231cf --- /dev/null +++ b/controllers/mat_rapid/algorithms/utils/util.py @@ -0,0 +1,18 @@ +import copy +import numpy as np + +import torch +import torch.nn as nn + +def init(module, weight_init, bias_init, gain=1): + weight_init(module.weight.data, gain=gain) + if module.bias is not None: + bias_init(module.bias.data) + return module + +def get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + +def check(input): + output = torch.from_numpy(input) if type(input) == np.ndarray else input + return output diff --git a/controllers/mat_rapid/config.py b/controllers/mat_rapid/config.py new file mode 100644 index 0000000..6c77953 --- /dev/null +++ b/controllers/mat_rapid/config.py @@ -0,0 +1,300 @@ +import argparse + + +def get_config(): + """ + The configuration parser for hyper-parameters of all environment. + Please reach each `scripts/train/_runner.py` file to find private hyper-parameters + only used in . + + Prepare parameters: + --algorithm_name + specifiy the algorithm, including `["mat", "mat_dec"]` + --experiment_name + an identifier to distinguish different experiment. + --seed + set seed for numpy and torch + --cuda + by default True, will use GPU to train; or else will use CPU; + --cuda_deterministic + by default, make sure random seed effective. if set, bypass such function. + --n_training_threads + number of training threads working in parallel. by default 1 + --n_rollout_threads + number of parallel envs for training rollout. by default 32 + --n_eval_rollout_threads + number of parallel envs for evaluating rollout. by default 1 + --n_render_rollout_threads + number of parallel envs for rendering, could only be set as 1 for some environments. + --num_env_steps + number of env steps to train (default: 10e6) + --user_name + [for wandb usage], to specify user's name for simply collecting training data. + --use_wandb + [for wandb usage], by default True, will log date to wandb server. or else will use tensorboard to log data. + + Env parameters: + --env_name + specify the name of environment + --use_obs_instead_of_state + [only for some env] by default False, will use global state; or else will use concatenated local obs. + + Replay Buffer parameters: + --episode_length + the max length of episode in the buffer. + + Network parameters: + --share_policy + by default True, all agents will share the same network; set to make training agents use different policies. + --use_centralized_V + by default True, use centralized training mode; or else will decentralized training mode. + --stacked_frames + Number of input frames which should be stack together. + --hidden_size + Dimension of hidden layers for actor/critic networks + --layer_N + Number of layers for actor/critic networks + --use_ReLU + by default True, will use ReLU. or else will use Tanh. + --use_popart + by default True, use PopArt to normalize rewards. + --use_valuenorm + by default True, use running mean and std to normalize rewards. + --use_feature_normalization + by default True, apply layernorm to normalize inputs. + --use_orthogonal + by default True, use Orthogonal initialization for weights and 0 initialization for biases. or else, will use xavier uniform inilialization. + --gain + by default 0.01, use the gain # of last action layer + --use_naive_recurrent_policy + by default False, use the whole trajectory to calculate hidden states. + --use_recurrent_policy + by default, use Recurrent Policy. If set, do not use. + --recurrent_N + The number of recurrent layers ( default 1). + --data_chunk_length + Time length of chunks used to train a recurrent_policy, default 10. + + Optimizer parameters: + --lr + learning rate parameter, (default: 5e-4, fixed). + --critic_lr + learning rate of critic (default: 5e-4, fixed) + --opti_eps + RMSprop optimizer epsilon (default: 1e-5) + --weight_decay + coefficience of weight decay (default: 0) + + PPO parameters: + --ppo_epoch + number of ppo epochs (default: 15) + --use_clipped_value_loss + by default, clip loss value. If set, do not clip loss value. + --clip_param + ppo clip parameter (default: 0.2) + --num_mini_batch + number of batches for ppo (default: 1) + --entropy_coef + entropy term coefficient (default: 0.01) + --use_max_grad_norm + by default, use max norm of gradients. If set, do not use. + --max_grad_norm + max norm of gradients (default: 0.5) + --use_gae + by default, use generalized advantage estimation. If set, do not use gae. + --gamma + discount factor for rewards (default: 0.99) + --gae_lambda + gae lambda parameter (default: 0.95) + --use_proper_time_limits + by default, the return value does consider limits of time. If set, compute returns with considering time limits factor. + --use_huber_loss + by default, use huber loss. If set, do not use huber loss. + --use_value_active_masks + by default True, whether to mask useless data in value loss. + --huber_delta + coefficient of huber loss. + + PPG parameters: + --aux_epoch + number of auxiliary epochs. (default: 4) + --clone_coef + clone term coefficient (default: 0.01) + + Run parameters: + --use_linear_lr_decay + by default, do not apply linear decay to learning rate. If set, use a linear schedule on the learning rate + + Save & Log parameters: + --save_interval + time duration between contiunous twice models saving. + --log_interval + time duration between contiunous twice log printing. + + Eval parameters: + --use_eval + by default, do not start evaluation. If set`, start evaluation alongside with training. + --eval_interval + time duration between contiunous twice evaluation progress. + --eval_episodes + number of episodes of a single evaluation. + + Render parameters: + --save_gifs + by default, do not save render video. If set, save video. + --use_render + by default, do not render the env during training. If set, start render. Note: something, the environment has internal render process which is not controlled by this hyperparam. + --render_episodes + the number of episodes to render a given env + --ifi + the play interval of each rendered image in saved video. + + Pretrained parameters: + --model_dir + by default None. set the path to pretrained model. + """ + parser = argparse.ArgumentParser( + description='onpolicy', formatter_class=argparse.RawDescriptionHelpFormatter) + + # prepare parameters + parser.add_argument("--algorithm_name", type=str, + default='mat', choices=["mat", "mat_dec", "mat_encoder", "mat_decoder", "mat_gru"]) + + parser.add_argument("--experiment_name", type=str, default="check", help="an identifier to distinguish different experiment.") + parser.add_argument("--seed", type=int, default=1, help="Random seed for numpy/torch") + parser.add_argument("--cuda", action='store_false', default=True, help="by default True, will use GPU to train; or else will use CPU;") + parser.add_argument("--cuda_deterministic", + action='store_false', default=True, help="by default, make sure random seed effective. if set, bypass such function.") + parser.add_argument("--n_training_threads", type=int, + default=1, help="Number of torch threads for training") + parser.add_argument("--n_rollout_threads", type=int, default=32, + help="Number of parallel envs for training rollouts") + parser.add_argument("--n_eval_rollout_threads", type=int, default=1, + help="Number of parallel envs for evaluating rollouts") + parser.add_argument("--n_render_rollout_threads", type=int, default=1, + help="Number of parallel envs for rendering rollouts") + parser.add_argument("--num_env_steps", type=int, default=10e6, + help='Number of environment steps to train (default: 10e6)') + parser.add_argument("--user_name", type=str, default='xxx',help="[for wandb usage], to specify user's name for simply collecting training data.") + parser.add_argument("--use_wandb", action='store_false', default=False, help="[for wandb usage], by default True, will log date to wandb server. or else will use tensorboard to log data.") + + # env parameters + parser.add_argument("--env_name", type=str, default='StarCraft2', help="specify the name of environment") + parser.add_argument("--use_obs_instead_of_state", action='store_true', + default=False, help="Whether to use global state or concatenated obs") + + # replay buffer parameters + parser.add_argument("--episode_length", type=int, + default=200, help="Max length for any episode") + + # network parameters + parser.add_argument("--share_policy", action='store_false', + default=True, help='Whether agent share the same policy') + parser.add_argument("--use_centralized_V", action='store_false', + default=True, help="Whether to use centralized V function") + parser.add_argument("--stacked_frames", type=int, default=1, + help="Dimension of hidden layers for actor/critic networks") + parser.add_argument("--use_stacked_frames", action='store_true', + default=False, help="Whether to use stacked_frames") + parser.add_argument("--hidden_size", type=int, default=64, + help="Dimension of hidden layers for actor/critic networks") + parser.add_argument("--layer_N", type=int, default=2, + help="Number of layers for actor/critic networks") + parser.add_argument("--use_ReLU", action='store_false', + default=True, help="Whether to use ReLU") + parser.add_argument("--use_popart", action='store_true', default=False, help="by default False, use PopArt to normalize rewards.") + parser.add_argument("--use_valuenorm", action='store_false', default=True, help="by default True, use running mean and std to normalize rewards.") + parser.add_argument("--use_feature_normalization", action='store_false', + default=True, help="Whether to apply layernorm to the inputs") + parser.add_argument("--use_orthogonal", action='store_false', default=True, + help="Whether to use Orthogonal initialization for weights and 0 initialization for biases") + parser.add_argument("--gain", type=float, default=0.01, + help="The gain # of last action layer") + + # recurrent parameters + parser.add_argument("--use_naive_recurrent_policy", action='store_true', + default=False, help='Whether to use a naive recurrent policy') + parser.add_argument("--use_recurrent_policy", action='store_true', + default=False, help='use a recurrent policy') + parser.add_argument("--recurrent_N", type=int, default=1, help="The number of recurrent layers.") + parser.add_argument("--data_chunk_length", type=int, default=10, + help="Time length of chunks used to train a recurrent_policy") + + # optimizer parameters + parser.add_argument("--lr", type=float, default=5e-4, + help='learning rate (default: 5e-4)') + parser.add_argument("--critic_lr", type=float, default=5e-4, + help='critic learning rate (default: 5e-4)') + parser.add_argument("--opti_eps", type=float, default=1e-5, + help='RMSprop optimizer epsilon (default: 1e-5)') + parser.add_argument("--weight_decay", type=float, default=0) + + # ppo parameters + parser.add_argument("--ppo_epoch", type=int, default=15, + help='number of ppo epochs (default: 15)') + parser.add_argument("--use_clipped_value_loss", + action='store_false', default=True, help="by default, clip loss value. If set, do not clip loss value.") + parser.add_argument("--clip_param", type=float, default=0.2, + help='ppo clip parameter (default: 0.2)') + parser.add_argument("--num_mini_batch", type=int, default=1, + help='number of batches for ppo (default: 1)') + parser.add_argument("--entropy_coef", type=float, default=0.01, + help='entropy term coefficient (default: 0.01)') + parser.add_argument("--value_loss_coef", type=float, + default=1, help='value loss coefficient (default: 0.5)') + parser.add_argument("--use_max_grad_norm", + action='store_false', default=True, help="by default, use max norm of gradients. If set, do not use.") + parser.add_argument("--max_grad_norm", type=float, default=10.0, + help='max norm of gradients (default: 0.5)') + parser.add_argument("--use_gae", action='store_false', + default=True, help='use generalized advantage estimation') + parser.add_argument("--gamma", type=float, default=0.99, + help='discount factor for rewards (default: 0.99)') + parser.add_argument("--gae_lambda", type=float, default=0.95, + help='gae lambda parameter (default: 0.95)') + parser.add_argument("--use_proper_time_limits", action='store_true', + default=False, help='compute returns taking into account time limits') + parser.add_argument("--use_huber_loss", action='store_false', default=True, help="by default, use huber loss. If set, do not use huber loss.") + parser.add_argument("--use_value_active_masks", + action='store_false', default=True, help="by default True, whether to mask useless data in value loss.") + parser.add_argument("--use_policy_active_masks", + action='store_false', default=True, help="by default True, whether to mask useless data in policy loss.") + parser.add_argument("--huber_delta", type=float, default=10.0, help=" coefficience of huber loss.") + + # run parameters + parser.add_argument("--use_linear_lr_decay", action='store_true', + default=False, help='use a linear schedule on the learning rate') + # save parameters + parser.add_argument("--save_interval", type=int, default=100, help="time duration between contiunous twice models saving.") + + # log parameters + parser.add_argument("--log_interval", type=int, default=5, help="time duration between contiunous twice log printing.") + + # eval parameters + parser.add_argument("--use_eval", action='store_true', default=False, help="by default, do not start evaluation. If set`, start evaluation alongside with training.") + parser.add_argument("--eval_interval", type=int, default=25, help="time duration between contiunous twice evaluation progress.") + parser.add_argument("--eval_episodes", type=int, default=32, help="number of episodes of a single evaluation.") + + # render parameters + parser.add_argument("--save_gifs", action='store_true', default=False, help="by default, do not save render video. If set, save video.") + parser.add_argument("--use_render", action='store_true', default=False, help="by default, do not render the env during training. If set, start render. Note: something, the environment has internal render process which is not controlled by this hyperparam.") + parser.add_argument("--render_episodes", type=int, default=5, help="the number of episodes to render a given env") + parser.add_argument("--ifi", type=float, default=0.1, help="the play interval of each rendered image in saved video.") + + # pretrained parameters + parser.add_argument("--model_dir", type=str, default=None, help="by default None. set the path to pretrained model.") + + + # add for transformer + parser.add_argument("--encode_state", action='store_true', default=False) + parser.add_argument("--n_block", type=int, default=1) + parser.add_argument("--n_embd", type=int, default=64) + parser.add_argument("--n_head", type=int, default=1) + parser.add_argument("--dec_actor", action='store_true', default=False) + parser.add_argument("--share_actor", action='store_true', default=False) + + # add for online multi-task + parser.add_argument("--train_maps", type=str, nargs='+', default=None) + parser.add_argument("--eval_maps", type=str, nargs='+', default=None) + + return parser diff --git a/controllers/mat_rapid/mat_rapid.py b/controllers/mat_rapid/mat_rapid.py new file mode 100644 index 0000000..42ec25e --- /dev/null +++ b/controllers/mat_rapid/mat_rapid.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python +import sys +import os +import wandb +import socket +import setproctitle +import numpy as np +from pathlib import Path +import torch +#sys.path.append("../../") +from config import get_config +#from mat.envs.mpe.MPE_env import MPEEnv +from soccer.soccer_env import SoccerEnv +from runner.soccer_runner import SoccerRunner as Runner +from soccer.env_wrappers import SubprocVecEnv, DummyVecEnv + +"""Train script for MPEs.""" + +def make_train_env(all_args): + def get_env_fn(rank): + def init_env(): + if all_args.env_name == "soccer": + #env_args = {"scenario": all_args.scenario, + # "n_agent": all_args.n_agent} + #env = FootballEnv(env_args=env_args) + + env_args = {"scenario": all_args.scenario_name, + "episode_length": all_args.episode_length} + env = SoccerEnv(env_args=env_args) + else: + print("Can not support the " + + all_args.env_name + "environment.") + raise NotImplementedError + env.seed(all_args.seed + rank * 1000) + return env + return init_env + if all_args.n_rollout_threads == 1: + return DummyVecEnv([get_env_fn(0)]) + else: + return SubprocVecEnv([get_env_fn(i) for i in range(all_args.n_rollout_threads)]) + + +def make_eval_env(all_args): + def get_env_fn(rank): + def init_env(): + if all_args.env_name == "soccer": + env = SoccerEnv() + else: + print("Can not support the " + + all_args.env_name + "environment.") + raise NotImplementedError + env.seed(all_args.seed * 50000 + rank * 10000) + return env + return init_env + if all_args.n_eval_rollout_threads == 1: + return DummyVecEnv([get_env_fn(0)]) + else: + return SubprocVecEnv([get_env_fn(i) for i in range(all_args.n_eval_rollout_threads)]) + + +def parse_args(args, parser): + parser.add_argument('--scenario_name', type=str, + default='soccer', help="Which scenario to run on") + #parser.add_argument("--num_landmarks", type=int, default=3) + parser.add_argument('--num_agents', type=int, + default=3, help="number of players") + + all_args = parser.parse_known_args(args)[0] + + return all_args + + +def main(args): + parser = get_config() + all_args = parse_args(args, parser) + + if all_args.algorithm_name == "rmappo": + all_args.use_recurrent_policy = True + assert (all_args.use_recurrent_policy or all_args.use_naive_recurrent_policy), ("check recurrent policy!") + elif all_args.algorithm_name == "mappo" or all_args.algorithm_name == "mat" or all_args.algorithm_name == "mat_dec": + assert (all_args.use_recurrent_policy == False and all_args.use_naive_recurrent_policy == False), ( + "check recurrent policy!") + else: + raise NotImplementedError + + if all_args.algorithm_name == "mat_dec": + all_args.dec_actor = True + all_args.share_actor = True + + # cuda + if all_args.cuda and torch.cuda.is_available(): + print("choose to use gpu...") + device = torch.device("cuda:0") + torch.set_num_threads(all_args.n_training_threads) + if all_args.cuda_deterministic: + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True + else: + print("choose to use cpu...") + device = torch.device("cpu") + torch.set_num_threads(all_args.n_training_threads) + + # run dir + print( Path(os.path.split(os.path.dirname(os.path.abspath(__file__)))[0] + "/mat_rapid" +"/results") / all_args.env_name / all_args.scenario_name / all_args.algorithm_name / all_args.experiment_name) + #run_dir = Path(os.path.split(os.path.dirname(os.path.abspath(__file__)))[0] + "/results") / all_args.env_name / all_args.scenario_name / all_args.algorithm_name / all_args.experiment_name + run_dir = Path(os.path.split(os.path.dirname(os.path.abspath(__file__)))[0] + "/mat_rapid" + "/results") / all_args.env_name / all_args.scenario_name / all_args.algorithm_name / all_args.experiment_name + if not run_dir.exists(): + os.makedirs(str(run_dir)) + + # wandb + if all_args.use_wandb: + run = wandb.init(config=all_args, + project=all_args.env_name, + entity=all_args.user_name, + notes=socket.gethostname(), + name=str(all_args.algorithm_name) + "_" + + str(all_args.experiment_name) + + "_seed" + str(all_args.seed), + group=all_args.scenario_name, + dir=str(run_dir), + job_type="training", + reinit=True) + else: + if not run_dir.exists(): + curr_run = 'run1' + else: + exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in run_dir.iterdir() if str(folder.name).startswith('run')] + if len(exst_run_nums) == 0: + curr_run = 'run1' + else: + curr_run = 'run%i' % (max(exst_run_nums) + 1) + run_dir = run_dir / curr_run + if not run_dir.exists(): + os.makedirs(str(run_dir)) + + setproctitle.setproctitle(str(all_args.algorithm_name) + "-" + \ + str(all_args.env_name) + "-" + str(all_args.experiment_name) + "@" + str(all_args.user_name)) + + # seed + torch.manual_seed(all_args.seed) + torch.cuda.manual_seed_all(all_args.seed) + np.random.seed(all_args.seed) + + # env init + envs = make_train_env(all_args) + eval_envs = make_eval_env(all_args) if all_args.use_eval else None + num_agents = all_args.num_agents + + config = { + "all_args": all_args, + "envs": envs, + "eval_envs": eval_envs, + "num_agents": num_agents, + "device": device, + "run_dir": run_dir + } + + runner = Runner(config) + runner.run() + + # post process + envs.close() + if all_args.use_eval and eval_envs is not envs: + eval_envs.close() + + if all_args.use_wandb: + run.finish() + else: + runner.writter.export_scalars_to_json(str(runner.log_dir + '/summary.json')) + runner.writter.close() + + +if __name__ == "__main__": + main(['--seed', '1', '--env_name', 'soccer', '--algorithm_name', 'mat_dec', '--experiment_name', 'single', '--scenario_name', 'soccer', '--num_agents', '3', '--lr', '5e-4', '--entropy_coef', '0.01', '--max_grad_norm', '0.5', '--n_training_threads', '16', '--n_rollout_threads', '1', '--num_mini_batch', '1', '--episode_length', '1000', '--num_env_steps', '10000000', '--ppo_epoch', '10', '--clip_param', '0.05', '--use_value_active_masks', '--use_policy_active_masks']) +# main(['--seed', '1', '--env_name', 'soccer', '--algorithm_name', 'mat_dec', '--experiment_name', 'single', '--scenario_name', 'soccer', '--num_agents', '3', '--lr', '5e-4', '--entropy_coef', '0.01', '--max_grad_norm', '0.5', '--n_training_threads', '16', '--n_rollout_threads', '1', '--num_mini_batch', '1', '--episode_length', '1000', '--num_env_steps', '10000000', '--ppo_epoch', '10', '--clip_param', '0.05', '--use_value_active_masks', '--use_policy_active_masks', '--model_dir', './transformer_1400.pt']) diff --git a/controllers/mat_rapid/requirements.txt b/controllers/mat_rapid/requirements.txt new file mode 100644 index 0000000..f4b63ed --- /dev/null +++ b/controllers/mat_rapid/requirements.txt @@ -0,0 +1,93 @@ +akro==0.0.8 +asynctest==0.13.0 +backcall==0.2.0 +blinker==1.4 +brotlipy==0.7.0 +certifi==2021.10.8 +cloudpickle==2.0.0 +cycler==0.11.0 +Cython==0.29.28 +decorator==5.1.1 +deepdiff==5.8.1 +docker-pycreds==0.4.0 +docopt==0.6.2 +dowel==0.0.4 +enum34==1.1.10 +fasteners==0.17.3 +fonttools==4.29.1 +future==0.18.2 +gitdb==4.0.9 +GitPython==3.1.27 +glfw==2.5.1 +google-auth-oauthlib==0.4.1 +gym==0.12.4 +gym-notices==0.0.5 +imageio==2.16.1 +importlib-metadata==4.11.2 +ipdb==0.13.9 +ipython==7.33.0 +jedi==0.18.1 +Jinja2==3.0.3 +joblib==1.1.0 +kiwisolver==1.3.2 +MarkupSafe==2.1.0 +matplotlib==3.5.1 +matplotlib-inline==0.1.3 +mkl-fft==1.3.1 +mkl-service==2.4.0 +mock==4.0.3 +mpyq==0.2.5 +mujoco-py==2.1.2.14 +ninja==1.10.2.3 +opencv-python==4.5.5.64 +ordered-set==4.1.0 +packaging==21.3 +parso==0.8.3 +patchelf==0.14.5.0 +pathtools==0.1.2 +pexpect==4.8.0 +pickleshare==0.7.5 +Pillow==9.0.1 +pipreqs==0.4.11 +portpicker==1.5.0 +promise==2.3 +prompt-toolkit==3.0.29 +protobuf==3.19.1 +psutil==5.9.0 +ptyprocess==0.7.0 +pyasn1-modules==0.2.8 +pygame==2.1.2 +pyglet==1.5.26 +Pygments==2.12.0 +pyparsing==3.0.7 +PySC2==3.0.0 +python-dateutil==2.8.2 +PyYAML==6.0 +requests-oauthlib==1.3.0 +s2clientprotocol==5.0.9.87702.0 +s2protocol==5.0.9.87702.0 +scikit-learn==1.0.2 +sentry-sdk==1.5.7 +setproctitle==1.2.2 +shortuuid==1.0.8 +sk-video==1.1.10 +sklearn==0.0 +smmap==5.0.0 +some-package==0.1 +tabulate==0.8.9 +tb-nightly==2.9.0a20220309 +tensorboard-plugin-wit==1.7.0 +tensorboardX==2.5 +termcolor==1.1.0 +threadpoolctl==3.1.0 +toml==0.10.2 +torch==1.10.2 +torchvision==0.11.3 +tqdm==4.64.0 +traitlets==5.1.1 +wandb==0.12.11 +wcwidth==0.2.5 +websocket-client==1.3.2 +whichcraft==0.6.1 +yarg==0.1.9 +yaspin==2.1.0 diff --git a/controllers/mat_rapid/runner/base_runner.py b/controllers/mat_rapid/runner/base_runner.py new file mode 100644 index 0000000..d867f19 --- /dev/null +++ b/controllers/mat_rapid/runner/base_runner.py @@ -0,0 +1,171 @@ +import wandb +import os +import numpy as np +import torch +from tensorboardX import SummaryWriter +from utils.shared_buffer import SharedReplayBuffer +from algorithms.mat.mat_trainer import MATTrainer as TrainAlgo +from algorithms.mat.algorithm.transformer_policy import TransformerPolicy as Policy + +def _t2n(x): + """Convert torch tensor to a numpy array.""" + return x.detach().cpu().numpy() + +class Runner(object): + """ + Base class for training recurrent policies. + :param config: (dict) Config dictionary containing parameters for training. + """ + def __init__(self, config): + + self.all_args = config['all_args'] + self.envs = config['envs'] + self.eval_envs = config['eval_envs'] + self.device = config['device'] + self.num_agents = config['num_agents'] + if config.__contains__("render_envs"): + self.render_envs = config['render_envs'] + + # parameters + self.env_name = self.all_args.env_name + self.algorithm_name = self.all_args.algorithm_name + self.experiment_name = self.all_args.experiment_name + self.use_centralized_V = self.all_args.use_centralized_V + self.use_obs_instead_of_state = self.all_args.use_obs_instead_of_state + self.num_env_steps = self.all_args.num_env_steps + self.episode_length = self.all_args.episode_length + self.n_rollout_threads = self.all_args.n_rollout_threads + self.n_eval_rollout_threads = self.all_args.n_eval_rollout_threads + self.n_render_rollout_threads = self.all_args.n_render_rollout_threads + self.use_linear_lr_decay = self.all_args.use_linear_lr_decay + self.hidden_size = self.all_args.hidden_size + self.use_wandb = self.all_args.use_wandb + self.use_render = self.all_args.use_render + self.recurrent_N = self.all_args.recurrent_N + + # interval + self.save_interval = self.all_args.save_interval + self.use_eval = self.all_args.use_eval + self.eval_interval = self.all_args.eval_interval + self.log_interval = self.all_args.log_interval + + # dir + self.model_dir = self.all_args.model_dir + + if self.use_wandb: + self.save_dir = str(wandb.run.dir) + self.run_dir = str(wandb.run.dir) + else: + self.run_dir = config["run_dir"] + self.log_dir = str(self.run_dir / 'logs') + if not os.path.exists(self.log_dir): + os.makedirs(self.log_dir) + self.writter = SummaryWriter(self.log_dir) + self.save_dir = str(self.run_dir / 'models') + if not os.path.exists(self.save_dir): + os.makedirs(self.save_dir) + + share_observation_space = self.envs.share_observation_space[0] if self.use_centralized_V else self.envs.observation_space[0] + + print("obs_space: ", self.envs.observation_space) + print("share_obs_space: ", self.envs.share_observation_space) + print("act_space: ", self.envs.action_space) + + # policy network + self.policy = Policy(self.all_args, + self.envs.observation_space[0], + share_observation_space, + self.envs.action_space[0], + self.num_agents, + device=self.device) + + if self.model_dir is not None: + self.restore(self.model_dir) + + # algorithm + self.trainer = TrainAlgo(self.all_args, self.policy, self.num_agents, device=self.device) + + # buffer + self.buffer = SharedReplayBuffer(self.all_args, + self.num_agents, + self.envs.observation_space[0], + share_observation_space, + self.envs.action_space[0], + self.all_args.env_name) + + def run(self): + """Collect training data, perform training updates, and evaluate policy.""" + raise NotImplementedError + + def warmup(self): + """Collect warmup pre-training data.""" + raise NotImplementedError + + def collect(self, step): + """Collect rollouts for training.""" + raise NotImplementedError + + def insert(self, data): + """ + Insert data into buffer. + :param data: (Tuple) data to insert into training buffer. + """ + raise NotImplementedError + + @torch.no_grad() + def compute(self): + """Calculate returns for the collected data.""" + self.trainer.prep_rollout() + if self.buffer.available_actions is None: + next_values = self.trainer.policy.get_values(np.concatenate(self.buffer.share_obs[-1]), + np.concatenate(self.buffer.obs[-1]), + np.concatenate(self.buffer.rnn_states_critic[-1]), + np.concatenate(self.buffer.masks[-1])) + else: + next_values = self.trainer.policy.get_values(np.concatenate(self.buffer.share_obs[-1]), + np.concatenate(self.buffer.obs[-1]), + np.concatenate(self.buffer.rnn_states_critic[-1]), + np.concatenate(self.buffer.masks[-1]), + np.concatenate(self.buffer.available_actions[-1])) + next_values = np.array(np.split(_t2n(next_values), self.n_rollout_threads)) + self.buffer.compute_returns(next_values, self.trainer.value_normalizer) + + def train(self): + """Train policies with data in buffer. """ + self.trainer.prep_training() + train_infos = self.trainer.train(self.buffer) + self.buffer.after_update() + return train_infos + + def save(self, episode): + """Save policy's actor and critic networks.""" + self.policy.save(self.save_dir, episode) + + def restore(self, model_dir): + """Restore policy's networks from a saved model.""" + self.policy.restore(model_dir) + + def log_train(self, train_infos, total_num_steps): + """ + Log training info. + :param train_infos: (dict) information about training update. + :param total_num_steps: (int) total number of training env steps. + """ + for k, v in train_infos.items(): + if self.use_wandb: + wandb.log({k: v}, step=total_num_steps) + else: + self.writter.add_scalars(k, {k: v}, total_num_steps) + + def log_env(self, env_infos, total_num_steps): + """ + Log env info. + :param env_infos: (dict) information about env state. + :param total_num_steps: (int) total number of training env steps. + """ + for k, v in env_infos.items(): + if len(v)>0: + if self.use_wandb: + wandb.log({k: np.mean(v)}, step=total_num_steps) + else: + self.writter.add_scalars(k, {k: np.mean(v)}, total_num_steps) diff --git a/controllers/mat_rapid/runner/soccer_runner.py b/controllers/mat_rapid/runner/soccer_runner.py new file mode 100644 index 0000000..def5d50 --- /dev/null +++ b/controllers/mat_rapid/runner/soccer_runner.py @@ -0,0 +1,265 @@ +import time +import wandb +import numpy as np +import torch +from runner.base_runner import Runner + + +def _t2n(x): + return x.detach().cpu().numpy() + +class SoccerRunner(Runner): + """Runner class to perform training, evaluation. and data collection for SMAC. See parent class for details.""" + def __init__(self, config): + super(SoccerRunner, self).__init__(config) + + def run(self): + self.warmup() + + start = time.time() + episodes = int(self.num_env_steps) // self.episode_length // self.n_rollout_threads + + train_episode_rewards = [0 for _ in range(self.n_rollout_threads)] + done_episodes_rewards = [] + + #train_episode_scores = [0 for _ in range(self.n_rollout_threads)] + #done_episodes_scores = [] + + train_individual_rewards = [0 for _ in range(self.num_agents)] + done_individual_rewards = [] + + for episode in range(episodes): + if self.use_linear_lr_decay: + self.trainer.policy.lr_decay(episode, episodes) + + for step in range(self.episode_length): + # Sample actions + values, actions, action_log_probs, rnn_states, rnn_states_critic = self.collect(step) + + # Obser reward and next obs + + #obs, rewards, dones, infos, available_actions = self.envs.step(actions) + obs, rewards, dones, infos = self.envs.step(actions) + + dones_env = np.all(dones, axis=1) + reward_env = np.mean(rewards, axis=1).flatten() + train_episode_rewards += reward_env + + for agent_id in range(self.num_agents): + for info in infos: + if 'individual_reward' in info[agent_id].keys(): + train_individual_rewards[agent_id] += info[agent_id]['individual_reward'] + + + #score_env = [t_info[0]["score_reward"] for t_info in infos] + #train_episode_scores += np.array(score_env) + for t in range(self.n_rollout_threads): + if dones_env[t]: + done_episodes_rewards.append(train_episode_rewards[t]) + train_episode_rewards[t] = 0 + #done_episodes_scores.append(train_episode_scores[t]) + #train_episode_scores[t] = 0 + done_individual_rewards.append(train_individual_rewards) + train_individual_rewards = [0 for _ in range(self.num_agents)] + + #data = obs, rewards, dones, infos, available_actions, \ + data = obs, rewards, dones, infos, \ + values, actions, action_log_probs, \ + rnn_states, rnn_states_critic + + # insert data into buffer + self.insert(data) + + # compute return and update network + self.compute() + train_infos = self.train() + + # post process + total_num_steps = (episode + 1) * self.episode_length * self.n_rollout_threads + # save model + if (episode % self.save_interval == 0 or episode == episodes - 1): + self.save(episode) + + # log information + if episode % self.log_interval == 0: + end = time.time() + print("\n Scenario {} Algo {} Exp {} updates {}/{} episodes, total num timesteps {}/{}, FPS {}.\n" + .format(self.all_args.scenario_name, + self.algorithm_name, + self.experiment_name, + episode, + episodes, + total_num_steps, + self.num_env_steps, + int(total_num_steps / (end - start)))) + + self.log_train(train_infos, total_num_steps) + + if len(done_episodes_rewards) > 0: + aver_episode_rewards = np.mean(done_episodes_rewards) + self.writter.add_scalars("train_episode_rewards", {"aver_rewards": aver_episode_rewards}, total_num_steps) + done_episodes_rewards = [] + + #aver_episode_scores = np.mean(done_episodes_scores) + #self.writter.add_scalars("train_episode_scores", {"aver_scores": aver_episode_scores}, total_num_steps) + #done_episodes_scores = [] + #print("some episodes done, average rewards: {}, scores: {}" + # .format(aver_episode_rewards, aver_episode_scores)) + print("some episodes done, average rewards: {}".format(aver_episode_rewards)) + + env_infos = {} + last_individual_rewards = done_individual_rewards[-1] + done_individual_rewards = [] + for agent_id in range(self.num_agents): + agent_k = 'agent%i/individual_rewards' % (agent_id+1) + env_infos[agent_k] = [last_individual_rewards[agent_id]] + self.log_env(env_infos, total_num_steps) + + + # eval + if episode % self.eval_interval == 0 and self.use_eval: + self.eval(total_num_steps) + + def warmup(self): + # reset env + obs = self.envs.reset() + + # replay buffer + if self.use_centralized_V: + share_obs = obs.reshape(self.n_rollout_threads, -1) + share_obs = np.expand_dims(share_obs, 1).repeat(self.num_agents, axis=1) + else: + share_obs = obs + + self.buffer.share_obs[0] = share_obs.copy() + self.buffer.obs[0] = obs.copy() + + @torch.no_grad() + def collect(self, step): + self.trainer.prep_rollout() + #value, action, action_log_prob, rnn_state, rnn_state_critic \ + # = self.trainer.policy.get_actions(np.concatenate(self.buffer.share_obs[step]), + # np.concatenate(self.buffer.obs[step]), + # np.concatenate(self.buffer.rnn_states[step]), + # np.concatenate(self.buffer.rnn_states_critic[step]), + # np.concatenate(self.buffer.masks[step]), + # np.concatenate(self.buffer.available_actions[step])) + value, action, action_log_prob, rnn_state, rnn_state_critic \ + = self.trainer.policy.get_actions(np.concatenate(self.buffer.share_obs[step]), + np.concatenate(self.buffer.obs[step]), + np.concatenate(self.buffer.rnn_states[step]), + np.concatenate(self.buffer.rnn_states_critic[step]), + np.concatenate(self.buffer.masks[step])) + + # [self.envs, agents, dim] + values = np.array(np.split(_t2n(value), self.n_rollout_threads)) + actions = np.array(np.split(_t2n(action), self.n_rollout_threads)) + action_log_probs = np.array(np.split(_t2n(action_log_prob), self.n_rollout_threads)) + rnn_states = np.array(np.split(_t2n(rnn_state), self.n_rollout_threads)) + rnn_states_critic = np.array(np.split(_t2n(rnn_state_critic), self.n_rollout_threads)) + + return values, actions, action_log_probs, rnn_states, rnn_states_critic + + def insert(self, data): + #obs, rewards, dones, infos, available_actions, \ + obs, rewards, dones, infos, \ + values, actions, action_log_probs, rnn_states, rnn_states_critic = data + + dones_env = np.all(dones, axis=1) + + rnn_states[dones_env == True] = np.zeros(((dones_env == True).sum(), self.num_agents, self.recurrent_N, self.hidden_size), dtype=np.float32) + rnn_states_critic[dones_env == True] = np.zeros(((dones_env == True).sum(), self.num_agents, *self.buffer.rnn_states_critic.shape[3:]), dtype=np.float32) + + masks = np.ones((self.n_rollout_threads, self.num_agents, 1), dtype=np.float32) + masks[dones_env == True] = np.zeros(((dones_env == True).sum(), self.num_agents, 1), dtype=np.float32) + + active_masks = np.ones((self.n_rollout_threads, self.num_agents, 1), dtype=np.float32) + active_masks[dones == True] = np.zeros(((dones == True).sum(), 1), dtype=np.float32) + active_masks[dones_env == True] = np.ones(((dones_env == True).sum(), self.num_agents, 1), dtype=np.float32) + + # bad_masks = np.array([[[0.0] if info[agent_id]['bad_transition'] else [1.0] for agent_id in range(self.num_agents)] for info in infos]) + + if self.use_centralized_V: + share_obs = obs.reshape(self.n_rollout_threads, -1) + share_obs = np.expand_dims(share_obs, 1).repeat(self.num_agents, axis=1) + else: + share_obs = obs + + #self.buffer.insert(share_obs, obs, rnn_states, rnn_states_critic, + # actions, action_log_probs, values, rewards, masks, None, active_masks, + # available_actions) + self.buffer.insert(share_obs, obs, rnn_states, rnn_states_critic, + actions, action_log_probs, values, rewards, masks, None, active_masks) + + + + def log_train(self, train_infos, total_num_steps): + train_infos["average_step_rewards"] = np.mean(self.buffer.rewards) + print("average_step_rewards is {}.".format(train_infos["average_step_rewards"])) + for k, v in train_infos.items(): + if self.use_wandb: + wandb.log({k: v}, step=total_num_steps) + else: + self.writter.add_scalars(k, {k: v}, total_num_steps) + + @torch.no_grad() + def eval(self, total_num_steps): + eval_episode = 0 + eval_episode_rewards = [] + one_episode_rewards = [0 for _ in range(self.all_args.eval_episodes)] + eval_episode_scores = [] + one_episode_scores = [0 for _ in range(self.all_args.eval_episodes)] + + eval_obs, eval_share_obs, ava = self.eval_envs.reset() + eval_rnn_states = np.zeros((self.all_args.eval_episodes, self.num_agents, self.recurrent_N, + self.hidden_size), dtype=np.float32) + eval_masks = np.ones((self.all_args.eval_episodes, self.num_agents, 1), dtype=np.float32) + + while True: + self.trainer.prep_rollout() + eval_actions, eval_rnn_states = \ + self.trainer.policy.act(np.concatenate(eval_share_obs), + np.concatenate(eval_obs), + np.concatenate(eval_rnn_states), + np.concatenate(eval_masks), + np.concatenate(ava), + deterministic=True) + eval_actions = np.array(np.split(_t2n(eval_actions), self.all_args.eval_episodes)) + eval_rnn_states = np.array(np.split(_t2n(eval_rnn_states), self.all_args.eval_episodes)) + + # Obser reward and next obs + eval_obs, eval_share_obs, eval_rewards, eval_dones, eval_infos, ava = self.eval_envs.step(eval_actions) + eval_rewards = np.mean(eval_rewards, axis=1).flatten() + one_episode_rewards += eval_rewards + + eval_scores = [t_info[0]["score_reward"] for t_info in eval_infos] + one_episode_scores += np.array(eval_scores) + + eval_dones_env = np.all(eval_dones, axis=1) + eval_rnn_states[eval_dones_env == True] = np.zeros(((eval_dones_env == True).sum(), self.num_agents, + self.recurrent_N, self.hidden_size), dtype=np.float32) + eval_masks = np.ones((self.all_args.eval_episodes, self.num_agents, 1), dtype=np.float32) + eval_masks[eval_dones_env == True] = np.zeros(((eval_dones_env == True).sum(), self.num_agents, 1), + dtype=np.float32) + + for eval_i in range(self.all_args.eval_episodes): + if eval_dones_env[eval_i]: + eval_episode += 1 + eval_episode_rewards.append(one_episode_rewards[eval_i]) + one_episode_rewards[eval_i] = 0 + + eval_episode_scores.append(one_episode_scores[eval_i]) + one_episode_scores[eval_i] = 0 + + if eval_episode >= self.all_args.eval_episodes: + key_average = '/eval_average_episode_rewards' + key_max = '/eval_max_episode_rewards' + key_scores = '/eval_average_episode_scores' + eval_env_infos = {key_average: eval_episode_rewards, + key_max: [np.max(eval_episode_rewards)], + key_scores: eval_episode_scores} + self.log_env(eval_env_infos, total_num_steps) + + print("eval average episode rewards: {}, scores: {}." + .format(np.mean(eval_episode_rewards), np.mean(eval_episode_scores))) + break diff --git a/controllers/mat_rapid/soccer/env_wrappers.py b/controllers/mat_rapid/soccer/env_wrappers.py new file mode 100644 index 0000000..12cf941 --- /dev/null +++ b/controllers/mat_rapid/soccer/env_wrappers.py @@ -0,0 +1,831 @@ +""" +Modified from OpenAI Baselines code to work with multi-agent envs +""" +import numpy as np +import torch +from multiprocessing import Process, Pipe +from abc import ABC, abstractmethod +from utils.util import tile_images + +class CloudpickleWrapper(object): + """ + Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle) + """ + + def __init__(self, x): + self.x = x + + def __getstate__(self): + import cloudpickle + return cloudpickle.dumps(self.x) + + def __setstate__(self, ob): + import pickle + self.x = pickle.loads(ob) + + +class ShareVecEnv(ABC): + """ + An abstract asynchronous, vectorized environment. + Used to batch data from multiple copies of an environment, so that + each observation becomes an batch of observations, and expected action is a batch of actions to + be applied per-environment. + """ + closed = False + viewer = None + + metadata = { + 'render.modes': ['human', 'rgb_array'] + } + + def __init__(self, num_envs, observation_space, share_observation_space, action_space): + self.num_envs = num_envs + self.observation_space = observation_space + self.share_observation_space = share_observation_space + self.action_space = action_space + + @abstractmethod + def reset(self): + """ + Reset all the environments and return an array of + observations, or a dict of observation arrays. + + If step_async is still doing work, that work will + be cancelled and step_wait() should not be called + until step_async() is invoked again. + """ + pass + + @abstractmethod + def step_async(self, actions): + """ + Tell all the environments to start taking a step + with the given actions. + Call step_wait() to get the results of the step. + + You should not call this if a step_async run is + already pending. + """ + pass + + @abstractmethod + def step_wait(self): + """ + Wait for the step taken with step_async(). + + Returns (obs, rews, dones, infos): + - obs: an array of observations, or a dict of + arrays of observations. + - rews: an array of rewards + - dones: an array of "episode done" booleans + - infos: a sequence of info objects + """ + pass + + def close_extras(self): + """ + Clean up the extra resources, beyond what's in this base class. + Only runs when not self.closed. + """ + pass + + def close(self): + if self.closed: + return + if self.viewer is not None: + self.viewer.close() + self.close_extras() + self.closed = True + + def step(self, actions): + """ + Step the environments synchronously. + + This is available for backwards compatibility. + """ + self.step_async(actions) + return self.step_wait() + + def render(self, mode='human'): + imgs = self.get_images() + bigimg = tile_images(imgs) + if mode == 'human': + self.get_viewer().imshow(bigimg) + return self.get_viewer().isopen + elif mode == 'rgb_array': + return bigimg + else: + raise NotImplementedError + + def get_images(self): + """ + Return RGB images from each environment + """ + raise NotImplementedError + + @property + def unwrapped(self): + if isinstance(self, VecEnvWrapper): + return self.venv.unwrapped + else: + return self + + def get_viewer(self): + if self.viewer is None: + from gym.envs.classic_control import rendering + self.viewer = rendering.SimpleImageViewer() + return self.viewer + + +def worker(remote, parent_remote, env_fn_wrapper): + parent_remote.close() + env = env_fn_wrapper.x() + while True: + cmd, data = remote.recv() + if cmd == 'step': + ob, reward, done, info = env.step(data) + if 'bool' in done.__class__.__name__: + if done: + ob = env.reset() + else: + if np.all(done): + ob = env.reset() + + remote.send((ob, reward, done, info)) + elif cmd == 'reset': + ob = env.reset() + remote.send((ob)) + elif cmd == 'render': + if data == "rgb_array": + fr = env.render(mode=data) + remote.send(fr) + elif data == "human": + env.render(mode=data) + elif cmd == 'reset_task': + ob = env.reset_task() + remote.send(ob) + elif cmd == 'close': + env.close() + remote.close() + break + elif cmd == 'get_spaces': + remote.send((env.observation_space, env.share_observation_space, env.action_space)) + else: + raise NotImplementedError + + +class GuardSubprocVecEnv(ShareVecEnv): + def __init__(self, env_fns, spaces=None): + """ + envs: list of gym environments to run in subprocesses + """ + self.waiting = False + self.closed = False + nenvs = len(env_fns) + self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) + self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) + for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] + for p in self.ps: + p.daemon = False # could cause zombie process + p.start() + for remote in self.work_remotes: + remote.close() + + self.remotes[0].send(('get_spaces', None)) + observation_space, share_observation_space, action_space = self.remotes[0].recv() + ShareVecEnv.__init__(self, len(env_fns), observation_space, + share_observation_space, action_space) + + def step_async(self, actions): + + for remote, action in zip(self.remotes, actions): + remote.send(('step', action)) + self.waiting = True + + def step_wait(self): + results = [remote.recv() for remote in self.remotes] + self.waiting = False + obs, rews, dones, infos = zip(*results) + return np.stack(obs), np.stack(rews), np.stack(dones), infos + + def reset(self): + for remote in self.remotes: + remote.send(('reset', None)) + obs = [remote.recv() for remote in self.remotes] + return np.stack(obs) + + def reset_task(self): + for remote in self.remotes: + remote.send(('reset_task', None)) + return np.stack([remote.recv() for remote in self.remotes]) + + def close(self): + if self.closed: + return + if self.waiting: + for remote in self.remotes: + remote.recv() + for remote in self.remotes: + remote.send(('close', None)) + for p in self.ps: + p.join() + self.closed = True + + +class SubprocVecEnv(ShareVecEnv): + def __init__(self, env_fns, spaces=None): + """ + envs: list of gym environments to run in subprocesses + """ + self.waiting = False + self.closed = False + nenvs = len(env_fns) + self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) + self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) + for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] + for p in self.ps: + p.daemon = True # if the main process crashes, we should not cause things to hang + p.start() + for remote in self.work_remotes: + remote.close() + + self.remotes[0].send(('get_spaces', None)) + observation_space, share_observation_space, action_space = self.remotes[0].recv() + ShareVecEnv.__init__(self, len(env_fns), observation_space, + share_observation_space, action_space) + + def step_async(self, actions): + for remote, action in zip(self.remotes, actions): + remote.send(('step', action)) + self.waiting = True + + def step_wait(self): + results = [remote.recv() for remote in self.remotes] + self.waiting = False + obs, rews, dones, infos = zip(*results) + return np.stack(obs), np.stack(rews), np.stack(dones), infos + + def reset(self): + for remote in self.remotes: + remote.send(('reset', None)) + obs = [remote.recv() for remote in self.remotes] + return np.stack(obs) + + + def reset_task(self): + for remote in self.remotes: + remote.send(('reset_task', None)) + return np.stack([remote.recv() for remote in self.remotes]) + + def close(self): + if self.closed: + return + if self.waiting: + for remote in self.remotes: + remote.recv() + for remote in self.remotes: + remote.send(('close', None)) + for p in self.ps: + p.join() + self.closed = True + + def render(self, mode="rgb_array"): + for remote in self.remotes: + remote.send(('render', mode)) + if mode == "rgb_array": + frame = [remote.recv() for remote in self.remotes] + return np.stack(frame) + + +def shareworker(remote, parent_remote, env_fn_wrapper): + parent_remote.close() + env = env_fn_wrapper.x() + while True: + cmd, data = remote.recv() + if cmd == 'step': + ob, s_ob, reward, done, info, available_actions = env.step(data) + if 'bool' in done.__class__.__name__: + if done: + ob, s_ob, available_actions = env.reset() + else: + if np.all(done): + ob, s_ob, available_actions = env.reset() + + remote.send((ob, s_ob, reward, done, info, available_actions)) + elif cmd == 'reset': + ob, s_ob, available_actions = env.reset() + remote.send((ob, s_ob, available_actions)) + elif cmd == 'reset_task': + ob = env.reset_task() + remote.send(ob) + elif cmd == 'render': + if data == "rgb_array": + fr = env.render(mode=data) + remote.send(fr) + elif data == "human": + env.render(mode=data) + elif cmd == 'close': + env.close() + remote.close() + break + elif cmd == 'get_num_agents': + remote.send((env.n_agents)) + elif cmd == 'get_spaces': + remote.send( + (env.observation_space, env.share_observation_space, env.action_space)) + elif cmd == 'render_vulnerability': + fr = env.render_vulnerability(data) + remote.send((fr)) + else: + raise NotImplementedError + + +class ShareSubprocVecEnv(ShareVecEnv): + def __init__(self, env_fns, spaces=None): + """ + envs: list of gym environments to run in subprocesses + """ + self.waiting = False + self.closed = False + nenvs = len(env_fns) + self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) + self.ps = [Process(target=shareworker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) + for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] + for p in self.ps: + p.daemon = True # if the main process crashes, we should not cause things to hang + p.start() + for remote in self.work_remotes: + remote.close() + self.remotes[0].send(('get_num_agents', None)) + self.n_agents = self.remotes[0].recv() + self.remotes[0].send(('get_spaces', None)) + observation_space, share_observation_space, action_space = self.remotes[0].recv( + ) + ShareVecEnv.__init__(self, len(env_fns), observation_space, + share_observation_space, action_space) + + def step_async(self, actions): + for remote, action in zip(self.remotes, actions): + remote.send(('step', action)) + self.waiting = True + + def step_wait(self): + results = [remote.recv() for remote in self.remotes] + self.waiting = False + obs, share_obs, rews, dones, infos, available_actions = zip(*results) + return np.stack(obs), np.stack(share_obs), np.stack(rews), np.stack(dones), infos, np.stack(available_actions) + + def reset(self): + for remote in self.remotes: + remote.send(('reset', None)) + results = [remote.recv() for remote in self.remotes] + obs, share_obs, available_actions = zip(*results) + return np.stack(obs), np.stack(share_obs), np.stack(available_actions) + + def reset_task(self): + for remote in self.remotes: + remote.send(('reset_task', None)) + return np.stack([remote.recv() for remote in self.remotes]) + + def close(self): + if self.closed: + return + if self.waiting: + for remote in self.remotes: + remote.recv() + for remote in self.remotes: + remote.send(('close', None)) + for p in self.ps: + p.join() + self.closed = True + + +def choosesimpleworker(remote, parent_remote, env_fn_wrapper): + parent_remote.close() + env = env_fn_wrapper.x() + while True: + cmd, data = remote.recv() + if cmd == 'step': + ob, reward, done, info = env.step(data) + remote.send((ob, reward, done, info)) + elif cmd == 'reset': + ob = env.reset(data) + remote.send((ob)) + elif cmd == 'reset_task': + ob = env.reset_task() + remote.send(ob) + elif cmd == 'close': + env.close() + remote.close() + break + elif cmd == 'render': + if data == "rgb_array": + fr = env.render(mode=data) + remote.send(fr) + elif data == "human": + env.render(mode=data) + elif cmd == 'get_spaces': + remote.send( + (env.observation_space, env.share_observation_space, env.action_space)) + else: + raise NotImplementedError + + +class ChooseSimpleSubprocVecEnv(ShareVecEnv): + def __init__(self, env_fns, spaces=None): + """ + envs: list of gym environments to run in subprocesses + """ + self.waiting = False + self.closed = False + nenvs = len(env_fns) + self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) + self.ps = [Process(target=choosesimpleworker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) + for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] + for p in self.ps: + p.daemon = True # if the main process crashes, we should not cause things to hang + p.start() + for remote in self.work_remotes: + remote.close() + self.remotes[0].send(('get_spaces', None)) + observation_space, share_observation_space, action_space = self.remotes[0].recv() + ShareVecEnv.__init__(self, len(env_fns), observation_space, + share_observation_space, action_space) + + def step_async(self, actions): + for remote, action in zip(self.remotes, actions): + remote.send(('step', action)) + self.waiting = True + + def step_wait(self): + results = [remote.recv() for remote in self.remotes] + self.waiting = False + obs, rews, dones, infos = zip(*results) + return np.stack(obs), np.stack(rews), np.stack(dones), infos + + def reset(self, reset_choose): + for remote, choose in zip(self.remotes, reset_choose): + remote.send(('reset', choose)) + obs = [remote.recv() for remote in self.remotes] + return np.stack(obs) + + def render(self, mode="rgb_array"): + for remote in self.remotes: + remote.send(('render', mode)) + if mode == "rgb_array": + frame = [remote.recv() for remote in self.remotes] + return np.stack(frame) + + def reset_task(self): + for remote in self.remotes: + remote.send(('reset_task', None)) + return np.stack([remote.recv() for remote in self.remotes]) + + def close(self): + if self.closed: + return + if self.waiting: + for remote in self.remotes: + remote.recv() + for remote in self.remotes: + remote.send(('close', None)) + for p in self.ps: + p.join() + self.closed = True + + +def chooseworker(remote, parent_remote, env_fn_wrapper): + parent_remote.close() + env = env_fn_wrapper.x() + while True: + cmd, data = remote.recv() + if cmd == 'step': + ob, s_ob, reward, done, info, available_actions = env.step(data) + remote.send((ob, s_ob, reward, done, info, available_actions)) + elif cmd == 'reset': + ob, s_ob, available_actions = env.reset(data) + remote.send((ob, s_ob, available_actions)) + elif cmd == 'reset_task': + ob = env.reset_task() + remote.send(ob) + elif cmd == 'close': + env.close() + remote.close() + break + elif cmd == 'render': + remote.send(env.render(mode='rgb_array')) + elif cmd == 'get_spaces': + remote.send( + (env.observation_space, env.share_observation_space, env.action_space)) + else: + raise NotImplementedError + + +class ChooseSubprocVecEnv(ShareVecEnv): + def __init__(self, env_fns, spaces=None): + """ + envs: list of gym environments to run in subprocesses + """ + self.waiting = False + self.closed = False + nenvs = len(env_fns) + self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) + self.ps = [Process(target=chooseworker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) + for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] + for p in self.ps: + p.daemon = True # if the main process crashes, we should not cause things to hang + p.start() + for remote in self.work_remotes: + remote.close() + self.remotes[0].send(('get_spaces', None)) + observation_space, share_observation_space, action_space = self.remotes[0].recv( + ) + ShareVecEnv.__init__(self, len(env_fns), observation_space, + share_observation_space, action_space) + + def step_async(self, actions): + for remote, action in zip(self.remotes, actions): + remote.send(('step', action)) + self.waiting = True + + def step_wait(self): + results = [remote.recv() for remote in self.remotes] + self.waiting = False + obs, share_obs, rews, dones, infos, available_actions = zip(*results) + return np.stack(obs), np.stack(share_obs), np.stack(rews), np.stack(dones), infos, np.stack(available_actions) + + def reset(self, reset_choose): + for remote, choose in zip(self.remotes, reset_choose): + remote.send(('reset', choose)) + results = [remote.recv() for remote in self.remotes] + obs, share_obs, available_actions = zip(*results) + return np.stack(obs), np.stack(share_obs), np.stack(available_actions) + + def reset_task(self): + for remote in self.remotes: + remote.send(('reset_task', None)) + return np.stack([remote.recv() for remote in self.remotes]) + + def close(self): + if self.closed: + return + if self.waiting: + for remote in self.remotes: + remote.recv() + for remote in self.remotes: + remote.send(('close', None)) + for p in self.ps: + p.join() + self.closed = True + + +def chooseguardworker(remote, parent_remote, env_fn_wrapper): + parent_remote.close() + env = env_fn_wrapper.x() + while True: + cmd, data = remote.recv() + if cmd == 'step': + ob, reward, done, info = env.step(data) + remote.send((ob, reward, done, info)) + elif cmd == 'reset': + ob = env.reset(data) + remote.send((ob)) + elif cmd == 'reset_task': + ob = env.reset_task() + remote.send(ob) + elif cmd == 'close': + env.close() + remote.close() + break + elif cmd == 'get_spaces': + remote.send( + (env.observation_space, env.share_observation_space, env.action_space)) + else: + raise NotImplementedError + + +class ChooseGuardSubprocVecEnv(ShareVecEnv): + def __init__(self, env_fns, spaces=None): + """ + envs: list of gym environments to run in subprocesses + """ + self.waiting = False + self.closed = False + nenvs = len(env_fns) + self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) + self.ps = [Process(target=chooseguardworker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) + for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] + for p in self.ps: + p.daemon = False # if the main process crashes, we should not cause things to hang + p.start() + for remote in self.work_remotes: + remote.close() + self.remotes[0].send(('get_spaces', None)) + observation_space, share_observation_space, action_space = self.remotes[0].recv( + ) + ShareVecEnv.__init__(self, len(env_fns), observation_space, + share_observation_space, action_space) + + def step_async(self, actions): + for remote, action in zip(self.remotes, actions): + remote.send(('step', action)) + self.waiting = True + + def step_wait(self): + results = [remote.recv() for remote in self.remotes] + self.waiting = False + obs, rews, dones, infos = zip(*results) + return np.stack(obs), np.stack(rews), np.stack(dones), infos + + def reset(self, reset_choose): + for remote, choose in zip(self.remotes, reset_choose): + remote.send(('reset', choose)) + obs = [remote.recv() for remote in self.remotes] + return np.stack(obs) + + def reset_task(self): + for remote in self.remotes: + remote.send(('reset_task', None)) + return np.stack([remote.recv() for remote in self.remotes]) + + def close(self): + if self.closed: + return + if self.waiting: + for remote in self.remotes: + remote.recv() + for remote in self.remotes: + remote.send(('close', None)) + for p in self.ps: + p.join() + self.closed = True + + +# single env +class DummyVecEnv(ShareVecEnv): + def __init__(self, env_fns): + self.envs = [fn() for fn in env_fns] + env = self.envs[0] + ShareVecEnv.__init__(self, len( + env_fns), env.observation_space, env.share_observation_space, env.action_space) + self.actions = None + + def step_async(self, actions): + self.actions = actions + + def step_wait(self): + results = [env.step(a) for (a, env) in zip(self.actions, self.envs)] + obs, rews, dones, infos = map(np.array, zip(*results)) + + for (i, done) in enumerate(dones): + if 'bool' in done.__class__.__name__: + if done: + obs[i] = self.envs[i].reset() + else: + if np.all(done): + obs[i] = self.envs[i].reset() + + self.actions = None + return obs, rews, dones, infos + + def reset(self): + obs = [env.reset() for env in self.envs] + return np.array(obs) + + def close(self): + for env in self.envs: + env.close() + + def render(self, mode="human"): + if mode == "rgb_array": + return np.array([env.render(mode=mode) for env in self.envs]) + elif mode == "human": + for env in self.envs: + env.render(mode=mode) + else: + raise NotImplementedError + + + +class ShareDummyVecEnv(ShareVecEnv): + def __init__(self, env_fns): + self.envs = [fn() for fn in env_fns] + env = self.envs[0] + self.n_agents = env.n_agents + ShareVecEnv.__init__(self, len( + env_fns), env.observation_space, env.share_observation_space, env.action_space) + self.actions = None + + def step_async(self, actions): + self.actions = actions + + def step_wait(self): + results = [env.step(a) for (a, env) in zip(self.actions, self.envs)] + obs, share_obs, rews, dones, infos, available_actions = map( + np.array, zip(*results)) + + for (i, done) in enumerate(dones): + if 'bool' in done.__class__.__name__: + if done: + obs[i], share_obs[i], available_actions[i] = self.envs[i].reset() + else: + if np.all(done): + obs[i], share_obs[i], available_actions[i] = self.envs[i].reset() + self.actions = None + + return obs, share_obs, rews, dones, infos, available_actions + + def reset(self): + results = [env.reset() for env in self.envs] + obs, share_obs, available_actions = map(np.array, zip(*results)) + return obs, share_obs, available_actions + + def close(self): + for env in self.envs: + env.close() + + def save_replay(self): + for env in self.envs: + env.save_replay() + + def render(self, mode="human"): + if mode == "rgb_array": + return np.array([env.render(mode=mode) for env in self.envs]) + elif mode == "human": + for env in self.envs: + env.render(mode=mode) + else: + raise NotImplementedError + + +class ChooseDummyVecEnv(ShareVecEnv): + def __init__(self, env_fns): + self.envs = [fn() for fn in env_fns] + env = self.envs[0] + ShareVecEnv.__init__(self, len( + env_fns), env.observation_space, env.share_observation_space, env.action_space) + self.actions = None + + def step_async(self, actions): + self.actions = actions + + def step_wait(self): + results = [env.step(a) for (a, env) in zip(self.actions, self.envs)] + obs, share_obs, rews, dones, infos, available_actions = map( + np.array, zip(*results)) + self.actions = None + return obs, share_obs, rews, dones, infos, available_actions + + def reset(self, reset_choose): + results = [env.reset(choose) + for (env, choose) in zip(self.envs, reset_choose)] + obs, share_obs, available_actions = map(np.array, zip(*results)) + return obs, share_obs, available_actions + + def close(self): + for env in self.envs: + env.close() + + def render(self, mode="human"): + if mode == "rgb_array": + return np.array([env.render(mode=mode) for env in self.envs]) + elif mode == "human": + for env in self.envs: + env.render(mode=mode) + else: + raise NotImplementedError + +class ChooseSimpleDummyVecEnv(ShareVecEnv): + def __init__(self, env_fns): + self.envs = [fn() for fn in env_fns] + env = self.envs[0] + ShareVecEnv.__init__(self, len( + env_fns), env.observation_space, env.share_observation_space, env.action_space) + self.actions = None + + def step_async(self, actions): + self.actions = actions + + def step_wait(self): + results = [env.step(a) for (a, env) in zip(self.actions, self.envs)] + obs, rews, dones, infos = map(np.array, zip(*results)) + self.actions = None + return obs, rews, dones, infos + + def reset(self, reset_choose): + obs = [env.reset(choose) + for (env, choose) in zip(self.envs, reset_choose)] + return np.array(obs) + + def close(self): + for env in self.envs: + env.close() + + def render(self, mode="human"): + if mode == "rgb_array": + return np.array([env.render(mode=mode) for env in self.envs]) + elif mode == "human": + for env in self.envs: + env.render(mode=mode) + else: + raise NotImplementedError diff --git a/controllers/mat_rapid/soccer/multiagentenv.py b/controllers/mat_rapid/soccer/multiagentenv.py new file mode 100644 index 0000000..2a1f4dd --- /dev/null +++ b/controllers/mat_rapid/soccer/multiagentenv.py @@ -0,0 +1,82 @@ +from collections import namedtuple +import numpy as np + + +def convert(dictionary): + return namedtuple('GenericDict', dictionary.keys())(**dictionary) + +class MultiAgentEnv(object): + + def __init__(self, **kwargs): + # Unpack arguments from sacred + args = kwargs["env_args"] + if isinstance(args, dict): + args = convert(args) + self.args = args + + if getattr(args, "seed", None) is not None: + self.seed = args.seed + self.rs = np.random.RandomState(self.seed) # initialise numpy random state + + def step(self, actions): + """ Returns reward, terminated, info """ + raise NotImplementedError + + def get_obs(self): + """ Returns all agent observations in a list """ + raise NotImplementedError + + def get_obs_agent(self, agent_id): + """ Returns observation for agent_id """ + raise NotImplementedError + + def get_obs_size(self): + """ Returns the shape of the observation """ + raise NotImplementedError + + def get_state(self): + raise NotImplementedError + + def get_state_size(self): + """ Returns the shape of the state""" + raise NotImplementedError + + def get_avail_actions(self): + raise NotImplementedError + + def get_avail_agent_actions(self, agent_id): + """ Returns the available actions for agent_id """ + raise NotImplementedError + + def get_total_actions(self): + """ Returns the total number of actions an agent could ever take """ + # TODO: This is only suitable for a discrete 1 dimensional action space for each agent + raise NotImplementedError + + def get_stats(self): + raise NotImplementedError + + # TODO: Temp hack + def get_agg_stats(self, stats): + return {} + + def reset(self): + """ Returns initial observations and states""" + raise NotImplementedError + + def render(self): + raise NotImplementedError + + def close(self): + raise NotImplementedError + + def seed(self, seed): + raise NotImplementedError + + def get_env_info(self): + env_info = {"state_shape": self.get_state_size(), + "obs_shape": self.get_obs_size(), + "n_actions": self.get_total_actions(), + "n_agents": self.n_agents, + "episode_limit": self.episode_limit} + return env_info diff --git a/controllers/mat_rapid/soccer/player.py b/controllers/mat_rapid/soccer/player.py new file mode 100644 index 0000000..e261c87 --- /dev/null +++ b/controllers/mat_rapid/soccer/player.py @@ -0,0 +1,92 @@ +import numpy as np +from controller import Supervisor +import math + +class Player(): + kick_vel = 0.0 + + def __init__(self, name = None, supervisor = None): + super().__init__() + self.name = name + self.supervisor = supervisor + self.player = None + self.emitter = None + self.waiting_time = 0 + self.dx = 0 + self.dy = 0 + self.dthe = 0 + + def reset(self, pos = [0.0, 0.0, 0.0]): + children = self.supervisor.getRoot().getField('children') + if self.player != None: + self.player.remove() + if "blue" in self.name: + ch = int(self.name[-1]) + children.importMFNodeFromString(-1, f'DEF {self.name} GankenKun_box {{translation {pos[0]} {pos[1]} 0.300 rotation 0 0 1 {pos[2]} jerseyTexture "textures/GankenKun_{self.name}.png" jerseyColor 0, 0, 1 channel {ch} controller "void"}}') + else: + ch = int(self.name[-1])+3 + children.importMFNodeFromString(-1, f'DEF {self.name} GankenKun_box {{translation {pos[0]} {pos[1]} 0.300 rotation 0 0 1 {pos[2]} jerseyTexture "textures/GankenKun_{self.name}.png" jerseyColor 1, 0, 0 channel {ch} controller "void"}}') + self.pos = pos + self.emitter = self.supervisor.getDevice(f'{self.name}_emitter') + self.player = self.supervisor.getFromDef(f'{self.name}') + self.player_pos = self.player.getField('translation') + self.player_rot = self.player.getField('rotation') + + self.alive = True + self.score = 0 + self.action = 0 + self.is_fall = False + self.is_replace = False + self.waiting_time = 1 + + def move(self, pos = [0.0, 0.0, 0.0]): + self.player.resetPhysics() + self.player_pos.setSFVec3f([pos[0], pos[1], 0.450]) + self.player_rot.setSFRotation([0, 0, 1, pos[2]]) + + def send(self, message): + if self.waiting_time > 0: + self.waiting_time -= 1 + return + if "kick" in message.decode('utf-8'): + self.waiting_time = 4 + message_parts = message.decode('utf-8').split(',') + if message_parts[0] == "walk": + self.dx, self.dy, self.dthe = float(message_parts[1])*0.01, float(message_parts[2])*0.01, float(message_parts[3])*0.01 + self.kick_vel = 0.0 + if "kick" in message.decode('utf-8'): + self.dx, self.dy, self.dthe = 0.0, 0.0, 0.0 + self.kick_vel = 2.0 + + def update(self): + #self.action = action + x, y, z = self.player_pos.getSFVec3f() + yaw, pitch, roll = self.rotation_to_euler(self.player_rot.getSFRotation()) + self.pos = [x, y, yaw] + if abs(pitch) > 1.0 or abs(roll) > 1.0: + self.is_fall = True + else: + self.is_fall = False + x += self.dx * math.cos(yaw) - self.dy * math.sin(yaw) + y += self.dx * math.sin(yaw) + self.dy * math.cos(yaw) + self.player_pos.setSFVec3f([x, y, z]) + self.player_rot.setSFRotation([0, 0, 1, yaw + self.dthe]) + self.player.setVelocity([0, 0, 0, 0, 0, 0]) + + def is_done(self): + return not self.alive + + def rotation_to_euler(self, rotation): + x, y, z, angle = rotation + c = np.cos(angle) + s = np.sin(angle) + t = 1 - c + R = np.array([ + [t*x*x + c, t*x*y - z*s, t*x*z + y*s], + [t*x*y + z*s, t*y*y + c, t*y*z - x*s], + [t*x*z - y*s, t*y*z + x*s, t*z*z + c] + ]) + yaw = np.arctan2(R[1, 0], R[0, 0]) + pitch = np.arctan2(-R[2, 0], np.sqrt(R[2, 1]**2 + R[2, 2]**2)) + roll = np.arctan2(R[2, 1], R[2, 2]) + return yaw, pitch, roll diff --git a/controllers/mat_rapid/soccer/soccer.py b/controllers/mat_rapid/soccer/soccer.py new file mode 100644 index 0000000..cf3d1de --- /dev/null +++ b/controllers/mat_rapid/soccer/soccer.py @@ -0,0 +1,323 @@ +#!/usr/bin/env python3 + +import numpy as np +import math +import copy +import random + +from controller import Supervisor + +from gymnasium.spaces import Box, Discrete, Sequence +from gymnasium.utils import EzPickle, seeding + +from pettingzoo import AECEnv +from pettingzoo.utils import wrappers +from pettingzoo.utils.agent_selector import agent_selector +from pettingzoo.utils.conversions import parallel_wrapper_fn + +from soccer.player import Player + +__all__ = ["env", "parallel_env", "raw_env"] + +def env(**kwargs): + env = raw_env(**kwargs) + env = wrappers.AssertOutOfBoundsWrapper(env) + env = wrappers.OrderEnforcingWrapper(env) + return env + +parallel_env = parallel_wrapper_fn(env) + +def normalize_angle_rad(angle): + while angle > math.pi: + angle -= 2.0 * math.pi + while angle <= -math.pi: + angle += 2.0 * math.pi + return angle + +class raw_env(AECEnv, EzPickle): + metadata = { + "render_modes": ["human", "rgb_array"], + "name": "soccer_v0", + "is_parallelizable": True, + } + + supervisor = None + + def __init__(self, max_cycles=300, render_mode=None): + EzPickle.__init__(self, max_cycles=max_cycles, render_mode=render_mode) + if self.supervisor == None: + self.supervisor = Supervisor() + self.time_step = int(self.supervisor.getBasicTimeStep()) + + self.frames = 0 + self.render_mode = render_mode + self._seed() + self.max_cycles = max_cycles + self.out_agent = [] + self.agent_name_mapping = {} + self.agent_dict = {} + self.kill_list = [] + self.agent_list = [] + #self.agents = ["blue1", "blue2", "blue3", "red1", "red2", "red3"] + self.agents = ["blue1", "blue2", "blue3"] + self.dead_agents = [] + for i in range(len(self.agents)): + self.agent_name_mapping[self.agents[i]] = i + self.agent_list.append(Player(self.agents[i], self.supervisor)) + #obs_space = Box(low=-5, high=5, shape = ([15]), dtype=np.float16) + obs_space = Box(low=-5, high=5, shape = ([9]), dtype=np.float32) + self.observation_spaces = dict(zip(self.agents, [obs_space for _ in enumerate(self.agents)])) + self.action_spaces = dict(zip(self.agents, [Discrete(9) for _ in enumerate(self.agents)])) + self.actions = ["walk,1,0,0", "walk,-1,0,0", "walk,0,1,0", "walk,0,-1,0", "walk,0,0,1", "walk,0,0,-1", "motion,left_kick", "motion,right_kick", "walk,0,0,0"] + self.state_space = Box(low=-5, high=5, shape = ([21]), dtype=np.float32) + + self.possible_agents = copy.deepcopy(self.agents) + self._agent_selector = agent_selector(self.agents) + + self.reinit() + + def __del__(self): + print("DELETE") + + def observation_space(self, agent): + return self.observation_spaces[agent] + + def action_space(self, agent): + return self.action_spaces[agent] + + def _seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + + def observe(self, agent): + i = self.agent_name_mapping[agent] + state = self.state() + ball_x, ball_y = [state[0], state[1]] + bx, by, bthe = state[i*3+3], state[i*3+4], state[i*3+5] + s, c = math.sin(bthe), math.cos(bthe) + blx, bly = ball_x - bx, ball_y - by + x, y = blx * c + bly * s, - blx * s + bly * c + angle = math.degrees(math.atan2(y, x)) + obs = [x, y] if (abs(angle) < 60) else [-100, -100] + obs += [bx, by, bthe] + no_agent = len(self.possible_agents) + base_index = list(range(no_agent)) + if agent.startswith("red"): + index = base_index[int(no_agent/2):] + base_index[:int(no_agent/2)] + else: + index = base_index + index.remove(i) + for j in index: + rx, ry = state[j*3+3], state[j*3+4] + lx, ly = rx - bx, ry - by + x, y = lx * c + ly * s, - lx * s + ly * c + obs += [x, y] + if agent.startswith("red"): + obs[2] = -obs[2] + obs[3] = -obs[3] + obs[4] = normalize_angle_rad(obs[4]+math.pi) + return obs + + def state(self): + ball_x, ball_y, _ = self.ball_pos.getSFVec3f() + #for agent in self.agent_list: + # agent.update() + player = [] + for i in range(len(self.agent_list)): + player.append(self.agent_list[i].pos) + #state = [ball_x, ball_y, 0, player[0][0], player[0][1], player[0][2], player[1][0], player[1][1], player[1][2], player[2][0], player[2][1], player[2][2], player[3][0], player[3][1], player[3][2], player[4][0], player[4][1], player[4][2], player[5][0], player[5][1], player[5][2]] + state = [ball_x, ball_y, 0, player[0][0], player[0][1], player[0][2], player[1][0], player[1][1], player[1][2], player[2][0], player[2][1], player[2][2]] + return state + + def step(self, action): + if self.terminations[self.agent_selection] or self.truncations[self.agent_selection]: + self._was_dead_step(action) + return + self._cumulative_rewards[self.agent_selection] = 0 + agent = self.agent_list[self.agent_name_mapping[self.agent_selection]] + agent.score = 0 + + terminate = False + truncate = False + goal = False + + #print("frames: "+str(self.frames)) + + i = self.agent_name_mapping[self.agent_selection] + if self.agent_list[i].is_fall: + while True: + if self.agents[i].startswith("blue"): + x, y = random.uniform(-4.0, -3.0), random.uniform(-2.5, 2.5) + elif self.agents[i].startswith("red"): + x, y = random.uniform(4.0, 3.0), random.uniform(-2.5, 2.5) + near_robot = False + for j in range(i): + robot_x, robot_y, _ = self.agent_list[j].pos + length = math.sqrt((x-robot_x)**2+(y-robot_y)**2) + if length < 1: + near_robot = True + break + if near_robot == False: + break + self.init_pos[i][0], self.init_pos[i][1] = x, y + self.agent_list[i].move(self.init_pos[i]) + self.agent_list[i].is_replace = True + else: + message = self.actions[action].encode('utf-8') + agent.send(message) + if "kick" in message.decode('utf-8'): + ball_x, ball_y, _ = self.ball_pos.getSFVec3f() + bx, by, bthe = agent.pos + s, c = math.sin(bthe), math.cos(bthe) + blx, bly = ball_x - bx, ball_y - by + x, y = blx * c + bly * s, - blx * s + bly * c + if 0.1 < x < 0.25: + if ("left" in message.decode('utf-8') and 0.0 < y < 0.1) or ("right" in message.decode('utf-8') and -0.1 < y < 0.0): + vel = agent.kick_vel + self.ball.setVelocity([vel*c, vel*s, 0, 0, 0, 0]) + + if self._agent_selector.is_last(): + self.frames += 1 + # status update and calculate reward + self._clear_rewards() + for i in range(4): + self.supervisor.step(self.time_step) + ball_x, ball_y, _ = self.ball_pos.getSFVec3f() + ball_vel_x, ball_vel_y = self.ball.getVelocity()[:2] + for agent in self.agents: + self.agent_list[self.agent_name_mapping[agent]].update() + x, y, the = self.agent_list[self.agent_name_mapping[agent]].pos + length = math.sqrt((x-ball_x)**2+(y-ball_y)**2) + #self.rewards[agent] += 0.2/length/40 + if length < 1.0: + if agent.startswith("blue"): + ball_dx, ball_dy = 4.5 - ball_x, 0 - ball_y + ball_len = math.sqrt(ball_dx**2+ball_dy**2) + ball_dx, ball_dy = ball_dx / ball_len, ball_dy / ball_len + reward = ball_vel_x * ball_dx + ball_vel_y * ball_dy + self.rewards[agent] += max(reward, 0) * 10 + elif agent.startswith("red"): + ball_dx, ball_dy = 4.5 - ( -ball_x), 0 - (-ball_y) + ball_len = math.sqrt(ball_dx**2+ball_dy**2) + ball_dx, ball_dy = ball_dx / ball_len, ball_dy / ball_len + reward = (-ball_vel_x) * ball_dx + (-ball_vel_y) * ball_dy + self.rewards[agent] += max(reward, 0) * 10 + for agent in self.agents: + # out of field penalty + x, y, the = self.agent_list[self.agent_name_mapping[agent]].pos + if abs(x) > 5.0 or abs(y) > 3.5: + self.rewards[agent] += -0.4 + # hit other robot penalty + for other_agent in self.agents: + if other_agent == agent: + continue + xo, yo, _ = self.agent_list[self.agent_name_mapping[other_agent]].pos + if math.sqrt((x-xo)**2+(y-yo)**2) < 0.3: + self.rewards[agent] = -1.0 + # fall penalty + if self.agent_list[self.agent_name_mapping[agent]].is_replace: + self.rewards[agent] += -1 + self.agent_list[self.agent_name_mapping[agent]].is_replace = False + print("reward(fall): "+str(agent)+" "+str(self.rewards[agent])) + + # global rewards + if ball_x > 4.5 and abs(ball_y) < 1.3: + goal = True + truncate = True + if agent.startswith("blue"): + self.rewards[agent] += 1000 + elif agent.startswith("red"): + self.rewards[agent] += -1000 + print("Team blue Goal, reward: "+str(agent)+" "+str(self.rewards[agent])) + elif ball_x < -4.5 and abs(ball_y) < 1.3: + goal = True + truncate = True + if agent.startswith("blue"): + self.rewards[agent] += -1000 + elif agent.startswith("red"): + self.rewards[agent] += 1000 + print("Team red Goal, reward: "+str(agent)+" "+str(self.rewards[agent])) + + for agent in self.agents: + self.total_rewards[agent] += self.rewards[agent] + + if not goal: + if abs(ball_x) > 4.5 or abs(ball_y) > 3.0: + print("The ball out of the field") + y = random.uniform(-2.5, 2.5) + self.ball.resetPhysics() + self.ball_pos.setSFVec3f([0, y, 0]) + + if self.frames >= self.max_cycles: + truncate = True + self.terminations = {a: terminate for a in self.agents} + self.truncations = {a: truncate for a in self.agents} + if truncate: + for agent in self.agents: + self.infos[agent]["episode"] = {"r": self.total_rewards[agent], "l": self.max_cycles} + + if self._agent_selector.is_last(): + _live_agents = self.agents[:] + for k in self.kill_list: + _live_agents.remove(k) + self.terminations[k] = True + self.dead_agents.append(k) + self.kill_list = [] + self._agent_selector.reinit(_live_agents) + + if len(self._agent_selector.agent_order): + self.agent_selection = self._agent_selector.next() + + self._accumulate_rewards() + self._deads_step_first() + + def render(): + pass + + def reinit(self): + self.score = 0 + self.run = True + children = self.supervisor.getRoot().getField('children') + + try: + self.ball + except: + pass + else: + self.ball.remove() + + y = random.uniform(-2.5, 2.5) + children.importMFNodeFromString(-1, f'DEF BALL RobocupSoccerBall {{ translation 0 {y} 0.1 size 1 }}') + self.ball = self.supervisor.getFromDef('BALL') + self.ball_pos = self.ball.getField('translation') + self.init_pos = [[-0.3, 0, 0], [-2, -1, 0], [-2, 1, 0], [1, 0, 3.14], [2, -1, 3.14], [2, 1, 3.14]] + for i in range(len(self.agent_list)): + while True: + if self.agents[i].startswith("blue"): + x, y = random.uniform(-4.0, -1.0), random.uniform(-2.5, 2.5) + elif self.agents[i].startswith("red"): + x, y = random.uniform(4.0, 1.0), random.uniform(-2.5, 2.5) + near_robot = False + for j in range(i): + length = math.sqrt((x-self.init_pos[j][0])**2+(y-self.init_pos[j][1])**2) + if length < 1: + near_robot = True + break + if near_robot == False: + break + self.init_pos[i][0], self.init_pos[i][1] = x, y + self.agent_list[i].reset(self.init_pos[i]) + self.frames = 0 + + def reset(self, seed = None, options = None): + if seed is not None: + self._seed(seed=seed) + self.agents = copy.deepcopy(self.possible_agents) + self._agent_selector.reinit(self.agents) + self.agent_selection = self._agent_selector.next() + self.rewards = dict(zip(self.agents, [0 for _ in self.agents])) + self.total_rewards = dict(zip(self.agents, [0 for _ in self.agents])) + self._cumulative_rewards = {a: 0 for a in self.agents} + self.terminations = dict(zip(self.agents, [False for _ in self.agents])) + self.truncations = dict(zip(self.agents, [False for _ in self.agents])) + self.infos = dict(zip(self.agents, [{} for _ in self.agents])) + self.reinit() diff --git a/controllers/mat_rapid/soccer/soccer_env.py b/controllers/mat_rapid/soccer/soccer_env.py new file mode 100644 index 0000000..070c1b9 --- /dev/null +++ b/controllers/mat_rapid/soccer/soccer_env.py @@ -0,0 +1,116 @@ +from functools import partial +import gym +from gym.spaces import Box +from gym.wrappers import TimeLimit +import numpy as np +#import gfootball.env as football_env +import soccer_v0 +#from .encode.obs_encode import FeatureEncoder +#from .encode.rew_encode import Rewarder + +from soccer.multiagentenv import MultiAgentEnv + + +class SoccerEnv(MultiAgentEnv): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.env = soccer_v0.parallel_env(max_cycles=kwargs["env_args"]["episode_length"]) + #self.scenario = kwargs["env_args"]["scenario"] + self.env.reset() + self.n_agents = self.env.num_agents + self.agents = self.env.agents + #self.reward_type = kwargs["env_args"]["reward"] + + #self.feature_encoder = FeatureEncoder() + #self.reward_encoder = Rewarder() + self.action_space = [gym.spaces.Discrete(self.env.action_space(agent).n) for agent in self.agents] + + tmp_obs_dicts, _ = self.env.reset() + #tmp_obs = [self._encode_obs(obs_dict)[0] for obs_dict in tmp_obs_dicts] + tmp_obs = np.hstack([np.array(tmp_obs_dicts[k], dtype=np.float32).flatten() for k in sorted(tmp_obs_dicts)]) + #self.observation_space = [Box(low=float("-inf"), high=float("inf"), shape=tmp_obs[n].shape, dtype=np.float32) + # for n in range(self.n_agents)] + self.observation_space = [self.env.observation_space(agent) for agent in self.agents] + #self.share_observation_space = self.observation_space.copy() + self.share_observation_space = [Box(low=float("-inf"), high=float("inf"), shape=tmp_obs.shape, dtype=np.float32) for n in range(self.n_agents)] + + self.pre_obs = None + + def _encode_obs(self, raw_obs): + #obs = self.feature_encoder.encode(raw_obs.copy()) + obs = raw_obs + obs_cat = np.hstack( + [np.array(obs[k], dtype=np.float32).flatten() for k in sorted(obs)] + ) + return obs_cat + #ava = obs["avail"] + #return obs_cat, ava + + def reset(self, **kwargs): + """ Returns initial observations and states""" + obs_dicts, _ = self.env.reset() + self.pre_obs = obs_dicts + #obs = [] + #ava = [] + obs = [np.array(obs_dicts[k], dtype=np.float32) for k in sorted(obs_dicts)] + #for obs_dict in obs_dicts: + # obs_i, ava_i = self._encode_obs(obs_dict) + # obs.append(obs_i) + # ava.append(ava_i) + return obs + #state = obs.copy() + #return obs, state, ava + + def step(self, actions): + #actions_int = [int(a) for a in actions] + #o, r, d, i = self.env.step(actions_int) + actions_dict = {} + for i, agent in enumerate(self.agents): + actions_dict[agent] = int(actions[i]) + observations, rewards, terminations, truncations, infos = self.env.step(actions_dict) + #obs = [] + obs = [np.array(observations[k], dtype=np.float32) for k in sorted(observations)] + #ava = [] + #for obs_dict in observations: + # obs_i, ava_i = self._encode_obs(obs_dict) + # obs.append(obs_i) + # ava.append(ava_i) + #state = obs.copy() + + #rewards = [[self.reward_encoder.calc_reward(_r, _prev_obs, _obs)] + # for _r, _prev_obs, _obs in zip(r, self.pre_obs, o)] + rewards = [[rewards[k]] for k in sorted(rewards)] + + self.pre_obs = observations + + d = [truncations[k] for k in sorted(truncations)] + dones = np.ones((self.n_agents), dtype=bool) * d + #infos = [i for n in range(self.n_agents)] + #infos = [infos[k] for k in sorted(infos)] + info_n = [] + for i, agent in enumerate(self.agents): + info = {'individual_reward': rewards[i][0]} + info_n.append(info) + #return obs, state, rewards, dones, infos, ava + return obs, rewards, dones, info_n + + def render(self, **kwargs): + # self.env.render(**kwargs) + pass + + def close(self): + pass + + def seed(self, args): + pass + + def get_env_info(self): + + env_info = {"state_shape": self.observation_space[0].shape, + "obs_shape": self.observation_space[0].shape, + "n_actions": self.action_space[0].n, + "n_agents": self.n_agents, + "action_spaces": self.action_space + } + return env_info diff --git a/controllers/mat_rapid/soccer_v0.py b/controllers/mat_rapid/soccer_v0.py new file mode 100644 index 0000000..6bf6c03 --- /dev/null +++ b/controllers/mat_rapid/soccer_v0.py @@ -0,0 +1,7 @@ +from soccer.soccer import ( + env, + parallel_env, + raw_env, +) + +__all__ = ["env", "parallel_env", "raw_env"] diff --git a/controllers/mat_rapid/utils/__init__.py b/controllers/mat_rapid/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/controllers/mat_rapid/utils/shared_buffer.py b/controllers/mat_rapid/utils/shared_buffer.py new file mode 100644 index 0000000..8047980 --- /dev/null +++ b/controllers/mat_rapid/utils/shared_buffer.py @@ -0,0 +1,288 @@ +import torch +import numpy as np +import torch.nn.functional as F +from utils.util import get_shape_from_obs_space, get_shape_from_act_space + + +def _flatten(T, N, x): + return x.reshape(T * N, *x.shape[2:]) + + +def _cast(x): + return x.transpose(1, 2, 0, 3).reshape(-1, *x.shape[3:]) + + +def _shuffle_agent_grid(x, y): + rows = np.indices((x, y))[0] + # cols = np.stack([np.random.permutation(y) for _ in range(x)]) + cols = np.stack([np.arange(y) for _ in range(x)]) + return rows, cols + + +class SharedReplayBuffer(object): + """ + Buffer to store training data. + :param args: (argparse.Namespace) arguments containing relevant model, policy, and env information. + :param num_agents: (int) number of agents in the env. + :param obs_space: (gym.Space) observation space of agents. + :param cent_obs_space: (gym.Space) centralized observation space of agents. + :param act_space: (gym.Space) action space for agents. + """ + + def __init__(self, args, num_agents, obs_space, cent_obs_space, act_space, env_name): + self.episode_length = args.episode_length + self.n_rollout_threads = args.n_rollout_threads + self.hidden_size = args.hidden_size + self.recurrent_N = args.recurrent_N + self.gamma = args.gamma + self.gae_lambda = args.gae_lambda + self._use_gae = args.use_gae + self._use_popart = args.use_popart + self._use_valuenorm = args.use_valuenorm + self._use_proper_time_limits = args.use_proper_time_limits + self.algo = args.algorithm_name + self.num_agents = num_agents + self.env_name = env_name + + obs_shape = get_shape_from_obs_space(obs_space) + share_obs_shape = get_shape_from_obs_space(cent_obs_space) + + if type(obs_shape[-1]) == list: + obs_shape = obs_shape[:1] + + if type(share_obs_shape[-1]) == list: + share_obs_shape = share_obs_shape[:1] + + self.share_obs = np.zeros((self.episode_length + 1, self.n_rollout_threads, num_agents, *share_obs_shape), + dtype=np.float32) + self.obs = np.zeros((self.episode_length + 1, self.n_rollout_threads, num_agents, *obs_shape), dtype=np.float32) + + self.rnn_states = np.zeros( + (self.episode_length + 1, self.n_rollout_threads, num_agents, self.recurrent_N, self.hidden_size), + dtype=np.float32) + self.rnn_states_critic = np.zeros_like(self.rnn_states) + + self.value_preds = np.zeros( + (self.episode_length + 1, self.n_rollout_threads, num_agents, 1), dtype=np.float32) + self.returns = np.zeros_like(self.value_preds) + self.advantages = np.zeros( + (self.episode_length, self.n_rollout_threads, num_agents, 1), dtype=np.float32) + + if act_space.__class__.__name__ == 'Discrete': + self.available_actions = np.ones((self.episode_length + 1, self.n_rollout_threads, num_agents, act_space.n), + dtype=np.float32) + else: + self.available_actions = None + + act_shape = get_shape_from_act_space(act_space) + + self.actions = np.zeros( + (self.episode_length, self.n_rollout_threads, num_agents, act_shape), dtype=np.float32) + self.action_log_probs = np.zeros( + (self.episode_length, self.n_rollout_threads, num_agents, act_shape), dtype=np.float32) + self.rewards = np.zeros( + (self.episode_length, self.n_rollout_threads, num_agents, 1), dtype=np.float32) + + self.masks = np.ones((self.episode_length + 1, self.n_rollout_threads, num_agents, 1), dtype=np.float32) + self.bad_masks = np.ones_like(self.masks) + self.active_masks = np.ones_like(self.masks) + + self.step = 0 + + def insert(self, share_obs, obs, rnn_states_actor, rnn_states_critic, actions, action_log_probs, + value_preds, rewards, masks, bad_masks=None, active_masks=None, available_actions=None): + """ + Insert data into the buffer. + :param share_obs: (argparse.Namespace) arguments containing relevant model, policy, and env information. + :param obs: (np.ndarray) local agent observations. + :param rnn_states_actor: (np.ndarray) RNN states for actor network. + :param rnn_states_critic: (np.ndarray) RNN states for critic network. + :param actions:(np.ndarray) actions taken by agents. + :param action_log_probs:(np.ndarray) log probs of actions taken by agents + :param value_preds: (np.ndarray) value function prediction at each step. + :param rewards: (np.ndarray) reward collected at each step. + :param masks: (np.ndarray) denotes whether the environment has terminated or not. + :param bad_masks: (np.ndarray) action space for agents. + :param active_masks: (np.ndarray) denotes whether an agent is active or dead in the env. + :param available_actions: (np.ndarray) actions available to each agent. If None, all actions are available. + """ + self.share_obs[self.step + 1] = share_obs.copy() + self.obs[self.step + 1] = obs.copy() + self.rnn_states[self.step + 1] = rnn_states_actor.copy() + self.rnn_states_critic[self.step + 1] = rnn_states_critic.copy() + self.actions[self.step] = actions.copy() + self.action_log_probs[self.step] = action_log_probs.copy() + self.value_preds[self.step] = value_preds.copy() + self.rewards[self.step] = rewards.copy() + self.masks[self.step + 1] = masks.copy() + if bad_masks is not None: + self.bad_masks[self.step + 1] = bad_masks.copy() + if active_masks is not None: + self.active_masks[self.step + 1] = active_masks.copy() + if available_actions is not None: + self.available_actions[self.step + 1] = available_actions.copy() + + self.step = (self.step + 1) % self.episode_length + + def chooseinsert(self, share_obs, obs, rnn_states, rnn_states_critic, actions, action_log_probs, + value_preds, rewards, masks, bad_masks=None, active_masks=None, available_actions=None): + """ + Insert data into the buffer. This insert function is used specifically for Hanabi, which is turn based. + :param share_obs: (argparse.Namespace) arguments containing relevant model, policy, and env information. + :param obs: (np.ndarray) local agent observations. + :param rnn_states_actor: (np.ndarray) RNN states for actor network. + :param rnn_states_critic: (np.ndarray) RNN states for critic network. + :param actions:(np.ndarray) actions taken by agents. + :param action_log_probs:(np.ndarray) log probs of actions taken by agents + :param value_preds: (np.ndarray) value function prediction at each step. + :param rewards: (np.ndarray) reward collected at each step. + :param masks: (np.ndarray) denotes whether the environment has terminated or not. + :param bad_masks: (np.ndarray) denotes indicate whether whether true terminal state or due to episode limit + :param active_masks: (np.ndarray) denotes whether an agent is active or dead in the env. + :param available_actions: (np.ndarray) actions available to each agent. If None, all actions are available. + """ + self.share_obs[self.step] = share_obs.copy() + self.obs[self.step] = obs.copy() + self.rnn_states[self.step + 1] = rnn_states.copy() + self.rnn_states_critic[self.step + 1] = rnn_states_critic.copy() + self.actions[self.step] = actions.copy() + self.action_log_probs[self.step] = action_log_probs.copy() + self.value_preds[self.step] = value_preds.copy() + self.rewards[self.step] = rewards.copy() + self.masks[self.step + 1] = masks.copy() + if bad_masks is not None: + self.bad_masks[self.step + 1] = bad_masks.copy() + if active_masks is not None: + self.active_masks[self.step] = active_masks.copy() + if available_actions is not None: + self.available_actions[self.step] = available_actions.copy() + + self.step = (self.step + 1) % self.episode_length + + def after_update(self): + """Copy last timestep data to first index. Called after update to model.""" + self.share_obs[0] = self.share_obs[-1].copy() + self.obs[0] = self.obs[-1].copy() + self.rnn_states[0] = self.rnn_states[-1].copy() + self.rnn_states_critic[0] = self.rnn_states_critic[-1].copy() + self.masks[0] = self.masks[-1].copy() + self.bad_masks[0] = self.bad_masks[-1].copy() + self.active_masks[0] = self.active_masks[-1].copy() + if self.available_actions is not None: + self.available_actions[0] = self.available_actions[-1].copy() + + def chooseafter_update(self): + """Copy last timestep data to first index. This method is used for Hanabi.""" + self.rnn_states[0] = self.rnn_states[-1].copy() + self.rnn_states_critic[0] = self.rnn_states_critic[-1].copy() + self.masks[0] = self.masks[-1].copy() + self.bad_masks[0] = self.bad_masks[-1].copy() + + def compute_returns(self, next_value, value_normalizer=None): + """ + Compute returns either as discounted sum of rewards, or using GAE. + :param next_value: (np.ndarray) value predictions for the step after the last episode step. + :param value_normalizer: (PopArt) If not None, PopArt value normalizer instance. + """ + self.value_preds[-1] = next_value + gae = 0 + for step in reversed(range(self.rewards.shape[0])): + if self._use_popart or self._use_valuenorm: + delta = self.rewards[step] + self.gamma * value_normalizer.denormalize( + self.value_preds[step + 1]) * self.masks[step + 1] \ + - value_normalizer.denormalize(self.value_preds[step]) + gae = delta + self.gamma * self.gae_lambda * self.masks[step + 1] * gae + + # here is a patch for mpe, whose last step is timeout instead of terminate + if self.env_name == "MPE" and step == self.rewards.shape[0] - 1: + gae = 0 + + self.advantages[step] = gae + self.returns[step] = gae + value_normalizer.denormalize(self.value_preds[step]) + else: + delta = self.rewards[step] + self.gamma * self.value_preds[step + 1] * \ + self.masks[step + 1] - self.value_preds[step] + gae = delta + self.gamma * self.gae_lambda * self.masks[step + 1] * gae + + # here is a patch for mpe, whose last step is timeout instead of terminate + if self.env_name == "MPE" and step == self.rewards.shape[0] - 1: + gae = 0 + + self.advantages[step] = gae + self.returns[step] = gae + self.value_preds[step] + + def feed_forward_generator_transformer(self, advantages, num_mini_batch=None, mini_batch_size=None): + """ + Yield training data for MLP policies. + :param advantages: (np.ndarray) advantage estimates. + :param num_mini_batch: (int) number of minibatches to split the batch into. + :param mini_batch_size: (int) number of samples in each minibatch. + """ + episode_length, n_rollout_threads, num_agents = self.rewards.shape[0:3] + batch_size = n_rollout_threads * episode_length + + if mini_batch_size is None: + assert batch_size >= num_mini_batch, ( + "PPO requires the number of processes ({}) " + "* number of steps ({}) = {} " + "to be greater than or equal to the number of PPO mini batches ({})." + "".format(n_rollout_threads, episode_length, + n_rollout_threads * episode_length, + num_mini_batch)) + mini_batch_size = batch_size // num_mini_batch + + rand = torch.randperm(batch_size).numpy() + sampler = [rand[i * mini_batch_size:(i + 1) * mini_batch_size] for i in range(num_mini_batch)] + rows, cols = _shuffle_agent_grid(batch_size, num_agents) + + # keep (num_agent, dim) + share_obs = self.share_obs[:-1].reshape(-1, *self.share_obs.shape[2:]) + share_obs = share_obs[rows, cols] + obs = self.obs[:-1].reshape(-1, *self.obs.shape[2:]) + obs = obs[rows, cols] + rnn_states = self.rnn_states[:-1].reshape(-1, *self.rnn_states.shape[2:]) + rnn_states = rnn_states[rows, cols] + rnn_states_critic = self.rnn_states_critic[:-1].reshape(-1, *self.rnn_states_critic.shape[2:]) + rnn_states_critic = rnn_states_critic[rows, cols] + actions = self.actions.reshape(-1, *self.actions.shape[2:]) + actions = actions[rows, cols] + if self.available_actions is not None: + available_actions = self.available_actions[:-1].reshape(-1, *self.available_actions.shape[2:]) + available_actions = available_actions[rows, cols] + value_preds = self.value_preds[:-1].reshape(-1, *self.value_preds.shape[2:]) + value_preds = value_preds[rows, cols] + returns = self.returns[:-1].reshape(-1, *self.returns.shape[2:]) + returns = returns[rows, cols] + masks = self.masks[:-1].reshape(-1, *self.masks.shape[2:]) + masks = masks[rows, cols] + active_masks = self.active_masks[:-1].reshape(-1, *self.active_masks.shape[2:]) + active_masks = active_masks[rows, cols] + action_log_probs = self.action_log_probs.reshape(-1, *self.action_log_probs.shape[2:]) + action_log_probs = action_log_probs[rows, cols] + advantages = advantages.reshape(-1, *advantages.shape[2:]) + advantages = advantages[rows, cols] + + for indices in sampler: + # [L,T,N,Dim]-->[L*T,N,Dim]-->[index,N,Dim]-->[index*N, Dim] + share_obs_batch = share_obs[indices].reshape(-1, *share_obs.shape[2:]) + obs_batch = obs[indices].reshape(-1, *obs.shape[2:]) + rnn_states_batch = rnn_states[indices].reshape(-1, *rnn_states.shape[2:]) + rnn_states_critic_batch = rnn_states_critic[indices].reshape(-1, *rnn_states_critic.shape[2:]) + actions_batch = actions[indices].reshape(-1, *actions.shape[2:]) + if self.available_actions is not None: + available_actions_batch = available_actions[indices].reshape(-1, *available_actions.shape[2:]) + else: + available_actions_batch = None + value_preds_batch = value_preds[indices].reshape(-1, *value_preds.shape[2:]) + return_batch = returns[indices].reshape(-1, *returns.shape[2:]) + masks_batch = masks[indices].reshape(-1, *masks.shape[2:]) + active_masks_batch = active_masks[indices].reshape(-1, *active_masks.shape[2:]) + old_action_log_probs_batch = action_log_probs[indices].reshape(-1, *action_log_probs.shape[2:]) + if advantages is None: + adv_targ = None + else: + adv_targ = advantages[indices].reshape(-1, *advantages.shape[2:]) + + yield share_obs_batch, obs_batch, rnn_states_batch, rnn_states_critic_batch, actions_batch, \ + value_preds_batch, return_batch, masks_batch, active_masks_batch, old_action_log_probs_batch, \ + adv_targ, available_actions_batch diff --git a/controllers/mat_rapid/utils/util.py b/controllers/mat_rapid/utils/util.py new file mode 100644 index 0000000..8b61453 --- /dev/null +++ b/controllers/mat_rapid/utils/util.py @@ -0,0 +1,72 @@ +import numpy as np +import math +import torch + +def check(input): + if type(input) == np.ndarray: + return torch.from_numpy(input) + +def get_gard_norm(it): + sum_grad = 0 + for x in it: + if x.grad is None: + continue + sum_grad += x.grad.norm() ** 2 + return math.sqrt(sum_grad) + +def update_linear_schedule(optimizer, epoch, total_num_epochs, initial_lr): + """Decreases the learning rate linearly""" + lr = initial_lr - (initial_lr * (epoch / float(total_num_epochs))) + for param_group in optimizer.param_groups: + param_group['lr'] = lr + +def huber_loss(e, d): + a = (abs(e) <= d).float() + b = (e > d).float() + return a*e**2/2 + b*d*(abs(e)-d/2) + +def mse_loss(e): + return e**2/2 + +def get_shape_from_obs_space(obs_space): + if obs_space.__class__.__name__ == 'Box': + obs_shape = obs_space.shape + elif obs_space.__class__.__name__ == 'list': + obs_shape = obs_space + else: + raise NotImplementedError + return obs_shape + +def get_shape_from_act_space(act_space): + if act_space.__class__.__name__ == 'Discrete': + act_shape = 1 + elif act_space.__class__.__name__ == "MultiDiscrete": + act_shape = act_space.shape + elif act_space.__class__.__name__ == "Box": + act_shape = act_space.shape[0] + elif act_space.__class__.__name__ == "MultiBinary": + act_shape = act_space.shape[0] + else: # agar + act_shape = act_space[0].shape[0] + 1 + return act_shape + + +def tile_images(img_nhwc): + """ + Tile N images into one big PxQ image + (P,Q) are chosen to be as close as possible, and if N + is square, then P=Q. + input: img_nhwc, list or array of images, ndim=4 once turned into array + n = batch index, h = height, w = width, c = channel + returns: + bigim_HWc, ndarray with ndim=3 + """ + img_nhwc = np.asarray(img_nhwc) + N, h, w, c = img_nhwc.shape + H = int(np.ceil(np.sqrt(N))) + W = int(np.ceil(float(N)/H)) + img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0]*0 for _ in range(N, H*W)]) + img_HWhwc = img_nhwc.reshape(H, W, h, w, c) + img_HhWwc = img_HWhwc.transpose(0, 2, 1, 3, 4) + img_Hh_Ww_c = img_HhWwc.reshape(H*h, W*w, c) + return img_Hh_Ww_c \ No newline at end of file diff --git a/controllers/mat_rapid/utils/valuenorm.py b/controllers/mat_rapid/utils/valuenorm.py new file mode 100644 index 0000000..6d9ce36 --- /dev/null +++ b/controllers/mat_rapid/utils/valuenorm.py @@ -0,0 +1,79 @@ + +import numpy as np + +import torch +import torch.nn as nn + + +class ValueNorm(nn.Module): + """ Normalize a vector of observations - across the first norm_axes dimensions""" + + def __init__(self, input_shape, norm_axes=1, beta=0.99999, per_element_update=False, epsilon=1e-5, device=torch.device("cpu")): + super(ValueNorm, self).__init__() + + self.input_shape = input_shape + self.norm_axes = norm_axes + self.epsilon = epsilon + self.beta = beta + self.per_element_update = per_element_update + self.tpdv = dict(dtype=torch.float32, device=device) + + self.running_mean = nn.Parameter(torch.zeros(input_shape), requires_grad=False).to(**self.tpdv) + self.running_mean_sq = nn.Parameter(torch.zeros(input_shape), requires_grad=False).to(**self.tpdv) + self.debiasing_term = nn.Parameter(torch.tensor(0.0), requires_grad=False).to(**self.tpdv) + + self.reset_parameters() + + def reset_parameters(self): + self.running_mean.zero_() + self.running_mean_sq.zero_() + self.debiasing_term.zero_() + + def running_mean_var(self): + debiased_mean = self.running_mean / self.debiasing_term.clamp(min=self.epsilon) + debiased_mean_sq = self.running_mean_sq / self.debiasing_term.clamp(min=self.epsilon) + debiased_var = (debiased_mean_sq - debiased_mean ** 2).clamp(min=1e-2) + return debiased_mean, debiased_var + + @torch.no_grad() + def update(self, input_vector): + if type(input_vector) == np.ndarray: + input_vector = torch.from_numpy(input_vector) + input_vector = input_vector.to(**self.tpdv) + + batch_mean = input_vector.mean(dim=tuple(range(self.norm_axes))) + batch_sq_mean = (input_vector ** 2).mean(dim=tuple(range(self.norm_axes))) + + if self.per_element_update: + batch_size = np.prod(input_vector.size()[:self.norm_axes]) + weight = self.beta ** batch_size + else: + weight = self.beta + + self.running_mean.mul_(weight).add_(batch_mean * (1.0 - weight)) + self.running_mean_sq.mul_(weight).add_(batch_sq_mean * (1.0 - weight)) + self.debiasing_term.mul_(weight).add_(1.0 * (1.0 - weight)) + + def normalize(self, input_vector): + # Make sure input is float32 + if type(input_vector) == np.ndarray: + input_vector = torch.from_numpy(input_vector) + input_vector = input_vector.to(**self.tpdv) + + mean, var = self.running_mean_var() + out = (input_vector - mean[(None,) * self.norm_axes]) / torch.sqrt(var)[(None,) * self.norm_axes] + + return out + + def denormalize(self, input_vector): + """ Transform normalized data back into original distribution """ + if type(input_vector) == np.ndarray: + input_vector = torch.from_numpy(input_vector) + input_vector = input_vector.to(**self.tpdv) + + mean, var = self.running_mean_var() + out = input_vector * torch.sqrt(var)[(None,) * self.norm_axes] + mean[(None,) * self.norm_axes] + + out = out.cpu().numpy() + + return out diff --git a/protos/GankenKun_box.proto b/protos/GankenKun_box.proto new file mode 100755 index 0000000..75c727c --- /dev/null +++ b/protos/GankenKun_box.proto @@ -0,0 +1,108 @@ +#VRML_SIM R2021a utf8 +# license: Apache License 2.0 +# license url: http://www.apache.org/licenses/LICENSE-2.0 +# This is a proto file for Webots for the GankenKun +# Extracted from: gankenkun_simple.urdf + +PROTO GankenKun_box [ + field SFVec3f translation 0 0 0 + field SFRotation rotation 0 1 0 0 + field SFString name "GankenKun" # Is `Robot.name`. + field SFString controller "void" # Is `Robot.controller`. + field MFString controllerArgs [] # Is `Robot.controllerArgs`. + field SFString customData "" # Is `Robot.customData`. + field SFBool supervisor FALSE # Is `Robot.supervisor`. + field SFBool synchronization TRUE # Is `Robot.synchronization`. + field SFBool selfCollision TRUE # Is `Robot.selfCollision`. +# field SFInt32 cameraWidth 640 # Is `Camera.width`. +# field SFInt32 cameraHeight 480 # Is `Camera.height`. + field MFString jerseyTexture "textures/GankenKun_red1.png" + field SFColor jerseyColor 1 0 0 +# field SFBool enable_backlash FALSE # Enables backlash on all joints. + field SFInt32 channel 1 # communication channel +] +{ + Robot { + translation IS translation + rotation IS rotation + children [ + Transform { + translation 0.000 0.000 0.000 + rotation 0.0 0.0 1 1.57 + children [ + Shape { + appearance DEF teamcolor PBRAppearance { + baseColor IS jerseyColor + transparency 0.000000 + roughness 1.000000 + metalness 0 + emissiveColor 0.000000 0.000000 0.000000 + } + geometry DEF body Box { + size 0.200 0.200 0.600 + } + } + ] + } + Transform { + translation -0.005 0.000 0.000 + rotation -0.577350 -0.577350 0.577350 2.094395 + children [ + Shape { + appearance PBRAppearance { + baseColorMap ImageTexture { + url IS jerseyTexture + } + metalness 0 + roughness 1 + } + geometry DEF body_number IndexedFaceSet { + coord Coordinate { + point [ + -0.03300 -0.08825 -0.03400, 0.03300 -0.08825 -0.03400, 0.03300 0.01500 -0.03400, -0.03300 0.01500 -0.03400, 0.03300 -0.07500 0.09700, -0.03300 -0.07500 0.09700, -0.03300 0.01500 0.09700, 0.03300 0.01500 0.09700, + ] + } + texCoord TextureCoordinate { + point [ + 0.0 0.0, 1.0 0.0, 1.0 1.0, 0.0 1.0 + ] + } + coordIndex [ + 3 2 1 0 -1 7 6 5 4 -1 + ] + texCoordIndex [ + 0 1 2 3 -1 0 1 2 3 -1 + ] + creaseAngle 1 + } + } + ] + } + ] + name IS name + boundingObject Transform { + translation 0.000 0.000 0.000 + rotation 0.0 0.0 1 1.57 + children [ + Box { + size 0.200 0.200 0.600 + } + ] + } + physics Physics { + density -1 + mass 1.289633 + centerOfMass [ 0.000 0.000000 0.300 ] + inertiaMatrix [ + 7.000000e-03 7.000000e-03 7.000000e-03 + 0.000000e+00 0.000000e+00 0.000000e+00 + ] + } + controller IS controller + controllerArgs IS controllerArgs + customData IS customData + supervisor IS supervisor + synchronization IS synchronization + selfCollision IS selfCollision + } +} diff --git a/worlds/mat_rapid.wbt b/worlds/mat_rapid.wbt new file mode 100644 index 0000000..26addc6 --- /dev/null +++ b/worlds/mat_rapid.wbt @@ -0,0 +1,68 @@ +#VRML_SIM R2021b utf8 +WorldInfo { + info [ + "GANKENKUN robot." + "The GANKENKUN robot simulation model" + ] + title "GANKENKUN" + basicTimeStep 80 + optimalThreadCount 8 + physicsDisableTime 0.1 + physicsDisableLinearThreshold 0.1 + physicsDisableAngularThreshold 0.1 + contactProperties [ + ContactProperties { + material1 "grass" + coulombFriction [ + 0.5 + ] + softCFM 0.03 + } + ContactProperties { + material1 "grass" + material2 "robocup soccer ball" + coulombFriction [ + 0.5 + ] + bounce 0.76 + softCFM 0.05 + } + ContactProperties { + material2 "robocup soccer ball" + bounce 0.76 + } + ] +} +Viewpoint { + orientation 0.6719791180295076 -0.49317350564460494 -0.552470775935249 1.8510428047885563 + position -2.5375047220358824 -0.9166711516828123 0.6010308394649483 +} +TexturedBackground { + texture "stadium_dry" +} +TexturedBackgroundLight { + texture "stadium_dry" +} + +RobocupSoccerField2 { + size "kid" +} + +Robot { + supervisor TRUE + controller "" + children[ + Emitter { name "blue1_emitter" channel 1 } + Receiver { name "blue1_receiver" channel 1 } + Emitter { name "blue2_emitter" channel 2 } + Receiver { name "blue2_receiver" channel 2 } + Emitter { name "blue3_emitter" channel 3 } + Receiver { name "blue3_receiver" channel 3 } + Emitter { name "red1_emitter" channel 4 } + Receiver { name "red1_receiver" channel 4 } + Emitter { name "red2_emitter" channel 5 } + Receiver { name "red2_receiver" channel 5 } + Emitter { name "red3_emitter" channel 6 } + Receiver { name "red3_receiver" channel 6 } + ] +}