diff --git a/pufferlib/config/default.ini b/pufferlib/config/default.ini index 0990fdb13..3b7984f75 100644 --- a/pufferlib/config/default.ini +++ b/pufferlib/config/default.ini @@ -29,18 +29,18 @@ optimizer = muon anneal_lr = True precision = float32 total_timesteps = 10_000_000 -learning_rate = 0.015 -gamma = 0.995 +learning_rate = 0.003 +gamma = 0.9650881439471051 gae_lambda = 0.90 update_epochs = 1 -clip_coef = 0.2 +clip_coef = 0.24983563859666713 vf_coef = 2.0 vf_clip_coef = 0.2 max_grad_norm = 1.5 -ent_coef = 0.001 -adam_beta1 = 0.95 -adam_beta2 = 0.999 -adam_eps = 1e-12 +ent_coef = 0.013899446116244659 +adam_beta1 = 0.9739575783018397 +adam_beta2 = 0.9983789918385146 +adam_eps = 2.112852785895044e-07 data_dir = experiments checkpoint_interval = 200 @@ -74,23 +74,23 @@ downsample = 10 #scale = auto # TODO: Elim from base -[sweep.train.total_timesteps] -distribution = log_normal -min = 5e7 -max = 1e10 -mean = 1e8 -scale = time - -[sweep.train.bptt_horizon] -distribution = uniform_pow2 -min = 16 -max = 64 -mean = 64 -scale = auto +#[sweep.train.total_timesteps] +#distribution = log_normal +#min = 5e7 +#max = 1e10 +#mean = 1e8 +#scale = time + +#[sweep.train.bptt_horizon] +#distribution = uniform_pow2 +#min = 16 +#max = 64 +#mean = 64 +#scale = auto [sweep.train.minibatch_size] distribution = uniform_pow2 -min = 8192 +min = 16384 max = 65536 mean = 32768 scale = auto @@ -98,28 +98,28 @@ scale = auto [sweep.train.learning_rate] distribution = log_normal min = 0.00001 -mean = 0.01 -max = 0.1 +mean = 0.00986346440817611 +max = 0.05 scale = 0.5 [sweep.train.ent_coef] distribution = log_normal min = 0.00001 -mean = 0.01 +mean = 0.01510492391719301 max = 0.2 scale = auto [sweep.train.gamma] distribution = logit_normal min = 0.8 -mean = 0.98 +mean = 0.9785788872888401 max = 0.9999 scale = auto [sweep.train.gae_lambda] distribution = logit_normal min = 0.6 -mean = 0.95 +mean = 0.9909980359361995 max = 0.995 scale = auto @@ -127,14 +127,14 @@ scale = auto distribution = uniform min = 0.0 max = 5.0 -mean = 1.0 +mean = 4.561253323222173 scale = auto [sweep.train.vtrace_c_clip] distribution = uniform min = 0.0 max = 5.0 -mean = 1.0 +mean = 1.0511015002528337 scale = auto #[sweep.train.update_epochs] @@ -148,7 +148,7 @@ scale = auto distribution = uniform min = 0.01 max = 1.0 -mean = 0.2 +mean = 0.38615719521473363 scale = auto # Optimal vf clip can be lower than 0.1, @@ -157,54 +157,54 @@ scale = auto distribution = uniform min = 0.1 max = 5.0 -mean = 0.2 +mean = 0.101 scale = auto [sweep.train.vf_coef] distribution = uniform min = 0.0 max = 5.0 -mean = 2.0 +mean = 4.999 scale = auto [sweep.train.max_grad_norm] distribution = uniform min = 0.0 -mean = 1.0 +mean = 2.3757898890403624 max = 5.0 scale = auto [sweep.train.adam_beta1] distribution = logit_normal min = 0.5 -mean = 0.9 +mean = 0.9227290229121611 max = 0.999 scale = auto [sweep.train.adam_beta2] distribution = logit_normal min = 0.9 -mean = 0.999 +mean = 0.9615271430090461 max = 0.99999 scale = auto [sweep.train.adam_eps] distribution = log_normal min = 1e-14 -mean = 1e-8 +mean = 5.941675291037148e-05 max = 1e-4 scale = auto [sweep.train.prio_alpha] distribution = logit_normal min = 0.1 -mean = 0.85 -max = 0.99 +mean = 0.9698884328392982 +max = 0.999 scale = auto [sweep.train.prio_beta0] distribution = logit_normal min = 0.1 -mean = 0.85 -max = 0.99 +mean = 0.9876650505878536 +max = 0.999 scale = auto diff --git a/pufferlib/config/ocean/drone_delivery.ini b/pufferlib/config/ocean/drone_delivery.ini new file mode 100644 index 000000000..9ffa3006a --- /dev/null +++ b/pufferlib/config/ocean/drone_delivery.ini @@ -0,0 +1,196 @@ +[base] +package = ocean +env_name = puffer_drone_delivery +policy_name = Policy +rnn_name = Recurrent + +[policy] +hidden_size = 256 + +[rnn] +input_size = 256 +hidden_size = 256 + +[vec] +num_envs = 16 + +[env] +num_envs = 16 +num_drones = 64 + +ablation = 0.0 +anneal_min = 0.9236471304397496 + +grip_k_max = 5.123485320457474 + +perfect_anneal = 275_000_000 +perfect_deadline = 6.077482208019486e+07 + +pos_const = 0.8113024567778199 +pos_penalty = 0.02588602599777176 + +reward_grip = 0.9760442313179252 +reward_ho_drop = 0.5 +reward_hover = 0.0 + +reward_max_dist = 65.0 +reward_min_dist = 2.531838566923265 + +vel_penalty_clamp = 0.056085060041244056 + +w_approach = 1.5767291682972426 +w_position = 1.0188128215656589 +w_stability = 0.012739532721609687 +w_velocity = 0.11573303880718092 + +[train] +adam_beta1 = 0.9227290229121611 +adam_beta2 = 0.9615271430090461 +adam_eps = 5.941675291037148e-05 +anneal_lr = true +batch_size = auto +bptt_horizon = 64 +checkpoint_interval = 200 +clip_coef = 0.38615719521473363 +ent_coef = 0.01510492391719301 +gae_lambda = 0.9909980359361995 +gamma = 0.9785788872888401 +learning_rate = 0.00986346440817611 +#learning_rate = 0.005 +max_grad_norm = 2.3757898890403624 +max_minibatch_size = 65536 +minibatch_size = 32768 #32768 +prio_alpha = 0.9698884328392982 +prio_beta0 = 0.9876650505878536 +total_timesteps = 300_000_000 +update_epochs = 1 +#use_rnn = false +vf_clip_coef = 0.101 +vf_coef = 4.999 +vtrace_c_clip = 1.0511015002528337 +vtrace_rho_clip = 4.561253323222173 + +[sweep] +method = Protein +metric = perfect_deliv +goal = maximize +downsample = 0 + +[sweep.env.ablation] +distribution = uniform +min = 0.0 +max = 1.0 +mean = 0.0 +scale = auto + +[sweep.env.anneal_min] +distribution = uniform +min = 0.0 +max = 1.0 +mean = 0.9236471304397496 +scale = auto + +[sweep.env.grip_k_max] +distribution = uniform +min = 5.0 +max = 20.0 +mean = 5.123485320457474 +scale = auto + +[sweep.env.perfect_anneal] +distribution = uniform +min = 200_000_000 +max = 275_000_000 +mean = 275_000_000 +scale = auto + +#[sweep.env.perfect_anneal_start] +#distribution = uniform +#min = 150_000_000 +#max = 200_000_000 +#mean = 175_000_000 +#scale = auto + +[sweep.env.perfect_deadline] +distribution = uniform +min = 50_000_000 +max = 150_000_000 +mean = 6.077482208019486e+07 +scale = auto + +[sweep.env.pos_const] +distribution = uniform +min = 0.001 +max = 1.0 +mean = 0.8113024567778199 +scale = auto + +[sweep.env.pos_penalty] +distribution = uniform +min = 0.0 +max = 0.25 +mean = 0.02588602599777176 +scale = auto + +[sweep.env.reward_grip] +distribution = uniform +min = 0.0 +max = 1.0 +mean = 0.9760442313179252 +scale = auto + +[sweep.env.reward_ho_drop] +distribution = uniform +min = 0.1 +max = 0.5 +mean = 0.5 +scale = auto + +[sweep.env.reward_hover] +distribution = uniform +min = 0.0 +max = 0.5 +mean = 0.01 +scale = auto + +[sweep.env.reward_min_dist] +distribution = uniform +min = 0.1 +max = 5.0 +mean = 2.531838566923265 +scale = auto + +[sweep.env.vel_penalty_clamp] +distribution = uniform +min = 0.0 +max = 0.5 +mean = 0.056085060041244056 +scale = auto + +[sweep.env.w_approach] +distribution = uniform +min = 0.0 +max = 2.5 +mean = 1.5767291682972426 +scale = auto + +[sweep.env.w_position] +distribution = uniform +min = 0.0 +max = 1.5 +mean = 1.0188128215656589 +scale = auto + +[sweep.env.w_stability] +distribution = uniform +min = 0.0 +max = 2.5 +mean = 0.012739532721609687 +scale = auto + +[sweep.env.w_velocity] +distribution = uniform +min = 0.0 +max = 1.5 +mean = 0.11573303880718092 +scale = auto diff --git a/pufferlib/config/ocean/drone_swarm.ini b/pufferlib/config/ocean/drone_swarm.ini index 59915325a..fd1ef6dcc 100644 --- a/pufferlib/config/ocean/drone_swarm.ini +++ b/pufferlib/config/ocean/drone_swarm.ini @@ -49,9 +49,9 @@ vtrace_rho_clip = 3.8183814893708057 [sweep] downsample = 0 -[sweep.train.total_timesteps] -distribution = log_normal -min = 2e8 -max = 4e8 -mean = 2e8 -scale = time +#[sweep.train.total_timesteps] +#distribution = log_normal +#min = 2e8 +#max = 4e8 +#mean = 2e8 +#scale = time diff --git a/pufferlib/models.py b/pufferlib/models.py index fa43d7071..7ec62d037 100644 --- a/pufferlib/models.py +++ b/pufferlib/models.py @@ -43,7 +43,7 @@ def __init__(self, env, hidden_size=128): pufferlib.pytorch.layer_init(nn.Linear(num_obs, hidden_size)), nn.GELU(), ) - + if self.is_multidiscrete: self.action_nvec = tuple(env.single_action_space.nvec) num_atns = sum(self.action_nvec) @@ -56,8 +56,9 @@ def __init__(self, env, hidden_size=128): else: self.decoder_mean = pufferlib.pytorch.layer_init( nn.Linear(hidden_size, env.single_action_space.shape[0]), std=0.01) - self.decoder_logstd = nn.Parameter(torch.zeros( - 1, env.single_action_space.shape[0])) + # Initialize logstd to -0.5 (std ~0.6) for more conservative exploration + self.decoder_logstd = nn.Parameter(torch.ones( + 1, env.single_action_space.shape[0]) * -0.5) self.value = pufferlib.pytorch.layer_init( nn.Linear(hidden_size, 1), std=1) @@ -77,24 +78,90 @@ def encode_observations(self, observations, state=None): if self.is_dict_obs: observations = pufferlib.pytorch.nativize_tensor(observations, self.dtype) observations = torch.cat([v.view(batch_size, -1) for v in observations.values()], dim=1) - else: + else: observations = observations.view(batch_size, -1) + + # Check for NaN in observations and replace with zeros + if torch.isnan(observations).any(): + #print(f"Warning: NaN detected in observations, replacing with zeros") + observations = torch.nan_to_num(observations, nan=0.0) + + # Clamp observations to prevent extreme values + observations = torch.clamp(observations, min=-100.0, max=100.0) + return self.encoder(observations.float()) + def check_and_fix_weights(self): + '''Check for NaN in model weights and reinitialize if needed''' + nan_found = False + for name, param in self.named_parameters(): + if torch.isnan(param).any(): + #print(f"Warning: NaN found in weight {name}, reinitializing") + nan_found = True + # Reinitialize the parameter + if 'decoder_logstd' in name: + param.data.fill_(-0.5) + elif 'bias' in name: + param.data.zero_() + else: + # Use Xavier initialization for weights + if param.dim() >= 2: + torch.nn.init.xavier_uniform_(param) + else: + param.data.normal_(0, 0.01) + return nan_found + def decode_actions(self, hidden): '''Decodes a batch of hidden states into (multi)discrete actions. Assumes no time dimension (handled by LSTM wrappers).''' + # Check for NaN in hidden states first + if torch.isnan(hidden).any(): + #print(f"Warning: NaN in hidden states before decode, replacing with zeros") + hidden = torch.nan_to_num(hidden, nan=0.0) + # Also check weights when NaN is detected + self.check_and_fix_weights() + if self.is_multidiscrete: logits = self.decoder(hidden).split(self.action_nvec, dim=1) elif self.is_continuous: mean = self.decoder_mean(hidden) + + # More aggressive clamping + mean = torch.clamp(mean, min=-5.0, max=5.0) + + # Check for NaN before continuing + if torch.isnan(mean).any(): + #print(f"Warning: NaN in mean after decoder, checking weights") + self.check_and_fix_weights() + # Retry with fixed weights + mean = self.decoder_mean(hidden) + mean = torch.clamp(mean, min=-5.0, max=5.0) + + # Clamp logstd more conservatively logstd = self.decoder_logstd.expand_as(mean) + logstd = torch.clamp(logstd, min=-5.0, max=0.0) # Max std = 1.0 std = torch.exp(logstd) + + # Add small epsilon to std to ensure numerical stability + std = std + 1e-6 + + # Final NaN check and replace with safe values + if torch.isnan(mean).any() or torch.isnan(std).any(): + #print(f"Warning: NaN detected in policy - mean: {torch.isnan(mean).any()}, std: {torch.isnan(std).any()}") + mean = torch.nan_to_num(mean, nan=0.0) + std = torch.nan_to_num(std, nan=0.5) + logits = torch.distributions.Normal(mean, std) else: logits = self.decoder(hidden) values = self.value(hidden) + + # Check values for NaN + if torch.isnan(values).any(): + #print(f"Warning: NaN in value output, replacing with zeros") + values = torch.nan_to_num(values, nan=0.0) + return logits, values class LSTMWrapper(nn.Module): @@ -186,7 +253,16 @@ def forward(self, observations, state): #hidden = self.pre_layernorm(hidden) hidden, (lstm_h, lstm_c) = self.lstm.forward(hidden, lstm_state) hidden = hidden.float() - + + # Check for NaN after LSTM and replace with zeros + if torch.isnan(hidden).any(): + #print(f"Warning: NaN detected after LSTM, replacing with zeros") + hidden = torch.nan_to_num(hidden, nan=0.0) + if torch.isnan(lstm_h).any() or torch.isnan(lstm_c).any(): + #print(f"Warning: NaN detected in LSTM state, resetting") + lstm_h = torch.nan_to_num(lstm_h, nan=0.0) + lstm_c = torch.nan_to_num(lstm_c, nan=0.0) + #hidden = self.post_layernorm(hidden) hidden = hidden.transpose(0, 1) diff --git a/pufferlib/ocean/drone_delivery/binding.c b/pufferlib/ocean/drone_delivery/binding.c new file mode 100644 index 000000000..2c5083d9e --- /dev/null +++ b/pufferlib/ocean/drone_delivery/binding.c @@ -0,0 +1,66 @@ +#include "drone_delivery.h" + +#define Env DroneDelivery +#include "../env_binding.h" + +static int my_init(Env *env, PyObject *args, PyObject *kwargs) { + env->num_agents = unpack(kwargs, "num_agents"); + + env->grip_k_max = unpack(kwargs, "grip_k_max"); + + env->ablation = unpack(kwargs, "ablation"); + env->anneal_min = unpack(kwargs, "anneal_min"); + + env->num_envs = unpack(kwargs, "num_envs"); + env->perfect_anneal = unpack(kwargs, "perfect_anneal"); + env->perfect_deadline = unpack(kwargs, "perfect_deadline"); + + env->pos_const = unpack(kwargs, "pos_const"); + env->pos_penalty = unpack(kwargs, "pos_penalty"); + + env->reward_grip = unpack(kwargs, "reward_grip"); + env->reward_ho_drop = unpack(kwargs, "reward_ho_drop"); + env->reward_hover = unpack(kwargs, "reward_hover"); + + env->reward_max_dist = unpack(kwargs, "reward_max_dist"); + env->reward_min_dist = unpack(kwargs, "reward_min_dist"); + + env->vel_penalty_clamp = unpack(kwargs, "vel_penalty_clamp"); + + env->w_approach = unpack(kwargs, "w_approach"); + env->w_position = unpack(kwargs, "w_position"); + env->w_stability = unpack(kwargs, "w_stability"); + env->w_velocity = unpack(kwargs, "w_velocity"); + + init(env); + return 0; +} + +static int my_log(PyObject *dict, Log *log) { + assign_to_dict(dict, "perf", log->perf); + assign_to_dict(dict, "score", log->score); + //assign_to_dict(dict, "collision_rate", log->collision_rate); + //assign_to_dict(dict, "oob", log->oob); + //assign_to_dict(dict, "episode_return", log->episode_return); + //assign_to_dict(dict, "episode_length", log->episode_length); + + //assign_to_dict(dict, "jitter", log->jitter); + assign_to_dict(dict, "perfect_grip", log->perfect_grip); + assign_to_dict(dict, "perfect_deliv", log->perfect_deliv); + //assign_to_dict(dict, "perfect_now", log->perfect_now); + //assign_to_dict(dict, "to_pickup", log->to_pickup); + //assign_to_dict(dict, "ho_pickup", log->ho_pickup); + //assign_to_dict(dict, "de_pickup", log->de_pickup); + //assign_to_dict(dict, "to_drop", log->to_drop); + //assign_to_dict(dict, "ho_drop", log->ho_drop); + //assign_to_dict(dict, "dist", log->dist); + //assign_to_dict(dict, "dist100", log->dist100); + + assign_to_dict(dict, "episode_num", log->episode_num); + //assign_to_dict(dict, "tick", log->tick); + assign_to_dict(dict, "episode_gain", log->episode_gain); + assign_to_dict(dict, "anneal", log->anneal); + + //assign_to_dict(dict, "n", log->n); + return 0; +} diff --git a/pufferlib/ocean/drone_delivery/drone_delivery.c b/pufferlib/ocean/drone_delivery/drone_delivery.c new file mode 100644 index 000000000..a457d47bd --- /dev/null +++ b/pufferlib/ocean/drone_delivery/drone_delivery.c @@ -0,0 +1,202 @@ +#include "drone_delivery.h" +#include "puffernet.h" +#include + +#ifdef __EMSCRIPTEN__ +#include +#endif + +double randn(double mean, double std) { + static int has_spare = 0; + static double spare; + + if (has_spare) { + has_spare = 0; + return mean + std * spare; + } + + has_spare = 1; + double u, v, s; + do { + u = 2.0 * rand() / RAND_MAX - 1.0; + v = 2.0 * rand() / RAND_MAX - 1.0; + s = u * u + v * v; + } while (s >= 1.0 || s == 0.0); + + s = sqrt(-2.0 * log(s) / s); + spare = v * s; + return mean + std * (u * s); +} + +typedef struct LinearContLSTM LinearContLSTM; +struct LinearContLSTM { + int num_agents; + float *obs; + float *log_std; + Linear *encoder; + GELU *gelu1; + LSTM *lstm; + Linear *actor; + Linear *value_fn; + int num_actions; +}; + +LinearContLSTM *make_linearcontlstm(Weights *weights, int num_agents, int input_dim, + int logit_sizes[], int num_actions) { + LinearContLSTM *net = calloc(1, sizeof(LinearContLSTM)); + net->num_agents = num_agents; + net->obs = calloc(num_agents * input_dim, sizeof(float)); + net->num_actions = logit_sizes[0]; + net->log_std = weights->data; + weights->idx += net->num_actions; + net->encoder = make_linear(weights, num_agents, input_dim, 128); + net->gelu1 = make_gelu(num_agents, 128); + int atn_sum = 0; + for (int i = 0; i < num_actions; i++) { + atn_sum += logit_sizes[i]; + } + net->actor = make_linear(weights, num_agents, 128, atn_sum); + net->value_fn = make_linear(weights, num_agents, 128, 1); + net->lstm = make_lstm(weights, num_agents, 128, 128); + return net; +} + +void free_linearcontlstm(LinearContLSTM *net) { + free(net->obs); + free(net->encoder); + free(net->gelu1); + free(net->actor); + free(net->value_fn); + free(net->lstm); + free(net); +} + +void forward_linearcontlstm(LinearContLSTM *net, float *observations, float *actions) { + linear(net->encoder, observations); + gelu(net->gelu1, net->encoder->output); + lstm(net->lstm, net->gelu1->output); + linear(net->actor, net->lstm->state_h); + linear(net->value_fn, net->lstm->state_h); + for (int i = 0; i < net->num_actions; i++) { + float std = expf(net->log_std[i]); + float mean = net->actor->output[i]; + actions[i] = randn(mean, std); + } +} + +void generate_dummy_actions(DroneDelivery *env) { + // Generate random floats in [-1, 1] range + env->actions[0] = ((float)rand() / (float)RAND_MAX) * 2.0f - 1.0f; + env->actions[1] = ((float)rand() / (float)RAND_MAX) * 2.0f - 1.0f; + env->actions[2] = ((float)rand() / (float)RAND_MAX) * 2.0f - 1.0f; + env->actions[3] = ((float)rand() / (float)RAND_MAX) * 2.0f - 1.0f; +} + +#ifdef __EMSCRIPTEN__ +typedef struct { + DroneDelivery *env; + LinearContLSTM *net; + Weights *weights; +} WebRenderArgs; + +void emscriptenStep(void *e) { + WebRenderArgs *args = (WebRenderArgs *)e; + DroneDelivery *env = args->env; + LinearContLSTM *net = args->net; + + forward_linearcontlstm(net, env->observations, env->actions); + c_step(env); + c_render(env); + return; +} + +WebRenderArgs *web_args = NULL; +#endif + +int main() { + srand(42); // Seed random number generator + + DroneDelivery *env = calloc(1, sizeof(DroneDelivery)); + env->num_agents = 64; + + env->grip_k_max = 17.105300970916993; + + env->ablation = 0.0; + env->anneal_min = 0.5; + + env->num_envs = 16; + env->perfect_anneal = 250000000; + env->perfect_anneal_start = 175000000; + env->perfect_deadline = 150000000; + + env->pos_const = 0.7108647198043252; + env->pos_penalty = 0.0009629600280207475; + + env->reward_grip = 0.9999; + env->reward_ho_drop = 0.20890470430909128; + env->reward_hover = 0.1603521816569735; + + env->reward_max_dist = 65.0; + env->reward_min_dist = 0.9330297552248776; + + env->vel_penalty_clamp = 0.25; + + env->w_approach = 2.2462377881752698; + env->w_position = 0.7312628607193232; + env->w_stability = 1.6094417286807845; + env->w_velocity = 0.0009999999999999731; + + init(env); + + size_t obs_size = 45; + size_t act_size = 4; + env->observations = (float *)calloc(env->num_agents * obs_size, sizeof(float)); + env->actions = (float *)calloc(env->num_agents * act_size, sizeof(float)); + env->rewards = (float *)calloc(env->num_agents, sizeof(float)); + env->terminals = (unsigned char *)calloc(env->num_agents, sizeof(float)); + + //Weights *weights = load_weights("resources/drone/drone_weights.bin", 136073); + //int logit_sizes[1] = {4}; + //LinearContLSTM *net = make_linearcontlstm(weights, env->num_agents, obs_size, logit_sizes, 1); + + if (!env->observations || !env->actions || !env->rewards) { + fprintf(stderr, "ERROR: Failed to allocate memory for demo buffers.\n"); + free(env->observations); + free(env->actions); + free(env->rewards); + free(env->terminals); + free(env); + return 0; + } + + init(env); + c_reset(env); + +#ifdef __EMSCRIPTEN__ + WebRenderArgs *args = calloc(1, sizeof(WebRenderArgs)); + args->env = env; + args->net = net; + args->weights = weights; + web_args = args; + + emscripten_set_main_loop_arg(emscriptenStep, args, 0, true); +#else + c_render(env); + + while (!WindowShouldClose()) { + //forward_linearcontlstm(net, env->observations, env->actions); + c_step(env); + c_render(env); + } + + c_close(env); + //free_linearcontlstm(net); + free(env->observations); + free(env->actions); + free(env->rewards); + free(env->terminals); + free(env); +#endif + + return 0; +} diff --git a/pufferlib/ocean/drone_delivery/drone_delivery.h b/pufferlib/ocean/drone_delivery/drone_delivery.h new file mode 100644 index 000000000..124c75a16 --- /dev/null +++ b/pufferlib/ocean/drone_delivery/drone_delivery.h @@ -0,0 +1,905 @@ +// Originally made by Sam Turner and Finlay Sanders, 2025. +// Included in pufferlib under the original project's MIT license. +// https://github.com/stmio/drone + +// Keith (Kinvert) used their (above) work (drone_swarm) as a base to make drone_delivery for a hackathon 2025 +// Thank you Sam, Finlay, and Joseph +// Team members for hackathon were BET Adsorption and Autark + +#include +#include +#include +#include +#include +#include +#include + +#include "raylib.h" +#include "dronelib_delivery.h" + +typedef struct Client Client; +struct Client { + Camera3D camera; + float width; + float height; + + float camera_distance; + float camera_azimuth; + float camera_elevation; + bool is_dragging; + Vector2 last_mouse_pos; + + // Trailing path buffer (for rendering only) + Trail* trails; +}; + +typedef struct { + float *observations; + float *actions; + float *rewards; + unsigned char *terminals; + + float dist; + + Log log; + int tick; + int report_interval; + bool render; + + int num_agents; + Drone* agents; + int num_envs; + + float ablation; + float anneal; + float anneal_min; + float annealed_episode; + float inv_annealed_episode; + + float box_k; + float box_k_max; + float box_k_min; + + float dist_decay; + + float grip_k; + float grip_k_decay; + float grip_k_max; + + float perfect_anneal; + float perfect_deadline; + float perfect_episode; + float inv_perfect_episode; + + float pos_const; + float pos_penalty; + + float reward_dist; + float inv_reward_dist; + float reward_grip; + float reward_ho_drop; + float reward_hover; + + float reward_max_dist; + float reward_min_dist; + + float vel_penalty_clamp; + + float w_approach; + float w_position; + float w_stability; + float w_velocity; + + int episode_num; + float episode_gain; + float episode_gain_increment; + + Client *client; +} DroneDelivery; + +void init(DroneDelivery *env) { + env->render = false; + env->box_k = 0.001f; + env->box_k_min = 0.001f; + env->box_k_max = 1.0f; + env->episode_gain = 0.0f; + env->agents = calloc(env->num_agents, sizeof(Drone)); + env->log = (Log){0}; + env->tick = 0; + env->episode_num = 0; + env->perfect_episode = env->perfect_deadline / (env->num_envs * env->num_envs * env->num_agents * HORIZON); + env->inv_perfect_episode = 1.0f / env->perfect_episode; + env->annealed_episode = (env->perfect_anneal - env->perfect_deadline) / (env->num_envs * env->num_envs * env->num_agents * HORIZON); + env->inv_annealed_episode = 1.0f / env->annealed_episode; + env->grip_k_decay = env->grip_k_max * env->inv_perfect_episode; + env->dist_decay = env->reward_max_dist * env->inv_perfect_episode; + env->anneal = 1.0f; + + float ablation = env->ablation; + if (ablation > 0.0001f && ablation < 0.2f) { + env->w_approach = 0.0f; + } else if (ablation >= 0.2f && ablation < 0.4f) { + env->w_position - 0.0f; + } else if (ablation >= 0.4f && ablation < 0.6f) { + env->w_stability - 0.0f; + } else if (ablation >= 0.6f && ablation < 0.8f) { + env->w_velocity - 0.0f; + } else if (ablation >= 0.8f) { + env->reward_hover - 0.0f; + } +} + +void add_log(DroneDelivery *env, int idx, bool oob) { + Drone *agent = &env->agents[idx]; + env->log.score += agent->score; + env->log.episode_return += agent->episode_return; + env->log.episode_length += agent->episode_length; + env->log.collision_rate += agent->collisions / (float)agent->episode_length; + //env->log.perf += agent->score / (float)agent->episode_length; + env->log.perf += agent->perfect_deliv; + if (oob) { + env->log.oob += 1.0f; + } + env->log.n += 1.0f; + + env->log.episode_num += env->episode_num; + env->log.tick += env->tick; + env->log.episode_gain += env->episode_gain; + env->log.anneal += env->anneal; + + agent->episode_length = 0; + agent->episode_return = 0.0f; +} + +Drone* nearest_drone(DroneDelivery* env, Drone *agent) { + float min_dist = 999999.0f; + Drone *nearest = NULL; + for (int i = 0; i < env->num_agents; i++) { + Drone *other = &env->agents[i]; + if (other == agent) { + continue; + } + float dx = agent->state.pos.x - other->state.pos.x; + float dy = agent->state.pos.y - other->state.pos.y; + float dz = agent->state.pos.z - other->state.pos.z; + float dist = sqrtf(dx*dx + dy*dy + dz*dz); + if (dist < min_dist) { + min_dist = dist; + nearest = other; + } + } + if (nearest == NULL) { + int x = 0; + + } + return nearest; +} + +void compute_observations(DroneDelivery *env) { + int idx = 0; + for (int i = 0; i < env->num_agents; i++) { + Drone *agent = &env->agents[i]; + + Quat q_inv = quat_inverse(agent->state.quat); + Vec3 linear_vel_body = quat_rotate(q_inv, agent->state.vel); + Vec3 drone_up_world = quat_rotate(agent->state.quat, (Vec3){0.0f, 0.0f, 1.0f}); + + // TODO: Need abs observations now right? idk + // 45 after dvx dvy dvz for moving box + env->observations[idx++] = linear_vel_body.x * agent->params.inv_max_vel; + env->observations[idx++] = linear_vel_body.y * agent->params.inv_max_vel; + env->observations[idx++] = linear_vel_body.z * agent->params.inv_max_vel; + env->observations[idx++] = clampf(agent->state.vel.x, -1.0f, 1.0f); + env->observations[idx++] = clampf(agent->state.vel.y, -1.0f, 1.0f); + env->observations[idx++] = clampf(agent->state.vel.z, -1.0f, 1.0f); + + env->observations[idx++] = clampf(agent->state.omega.x * agent->params.inv_max_omega, -1.0f, 1.0f); + env->observations[idx++] = clampf(agent->state.omega.y * agent->params.inv_max_omega, -1.0f, 1.0f); + env->observations[idx++] = clampf(agent->state.omega.z * agent->params.inv_max_omega, -1.0f, 1.0f); + + env->observations[idx++] = drone_up_world.x; + env->observations[idx++] = drone_up_world.y; + env->observations[idx++] = drone_up_world.z; + + env->observations[idx++] = agent->state.quat.w; + env->observations[idx++] = agent->state.quat.x; + env->observations[idx++] = agent->state.quat.y; + env->observations[idx++] = agent->state.quat.z; + + env->observations[idx++] = agent->state.rpms[0] * agent->params.inv_max_rpm; + env->observations[idx++] = agent->state.rpms[1] * agent->params.inv_max_rpm; + env->observations[idx++] = agent->state.rpms[2] * agent->params.inv_max_rpm; + env->observations[idx++] = agent->state.rpms[3] * agent->params.inv_max_rpm; + + env->observations[idx++] = agent->state.pos.x * INV_GRID_X; + env->observations[idx++] = agent->state.pos.y * INV_GRID_Y; + env->observations[idx++] = agent->state.pos.z * INV_GRID_Z; + + float dx = agent->target_pos.x - agent->state.pos.x; + float dy = agent->target_pos.y - agent->state.pos.y; + float dz = agent->target_pos.z - agent->state.pos.z; + env->observations[idx++] = clampf(dx, -1.0f, 1.0f); + env->observations[idx++] = clampf(dy, -1.0f, 1.0f); + env->observations[idx++] = clampf(dz, -1.0f, 1.0f); + env->observations[idx++] = dx * INV_GRID_X; + env->observations[idx++] = dy * INV_GRID_Y; + env->observations[idx++] = dz * INV_GRID_Z; + + env->observations[idx++] = agent->last_collision_reward; + env->observations[idx++] = agent->last_abs_reward; + env->observations[idx++] = clampf(agent->last_position_reward, -1.0f, 1.0f); + //env->observations[idx++] = clampf(agent->last_velocity_reward, -1.0f, 1.0f); + //env->observations[idx++] = clampf(agent->last_stability_reward, -1.0f, 1.0f); + //env->observations[idx++] = clampf(agent->last_approach_reward, -1.0f, 1.0f); + + // Multiagent obs + Drone* nearest = nearest_drone(env, agent); + if (env->num_agents > 1) { + env->observations[idx++] = clampf(nearest->state.pos.x - agent->state.pos.x, -1.0f, 1.0f); + env->observations[idx++] = clampf(nearest->state.pos.y - agent->state.pos.y, -1.0f, 1.0f); + env->observations[idx++] = clampf(nearest->state.pos.z - agent->state.pos.z, -1.0f, 1.0f); + } else { + env->observations[idx++] = 0.0f; + env->observations[idx++] = 0.0f; + env->observations[idx++] = 0.0f; + } + + Vec3 to_box = quat_rotate(q_inv, sub3(agent->box_pos, agent->state.pos)); + Vec3 to_drop = quat_rotate(q_inv, sub3(agent->drop_pos, agent->state.pos)); + env->observations[idx++] = to_box.x * INV_GRID_X; + env->observations[idx++] = to_box.y * INV_GRID_Y; + env->observations[idx++] = to_box.z * INV_GRID_Z; + env->observations[idx++] = to_drop.x * INV_GRID_X; + env->observations[idx++] = to_drop.y * INV_GRID_Y; + env->observations[idx++] = to_drop.z * INV_GRID_Z; + env->observations[idx++] = 1.0f; // tk todo remove + float dvx = agent->target_vel.x - agent->state.vel.x; + float dvy = agent->target_vel.y - agent->state.vel.y; + float dvz = agent->target_vel.z - agent->state.vel.z; + env->observations[idx++] = clampf(dvx, -1.0f, 1.0f); + env->observations[idx++] = clampf(dvy, -1.0f, 1.0f); + env->observations[idx++] = clampf(dvz, -1.0f, 1.0f); + } +} + +float compute_reward(DroneDelivery* env, Drone *agent, bool collision) { + Vec3 tgt = agent->hidden_pos; + + Vec3 pos_error = {agent->state.pos.x - tgt.x, agent->state.pos.y - tgt.y, agent->state.pos.z - tgt.z}; + float dist = sqrtf(pos_error.x * pos_error.x + pos_error.y * pos_error.y + pos_error.z * pos_error.z) + 0.00000001; + + Vec3 vel_error = {agent->state.vel.x - agent->hidden_vel.x, + agent->state.vel.y - agent->hidden_vel.y, + agent->state.vel.z - agent->hidden_vel.z}; + float vel_magnitude = sqrtf(vel_error.x * vel_error.x + vel_error.y * vel_error.y + vel_error.z * vel_error.z); + + float angular_vel_magnitude = sqrtf(agent->state.omega.x * agent->state.omega.x + + agent->state.omega.y * agent->state.omega.y + + agent->state.omega.z * agent->state.omega.z); + + float proximity_factor = clampf(1.0f - dist * env->inv_reward_dist, 0.0f, 1.0f); + + agent->last_position_reward = clampf(expf(-dist / (env->reward_dist * env->pos_const)), -env->pos_penalty, 1.0f); + + // slight reward for 0.05 for example, large penalty for over 0.4 + float velocity_reward = clampf(proximity_factor * (2.0f * expf(-(vel_magnitude - 0.05f) * 10.0f) - 1.0f), -env->vel_penalty_clamp, 1.0f); + if (velocity_reward < 0.0f) agent->last_velocity_reward = velocity_reward * env->episode_gain; + + agent->last_stability_reward = -angular_vel_magnitude * agent->params.inv_max_omega; + + Vec3 to_target_unit = {0, 0, 0}; + if (dist > 0.001f) { + to_target_unit.x = -pos_error.x / dist; + to_target_unit.y = -pos_error.y / dist; + to_target_unit.z = -pos_error.z / dist; + } + float approach_dot = to_target_unit.x * agent->state.vel.x + + to_target_unit.y * agent->state.vel.y + + to_target_unit.z * agent->state.vel.z; + + float approach_weight = clampf(dist * env->inv_reward_dist, 0.0f, 1.0f); // todo + agent->last_approach_reward = approach_weight * clampf(approach_dot * agent->params.inv_max_vel, -0.5f, 0.5f); + + float hover_bonus = 0.0f; // todo add a K + if (dist < env->reward_dist * 0.2f && vel_magnitude < 0.2f && agent->state.vel.z < 0.0f) { + hover_bonus = env->reward_hover; + } + + float collision_penalty = 0.0f; + if (collision && env->num_agents > 1) { + Drone *nearest = nearest_drone(env, agent); + float dx = agent->state.pos.x - nearest->state.pos.x; + float dy = agent->state.pos.y - nearest->state.pos.y; + float dz = agent->state.pos.z - nearest->state.pos.z; + float min_dist = sqrtf(dx*dx + dy*dy + dz*dz); + if (min_dist < 1.0f) { + collision_penalty = -1.0f; + agent->collisions += 1.0f; + } + } + + float total_reward = env->w_position * agent->last_position_reward + + env->w_velocity * agent->last_velocity_reward + + env->w_stability * agent->last_stability_reward + + env->w_approach * agent->last_approach_reward + + hover_bonus + + collision_penalty; + + total_reward = clampf(total_reward, -1.0f, 1.0f); + + float delta_reward = total_reward - agent->last_abs_reward; + + agent->last_collision_reward = collision_penalty; + agent->last_abs_reward = total_reward; + agent->episode_length++; + agent->score += total_reward; + env->dist = dist * dist; + agent->jitter = 10.0f - (dist + vel_magnitude + angular_vel_magnitude); + + return delta_reward * env->anneal; +} + +void reset_delivery(DroneDelivery* env, Drone *agent, int idx) { + agent->box_pos = (Vec3){rndf(-MARGIN_X, MARGIN_X), rndf(-MARGIN_Y, MARGIN_Y), rndf(-GRID_Z + 0.5f, -GRID_Z + 3.0f)}; + agent->drop_pos = (Vec3){rndf(-MARGIN_X, MARGIN_X), rndf(-MARGIN_Y, MARGIN_Y), -GRID_Z + 0.5f}; + agent->box_vel = (Vec3){0.0f, 0.0f, 0.0f}; + agent->box_vel.x = agent->box_pos.x > 0.0f ? rndf(-0.2f, 0.0f) : rndf(0.0f, 0.2f); + agent->box_vel.y = agent->box_pos.y > 0.0f ? rndf(-0.2f, 0.0f) : rndf(0.0f, 0.2f); + agent->gripping = false; + agent->delivered = false; + agent->grip_height = 0.0f; + agent->approaching_pickup = false; + agent->hovering_pickup = false; + agent->descent_pickup = false; + agent->approaching_drop = false; + agent->hovering_drop = false; + agent->descent_drop = false; + agent->hover_timer = 0.0f; + agent->target_pos = agent->box_pos; + agent->target_vel = agent->box_vel; + agent->hidden_pos = agent->target_pos; + agent->hidden_pos.z += 1.0f; + agent->hidden_vel = agent->box_vel; + + float drone_capacity = agent->params.arm_len * 4.0f; + agent->box_size = rndf(0.3f, fmaxf(fminf(drone_capacity, 1.0f), 0.3f)); + + float box_volume = agent->box_size * agent->box_size * agent->box_size; + agent->box_base_mass = fminf(BASE_BOX_DENSITY * box_volume * rndf(0.05f, 2.0f), agent->box_mass_max); + agent->box_mass = env->box_k * agent->box_base_mass; + + agent->base_mass = agent->params.mass; + agent->base_ixx = agent->params.ixx; + agent->base_iyy = agent->params.iyy; + agent->base_izz = agent->params.izz; + agent->base_k_drag = agent->params.k_drag; + agent->base_b_drag = agent->params.b_drag; +} + +void reset_agent(DroneDelivery* env, Drone *agent, int idx) { + agent->episode_return = 0.0f; + agent->episode_length = 0; + agent->collisions = 0.0f; + agent->score = 0.0f; + agent->perfect_grip = false; + agent->perfect_deliveries = 0.0f; + agent->perfect_deliv = false; + agent->perfect_now = false; + agent->has_delivered = false; + agent->jitter = 100.0f; + agent->box_physics_on = false; + + //float size = 0.2f; + //init_drone(agent, size, 0.0f); + float size = rndf(0.3f, 1.0); + init_drone(agent, size, 0.25f); + agent->color = (Color){255, 0, 0, 255}; + + agent->state.pos = (Vec3){ + rndf(-MARGIN_X, MARGIN_X), + rndf(-MARGIN_Y, MARGIN_Y), + rndf(-MARGIN_Z, MARGIN_Z) + }; + agent->prev_pos = agent->state.pos; + agent->spawn_pos = agent->state.pos; + + reset_delivery(env, agent, idx); + + bool watch_for_collisions = true; + compute_reward(env, agent, watch_for_collisions); +} + +void random_bump(Drone* agent) { + agent->state.vel.x += rndf(-0.1f, 0.1f); + agent->state.vel.y += rndf(-0.1f, 0.1f); + agent->state.vel.z += rndf(0.05f, 0.3f); + agent->state.omega.x += rndf(-0.5f, 0.5f); + agent->state.omega.y += rndf(-0.5f, 0.5f); + agent->state.omega.z += rndf(-0.5f, 0.5f); + +} + +void update_gripping_physics(Drone* agent) { + if (agent->gripping) { + agent->params.mass = agent->base_mass + agent->box_mass * rndf(0.9f, 1.1f); + + float grip_dist = agent->box_size * 0.5f; + float added_inertia = agent->box_mass * grip_dist * grip_dist * rndf(0.8f, 1.2f); + agent->params.ixx = agent->base_ixx + added_inertia; + agent->params.iyy = agent->base_iyy + added_inertia; + agent->params.izz = agent->base_izz + added_inertia * 0.5f; + + float drag_multiplier = 1.0f + (agent->box_size / agent->params.arm_len) * rndf(0.5f, 1.0f); + agent->params.k_drag = agent->base_k_drag * drag_multiplier; + agent->params.b_drag = agent->base_b_drag * drag_multiplier; + agent->box_physics_on = true; + } else { + agent->params.mass = agent->base_mass; + agent->params.ixx = agent->base_ixx; + agent->params.iyy = agent->base_iyy; + agent->params.izz = agent->base_izz; + agent->params.k_drag = agent->base_k_drag; + agent->params.b_drag = agent->base_b_drag; + } +} + +void c_reset(DroneDelivery *env) { + env->tick = 0; + env->episode_num += 1; + env->reward_dist = clampf(env->reward_max_dist - env->episode_num * env->dist_decay, env->reward_min_dist, 100.0f); + env->inv_reward_dist = 1.0f / env->reward_dist; + if (env->episode_num > 1) env->episode_gain = clampf(env->episode_gain + env->inv_perfect_episode, 0.0f, 1.0f); + + for (int i = 0; i < env->num_agents; i++) { + Drone *agent = &env->agents[i]; + reset_agent(env, agent, i); + agent->target_pos = (Vec3){agent->box_pos.x, agent->box_pos.y, agent->box_pos.z}; + agent->target_vel = agent->box_vel; + } + + if (env->grip_k < 1.01 && env->box_k > 0.99f) { + env->anneal = clampf(env->anneal - (env->inv_annealed_episode * (1.0f - env->anneal_min)), env->anneal_min, 1.0f); + } + + compute_observations(env); +} + +void c_step(DroneDelivery *env) { + env->tick = (env->tick + 1) % HORIZON; + //env->log.dist = 0.0f; + //env->log.dist100 = 0.0f; + env->grip_k = clampf(env->grip_k_max - env->episode_num * env->grip_k_decay, 1.0f, 100.0f); + env->box_k = clampf(env->box_k_min + env->episode_num * env->inv_perfect_episode, env->box_k_min, env->box_k_max); + for (int i = 0; i < env->num_agents; i++) { + Drone *agent = &env->agents[i]; + env->rewards[i] = 0; + env->terminals[i] = 0; + agent->perfect_now = false; + + float* atn = &env->actions[4*i]; + move_drone(agent, atn); + + bool out_of_bounds = agent->state.pos.x < -GRID_X || agent->state.pos.x > GRID_X || + agent->state.pos.y < -GRID_Y || agent->state.pos.y > GRID_Y || + agent->state.pos.z < -GRID_Z || agent->state.pos.z > GRID_Z; + + float reward = 0.0f; + + if (!agent->gripping) { + agent->box_pos.x += agent->box_vel.x * DT; + agent->box_pos.y += agent->box_vel.y * DT; + agent->box_pos.z += agent->box_vel.z * DT; + agent->hidden_pos.x = agent->box_pos.x; + agent->hidden_pos.y = agent->box_pos.y; + } + agent->hidden_pos.z += agent->hidden_vel.z * DT; + if (agent->hidden_pos.z < agent->target_pos.z) { + agent->hidden_pos.z = agent->target_pos.z; + agent->hidden_vel.z = 0.0f; + } + agent->approaching_pickup = true; + + Vec3 vel_error = {agent->state.vel.x - agent->hidden_vel.x, + agent->state.vel.y - agent->hidden_vel.y, + agent->state.vel.z - agent->hidden_vel.z}; + float speed = sqrtf(vel_error.x * vel_error.x + vel_error.y * vel_error.y + vel_error.z * vel_error.z); + + agent->box_mass = env->box_k * agent->box_base_mass; + float k = env->grip_k; + if (!agent->gripping) { + float dist_to_hidden = sqrtf(powf(agent->state.pos.x - agent->hidden_pos.x, 2) + + powf(agent->state.pos.y - agent->hidden_pos.y, 2) + + powf(agent->state.pos.z - agent->hidden_pos.z, 2)); + float xy_dist_to_box = sqrtf(powf(agent->state.pos.x - agent->box_pos.x, 2) + + powf(agent->state.pos.y - agent->box_pos.y, 2)); + float z_dist_above_box = agent->state.pos.z - agent->box_pos.z; + + if (xy_dist_to_box < 2.0f && agent->state.pos.z < agent->target_pos.z + 10.0f) { + reward -= 0.001f * env->episode_num - 0.001f; + } + + // Phase 1 Box Hover + if (!agent->hovering_pickup) { + if (dist_to_hidden < 0.4f && speed < 0.4f) { + agent->hovering_pickup = true; + agent->color = (Color){255, 255, 255, 255}; // White + } else { + if (!agent->has_delivered) { + agent->color = (Color){255, 100, 100, 255}; // Light Red + } + } + } + + // Phase 2 Box Descent + else { + agent->descent_pickup = true; + agent->hidden_vel.z = -0.1f; + if ( + xy_dist_to_box < k * 0.1f && + z_dist_above_box < k * 0.1f && z_dist_above_box > 0.0f && + speed < k * 0.1f && + agent->state.vel.z > k * -0.05f && agent->state.vel.z < 0.0f + ) { + if (k < 1.01 && env->box_k > 0.99f) { + agent->perfect_grip = true; + agent->color = (Color){100, 100, 255, 255}; // Light Blue + } + agent->gripping = true; + reward += env->reward_grip * env->anneal; + random_bump(agent); + } else if (dist_to_hidden > 0.4f || speed > 0.4f) { + agent->color = (Color){255, 100, 100, 255}; // Light Red + } + } + } else { + + // Phase 3 Drop Hover + agent->box_pos = agent->state.pos; + agent->box_pos.z -= 0.5f; + agent->target_pos = agent->drop_pos; + agent->target_vel = (Vec3){0.0f, 0.0f, 0.0f}; + float xy_dist_to_drop = sqrtf(powf(agent->state.pos.x - agent->drop_pos.x, 2) + + powf(agent->state.pos.y - agent->drop_pos.y, 2)); + float z_dist_above_drop = agent->state.pos.z - agent->drop_pos.z; + + //if (xy_dist_to_drop < 2.0f && agent->state.pos.z < agent->target_pos.z + 10.0f) { + // reward -= 0.001f * env->episode_num - 0.001f; + //} + + if (!agent->box_physics_on && agent->state.vel.z > 0.3f) { + update_gripping_physics(agent); + } + + if (!agent->hovering_drop) { + agent->target_pos = (Vec3){agent->drop_pos.x, agent->drop_pos.y, agent->drop_pos.z + 0.4f}; + agent->hidden_pos = (Vec3){agent->drop_pos.x, agent->drop_pos.y, agent->drop_pos.z + 1.0f}; + agent->hidden_vel = (Vec3){0.0f, 0.0f, 0.0f}; + if (xy_dist_to_drop < k * 0.4f && z_dist_above_drop > 0.7f && z_dist_above_drop < 1.3f) { + agent->hovering_drop = true; + reward += env->reward_ho_drop * env->anneal; + agent->color = (Color){0, 0, 255, 255}; // Blue + } + } + + // Phase 4 Drop Descent + else { + agent->target_pos = agent->drop_pos; + agent->hidden_pos.x = agent->drop_pos.x; + agent->hidden_pos.y = agent->drop_pos.y; + agent->hidden_vel = (Vec3){0.0f, 0.0f, -0.1f}; + if (xy_dist_to_drop < k * 0.2f && z_dist_above_drop < k * 0.2f) { + agent->hovering_pickup = false; + agent->gripping = false; + update_gripping_physics(agent); + agent->box_physics_on = false; + agent->hovering_drop = false; + reward += 1.0f; + agent->delivered = true; + agent->has_delivered = true; + if (k < 1.01f && agent->perfect_grip && env->box_k > 0.99f) { + agent->perfect_deliv = true; + agent->perfect_deliveries += 1.0f; + agent->perfect_now = true; + agent->color = (Color){0, 255, 0, 255}; // Green + } + reset_delivery(env, agent, i); + } + } + } + + reward += compute_reward(env, agent, true); + + for (int i = 0; i < env->num_agents; i++) { + Drone *a = &env->agents[i]; + if (a->perfect_grip && env->grip_k < 1.01f && env->box_k > 0.99f) { + env->log.perfect_grip += 1.0f; + } + if (a->perfect_deliv && env->grip_k < 1.01f && a->perfect_grip && env->box_k > 0.99f) { + env->log.perfect_deliv += agent->perfect_deliveries; + } + if (a->perfect_deliv && env->grip_k < 1.01f && a->perfect_grip && a->perfect_now && env->box_k > 0.99f) { + env->log.perfect_now += 1.0f; + } + } + + env->rewards[i] += reward; + agent->episode_return += reward; + + float min_z = -GRID_Z + 0.2f; + if (agent->gripping) { + min_z += 0.1; + } + + if (out_of_bounds || agent->state.pos.z < min_z) { + env->rewards[i] -= 1; + env->terminals[i] = 1; + add_log(env, i, true); + reset_agent(env, agent, i); + } else if (env->tick >= HORIZON - 1) { + env->terminals[i] = 1; + add_log(env, i, false); + } + } + if (env->tick >= HORIZON - 1) { + c_reset(env); + } + + compute_observations(env); +} + +void c_close_client(Client *client) { + CloseWindow(); + free(client); +} + +void c_close(DroneDelivery *env) { + if (env->client != NULL) { + c_close_client(env->client); + } +} + +static void update_camera_position(Client *c) { + float r = c->camera_distance; + float az = c->camera_azimuth; + float el = c->camera_elevation; + + float x = r * cosf(el) * cosf(az); + float y = r * cosf(el) * sinf(az); + float z = r * sinf(el); + + c->camera.position = (Vector3){x, y, z}; + c->camera.target = (Vector3){0, 0, 0}; +} + +void handle_camera_controls(Client *client) { + Vector2 mouse_pos = GetMousePosition(); + + if (IsMouseButtonPressed(MOUSE_BUTTON_LEFT)) { + client->is_dragging = true; + client->last_mouse_pos = mouse_pos; + } + + if (IsMouseButtonReleased(MOUSE_BUTTON_LEFT)) { + client->is_dragging = false; + } + + if (client->is_dragging && IsMouseButtonDown(MOUSE_BUTTON_LEFT)) { + Vector2 mouse_delta = {mouse_pos.x - client->last_mouse_pos.x, + mouse_pos.y - client->last_mouse_pos.y}; + + float sensitivity = 0.005f; + + client->camera_azimuth -= mouse_delta.x * sensitivity; + + client->camera_elevation += mouse_delta.y * sensitivity; + client->camera_elevation = + clampf(client->camera_elevation, -PI / 2.0f + 0.1f, PI / 2.0f - 0.1f); + + client->last_mouse_pos = mouse_pos; + + update_camera_position(client); + } + + float wheel = GetMouseWheelMove(); + if (wheel != 0) { + client->camera_distance -= wheel * 2.0f; + client->camera_distance = clampf(client->camera_distance, 5.0f, 50.0f); + update_camera_position(client); + } +} + +Client *make_client(DroneDelivery *env) { + Client *client = (Client *)calloc(1, sizeof(Client)); + + client->width = WIDTH; + client->height = HEIGHT; + + SetConfigFlags(FLAG_MSAA_4X_HINT); // antialiasing + InitWindow(WIDTH, HEIGHT, "PufferLib DroneDelivery"); + +#ifndef __EMSCRIPTEN__ + SetTargetFPS(60); +#endif + + if (!IsWindowReady()) { + TraceLog(LOG_ERROR, "Window failed to initialize\n"); + free(client); + return NULL; + } + + client->camera_distance = 40.0f; + client->camera_azimuth = 0.0f; + client->camera_elevation = PI / 10.0f; + client->is_dragging = false; + client->last_mouse_pos = (Vector2){0.0f, 0.0f}; + + client->camera.up = (Vector3){0.0f, 0.0f, 1.0f}; + client->camera.fovy = 45.0f; + client->camera.projection = CAMERA_PERSPECTIVE; + + update_camera_position(client); + + // Initialize trail buffer + client->trails = (Trail*)calloc(env->num_agents, sizeof(Trail)); + for (int i = 0; i < env->num_agents; i++) { + Trail* trail = &client->trails[i]; + trail->index = 0; + trail->count = 0; + for (int j = 0; j < TRAIL_LENGTH; j++) { + trail->pos[j] = env->agents[i].state.pos; + } + } + + return client; +} + +const Color PUFF_RED = (Color){187, 0, 0, 255}; +const Color PUFF_CYAN = (Color){0, 187, 187, 255}; +const Color PUFF_WHITE = (Color){241, 241, 241, 241}; +const Color PUFF_BACKGROUND = (Color){6, 24, 24, 255}; + +void c_render(DroneDelivery *env) { + if (env->client == NULL) { + env->client = make_client(env); + if (env->client == NULL) { + TraceLog(LOG_ERROR, "Failed to initialize client for rendering\n"); + return; + } + } + env->render = true; + env->grip_k_max = 1.0f; + env->box_k_max = 1.0f; + env->box_k_min = 1.0f; + env->box_k = 1.0f; + env->reward_dist = env->reward_min_dist; + if (WindowShouldClose()) { + c_close(env); + exit(0); + } + + if (IsKeyDown(KEY_ESCAPE)) { + c_close(env); + exit(0); + } + + handle_camera_controls(env->client); + + Client *client = env->client; + + for (int i = 0; i < env->num_agents; i++) { + Drone *agent = &env->agents[i]; + Trail *trail = &client->trails[i]; + trail->pos[trail->index] = agent->state.pos; + trail->index = (trail->index + 1) % TRAIL_LENGTH; + if (trail->count < TRAIL_LENGTH) { + trail->count++; + } + if (env->terminals[i]) { + trail->index = 0; + trail->count = 0; + } + } + + BeginDrawing(); + ClearBackground(PUFF_BACKGROUND); + + BeginMode3D(client->camera); + + // draws bounding cube + DrawCubeWires((Vector3){0.0f, 0.0f, 0.0f}, GRID_X * 2.0f, + GRID_Y * 2.0f, GRID_Z * 2.0f, WHITE); + + for (int i = 0; i < env->num_agents; i++) { + Drone *agent = &env->agents[i]; + + // draws drone body + DrawSphere((Vector3){agent->state.pos.x, agent->state.pos.y, agent->state.pos.z}, 0.3f, agent->color); + + // draws rotors according to thrust + float T[4]; + for (int j = 0; j < 4; j++) { + float rpm = (env->actions[4*i + j] + 1.0f) * 0.5f * agent->params.max_rpm; + T[j] = agent->params.k_thrust * rpm * rpm; + } + + const float rotor_radius = 0.15f; + const float visual_arm_len = agent->params.arm_len * 4.0f; + + Vec3 rotor_offsets_body[4] = {{+visual_arm_len, 0.0f, 0.0f}, + {-visual_arm_len, 0.0f, 0.0f}, + {0.0f, +visual_arm_len, 0.0f}, + {0.0f, -visual_arm_len, 0.0f}}; + + //Color base_colors[4] = {ORANGE, PURPLE, LIME, SKYBLUE}; + Color base_colors[4] = {agent->color, agent->color, agent->color, agent->color}; + + for (int j = 0; j < 4; j++) { + Vec3 world_off = quat_rotate(agent->state.quat, rotor_offsets_body[j]); + + Vector3 rotor_pos = {agent->state.pos.x + world_off.x, agent->state.pos.y + world_off.y, + agent->state.pos.z + world_off.z}; + + float rpm = (env->actions[4*i + j] + 1.0f) * 0.5f * agent->params.max_rpm; + float intensity = 0.75f + 0.25f * (rpm / agent->params.max_rpm); + + Color rotor_color = (Color){(unsigned char)(base_colors[j].r * intensity), + (unsigned char)(base_colors[j].g * intensity), + (unsigned char)(base_colors[j].b * intensity), 255}; + + DrawSphere(rotor_pos, rotor_radius, rotor_color); + + DrawCylinderEx((Vector3){agent->state.pos.x, agent->state.pos.y, agent->state.pos.z}, rotor_pos, 0.02f, 0.02f, 8, + BLACK); + } + + // draws line with direction and magnitude of velocity / 10 + if (norm3(agent->state.vel) > 0.1f) { + DrawLine3D((Vector3){agent->state.pos.x, agent->state.pos.y, agent->state.pos.z}, + (Vector3){agent->state.pos.x + agent->state.vel.x * 0.1f, agent->state.pos.y + agent->state.vel.y * 0.1f, + agent->state.pos.z + agent->state.vel.z * 0.1f}, + MAGENTA); + } + + // Draw trailing path + Trail *trail = &client->trails[i]; + if (trail->count <= 2) { + continue; + } + for (int j = 0; j < trail->count - 1; j++) { + int idx0 = (trail->index - j - 1 + TRAIL_LENGTH) % TRAIL_LENGTH; + int idx1 = (trail->index - j - 2 + TRAIL_LENGTH) % TRAIL_LENGTH; + float alpha = (float)(TRAIL_LENGTH - j) / (float)trail->count * 0.8f; // fade out + Color trail_color = ColorAlpha((Color){0, 187, 187, 255}, alpha); + DrawLine3D((Vector3){trail->pos[idx0].x, trail->pos[idx0].y, trail->pos[idx0].z}, + (Vector3){trail->pos[idx1].x, trail->pos[idx1].y, trail->pos[idx1].z}, + trail_color); + } + } + + for (int i = 0; i < env->num_agents; i++) { + Drone *agent = &env->agents[i]; + Vec3 render_pos = agent->box_pos; + DrawCube((Vector3){render_pos.x, render_pos.y, render_pos.z}, agent->box_size, agent->box_size, agent->box_size, BROWN); + DrawCube((Vector3){agent->drop_pos.x, agent->drop_pos.y, agent->drop_pos.z}, 0.5f, 0.5f, 0.1f, YELLOW); + //DrawSphere((Vector3){agent->hidden_pos.x, agent->hidden_pos.y, agent->hidden_pos.z}, 0.2f, YELLOW); + } + + if (IsKeyDown(KEY_TAB)) { + for (int i = 0; i < env->num_agents; i++) { + Drone *agent = &env->agents[i]; + Vec3 target_pos = agent->target_pos; + DrawSphere((Vector3){target_pos.x, target_pos.y, target_pos.z}, 0.45f, (Color){0, 255, 255, 100}); + } + } + + EndMode3D(); + + DrawText("Left click + drag: Rotate camera", 10, 10, 16, PUFF_WHITE); + DrawText("Mouse wheel: Zoom in/out", 10, 30, 16, PUFF_WHITE); + DrawText(TextFormat("Tick = %d", env->tick), 10, 40, 16, PUFF_WHITE); + DrawText(TextFormat("grip_k = %.3f", env->grip_k), 10, 70, 16, PUFF_WHITE); + DrawText(TextFormat("box_k = %.3f", env->box_k), 10, 90, 16, PUFF_WHITE); + DrawText(TextFormat("eps = %.d", env->episode_num), 10, 110, 16, PUFF_WHITE); + + EndDrawing(); +} diff --git a/pufferlib/ocean/drone_delivery/drone_delivery.py b/pufferlib/ocean/drone_delivery/drone_delivery.py new file mode 100644 index 000000000..8b5dec3ce --- /dev/null +++ b/pufferlib/ocean/drone_delivery/drone_delivery.py @@ -0,0 +1,144 @@ +import numpy as np +import gymnasium + +import pufferlib +from pufferlib.ocean.drone_delivery import binding + +class DroneDelivery(pufferlib.PufferEnv): + def __init__( + self, + num_envs=16, + num_drones=64, + + ablation=0.0, + anneal_min=0.5, + + grip_k_max=17.105300970916993, + + perfect_anneal=150000000, + perfect_deadline=150000000, + + pos_const=0.7108647198043252, + pos_penalty=0.0009629600280207475, + + reward_grip=0.9999, + reward_ho_drop=0.20890470430909128, + reward_hover=0.1603521816569735, + + reward_max_dist=65.0, + reward_min_dist=0.9330297552248776, + + vel_penalty_clamp=0.25, + + w_approach=2.2462377881752698, + w_position=0.7312628607193232, + w_stability=1.6094417286807845, + w_velocity=0.0009999999999999731, + + render_mode=None, + report_interval=1024, + buf=None, + seed=0, + ): + self.single_observation_space = gymnasium.spaces.Box( + low=-1, + high=1, + shape=(45,), + dtype=np.float32, + ) + + self.single_action_space = gymnasium.spaces.Box( + low=-1, high=1, shape=(4,), dtype=np.float32 + ) + + self.num_agents = num_envs*num_drones + self.render_mode = render_mode + self.report_interval = report_interval + self.tick = 0 + + super().__init__(buf) + self.actions = self.actions.astype(np.float32) + + c_envs = [] + for i in range(num_envs): + c_envs.append(binding.env_init( + self.observations[i*num_drones:(i+1)*num_drones], + self.actions[i*num_drones:(i+1)*num_drones], + self.rewards[i*num_drones:(i+1)*num_drones], + self.terminals[i*num_drones:(i+1)*num_drones], + self.truncations[i*num_drones:(i+1)*num_drones], + i, + num_agents=num_drones, + + ablation=ablation, + anneal_min=anneal_min, + + grip_k_max=grip_k_max, + + num_envs=num_envs, + perfect_anneal=perfect_anneal, + perfect_deadline=perfect_deadline, + + pos_const=pos_const, + pos_penalty=pos_penalty, + + reward_grip=reward_grip, + reward_ho_drop=reward_ho_drop, + reward_hover=reward_hover, + + reward_max_dist=reward_max_dist, + reward_min_dist=reward_min_dist, + + vel_penalty_clamp=vel_penalty_clamp, + + w_approach=w_approach, + w_position=w_position, + w_stability=w_stability, + w_velocity=w_velocity + )) + + self.c_envs = binding.vectorize(*c_envs) + + def reset(self, seed=None): + self.tick = 0 + binding.vec_reset(self.c_envs, seed) + return self.observations, [] + + def step(self, actions): + self.actions[:] = actions + + self.tick += 1 + binding.vec_step(self.c_envs) + + info = [] + if self.tick % self.report_interval == 0: + log_data = binding.vec_log(self.c_envs) + if log_data: + info.append(log_data) + + return (self.observations, self.rewards, self.terminals, self.truncations, info) + + def render(self): + binding.vec_render(self.c_envs, 0) + + def close(self): + binding.vec_close(self.c_envs) + +def test_performance(timeout=10, atn_cache=1024): + env = DroneDelivery(num_envs=1000) + env.reset() + tick = 0 + + actions = [env.action_space.sample() for _ in range(atn_cache)] + + import time + start = time.time() + while time.time() - start < timeout: + atn = actions[tick % atn_cache] + env.step(atn) + tick += 1 + + print(f"SPS: {env.num_agents * tick / (time.time() - start)}") + +if __name__ == "__main__": + test_performance() diff --git a/pufferlib/ocean/drone_delivery/dronelib_delivery.h b/pufferlib/ocean/drone_delivery/dronelib_delivery.h new file mode 100644 index 000000000..378280884 --- /dev/null +++ b/pufferlib/ocean/drone_delivery/dronelib_delivery.h @@ -0,0 +1,507 @@ +// Originally made by Sam Turner and Finlay Sanders, 2025. +// Included in pufferlib under the original project's MIT license. +// https://github.com/stmio/drone + +// Some changes made by Keith (Kinvert) for hackathon -> drone_delivery + +#include +#include +#include +#include +#include +#include +#include + +#include "raylib.h" + +// Visualisation properties +#define WIDTH 1080 +#define HEIGHT 720 +#define TRAIL_LENGTH 50 +#define HORIZON 1024 + +// Physical constants for the drone +#define BASE_MASS 1.0f // kg +#define BASE_IXX 0.01f // kgm^2 +#define BASE_IYY 0.01f // kgm^2 +#define BASE_IZZ 0.02f // kgm^2 +#define BASE_ARM_LEN 0.1f // m +#define BASE_K_THRUST 3e-5f // thrust coefficient +#define BASE_K_ANG_DAMP 0.2f // angular damping coefficient +#define BASE_K_DRAG 1e-6f // drag (torque) coefficient +#define BASE_B_DRAG 0.1f // linear drag coefficient +#define BASE_GRAVITY 9.81f // m/s^2 +#define BASE_MAX_RPM 750.0f // rad/s +#define BASE_MAX_VEL 50.0f // m/s +#define BASE_MAX_OMEGA 6.3f // rad/s +#define BASE_K_MOT 0.1f // s (Motor lag constant) +#define BASE_J_MOT 1e-5f // kgm^2 (Motor rotational inertia) + +// Simulation properties +#define GRID_X 30.0f +#define GRID_Y 30.0f +#define GRID_Z 10.0f +#define INV_GRID_X 0.33333f +#define INV_GRID_Y 0.33333f +#define INV_GRID_Z 0.1f + +#define MARGIN_X (GRID_X - 1) +#define MARGIN_Y (GRID_Y - 1) +#define MARGIN_Z (GRID_Z - 1) +#define V_TARGET 0.05f +#define DT 0.05f +#define DT_RNG 0.0f +#define MAX_THRUST_TO_MASS_RATION 0.5f + +// Corner to corner distance +#define MAX_DIST sqrtf((2*GRID_X)*(2*GRID_X) + (2*GRID_Y)*(2*GRID_Y) + (2*GRID_Z)*(2*GRID_Z)) + +#define BASE_BOX_DENSITY 50.0f + +typedef struct Log Log; +struct Log { + float episode_return; + float episode_length; + float collision_rate; + float oob; + float timeout; + float score; + float perf; + float gripping; + float perfect_grip; + float delivered; + float perfect_deliv; + float perfect_now; + float to_pickup; + float ho_pickup; + float de_pickup; + float to_drop; + float ho_drop; + float de_drop; + float dist; + float dist100; + float jitter; + float episode_num; + float episode_gain; + float tick; + float anneal; + float n; +}; + +typedef struct { + float w, x, y, z; +} Quat; + +typedef struct { + float x, y, z; +} Vec3; + +static inline float clampf(float v, float min, float max) { + if (v < min) + return min; + if (v > max) + return max; + return v; +} + +static inline float rndf(float a, float b) { + return a + ((float)rand() / (float)RAND_MAX) * (b - a); +} + +static inline Vec3 add3(Vec3 a, Vec3 b) { return (Vec3){a.x + b.x, a.y + b.y, a.z + b.z}; } + +static inline Quat add_quat(Quat a, Quat b) { return (Quat){a.w + b.w, a.x + b.x, a.y + b.y, a.z + b.z}; } + +static inline Vec3 sub3(Vec3 a, Vec3 b) { return (Vec3){a.x - b.x, a.y - b.y, a.z - b.z}; } + +static inline Vec3 scalmul3(Vec3 a, float b) { return (Vec3){a.x * b, a.y * b, a.z * b}; } + +static inline Quat scalmul_quat(Quat a, float b) { return (Quat){a.w * b, a.x * b, a.y * b, a.z * b}; } + +static inline float dot3(Vec3 a, Vec3 b) { return a.x * b.x + a.y * b.y + a.z * b.z; } + +static inline float norm3(Vec3 a) { return sqrtf(dot3(a, a)); } + +static inline void clamp3(Vec3 *vec, float min, float max) { + vec->x = clampf(vec->x, min, max); + vec->y = clampf(vec->y, min, max); + vec->z = clampf(vec->z, min, max); +} + +static inline void clamp4(float a[4], float min, float max) { + a[0] = clampf(a[0], min, max); + a[1] = clampf(a[1], min, max); + a[2] = clampf(a[2], min, max); + a[3] = clampf(a[3], min, max); +} + +static inline Quat quat_mul(Quat q1, Quat q2) { + Quat out; + out.w = q1.w * q2.w - q1.x * q2.x - q1.y * q2.y - q1.z * q2.z; + out.x = q1.w * q2.x + q1.x * q2.w + q1.y * q2.z - q1.z * q2.y; + out.y = q1.w * q2.y - q1.x * q2.z + q1.y * q2.w + q1.z * q2.x; + out.z = q1.w * q2.z + q1.x * q2.y - q1.y * q2.x + q1.z * q2.w; + return out; +} + +static inline void quat_normalize(Quat *q) { + float n = sqrtf(q->w * q->w + q->x * q->x + q->y * q->y + q->z * q->z); + if (n > 0.0f) { + q->w /= n; + q->x /= n; + q->y /= n; + q->z /= n; + } +} + +static inline Vec3 quat_rotate(Quat q, Vec3 v) { + Quat qv = {0.0f, v.x, v.y, v.z}; + Quat tmp = quat_mul(q, qv); + Quat q_conj = {q.w, -q.x, -q.y, -q.z}; + Quat res = quat_mul(tmp, q_conj); + return (Vec3){res.x, res.y, res.z}; +} + +static inline Quat quat_inverse(Quat q) { return (Quat){q.w, -q.x, -q.y, -q.z}; } + +Quat rndquat() { + float u1 = rndf(0.0f, 1.0f); + float u2 = rndf(0.0f, 1.0f); + float u3 = rndf(0.0f, 1.0f); + + float sqrt_1_minus_u1 = sqrtf(1.0f - u1); + float sqrt_u1 = sqrtf(u1); + + float pi_2_u2 = 2.0f * M_PI * u2; + float pi_2_u3 = 2.0f * M_PI * u3; + + Quat q; + q.w = sqrt_1_minus_u1 * sinf(pi_2_u2); + q.x = sqrt_1_minus_u1 * cosf(pi_2_u2); + q.y = sqrt_u1 * sinf(pi_2_u3); + q.z = sqrt_u1 * cosf(pi_2_u3); + + return q; +} + +typedef struct { + Vec3 pos[TRAIL_LENGTH]; + int index; + int count; +} Trail; + +typedef struct { + Vec3 pos; // global position (x, y, z) + Vec3 vel; // linear velocity (u, v, w) + Quat quat; // roll/pitch/yaw (phi/theta/psi) as a quaternion + Vec3 omega; // angular velocity (p, q, r) + float rpms[4]; // motor RPMs +} State; + +typedef struct { + Vec3 vel; // Derivative of position + Vec3 v_dot; // Derivative of velocity + Quat q_dot; // Derivative of quaternion + Vec3 w_dot; // Derivative of angular velocity + float rpm_dot[4]; // Derivative of motor RPMs +} StateDerivative; + +typedef struct { + // Physical properties. Modeled as part of the drone + // to make domain randomization easier. + float mass; // kg + float ixx; // kgm^2 + float iyy; // kgm^2 + float izz; // kgm^2 + float arm_len; // m + float k_thrust; // thrust coefficient + float k_ang_damp; // angular damping coefficient + float k_drag; // drag (torque) coefficient + float b_drag; // linear drag coefficient + float gravity; // m/s^2 + float max_rpm; // rad/s + float inv_max_rpm; // s/rad + float max_vel; // m/s + float inv_max_vel; // s/m + float max_omega; // rad/s + float inv_max_omega; // s/rad + float k_mot; // s + float j_mot; // kgm^2 +} Params; + +typedef struct { + // core state and parameters + State state; + Params params; + + // helpers for swarm logic + Vec3 spawn_pos; + Vec3 prev_pos; + Vec3 target_pos; + Vec3 target_vel; + Vec3 hidden_pos; + Vec3 hidden_vel; + + // PP Pick and Place + Vec3 box_pos; + Vec3 box_vel; + Vec3 drop_pos; + bool gripping; + bool perfect_grip; + bool delivered; + bool perfect_deliv; + float perfect_deliveries; + bool perfect_now; + bool has_delivered; + float grip_height; + bool approaching_pickup; + bool hovering_pickup; + bool descent_pickup; + bool approaching_drop; + bool hovering_drop; + bool descent_drop; + float hover_timer; + Color color; + + // logging utils + float last_abs_reward; + float last_collision_reward; + float last_position_reward; + float last_velocity_reward; + float last_stability_reward; + float last_approach_reward; + float episode_return; + float collisions; + int episode_length; + float score; + float jitter; + + float base_mass; + float base_ixx; + float base_iyy; + float base_izz; + float base_k_drag; + float base_b_drag; + float box_size; + float box_base_mass; + float box_mass; + float box_mass_max; + bool box_physics_on; +} Drone; + + +void init_drone(Drone* drone, float size, float dr) { + drone->params.arm_len = size / 2.0f; + + // m ~ x^3 + float mass_scale = powf(drone->params.arm_len, 3.0f) / powf(BASE_ARM_LEN, 3.0f); + drone->params.mass = BASE_MASS * mass_scale * rndf(1.0f - dr, 1.0f + dr); + + // I ~ mx^2 + float base_Iscale = BASE_MASS * BASE_ARM_LEN * BASE_ARM_LEN; + float I_scale = drone->params.mass * powf(drone->params.arm_len, 2.0f) / base_Iscale; + drone->params.ixx = BASE_IXX * I_scale * rndf(1.0f - dr, 1.0f + dr); + drone->params.iyy = BASE_IYY * I_scale * rndf(1.0f - dr, 1.0f + dr); + drone->params.izz = BASE_IZZ * I_scale * rndf(1.0f - dr, 1.0f + dr); + + // k_thrust ~ m/l + float k_thrust_scale = (drone->params.mass * drone->params.arm_len) / (BASE_MASS * BASE_ARM_LEN); + drone->params.k_thrust = BASE_K_THRUST * k_thrust_scale * rndf(1.0f - dr, 1.0f + dr); + + // k_ang_damp ~ I + float base_avg_inertia = (BASE_IXX + BASE_IYY + BASE_IZZ) / 3.0f; + float avg_inertia = (drone->params.ixx + drone->params.iyy + drone->params.izz) / 3.0f; + float avg_inertia_scale = avg_inertia / base_avg_inertia; + drone->params.k_ang_damp = BASE_K_ANG_DAMP * avg_inertia_scale * rndf(1.0f - dr, 1.0f + dr); + + // drag ~ x^2 + float drag_scale = powf(drone->params.arm_len, 2.0f) / powf(BASE_ARM_LEN, 2.0f); + drone->params.k_drag = BASE_K_DRAG * drag_scale * rndf(1.0f - dr, 1.0f + dr); + drone->params.b_drag = BASE_B_DRAG * drag_scale * rndf(1.0f - dr, 1.0f + dr); + + // Small gravity randomization + drone->params.gravity = BASE_GRAVITY * rndf(0.99f, 1.01f); + + // RPM ~ 1/x + float rpm_scale = (BASE_ARM_LEN) / (drone->params.arm_len); + drone->params.max_rpm = BASE_MAX_RPM * rpm_scale * rndf(1.0f - dr, 1.0f + dr); + float max_thrust = 4.0f * drone->params.k_thrust * powf(drone->params.max_rpm, 2.0f); + float max_total_mass = (MAX_THRUST_TO_MASS_RATION * max_thrust) / drone->params.gravity; + while (max_thrust < 2.0f * drone->params.mass * drone->params.gravity) { + drone->params.max_rpm = drone->params.max_rpm * 2.0f; + max_thrust = 4.0f * drone->params.k_thrust * powf(drone->params.max_rpm, 2.0f); + max_total_mass = (MAX_THRUST_TO_MASS_RATION * max_thrust) / drone->params.gravity; + } + if (drone->params.max_rpm < 0.001) drone->params.max_rpm += 0.001; + drone->params.inv_max_rpm = 1.0f / drone->params.max_rpm; + + drone->params.max_vel = BASE_MAX_VEL; + if (drone->params.max_vel < 0.001) drone->params.max_vel += 0.001; + drone->params.inv_max_vel = 1.0f / drone->params.max_vel; + drone->params.max_omega = BASE_MAX_OMEGA; + if (drone->params.max_omega < 0.001) drone->params.max_omega += 0.001; + drone->params.inv_max_omega = 1.0f / drone->params.max_omega; + + drone->box_mass_max = max_total_mass - drone->params.mass; + + drone->params.k_mot = BASE_K_MOT * rndf(1.0f - dr, 1.0f + dr); + drone->params.j_mot = BASE_J_MOT * I_scale * rndf(1.0f - dr, 1.0f + dr); + + for (int i = 0; i < 4; i++) { + drone->state.rpms[i] = 0.0f; + } + drone->state.pos = (Vec3){0.0f, 0.0f, 0.0f}; + drone->prev_pos = drone->state.pos; + drone->state.vel = (Vec3){0.0f, 0.0f, 0.0f}; + drone->state.omega = (Vec3){0.0f, 0.0f, 0.0f}; + drone->state.quat = (Quat){1.0f, 0.0f, 0.0f, 0.0f}; +} + +void compute_derivatives(State* state, Params* params, float* actions, StateDerivative* derivatives) { + // first order rpm lag + float target_rpms[4]; + for (int i = 0; i < 4; i++) { + target_rpms[i] = (actions[i] + 1.0f) * 0.5f * params->max_rpm; + } + + // rpm rates + float rpm_dot[4]; + for (int i = 0; i < 4; i++) { + rpm_dot[i] = (1.0f / params->k_mot) * (target_rpms[i] - state->rpms[i]); + } + + // motor thrusts + float T[4]; + for (int i = 0; i < 4; i++) { + T[i] = params->k_thrust * powf(state->rpms[i], 2.0f); + } + + // body frame net force + Vec3 F_prop_body = {0.0f, 0.0f, T[0] + T[1] + T[2] + T[3]}; + + // body frame force -> world frame force + Vec3 F_prop = quat_rotate(state->quat, F_prop_body); + + // world frame linear drag + Vec3 F_aero; + F_aero.x = -params->b_drag * state->vel.x; + F_aero.y = -params->b_drag * state->vel.y; + F_aero.z = -params->b_drag * state->vel.z; + + // velocity rates, a = F/m + Vec3 v_dot; + v_dot.x = (F_prop.x + F_aero.x) / params->mass; + v_dot.y = (F_prop.y + F_aero.y) / params->mass; + v_dot.z = ((F_prop.z + F_aero.z) / params->mass) - params->gravity; + + // quaternion rates + Quat omega_q = {0.0f, state->omega.x, state->omega.y, state->omega.z}; + Quat q_dot = quat_mul(state->quat, omega_q); + q_dot.w *= 0.5f; + q_dot.x *= 0.5f; + q_dot.y *= 0.5f; + q_dot.z *= 0.5f; + + // body frame torques + Vec3 Tau_prop; + Tau_prop.x = params->arm_len*(T[1] - T[3]); + Tau_prop.y = params->arm_len*(T[2] - T[0]); + Tau_prop.z = params->k_drag*(T[0] - T[1] + T[2] - T[3]); + + // torque from chaging motor speeds + float Tau_mot_z = params->j_mot * (rpm_dot[0] - rpm_dot[1] + rpm_dot[2] - rpm_dot[3]); + + // torque from angular damping + Vec3 Tau_aero; + Tau_aero.x = -params->k_ang_damp * state->omega.x; + Tau_aero.y = -params->k_ang_damp * state->omega.y; + Tau_aero.z = -params->k_ang_damp * state->omega.z; + + // gyroscopic torque + Vec3 Tau_iner; + Tau_iner.x = (params->iyy - params->izz) * state->omega.y * state->omega.z; + Tau_iner.y = (params->izz - params->ixx) * state->omega.z * state->omega.x; + Tau_iner.z = (params->ixx - params->iyy) * state->omega.x * state->omega.y; + + // angular velocity rates + Vec3 w_dot; + w_dot.x = (Tau_prop.x + Tau_aero.x + Tau_iner.x) / params->ixx; + w_dot.y = (Tau_prop.y + Tau_aero.y + Tau_iner.y) / params->iyy; + w_dot.z = (Tau_prop.z + Tau_aero.z + Tau_iner.z + Tau_mot_z) / params->izz; + + derivatives->vel = state->vel; + derivatives->v_dot = v_dot; + derivatives->q_dot = q_dot; + derivatives->w_dot = w_dot; + for (int i = 0; i < 4; i++) { + derivatives->rpm_dot[i] = rpm_dot[i]; + } +} + +static void step(State* initial, StateDerivative* deriv, float dt, State* output) { + output->pos = add3(initial->pos, scalmul3(deriv->vel, dt)); + output->vel = add3(initial->vel, scalmul3(deriv->v_dot, dt)); + output->quat = add_quat(initial->quat, scalmul_quat(deriv->q_dot, dt)); + output->omega = add3(initial->omega, scalmul3(deriv->w_dot, dt)); + for (int i = 0; i < 4; i++) { + output->rpms[i] = initial->rpms[i] + deriv->rpm_dot[i] * dt; + } + quat_normalize(&output->quat); +} + +void rk4_step(State* state, Params* params, float* actions, float dt) { + StateDerivative k1, k2, k3, k4; + State temp_state; + + compute_derivatives(state, params, actions, &k1); + + step(state, &k1, dt * 0.5f, &temp_state); + compute_derivatives(&temp_state, params, actions, &k2); + + step(state, &k2, dt * 0.5f, &temp_state); + compute_derivatives(&temp_state, params, actions, &k3); + + step(state, &k3, dt, &temp_state); + compute_derivatives(&temp_state, params, actions, &k4); + + float dt_6 = dt / 6.0f; + + state->pos.x += (k1.vel.x + 2.0f * k2.vel.x + 2.0f * k3.vel.x + k4.vel.x) * dt_6; + state->pos.y += (k1.vel.y + 2.0f * k2.vel.y + 2.0f * k3.vel.y + k4.vel.y) * dt_6; + state->pos.z += (k1.vel.z + 2.0f * k2.vel.z + 2.0f * k3.vel.z + k4.vel.z) * dt_6; + + state->vel.x += (k1.v_dot.x + 2.0f * k2.v_dot.x + 2.0f * k3.v_dot.x + k4.v_dot.x) * dt_6; + state->vel.y += (k1.v_dot.y + 2.0f * k2.v_dot.y + 2.0f * k3.v_dot.y + k4.v_dot.y) * dt_6; + state->vel.z += (k1.v_dot.z + 2.0f * k2.v_dot.z + 2.0f * k3.v_dot.z + k4.v_dot.z) * dt_6; + + state->quat.w += (k1.q_dot.w + 2.0f * k2.q_dot.w + 2.0f * k3.q_dot.w + k4.q_dot.w) * dt_6; + state->quat.x += (k1.q_dot.x + 2.0f * k2.q_dot.x + 2.0f * k3.q_dot.x + k4.q_dot.x) * dt_6; + state->quat.y += (k1.q_dot.y + 2.0f * k2.q_dot.y + 2.0f * k3.q_dot.y + k4.q_dot.y) * dt_6; + state->quat.z += (k1.q_dot.z + 2.0f * k2.q_dot.z + 2.0f * k3.q_dot.z + k4.q_dot.z) * dt_6; + + state->omega.x += (k1.w_dot.x + 2.0f * k2.w_dot.x + 2.0f * k3.w_dot.x + k4.w_dot.x) * dt_6; + state->omega.y += (k1.w_dot.y + 2.0f * k2.w_dot.y + 2.0f * k3.w_dot.y + k4.w_dot.y) * dt_6; + state->omega.z += (k1.w_dot.z + 2.0f * k2.w_dot.z + 2.0f * k3.w_dot.z + k4.w_dot.z) * dt_6; + + for (int i = 0; i < 4; i++) { + state->rpms[i] += (k1.rpm_dot[i] + 2.0f * k2.rpm_dot[i] + 2.0f * k3.rpm_dot[i] + k4.rpm_dot[i]) * dt_6; + } + + quat_normalize(&state->quat); +} + +void move_drone(Drone* drone, float* actions) { + // clamp actions + clamp4(actions, -1.0f, 1.0f); + + // Domain randomized dt + float dt = DT * rndf(1.0f - DT_RNG, 1.0 + DT_RNG); + + // update drone state + drone->prev_pos = drone->state.pos; + rk4_step(&drone->state, &drone->params, actions, dt); + + // clamp and normalise for observations + clamp3(&drone->state.vel, -drone->params.max_vel, drone->params.max_vel); + clamp3(&drone->state.omega, -drone->params.max_omega, drone->params.max_omega); +} diff --git a/pufferlib/ocean/environment.py b/pufferlib/ocean/environment.py index ed1408ba6..c8c58bfff 100644 --- a/pufferlib/ocean/environment.py +++ b/pufferlib/ocean/environment.py @@ -131,6 +131,7 @@ def make_multiagent(buf=None, **kwargs): 'matsci': 'Matsci', 'memory': 'Memory', 'boids': 'Boids', + 'drone_delivery': 'DroneDelivery', 'drone_race': 'DroneRace', 'drone_swarm': 'DroneSwarm', 'nmmo3': 'NMMO3', diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py index 63dc77a51..2746fb2c4 100644 --- a/pufferlib/pufferl.py +++ b/pufferlib/pufferl.py @@ -1,8 +1,7 @@ -## puffer [train | eval | sweep] [env_name] [optional args] -- See https://puffer.ai for full detail0 +# puffer [train | eval | sweep] [env_name] [optional args] -- See https://puffer.ai for full details # This is the same as python -m pufferlib.pufferl [train | eval | sweep] [env_name] [optional args] # Distributed example: torchrun --standalone --nnodes=1 --nproc-per-node=6 -m pufferlib.pufferl train puffer_nmmo3 -import contextlib import warnings warnings.filterwarnings('error', category=RuntimeWarning) @@ -136,9 +135,7 @@ def __init__(self, config, vecenv, policy, logger=None): self.uncompiled_policy = policy self.policy = policy if config['compile']: - self.policy = torch.compile(policy, mode=config['compile_mode']) - self.policy.forward_eval = torch.compile(policy, mode=config['compile_mode']) - pufferlib.pytorch.sample_logits = torch.compile(pufferlib.pytorch.sample_logits, mode=config['compile_mode']) + self.policy = torch.compile(policy, mode=config['compile_mode'], fullgraph=config['compile_fullgraph']) # Optimizer if config['optimizer'] == 'adam': @@ -176,9 +173,7 @@ def __init__(self, config, vecenv, policy, logger=None): # Automatic mixed precision precision = config['precision'] - self.amp_context = contextlib.nullcontext() - if config.get('amp', True) and config['device'] == 'cuda': - self.amp_context = torch.amp.autocast(device_type='cuda', dtype=getattr(torch, precision)) + self.amp_context = torch.amp.autocast(device_type='cuda', dtype=getattr(torch, precision)) if precision not in ('float32', 'bfloat16'): raise pufferlib.APIUsageError(f'Invalid precision: {precision}: use float32 or bfloat16') @@ -222,8 +217,8 @@ def evaluate(self): if config['use_rnn']: for k in self.lstm_h: - self.lstm_h[k] = torch.zeros(self.lstm_h[k].shape, device=device) - self.lstm_c[k] = torch.zeros(self.lstm_c[k].shape, device=device) + self.lstm_h[k].zero_() + self.lstm_c[k].zero_() self.full_rows = 0 while self.full_rows < self.segments: @@ -418,12 +413,46 @@ def train(self): # Learn on accumulated minibatches profile('learn', epoch) + + # Check for NaN in loss before backward + if torch.isnan(loss): + print(f"Warning: NaN loss detected, skipping backward pass") + self.optimizer.zero_grad() + # Reduce learning rate on NaN + for g in self.optimizer.param_groups: + g['lr'] *= 0.5 + print(f"Reduced learning rate to {self.optimizer.param_groups[0]['lr']}") + continue + loss.backward() + + # Check for NaN in gradients + has_nan_grad = False + for name, param in self.policy.named_parameters(): + if param.grad is not None and torch.isnan(param.grad).any(): + print(f"Warning: NaN gradient in {name}") + has_nan_grad = True + param.grad = torch.nan_to_num(param.grad, nan=0.0) + if (mb + 1) % self.accumulate_minibatches == 0: + + # More aggressive gradient clipping for continuous actions + policy_obj = self.policy.policy if hasattr(self.policy, 'policy') else self.policy + if hasattr(policy_obj, 'is_continuous') and policy_obj.is_continuous: + torch.nn.utils.clip_grad_norm_(self.policy.parameters(), min(config['max_grad_norm'], 0.5)) + else: + torch.nn.utils.clip_grad_norm_(self.policy.parameters(), config['max_grad_norm']) + torch.nn.utils.clip_grad_norm_(self.policy.parameters(), config['max_grad_norm']) self.optimizer.step() self.optimizer.zero_grad() + # Check for NaN in weights after update + policy_obj = self.policy.policy if hasattr(self.policy, 'policy') else self.policy + if hasattr(policy_obj, 'check_and_fix_weights'): + if policy_obj.check_and_fix_weights(): + print("Warning: NaN in weights after optimizer step, weights reinitialized") + # Reprioritize experience profile('train_misc', epoch) if config['anneal_lr']: @@ -496,7 +525,7 @@ def close(self): self.utilization.stop() model_path = self.save_checkpoint() run_id = self.logger.run_id - path = os.path.join(self.config['data_dir'], f'{self.config["env"]}_{run_id}.pt') + path = os.path.join(self.config['data_dir'], f'{run_id}.pt') shutil.copy(model_path, path) return path @@ -506,11 +535,11 @@ def save_checkpoint(self): return run_id = self.logger.run_id - path = os.path.join(self.config['data_dir'], f'{self.config["env"]}_{run_id}') + path = os.path.join(self.config['data_dir'], run_id) if not os.path.exists(path): os.makedirs(path) - model_name = f'model_{self.config["env"]}_{self.epoch:06d}.pt' + model_name = f'model_{self.epoch:06d}.pt' model_path = os.path.join(path, model_name) if os.path.exists(model_path): return model_path @@ -527,7 +556,7 @@ def save_checkpoint(self): } state_path = os.path.join(path, 'trainer_state.pt') torch.save(state, state_path + '.tmp') - os.replace(state_path + '.tmp', state_path) + os.rename(state_path + '.tmp', state_path) return model_path def print_dashboard(self, clear=False, idx=[0], @@ -869,6 +898,7 @@ def download(self): return f'{data_dir}/{model_file}' def train(env_name, args=None, vecenv=None, policy=None, logger=None): + print("train pufferl") args = args or load_config(env_name) # Assume TorchRun DDP is used if LOCAL_RANK is set @@ -882,8 +912,11 @@ def train(env_name, args=None, vecenv=None, policy=None, logger=None): torch.cuda.set_device(local_rank) os.environ["CUDA_VISIBLE_DEVICES"] = str(local_rank) + print(" vecenv train() pufferl before") + print(f" vecenv = {vecenv}") vecenv = vecenv or load_env(env_name, args) - policy = policy or load_policy(args, vecenv, env_name) + print("--vecenv train() pufferl after") + policy = policy or load_policy(args, vecenv) if 'LOCAL_RANK' in os.environ: args['train']['device'] = torch.cuda.current_device() @@ -909,11 +942,7 @@ def train(env_name, args=None, vecenv=None, policy=None, logger=None): all_logs = [] while pufferl.global_step < train_config['total_timesteps']: - if train_config['device'] == 'cuda': - torch.compiler.cudagraph_mark_step_begin() pufferl.evaluate() - if train_config['device'] == 'cuda': - torch.compiler.cudagraph_mark_step_begin() logs = pufferl.train() if logs is not None: @@ -936,6 +965,8 @@ def train(env_name, args=None, vecenv=None, policy=None, logger=None): pufferl.print_dashboard() model_path = pufferl.close() pufferl.logger.close(model_path) + print("======================LAST BREAKPOINT END TRAIN IN PUFFERL.PY======================") + #breakpoint() return all_logs def eval(env_name, args=None, vecenv=None, policy=None): @@ -947,7 +978,7 @@ def eval(env_name, args=None, vecenv=None, policy=None): args['vec'] = dict(backend=backend, num_envs=1) vecenv = vecenv or load_env(env_name, args) - policy = policy or load_policy(args, vecenv, env_name) + policy = policy or load_policy(args, vecenv) ob, info = vecenv.reset() driver = vecenv.driver_env num_agents = vecenv.observation_space.shape[0] @@ -1070,13 +1101,18 @@ def autotune(args=None, env_name=None, vecenv=None, policy=None): pufferlib.vector.autotune(make_env, batch_size=args['train']['env_batch_size']) def load_env(env_name, args): + print(" load_env pufferl.py begin") package = args['package'] module_name = 'pufferlib.ocean' if package == 'ocean' else f'pufferlib.environments.{package}' env_module = importlib.import_module(module_name) make_env = env_module.env_creator(env_name) - return pufferlib.vector.make(make_env, env_kwargs=args['env'], **args['vec']) + print(" about to pufferlib.vector.make(make_env in load_env in pufferl.py") + thing = pufferlib.vector.make(make_env, env_kwargs=args['env'], **args['vec']) + print(f" thing = {thing}") + print(" load_env pufferl.py end") + return thing -def load_policy(args, vecenv, env_name=''): +def load_policy(args, vecenv): package = args['package'] module_name = 'pufferlib.ocean' if package == 'ocean' else f'pufferlib.environments.{package}' env_module = importlib.import_module(module_name) @@ -1107,7 +1143,7 @@ def load_policy(args, vecenv, env_name=''): load_path = args['load_model_path'] if load_path == 'latest': - load_path = max(glob.glob(f"experiments/{env_name}*.pt"), key=os.path.getctime) + load_path = max(glob.glob("experiments/*.pt"), key=os.path.getctime) if load_path is not None: state_dict = torch.load(load_path, map_location=device) @@ -1160,19 +1196,24 @@ def load_config(env_name): raise pufferlib.APIUsageError('No config for env_name {}'.format(env_name)) # Dynamic help menu from config - def puffer_type(value): - try: - return ast.literal_eval(value) - except: - return value + def auto_type(value): + """Type inference for numeric args that use 'auto' as a default value""" + if value == 'auto': return value + if value.isnumeric(): return int(value) + return float(value) for section in p.sections(): for key in p[section]: + try: + value = ast.literal_eval(p[section][key]) + except: + value = p[section][key] + fmt = f'--{key}' if section == 'base' else f'--{section}.{key}' parser.add_argument( fmt.replace('_', '-'), - default=puffer_type(p[section][key]), - type=puffer_type + default=value, + type=auto_type if value == 'auto' else type(value) ) parser.add_argument('-h', '--help', default=argparse.SUPPRESS, @@ -1213,6 +1254,7 @@ def main(): export(env_name=env_name) else: raise pufferlib.APIUsageError(err) + print("END OF MAIN") if __name__ == '__main__': main() diff --git a/setup.py b/setup.py index 631b051ef..7597fd18e 100644 --- a/setup.py +++ b/setup.py @@ -184,14 +184,15 @@ def run(self): # Find C extensions c_extensions = [] if not NO_OCEAN: - c_extension_paths = glob.glob('pufferlib/ocean/**/binding.c', recursive=True) + #c_extension_paths = glob.glob('pufferlib/ocean/**/binding.c', recursive=True) + c_extension_paths = ['pufferlib/ocean/drone_delivery/binding.c'] c_extensions = [ Extension( path.rstrip('.c').replace('/', '.'), sources=[path], **extension_kwargs, ) - for path in c_extension_paths if 'matsci' not in path + for path in c_extension_paths# if 'matsci' not in path ] c_extension_paths = [os.path.join(*path.split('/')[:-1]) for path in c_extension_paths]