Skip to content

Commit d890f7d

Browse files
CydraldaviskingCopilotAldric PIERRAIN
authored
Add Adaptive Computation Time (ACT) layer (#3111)
* Implementation of linear_ layer for neural networks. This layer provides an optimized linear transformation for multi-dimensional inputs. * Minor change * Update dlib/dnn/layers.h Co-authored-by: Copilot <[email protected]> * Add reshape_to and flatten layers to Dlib's DNN module * Missing update to "visitors.h" * format fixing for reshape_to * Update dlib/test/dnn.cpp * Vocabulary size fixed for learning, and function added for transformation-free tokenization * Added a new example for learning a “complex” Transformer model. * Added a new example for learning a “complex” Transformer model. * Updated example for training a Transformer model. * fix for gcc/ffmpeg compilation * Fix a warning message for Ubuntu compilation. * Update for Linux environment. * Fix batch building * Slight improvement in model definition. * linear_ layer implementation improvement * finalizing the example * Fixing break condition in training method. * Fixing declaration order of variables. * bpe_tokenizer improvements. * Example updated. * bpe_tokenizer class refactoring. * Example updated. * bpe_tokenizer class updated. * Decoding part of the bpe_tokenizer updated. * Network definition update * Add Adaptive Computation Time (ACT) layer with CPU/CUDA support * Fixes * Update comments for params * Fixes and improvements * Disabling enable_depth_scaling, which obviously affects the result of test_layer --------- Co-authored-by: Davis E. King <[email protected]> Co-authored-by: Copilot <[email protected]> Co-authored-by: Aldric PIERRAIN <[email protected]>
1 parent 35a8e1f commit d890f7d

File tree

9 files changed

+1357
-76
lines changed

9 files changed

+1357
-76
lines changed

dlib/cuda/cpu_dlib.cpp

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3219,6 +3219,153 @@ namespace dlib
32193219

32203220
// ------------------------------------------------------------------------------------
32213221

3222+
void compute_act_halt_probabilities(
3223+
resizable_tensor& halt_probs,
3224+
resizable_tensor& logits,
3225+
const tensor& input_data,
3226+
const tensor& halt_params,
3227+
long batch_size,
3228+
long seq_len,
3229+
long feature_dim
3230+
)
3231+
{
3232+
const float* in_ptr = input_data.host();
3233+
const float* W_halt = halt_params.host();
3234+
const float b_halt = halt_params.host()[feature_dim];
3235+
float* logits_ptr = logits.host();
3236+
float* halt_probs_ptr = halt_probs.host();
3237+
3238+
const long d_model = feature_dim / input_data.k();
3239+
const long num_channels = input_data.k();
3240+
3241+
for (long pos = 0; pos < batch_size * seq_len; ++pos) {
3242+
const long n = pos / seq_len;
3243+
const long s = pos % seq_len;
3244+
3245+
float logit = b_halt;
3246+
3247+
for (long c = 0; c < num_channels; ++c) {
3248+
for (long d = 0; d < d_model; ++d) {
3249+
const long in_idx = ((n * num_channels + c) * seq_len + s) * d_model + d;
3250+
const long weight_idx = c * d_model + d;
3251+
logit += in_ptr[in_idx] * W_halt[weight_idx];
3252+
}
3253+
}
3254+
3255+
logits_ptr[pos] = logit;
3256+
3257+
halt_probs_ptr[pos] = 1.0f / (1.0f + std::exp(-logit));
3258+
}
3259+
}
3260+
3261+
void update_act_state(
3262+
resizable_tensor& output,
3263+
const tensor& input_data,
3264+
const tensor& halt_probs,
3265+
resizable_tensor& cumulative_halting,
3266+
resizable_tensor& remainders,
3267+
resizable_tensor& n_steps,
3268+
long batch_size,
3269+
long seq_len,
3270+
long d_model,
3271+
long num_channels,
3272+
float halt_threshold,
3273+
long current_step
3274+
)
3275+
{
3276+
const float* in_ptr = input_data.host();
3277+
const float* p_halt = halt_probs.host();
3278+
float* out_ptr = output.host();
3279+
float* cum_halt = cumulative_halting.host();
3280+
float* remain = remainders.host();
3281+
float* steps = n_steps.host();
3282+
3283+
for (long pos = 0; pos < batch_size * seq_len; ++pos) {
3284+
if (cum_halt[pos] < halt_threshold) {
3285+
const long n = pos / seq_len;
3286+
const long s = pos % seq_len;
3287+
3288+
float p = p_halt[pos];
3289+
float r = remain[pos];
3290+
float effective = std::min(p * r, halt_threshold - cum_halt[pos]);
3291+
3292+
cum_halt[pos] += effective;
3293+
remain[pos] -= effective;
3294+
steps[pos] = static_cast<float>(current_step + 1);
3295+
3296+
for (long c = 0; c < num_channels; ++c) {
3297+
for (long d = 0; d < d_model; ++d) {
3298+
const long idx = ((n * num_channels + c) * seq_len + s) * d_model + d;
3299+
out_ptr[idx] += effective * in_ptr[idx];
3300+
}
3301+
}
3302+
}
3303+
}
3304+
}
3305+
3306+
void finalize_act_output(
3307+
resizable_tensor& output,
3308+
const tensor& input_data,
3309+
const tensor& remainders,
3310+
long batch_size,
3311+
long seq_len,
3312+
long d_model,
3313+
long num_channels
3314+
)
3315+
{
3316+
const float* in_ptr = input_data.host();
3317+
const float* remain = remainders.host();
3318+
float* out_ptr = output.host();
3319+
3320+
for (long pos = 0; pos < batch_size * seq_len; ++pos) {
3321+
float r = remain[pos];
3322+
if (r > 1e-6f) {
3323+
const long n = pos / seq_len;
3324+
const long s = pos % seq_len;
3325+
3326+
for (long c = 0; c < num_channels; ++c) {
3327+
for (long d = 0; d < d_model; ++d) {
3328+
const long idx = ((n * num_channels + c) * seq_len + s) * d_model + d;
3329+
out_ptr[idx] += r * in_ptr[idx];
3330+
}
3331+
}
3332+
}
3333+
}
3334+
}
3335+
3336+
void apply_act_depth_scaling(
3337+
tensor& gradients,
3338+
const tensor& n_steps,
3339+
long batch_size,
3340+
long seq_len,
3341+
long d_model,
3342+
long num_channels,
3343+
float max_steps,
3344+
float scale_factor
3345+
)
3346+
{
3347+
const float* steps = n_steps.host();
3348+
float* grad_ptr = gradients.host();
3349+
3350+
for (long pos = 0; pos < batch_size * seq_len; ++pos)
3351+
{
3352+
const float scale = 1.0f + scale_factor * (steps[pos] / max_steps);
3353+
const long n = pos / seq_len;
3354+
const long s = pos % seq_len;
3355+
3356+
for (long c = 0; c < num_channels; ++c)
3357+
{
3358+
for (long d = 0; d < d_model; ++d)
3359+
{
3360+
const long idx = ((n * num_channels + c) * seq_len + s) * d_model + d;
3361+
grad_ptr[idx] *= scale;
3362+
}
3363+
}
3364+
}
3365+
}
3366+
3367+
// ------------------------------------------------------------------------------------
3368+
32223369
}
32233370
}
32243371

dlib/cuda/cpu_dlib.h

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -536,6 +536,54 @@ namespace dlib
536536
bool scale
537537
);
538538

539+
// -----------------------------------------------------------------------------------
540+
541+
void compute_act_halt_probabilities(
542+
resizable_tensor& halt_probs,
543+
resizable_tensor& logits,
544+
const tensor& input_data,
545+
const tensor& halt_params,
546+
long batch_size,
547+
long seq_len,
548+
long feature_dim
549+
);
550+
551+
void update_act_state(
552+
resizable_tensor& output,
553+
const tensor& input_data,
554+
const tensor& halt_probs,
555+
resizable_tensor& cumulative_halting,
556+
resizable_tensor& remainders,
557+
resizable_tensor& n_steps,
558+
long batch_size,
559+
long seq_len,
560+
long d_model,
561+
long num_channels,
562+
float halt_threshold,
563+
long current_step
564+
);
565+
566+
void finalize_act_output(
567+
resizable_tensor& output,
568+
const tensor& input_data,
569+
const tensor& remainders,
570+
long batch_size,
571+
long seq_len,
572+
long d_model,
573+
long num_channels
574+
);
575+
576+
void apply_act_depth_scaling(
577+
tensor& gradients,
578+
const tensor& n_steps,
579+
long batch_size,
580+
long seq_len,
581+
long d_model,
582+
long num_channels,
583+
float max_steps,
584+
float scale_factor
585+
);
586+
539587
// -----------------------------------------------------------------------------------
540588

541589
class pooling

0 commit comments

Comments
 (0)