Skip to content

Commit 34b519b

Browse files
authored
Merge branch 'master' into softmaxm-layer
2 parents 0ca05c7 + 3924095 commit 34b519b

File tree

15 files changed

+731
-24
lines changed

15 files changed

+731
-24
lines changed

dlib/cuda/cpu_dlib.cpp

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2477,6 +2477,121 @@ namespace dlib
24772477
}
24782478

24792479
// ------------------------------------------------------------------------------------
2480+
2481+
void embeddings(
2482+
resizable_tensor& dest,
2483+
const tensor& src,
2484+
const tensor& embs
2485+
)
2486+
{
2487+
DLIB_CASSERT(
2488+
src.nr() > 0 &&
2489+
embs.num_samples() > 0 &&
2490+
embs.k() > 0 &&
2491+
embs.nr() == 1 &&
2492+
embs.nc() == 1,
2493+
"\nsrc.num_samples(): " << src.num_samples() <<
2494+
"\nsrc.k(): " << src.k() <<
2495+
"\nsrc.nr(): " << src.nr() <<
2496+
"\nsrc.nc(): " << src.nc() <<
2497+
"\nembs.num_samples(): " << embs.num_samples() <<
2498+
"\nembs.k(): " << embs.k() <<
2499+
"\nembs.nr(): " << embs.nr() <<
2500+
"\nembs.nc(): " << embs.nc()
2501+
);
2502+
2503+
long ns = dest.num_samples(), nk = dest.k(), nr = dest.nr(), nc = dest.nc();
2504+
const float* src_data = src.host();
2505+
float* dest_data = dest.host();
2506+
const float* embs_data = embs.host();
2507+
for (long s = 0; s < ns; ++s)
2508+
{
2509+
for (long k = 0; k < nk; ++k)
2510+
{
2511+
for (long r = 0; r < nr; ++r)
2512+
{
2513+
const unsigned long token_idx = static_cast<unsigned long>(src_data[tensor_index(src, s, k, r, 0)]);
2514+
if (token_idx < embs.num_samples())
2515+
{
2516+
for (long c = 0; c < nc; ++c)
2517+
dest_data[tensor_index(dest, s, k, r, c)] = embs_data[tensor_index(embs, token_idx, c, 0, 0)];
2518+
}
2519+
else
2520+
{
2521+
for (long c = 0; c < nc; ++c)
2522+
dest_data[tensor_index(dest, s, k, r, c)] = 0;
2523+
}
2524+
}
2525+
}
2526+
}
2527+
}
2528+
2529+
void embeddings_gradient(
2530+
const tensor& prev,
2531+
const tensor& gradient_input,
2532+
tensor& grads,
2533+
const tensor& freqs,
2534+
float learning_rate,
2535+
bool scale
2536+
)
2537+
{
2538+
DLIB_CASSERT(
2539+
prev.nr() > 0 &&
2540+
gradient_input.num_samples() == prev.num_samples() &&
2541+
gradient_input.k() == prev.k() &&
2542+
gradient_input.nr() == prev.nr() &&
2543+
gradient_input.nc() == grads.k() &&
2544+
grads.num_samples() > 0 &&
2545+
grads.k() > 0 &&
2546+
grads.nr() == 1 &&
2547+
grads.nc() == 1,
2548+
"\ngradient_input.num_samples(): " << gradient_input.num_samples() <<
2549+
"\ngradient_input.k(): " << gradient_input.k() <<
2550+
"\ngradient_input.nr(): " << gradient_input.nr() <<
2551+
"\ngradient_input.nc(): " << gradient_input.nc() <<
2552+
"\nprev.num_samples(): " << prev.num_samples() <<
2553+
"\nprev.k(): " << prev.k() <<
2554+
"\nprev.nr(): " << prev.nr() <<
2555+
"\nprev.nc(): " << prev.nc() <<
2556+
"\ngrads.num_samples(): " << grads.num_samples() <<
2557+
"\ngrads.k(): " << grads.k() <<
2558+
"\ngrads.nr(): " << grads.nr() <<
2559+
"\ngrads.nc(): " << grads.nc()
2560+
);
2561+
2562+
const float* prev_data = prev.host();
2563+
const float* gradient_input_data = gradient_input.host();
2564+
const float* freqs_data = freqs.host();
2565+
float* grads_data = grads.host();
2566+
long ns = gradient_input.num_samples(), nk = gradient_input.k();
2567+
long nr = gradient_input.nr(), nc = gradient_input.nc();
2568+
2569+
std::vector<dlib::mutex> embedding_mutexes(grads.num_samples());
2570+
parallel_for(0, ns * nk, [&](long i)
2571+
{
2572+
long s = i / nk;
2573+
long k = i % nk;
2574+
2575+
for (long r = 0; r < nr; ++r)
2576+
{
2577+
const unsigned long token_idx = static_cast<unsigned long>(prev_data[tensor_index(prev, s, k, r, 0)]);
2578+
if (token_idx < grads.num_samples())
2579+
{
2580+
const float freg_token = freqs_data[token_idx];
2581+
float freq_scale = 1.0f;
2582+
2583+
if (scale && freg_token != 0.0f) freq_scale = std::min(0.15f, std::max(1.0f / freg_token, 1.0f));
2584+
auto_mutex locker(embedding_mutexes[token_idx]);
2585+
for (long c = 0; c < nc; ++c)
2586+
{
2587+
const float gradient = gradient_input_data[tensor_index(gradient_input, s, k, r, c)];
2588+
grads_data[tensor_index(grads, token_idx, c, 0, 0)] -= (gradient * learning_rate * freq_scale);
2589+
}
2590+
}
2591+
}
2592+
});
2593+
}
2594+
24802595
// ------------------------------------------------------------------------------------
24812596
// ------------------------------------------------------------------------------------
24822597

dlib/cuda/cpu_dlib.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -519,6 +519,23 @@ namespace dlib
519519
const tensor& gradient_input
520520
);
521521

522+
// -----------------------------------------------------------------------------------
523+
524+
void embeddings(
525+
resizable_tensor& dest,
526+
const tensor& src,
527+
const tensor& embs
528+
);
529+
530+
void embeddings_gradient(
531+
const tensor& prev,
532+
const tensor& gradient_input,
533+
tensor& grads,
534+
const tensor& freqs,
535+
float learning_rate,
536+
bool scale
537+
);
538+
522539
// -----------------------------------------------------------------------------------
523540

524541
class pooling

dlib/cuda/cuda_dlib.cu

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2088,6 +2088,126 @@ namespace dlib
20882088
row_stride, col_stride, add_to);
20892089
}
20902090

2091+
// ----------------------------------------------------------------------------------------
2092+
2093+
__global__ void _cuda_embeddings(size_t dsize, size_t dk, size_t dr, size_t dc,
2094+
float* d, const float* s, const float* e, size_t es
2095+
)
2096+
{
2097+
for (auto i : grid_stride_range(0, dsize))
2098+
{
2099+
const auto n = i / (dk * dr * dc);
2100+
const auto s_idx = i % (dk * dr * dc);
2101+
const auto k = (s_idx / (dr * dc)) % dk;
2102+
const auto r = (s_idx / dc) % dr;
2103+
const auto c = s_idx % dc;
2104+
2105+
const unsigned long t_idx = static_cast<unsigned long>(s[(n * dk + k) * dr + r]);
2106+
2107+
if (t_idx < es)
2108+
d[i] = e[t_idx * dc + c];
2109+
else
2110+
d[i] = 0.0f;
2111+
}
2112+
}
2113+
2114+
void embeddings(
2115+
resizable_tensor& dest,
2116+
const tensor& src,
2117+
const tensor& embs
2118+
)
2119+
{
2120+
DLIB_CASSERT(
2121+
src.nr() > 0 &&
2122+
embs.num_samples() > 0 &&
2123+
embs.k() > 0 &&
2124+
embs.nr() == 1 &&
2125+
embs.nc() == 1,
2126+
"\nsrc.num_samples(): " << src.num_samples() <<
2127+
"\nsrc.k(): " << src.k() <<
2128+
"\nsrc.nr(): " << src.nr() <<
2129+
"\nsrc.nc(): " << src.nc() <<
2130+
"\nembs.num_samples(): " << embs.num_samples() <<
2131+
"\nembs.k(): " << embs.k() <<
2132+
"\nembs.nr(): " << embs.nr() <<
2133+
"\nembs.nc(): " << embs.nc()
2134+
);
2135+
2136+
const long dk = dest.k();
2137+
const long dr = dest.nr();
2138+
const long dc = dest.nc();
2139+
2140+
launch_kernel(_cuda_embeddings, dest.size(), dk, dr, dc,
2141+
dest.device(), src.device(), embs.device(), embs.num_samples());
2142+
}
2143+
2144+
__global__ void _cuda_embeddings_gradient(size_t ssize, size_t sk, size_t sr, size_t sc,
2145+
const float* o, const float* gi, float* g, const float* f, float lr, bool sl, size_t es
2146+
)
2147+
{
2148+
for (auto i : grid_stride_range(0, ssize))
2149+
{
2150+
const auto n = i / (sk * sr * sc);
2151+
const auto s_idx = i % (sk * sr * sc);
2152+
const auto k = (s_idx / (sr * sc)) % sk;
2153+
const auto r = (s_idx / sc) % sr;
2154+
const auto c = s_idx % sc;
2155+
2156+
const unsigned long t_idx = static_cast<unsigned long>(o[(n * sk + k) * sr + r]);
2157+
if (t_idx < es)
2158+
{
2159+
const float f_t = f[t_idx];
2160+
float f_s = 1.0f;
2161+
2162+
if (sl && f_t != 0.0f) f_s = fminf(0.15f, fmaxf(1.0f / f_t, 1.0f));
2163+
if (f_t > 1) atomicAdd(&g[t_idx * sc + c], -gi[i] * lr * f_s);
2164+
else g[t_idx * sc + c] -= gi[i] * lr * f_s;
2165+
}
2166+
}
2167+
}
2168+
2169+
void embeddings_gradient(
2170+
const tensor& prev,
2171+
const tensor& gradient_input,
2172+
tensor& grads,
2173+
const tensor& freqs,
2174+
float learning_rate,
2175+
bool scale
2176+
)
2177+
{
2178+
DLIB_CASSERT(
2179+
prev.nr() > 0 &&
2180+
gradient_input.num_samples() == prev.num_samples() &&
2181+
gradient_input.k() == prev.k() &&
2182+
gradient_input.nr() == prev.nr() &&
2183+
gradient_input.nc() == grads.k() &&
2184+
grads.num_samples() > 0 &&
2185+
grads.k() > 0 &&
2186+
grads.nr() == 1 &&
2187+
grads.nc() == 1,
2188+
"\ngradient_input.num_samples(): " << gradient_input.num_samples() <<
2189+
"\ngradient_input.k(): " << gradient_input.k() <<
2190+
"\ngradient_input.nr(): " << gradient_input.nr() <<
2191+
"\ngradient_input.nc(): " << gradient_input.nc() <<
2192+
"\nprev.num_samples(): " << prev.num_samples() <<
2193+
"\nprev.k(): " << prev.k() <<
2194+
"\nprev.nr(): " << prev.nr() <<
2195+
"\nprev.nc(): " << prev.nc() <<
2196+
"\ngrads.num_samples(): " << grads.num_samples() <<
2197+
"\ngrads.k(): " << grads.k() <<
2198+
"\ngrads.nr(): " << grads.nr() <<
2199+
"\ngrads.nc(): " << grads.nc()
2200+
);
2201+
2202+
const long sk = gradient_input.k();
2203+
const long sr = gradient_input.nr();
2204+
const long sc = gradient_input.nc();
2205+
2206+
launch_kernel(_cuda_embeddings_gradient, gradient_input.size(), sk, sr, sc,
2207+
prev.device(), gradient_input.device(), grads.device(), freqs.device(),
2208+
learning_rate, scale, grads.num_samples());
2209+
}
2210+
20912211
// ----------------------------------------------------------------------------------------
20922212

20932213
__global__ void _cuda_layer_normalize(

dlib/cuda/cuda_dlib.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -561,6 +561,23 @@ namespace dlib
561561
const tensor& gradient_input
562562
);
563563

564+
// -----------------------------------------------------------------------------------
565+
566+
void embeddings(
567+
resizable_tensor& dest,
568+
const tensor& src,
569+
const tensor& embs
570+
);
571+
572+
void embeddings_gradient(
573+
const tensor& prev,
574+
const tensor& gradient_input,
575+
tensor& grads,
576+
const tensor& freqs,
577+
float learning_rate,
578+
bool scale
579+
);
580+
564581
// ----------------------------------------------------------------------------------------
565582

566583
void copy_tensor(

dlib/cuda/tensor_tools.cpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1298,6 +1298,37 @@ namespace dlib { namespace tt
12981298
#endif
12991299
}
13001300

1301+
// ----------------------------------------------------------------------------------------
1302+
1303+
void embeddings(
1304+
resizable_tensor& dest,
1305+
const tensor& src,
1306+
const tensor& embs
1307+
)
1308+
{
1309+
#ifdef DLIB_USE_CUDA
1310+
cuda::embeddings(dest, src, embs);
1311+
#else
1312+
cpu::embeddings(dest, src, embs);
1313+
#endif
1314+
}
1315+
1316+
void embeddings_gradient(
1317+
const tensor& prev,
1318+
const tensor& gradient_input,
1319+
tensor& grads,
1320+
const tensor& freqs,
1321+
float learning_rate,
1322+
bool scale
1323+
)
1324+
{
1325+
#ifdef DLIB_USE_CUDA
1326+
cuda::embeddings_gradient(prev, gradient_input, grads, freqs, learning_rate, scale);
1327+
#else
1328+
cpu::embeddings_gradient(prev, gradient_input, grads, freqs, learning_rate, scale);
1329+
#endif
1330+
}
1331+
13011332
// ----------------------------------------------------------------------------------------
13021333

13031334
}}

0 commit comments

Comments
 (0)