Skip to content

Commit 230c0b0

Browse files
Cydralarrufatdavisking
authored
Add multm_prev_ layer and enhance gemm() function for PLANE_WISE operations (#3020)
* Fix Stride Indexing Bugs in `reorg` and `reorg_gradient` Functions (CPU & CUDA) and Add `add_to` Parameter * 'add_to' parameter missing in cuda call reorg_gradient.launch_kernel() * Cleanup: remove using namespace std; (#3016) * remove using namespace std from headers * more std:: * more std:: * more std:: on windows stuff * remove uses of using namespace std::chrono * do not use C++17 features * Add Davis suggestion * revert some more stuff * revert removing include * more std::chrono stuff * fix build error * Adjust comment formatting to be like other dlib comments * Add positional encodings layer to Dlib * Add multm_prev layer and enhance gemm() function for PLANE_WISE operations * Updates * Updates * Resynchronization with tril_ class * Delete .vscode/settings.json Not required for the merging * Remove duplicates * Small improvements to PLANE_WISE in gemm() function * Same improvements for the CPU version * Introducing a new enum for operation modes in tensor computations * Remove a test duplicated call in dnn tests * Remove duplicated declaration * Comment fixed * Fixing the Cuda compilation * Merging with updated softmax_ layer * Fixing header for CPU compilation * Adding a missing cast * Test fixed to use the new operation_mode enum * softmaxm test fixed * Enum test removed * Enum test removed * Fixing indentation * Fixing indentation * Test removed * Move the operation_mode enumeration to its own header * Use operation_mode instead of unsigned long --------- Co-authored-by: Adrià <[email protected]> Co-authored-by: Davis King <[email protected]>
1 parent dfbee6d commit 230c0b0

File tree

13 files changed

+979
-286
lines changed

13 files changed

+979
-286
lines changed

dlib/cuda/cpu_dlib.cpp

Lines changed: 137 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -1620,122 +1620,175 @@ namespace dlib
16201620

16211621
namespace ttimpl
16221622
{
1623-
void softmax (
1624-
const long num_locations,
1625-
const long num_channels,
1626-
tensor& dest,
1627-
const tensor& src
1628-
)
1629-
{
1630-
DLIB_ASSERT(num_channels*num_locations == src.nr()*src.nc()*src.k());
1631-
DLIB_CASSERT(have_same_dimensions(dest,src));
1632-
const auto d = dest.host();
1633-
const auto s = src.host();
1623+
void softmax(
1624+
const long num_locations,
1625+
const long num_channels,
1626+
tensor& dest,
1627+
const tensor& src,
1628+
operation_mode mode = operation_mode::CHANNEL_WISE
1629+
)
1630+
{
1631+
DLIB_ASSERT(num_channels * num_locations == src.nr() * src.nc() * src.k());
1632+
DLIB_CASSERT(have_same_dimensions(dest, src));
1633+
const auto d = dest.host();
1634+
const auto s = src.host();
16341635

1635-
// Note that we subtract out the max values in each channel before applying
1636-
// exp() to avoid numeric overflow in the subsequent computations. Doing this
1637-
// doesn't change the resulting output, it just makes it more numerically
1638-
// stable.
1639-
for (long n = 0; n < src.num_samples(); ++n)
1640-
{
1641-
auto ss = s + num_locations*num_channels*n;
1642-
auto dd = d + num_locations*num_channels*n;
1643-
for (long i = 0; i < num_locations; ++i)
1636+
for (long n = 0; n < src.num_samples(); ++n)
16441637
{
1645-
float max_val = -std::numeric_limits<float>::infinity();
1646-
for (long k = 0; k < num_channels; ++k)
1647-
max_val = std::max(max_val, ss[k*num_locations]);
1638+
auto ss = s + num_locations * num_channels * n;
1639+
auto dd = d + num_locations * num_channels * n;
16481640

1649-
for (long k = 0; k < num_channels; ++k)
1650-
dd[k*num_locations] = std::exp(ss[k*num_locations]-max_val);
1641+
if (mode == operation_mode::CHANNEL_WISE)
1642+
{
1643+
for (long i = 0; i < num_locations; ++i)
1644+
{
1645+
float max_val = -std::numeric_limits<float>::infinity();
1646+
for (long k = 0; k < num_channels; ++k)
1647+
max_val = std::max(max_val, ss[k * num_locations]);
16511648

1652-
++ss;
1653-
++dd;
1654-
}
1655-
}
1649+
float sum = 0.0f;
1650+
for (long k = 0; k < num_channels; ++k)
1651+
{
1652+
dd[k * num_locations] = std::exp(ss[k * num_locations] - max_val);
1653+
sum += dd[k * num_locations];
1654+
}
1655+
for (long k = 0; k < num_channels; ++k)
1656+
dd[k * num_locations] /= sum;
16561657

1657-
// Now normalize each channel so they sum to 1.
1658-
for (long n = 0; n < src.num_samples(); ++n)
1659-
{
1660-
const auto dd = d + num_locations*num_channels*n;
1661-
for (long i = 0; i < num_locations; ++i)
1662-
{
1663-
const auto ddd = dd+i;
1658+
++ss;
1659+
++dd;
1660+
}
1661+
}
1662+
else if (mode == operation_mode::PLANE_WISE)
1663+
{
1664+
for (long k = 0; k < num_channels; ++k)
1665+
{
1666+
auto s_channel = ss + k * num_locations;
1667+
auto d_channel = dd + k * num_locations;
1668+
for (long r = 0; r < src.nr(); ++r)
1669+
{
1670+
float max_val = -std::numeric_limits<float>::infinity();
1671+
for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx)
1672+
max_val = std::max(max_val, s_channel[idx]);
16641673

1665-
float temp = 0;
1666-
for (long k = 0; k < num_channels; ++k)
1667-
temp += ddd[k*num_locations];
1668-
for (long k = 0; k < num_channels; ++k)
1669-
ddd[k*num_locations] /= temp;
1674+
if (max_val == -std::numeric_limits<float>::infinity())
1675+
{
1676+
for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx)
1677+
d_channel[idx] = 0.0f;
1678+
}
1679+
else
1680+
{
1681+
float sum = 0.0f;
1682+
for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx)
1683+
{
1684+
d_channel[idx] = std::exp(s_channel[idx] - max_val);
1685+
sum += d_channel[idx];
1686+
}
1687+
for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx)
1688+
d_channel[idx] /= sum;
1689+
}
1690+
}
1691+
}
1692+
}
16701693
}
16711694
}
1672-
}
16731695

1674-
void softmax_gradient (
1675-
const long num_locations,
1676-
const long num_channels,
1677-
tensor& grad,
1678-
const tensor& dest,
1679-
const tensor& gradient_input
1680-
)
1681-
{
1682-
DLIB_ASSERT(num_channels*num_locations == grad.nr()*grad.nc()*grad.k());
1683-
DLIB_CASSERT(have_same_dimensions(grad,dest));
1684-
DLIB_CASSERT(have_same_dimensions(grad,gradient_input));
1685-
const auto d = dest.host();
1686-
const auto g = grad.host();
1687-
const auto in = gradient_input.host();
1688-
1689-
1690-
for (long n = 0; n < grad.num_samples(); ++n)
1696+
void softmax_gradient(
1697+
const long num_locations,
1698+
const long num_channels,
1699+
tensor& grad,
1700+
const tensor& dest,
1701+
const tensor& gradient_input,
1702+
operation_mode mode = operation_mode::CHANNEL_WISE
1703+
)
16911704
{
1692-
const auto d2 = d + num_locations*num_channels*n;
1693-
const auto g2 = g + num_locations*num_channels*n;
1694-
const auto in2 = in + num_locations*num_channels*n;
1695-
for (long i = 0; i < num_locations; ++i)
1705+
DLIB_ASSERT(num_channels * num_locations == grad.nr() * grad.nc() * grad.k());
1706+
DLIB_CASSERT(have_same_dimensions(grad, dest));
1707+
DLIB_CASSERT(have_same_dimensions(grad, gradient_input));
1708+
1709+
const auto d = dest.host();
1710+
const auto g = grad.host();
1711+
const auto in = gradient_input.host();
1712+
for (long n = 0; n < grad.num_samples(); ++n)
16961713
{
1697-
const auto d3 = d2+i;
1698-
const auto g3 = g2+i;
1699-
const auto in3 = in2+i;
1714+
const auto d2 = d + num_locations * num_channels * n;
1715+
const auto g2 = g + num_locations * num_channels * n;
1716+
const auto in2 = in + num_locations * num_channels * n;
17001717

1701-
float temp = 0;
1702-
for (long k = 0; k < num_channels; ++k)
1703-
temp += -d3[k*num_locations]*in3[k*num_locations];
1704-
if (is_same_object(gradient_input, grad))
1718+
if (mode == operation_mode::CHANNEL_WISE)
17051719
{
1706-
for (long k = 0; k < num_channels; ++k)
1707-
g3[k*num_locations] = d3[k*num_locations]*(temp+in3[k*num_locations]);
1720+
for (long i = 0; i < num_locations; ++i)
1721+
{
1722+
const auto d3 = d2 + i;
1723+
const auto g3 = g2 + i;
1724+
const auto in3 = in2 + i;
1725+
float sum = 0.0f;
1726+
for (long k = 0; k < num_channels; ++k)
1727+
sum += -d3[k * num_locations] * in3[k * num_locations];
1728+
if (is_same_object(gradient_input, grad))
1729+
{
1730+
for (long k = 0; k < num_channels; ++k)
1731+
g3[k * num_locations] = d3[k * num_locations] * (sum + in3[k * num_locations]);
1732+
}
1733+
else
1734+
{
1735+
for (long k = 0; k < num_channels; ++k)
1736+
g3[k * num_locations] += d3[k * num_locations] * (sum + in3[k * num_locations]);
1737+
}
1738+
}
17081739
}
1709-
else
1740+
else if (mode == operation_mode::PLANE_WISE)
17101741
{
17111742
for (long k = 0; k < num_channels; ++k)
1712-
g3[k*num_locations] += d3[k*num_locations]*(temp+in3[k*num_locations]);
1743+
{
1744+
const auto d_channel = d2 + k * num_locations;
1745+
const auto g_channel = g2 + k * num_locations;
1746+
const auto in_channel = in2 + k * num_locations;
1747+
for (long r = 0; r < grad.nr(); ++r)
1748+
{
1749+
float sum = 0.0f;
1750+
for (long c = 0, idx = r * grad.nc(); c < grad.nc(); ++c, ++idx)
1751+
sum += -d_channel[idx] * in_channel[idx];
1752+
if (is_same_object(gradient_input, grad))
1753+
{
1754+
for (long c = 0, idx = r * grad.nc(); c < grad.nc(); ++c, ++idx)
1755+
g_channel[idx] = d_channel[idx] * (sum + in_channel[idx]);
1756+
}
1757+
else
1758+
{
1759+
for (long c = 0, idx = r * grad.nc(); c < grad.nc(); ++c, ++idx)
1760+
g_channel[idx] += d_channel[idx] * (sum + in_channel[idx]);
1761+
}
1762+
}
1763+
}
17131764
}
17141765
}
17151766
}
17161767
}
1717-
}
17181768

17191769
// ----------------------------------------------------------------------------------------
17201770

1721-
void softmax (
1771+
void softmax(
17221772
tensor& dest,
1723-
const tensor& src
1773+
const tensor& src,
1774+
operation_mode mode
17241775
)
17251776
{
1726-
DLIB_CASSERT(have_same_dimensions(dest,src));
1727-
ttimpl::softmax(src.nr()*src.nc(), src.k(), dest, src);
1777+
DLIB_CASSERT(have_same_dimensions(dest, src));
1778+
DLIB_CASSERT(mode == operation_mode::CHANNEL_WISE || mode == operation_mode::PLANE_WISE, "Invalid softmax mode");
1779+
ttimpl::softmax(src.nr() * src.nc(), src.k(), dest, src, mode);
17281780
}
17291781

1730-
void softmax_gradient (
1782+
void softmax_gradient(
17311783
tensor& grad,
17321784
const tensor& dest,
1733-
const tensor& gradient_input
1785+
const tensor& gradient_input,
1786+
operation_mode mode
17341787
)
17351788
{
1736-
DLIB_CASSERT(have_same_dimensions(grad,dest));
1737-
DLIB_CASSERT(have_same_dimensions(grad,gradient_input));
1738-
ttimpl::softmax_gradient(grad.nr()*grad.nc(), grad.k(), grad, dest, gradient_input);
1789+
DLIB_CASSERT(have_same_dimensions(grad, dest));
1790+
DLIB_CASSERT(have_same_dimensions(grad, gradient_input));
1791+
ttimpl::softmax_gradient(grad.nr() * grad.nc(), grad.k(), grad, dest, gradient_input, mode);
17391792
}
17401793

17411794
// ------------------------------------------------------------------------------------

dlib/cuda/cpu_dlib.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -291,15 +291,17 @@ namespace dlib
291291

292292
// -----------------------------------------------------------------------------------
293293

294-
void softmax (
294+
void softmax(
295295
tensor& dest,
296-
const tensor& src
296+
const tensor& src,
297+
operation_mode mode = operation_mode::CHANNEL_WISE
297298
);
298299

299-
void softmax_gradient (
300+
void softmax_gradient(
300301
tensor& grad,
301302
const tensor& dest,
302-
const tensor& gradient_input
303+
const tensor& gradient_input,
304+
operation_mode mode = operation_mode::CHANNEL_WISE
303305
);
304306

305307
// ------------------------------------------------------------------------------------

0 commit comments

Comments
 (0)