4d morton code leads to much better batching of triangles

devshgraphicsprogramming · devshgraphicsprogramming · commit 3f0313567c34 · 2021-05-08T17:38:16.000+02:00
diff --git a/include/nbl/asset/utils/IMeshPacker.h b/include/nbl/asset/utils/IMeshPacker.h
@@ -218,20 +218,18 @@ class IMeshPacker : public IMeshPackerBase
 
             MortonTriangle(uint16_t fixedPointPos[3], float area)
             {
-                key = core::Float16Compressor::compress(area);
-                key <<= 48ull;
-                
-                key |= core::morton3d_encode(fixedPointPos[0], fixedPointPos[1], fixedPointPos[2]);
+                auto tmp = reinterpret_cast<uint16_t*>(key);
+                std::copy_n(fixedPointPos,3u,tmp);
+                tmp[3] = core::Float16Compressor::compress(area);
             }
 
-            //TODO: maybe investigate morton 4d, where `logRelArea` is "4th" coord
             void complete(float maxArea)
             {
-                const float area = core::Float16Compressor::decompress(key >> 48ull);
-                key &= 0x0000ffffFFFFffffu;
-                const float scale = -0.5f; // square root
-                uint64_t logRelArea = uint64_t(65535.5f - core::clamp(scale * std::log2f(area / maxArea), 0.f, 65535.5f));
-                key |= logRelArea << 48ull;
+                auto tmp = reinterpret_cast<const uint16_t*>(key);
+                const float area = core::Float16Compressor::decompress(tmp[3]);
+                const float scale = 0.5f; // square root
+                uint16_t logRelArea = uint16_t(65535.5f+core::clamp(scale*std::log2f(area/maxArea),-65535.5f,0.f));
+                key = core::morton4d_encode(tmp[0],tmp[1],tmp[2],logRelArea);
             }
 
             uint64_t key;
diff --git a/include/nbl/core/math/morton.h b/include/nbl/core/math/morton.h
@@ -16,7 +16,7 @@ namespace core
 namespace impl
 {
     template <typename T>
-    constexpr T morton2d_mask(uint32_t _n)
+    constexpr T morton2d_mask(uint8_t _n)
     {
         constexpr uint64_t mask[5] =
         {
@@ -28,6 +28,31 @@ namespace impl
         };
         return static_cast<T>(mask[_n]);
     }
+    template <typename T>
+    constexpr T morton3d_mask(uint8_t _n)
+    {
+        constexpr uint64_t mask[5] =
+        {
+            0x1249249249249249ull,
+            0x10C30C30C30C30C3ull,
+            0x010F00F00F00F00Full,
+            0x001F0000FF0000FFull,
+            0x001F00000000FFFFull
+        };
+        return static_cast<T>(mask[_n]);
+    }
+    template <typename T>
+    constexpr T morton4d_mask(uint8_t _n)
+    {
+        constexpr uint64_t mask[4] =
+        {
+            0x1111111111111111ull,
+            0x0303030303030303ull,
+            0x000F000F000F000Full,
+            0x000000FF000000FFull
+        };
+        return static_cast<T>(mask[_n]);
+    }
 
     template <typename T, uint32_t bitDepth>
     inline T morton2d_decode(T x)
@@ -58,7 +83,7 @@ namespace impl
         {
             x = (x | (x << 16)) & morton2d_mask<T>(4);
         }
-        if constexpr (bitDepth > 16u)
+        if constexpr (bitDepth>16u)
         {
             x = (x | (x << 8)) & morton2d_mask<T>(3);
         }
@@ -71,15 +96,43 @@ namespace impl
 
         return x;
     }
+    template <typename T, uint32_t bitDepth>
+    inline T separate_bits_3d(T x)
+    {
+        if constexpr (bitDepth>32u)
+        {
+            x = (x | (x << 32)) & morton3d_mask<T>(4);
+        }
+        if constexpr (bitDepth>16u)
+        {
+            x = (x | (x << 16)) & morton3d_mask<T>(3);
+        }
+        if constexpr (bitDepth>8u)
+        {
+            x = (x | (x << 8)) & morton3d_mask<T>(2);
+        }
+        x = (x | (x << 4)) & morton3d_mask<T>(1);
+        x = (x | (x << 2)) & morton3d_mask<T>(0);
 
-    inline uint64_t separate_bits_3d(uint64_t x)
+        return x;
+    }
+    template <typename T, uint32_t bitDepth>
+    inline T separate_bits_4d(T x)
     {
-        x &= 0x00000000001fffff;
-        x = (x | x << 32) & 0x001f00000000ffff;
-        x = (x | x << 16) & 0x001f0000ff0000ff;
-        x = (x | x << 8) & 0x010f00f00f00f00f;
-        x = (x | x << 4) & 0x10c30c30c30c30c3;
-        x = (x | x << 2) & 0x1249249249249249;
+        if constexpr (bitDepth>32u)
+        {
+            x = (x | (x << 24)) & morton4d_mask<T>(3);
+        }
+        if constexpr (bitDepth>16u)
+        {
+            x = (x | (x << 12)) & morton4d_mask<T>(2);
+        }
+        if constexpr (bitDepth>8u)
+        {
+            x = (x | (x << 6)) & morton4d_mask<T>(1);
+        }
+        x = (x | (x << 3)) & morton4d_mask<T>(0);
+
         return x;
     }
 }
@@ -91,8 +144,10 @@ T morton2d_decode_y(T _morton) { return impl::morton2d_decode<T,bitDepth>(_morto
 
 template<typename T, uint32_t bitDepth=sizeof(T)*8u>
 T morton2d_encode(T x, T y) { return impl::separate_bits_2d<T,bitDepth>(x) | (impl::separate_bits_2d<T,bitDepth>(y)<<1); }
-
-inline uint64_t morton3d_encode(uint64_t x, uint64_t y, uint64_t z) { return impl::separate_bits_3d(x) | (impl::separate_bits_3d(y) << 1) | (impl::separate_bits_3d(z) << 2); }
+template<typename T, uint32_t bitDepth=sizeof(T)*8u>
+T morton3d_encode(T x, T y, T z) { return impl::separate_bits_3d<T,bitDepth>(x) | (impl::separate_bits_3d<T,bitDepth>(y)<<1) | (impl::separate_bits_3d<T,bitDepth>(z)<<2); }
+template<typename T, uint32_t bitDepth=sizeof(T)*8u>
+T morton4d_encode(T x, T y, T z, T w) { return impl::separate_bits_4d<T,bitDepth>(x) | (impl::separate_bits_4d<T,bitDepth>(y)<<1) | (impl::separate_bits_4d<T,bitDepth>(z)<<2) | (impl::separate_bits_4d<T,bitDepth>(w)<<3); }
 
 }}