Workaround for packHalf2x16 precision issues on some Android devices (#8121)

mvaligursky · Martin Valigursky · web-flow · commit 7ee986fc508b · 2025-11-12T18:11:09.000Z
* Workaround for packHalf2x16 precision issues on some Android devices

* tabs

---------

Co-authored-by: Martin Valigursky &lt;mvaligursky@snapchat.com&gt;
diff --git a/src/platform/graphics/graphics-device.js b/src/platform/graphics/graphics-device.js
@@ -545,6 +545,12 @@ class GraphicsDevice extends EventHandler {
         if (this.textureFloatFilterable) capsDefines.set('CAPS_TEXTURE_FLOAT_FILTERABLE', '');
         if (this.textureFloatRenderable) capsDefines.set('CAPS_TEXTURE_FLOAT_RENDERABLE', '');
         if (this.supportsMultiDraw) capsDefines.set('CAPS_MULTI_DRAW', '');
+
+        // Platform defines
+        if (platform.desktop) capsDefines.set('PLATFORM_DESKTOP', '');
+        if (platform.mobile) capsDefines.set('PLATFORM_MOBILE', '');
+        if (platform.android) capsDefines.set('PLATFORM_ANDROID', '');
+        if (platform.ios) capsDefines.set('PLATFORM_IOS', '');
     }
 
     /**
diff --git a/src/scene/shader-lib/glsl/chunks/gsplat/frag/gsplatCopyToWorkbuffer.js b/src/scene/shader-lib/glsl/chunks/gsplat/frag/gsplatCopyToWorkbuffer.js
@@ -8,6 +8,7 @@ export default /* glsl */`
 #include "gsplatEvalSHVS"
 #include "gsplatQuatToMat3VS"
 #include "gsplatSourceFormatVS"
+#include "packHalfPS"
 
 uniform mat4 uTransform;
 
@@ -120,8 +121,8 @@ void main(void) {
             pcFragColor0 = color;
         #endif
         #ifndef GSPLAT_COLOR_ONLY
-            pcFragColor1 = uvec4(floatBitsToUint(modelCenter.x), floatBitsToUint(modelCenter.y), floatBitsToUint(modelCenter.z), packHalf2x16(vec2(covA.z, covB.z)));
-            pcFragColor2 = uvec2(packHalf2x16(covA.xy), packHalf2x16(covB.xy));
+            pcFragColor1 = uvec4(floatBitsToUint(modelCenter.x), floatBitsToUint(modelCenter.y), floatBitsToUint(modelCenter.z), packHalf2x16Safe(vec2(covA.z, covB.z)));
+            pcFragColor2 = uvec2(packHalf2x16Safe(covA.xy), packHalf2x16Safe(covB.xy));
         #endif
     }
 }
diff --git a/src/scene/shader-lib/glsl/chunks/gsplat/vert/gsplatCompressedData.js b/src/scene/shader-lib/glsl/chunks/gsplat/vert/gsplatCompressedData.js
@@ -1,4 +1,6 @@
 export default /* glsl */`
+#include "gsplatPackingPS"
+
 uniform highp usampler2D packedTexture;
 uniform highp sampler2D chunkTexture;
 
@@ -18,15 +20,6 @@ vec3 unpack111011(uint bits) {
     );
 }
 
-vec4 unpack8888(uint bits) {
-    return vec4(
-        float(bits >> 24u) / 255.0,
-        float((bits >> 16u) & 0xffu) / 255.0,
-        float((bits >> 8u) & 0xffu) / 255.0,
-        float(bits & 0xffu) / 255.0
-    );
-}
-
 const float norm = sqrt(2.0);
 
 vec4 unpackRotation(uint bits) {
diff --git a/src/scene/shader-lib/glsl/chunks/internal/frag/packHalf.js b/src/scene/shader-lib/glsl/chunks/internal/frag/packHalf.js
@@ -0,0 +1,100 @@
+// Generic half-float packing with software fallback for subnormals
+// Addresses vendor differences in packHalf2x16 subnormal handling (e.g., Adreno (TM) 750 on Samsung Galaxy S24)
+export default /* glsl */`
+
+#if defined(PLATFORM_ANDROID)
+
+    // Software pack of one f32 -> f16 (low 16 bits). Ties-to-even, full subnormals.
+    uint floatToHalf(float a) {
+        uint u    = floatBitsToUint(a);
+        uint sign = (u >> 16u) & 0x8000u;
+        uint absu = u & 0x7FFFFFFFu;
+        uint man  = u & 0x007FFFFFu;
+        int  e32  = int((u >> 23u) & 0xFFu) - 127;
+        
+        // NaN / Inf
+        if ((absu & 0x7F800000u) == 0x7F800000u) {
+            bool isnan = (man != 0u);
+            return sign | (isnan ? 0x7E00u : 0x7C00u);
+        }
+        
+        // Overflow to Inf
+        if (e32 > 15) return sign | 0x7C00u;
+        
+        // Normal half
+        if (e32 >= -14) {
+            uint he  = uint(e32 + 15);
+            uint hm  = man >> 13u;
+            uint rem = man & 0x1FFFu;
+            uint add = (rem > 0x1000u || (rem == 0x1000u && (hm & 1u) == 1u)) ? 1u : 0u;
+            hm = (hm + add) & 0x3FFu;
+            if ((hm & 0x400u) != 0u) {
+                hm = 0u; he = he + 1u;
+                if (he >= 31u) return sign | 0x7C00u;
+            }
+            return sign | (he << 10u) | hm;
+        }
+        
+        // Subnormals
+        if (e32 >= -24) {
+            uint s      = uint(-(e32 + 1));
+            uint mnorm  = 0x00800000u | man;
+            uint hm     = mnorm >> s;
+            uint mask   = (1u << s) - 1u;
+            uint rem    = mnorm & mask;
+            uint halfBt = 1u << (s - 1u);
+            uint add    = (rem > halfBt || (rem == halfBt && (hm & 1u) == 1u)) ? 1u : 0u;
+            hm = hm + add;
+            if (hm >= 0x400u) return sign | (1u << 10u);
+            return sign | hm;
+        }
+        
+        // Underflow to signed zero
+        return sign;
+    }
+
+    // Hybrid pack: software for subnormals, builtin for normal range
+    uint packHalf2x16Safe(vec2 v) {
+        // Convert the input floats to their 32-bit IEEE-754 bit patterns.
+        // We'll inspect the exponent bits directly to determine their numeric range.
+        uint u_x  = floatBitsToUint(v.x);
+        uint u_y  = floatBitsToUint(v.y);
+        
+        // Extract the unbiased exponent for each component (float32 uses bias = 127).
+        // e32 = exponent - 127  ⇒  actual power of two for each value.
+        int  e32_x = int((u_x >> 23u) & 0xFFu) - 127;
+        int  e32_y = int((u_y >> 23u) & 0xFFu) - 127;
+        
+        // -------------------------------------------------------------------------
+        // Detect values that would become *subnormal* (or zero) in float16.
+        //
+        //   e32 < -14  ⇔  |value| < 2^-14 ≈ 6.1035e-5
+        //
+        // Many mobile GPUs (including Adreno and Mali) mishandle half-precision
+        // subnormals—typically flushing them to zero or rounding incorrectly.
+        // To preserve correct rounding and sign, we use the software conversion
+        // path (floatToHalf) for these small magnitudes.
+        //
+        // The software branch runs very rarely (<0.1% of typical values for
+        // normalized scene data) and costs only a few ALU instructions, so the
+        // performance impact is negligible while avoiding visible precision loss.
+        // -------------------------------------------------------------------------
+        if (e32_x < -14 || e32_y < -14) {
+            // Convert both components with the reference software routine
+            // and pack into a 32-bit uint: low 16 bits = x, high 16 bits = y.
+            return (floatToHalf(v.y) << 16u) | floatToHalf(v.x);
+        }
+        
+        // Normal range: use the fast hardware builtin
+        return packHalf2x16(v);
+    }
+
+#else
+
+    // On non-Android platforms, use builtin directly (no subnormal workaround needed)
+    uint packHalf2x16Safe(vec2 v) {
+        return packHalf2x16(v);
+    }
+
+#endif
+`;
diff --git a/src/scene/shader-lib/glsl/collections/shader-chunks-glsl.js b/src/scene/shader-lib/glsl/collections/shader-chunks-glsl.js
@@ -89,6 +89,7 @@ import normalMapPS from '../chunks/standard/frag/normalMap.js';
 import opacityPS from '../chunks/standard/frag/opacity.js';
 import opacityDitherPS from '../chunks/standard/frag/opacity-dither.js';
 import outputPS from '../chunks/lit/frag/output.js';
+import packHalfPS from '../chunks/internal/frag/packHalf.js';
 import outputAlphaPS from '../chunks/lit/frag/outputAlpha.js';
 import outputTex2DPS from '../chunks/common/frag/outputTex2D.js';
 import sheenPS from '../chunks/standard/frag/sheen.js';
@@ -260,6 +261,7 @@ const shaderChunksGLSL = {
     opacityDitherPS,
     outputPS,
     outputAlphaPS,
+    packHalfPS,
     outputTex2DPS,
     sheenPS,
     sheenGlossPS,
diff --git a/src/scene/shader-lib/wgsl/chunks/gsplat/frag/gsplatCopyToWorkbuffer.js b/src/scene/shader-lib/wgsl/chunks/gsplat/frag/gsplatCopyToWorkbuffer.js
@@ -8,6 +8,7 @@ export default /* wgsl */`
 #include "gsplatEvalSHVS"
 #include "gsplatQuatToMat3VS"
 #include "gsplatSourceFormatVS"
+#include "packHalfPS"
 
 uniform uTransform: mat4x4f;
 
@@ -110,8 +111,8 @@ fn fragmentMain(input: FragmentInput) -> FragmentOutput {
         // write out results
         output.color = color;
         #ifndef GSPLAT_COLOR_ONLY
-            output.color1 = vec4u(bitcast<u32>(modelCenter.x), bitcast<u32>(modelCenter.y), bitcast<u32>(modelCenter.z), pack2x16float(vec2f(covA.z, covB.z)));
-            output.color2 = vec2u(pack2x16float(covA.xy), pack2x16float(covB.xy));
+            output.color1 = vec4u(bitcast<u32>(modelCenter.x), bitcast<u32>(modelCenter.y), bitcast<u32>(modelCenter.z), pack2x16floatSafe(vec2f(covA.z, covB.z)));
+            output.color2 = vec2u(pack2x16floatSafe(covA.xy), pack2x16floatSafe(covB.xy));
         #endif
     }
     
diff --git a/src/scene/shader-lib/wgsl/chunks/gsplat/vert/gsplatCompressedData.js b/src/scene/shader-lib/wgsl/chunks/gsplat/vert/gsplatCompressedData.js
@@ -1,4 +1,6 @@
 export default /* wgsl */`
+#include "gsplatPackingPS"
+
 var packedTexture: texture_2d<u32>;
 var chunkTexture: texture_2d<uff>;
 
@@ -14,15 +16,6 @@ fn unpack111011(bits: u32) -> vec3f {
     return (vec3f((vec3<u32>(bits) >> vec3<u32>(21u, 11u, 0u)) & vec3<u32>(0x7ffu, 0x3ffu, 0x7ffu))) / vec3f(2047.0, 1023.0, 2047.0);
 }
 
-fn unpack8888(bits: u32) -> vec4f {
-    return vec4f(
-        f32((bits >> 24u) & 0xffu),
-        f32((bits >> 16u) & 0xffu),
-        f32((bits >> 8u)  & 0xffu),
-        f32(bits         & 0xffu)
-    ) / 255.0;
-}
-
 const norm_const: f32 = sqrt(2.0);
 
 fn unpackRotation(bits: u32) -> vec4f {
diff --git a/src/scene/shader-lib/wgsl/chunks/internal/frag/packHalf.js b/src/scene/shader-lib/wgsl/chunks/internal/frag/packHalf.js
@@ -0,0 +1,99 @@
+// Generic half-float packing with software fallback for subnormals
+// Addresses vendor differences in pack2x16float subnormal handling (e.g., Adreno (TM) 750 on Samsung Galaxy S24)
+export default /* wgsl */`
+
+#if defined(PLATFORM_ANDROID)
+
+    // Software pack of one f32 -> f16 (low 16 bits). Ties-to-even, full subnormals.
+    fn floatToHalf(a: f32) -> u32 {
+        let u: u32    = bitcast<u32>(a);
+        let sign: u32 = (u >> 16u) & 0x8000u;
+        let absu: u32 = u & 0x7FFFFFFFu;
+        let man: u32  = u & 0x007FFFFFu;
+        let e32: i32  = i32((u >> 23u) & 0xFFu) - 127;
+        
+        // NaN / Inf
+        if ((absu & 0x7F800000u) == 0x7F800000u) {
+            let isnan = (man != 0u);
+            return sign | select(0x7C00u, 0x7E00u, isnan);
+        }
+        
+        // Overflow to Inf
+        if (e32 > 15) { return sign | 0x7C00u; }
+        
+        // Normal half
+        if (e32 >= -14) {
+            var he: u32 = u32(e32 + 15);
+            var hm: u32 = man >> 13u;
+            let rem: u32 = man & 0x1FFFu;
+            let add: u32 = select(0u, 1u, (rem > 0x1000u) || (rem == 0x1000u && (hm & 1u) == 1u));
+            hm = (hm + add) & 0x3FFu;
+            if ((hm & 0x400u) != 0u) {
+                hm = 0u; he = he + 1u;
+                if (he >= 31u) { return sign | 0x7C00u; }
+            }
+            return sign | (he << 10u) | hm;
+        }
+        
+        // Subnormals
+        if (e32 >= -24) {
+            let s: u32      = u32(-(e32 + 1));
+            let mnorm: u32  = 0x00800000u | man;
+            var hm: u32     = mnorm >> s;
+            let mask: u32   = (1u << s) - 1u;
+            let rem: u32    = mnorm & mask;
+            let halfBt: u32 = 1u << (s - 1u);
+            let add: u32    = select(0u, 1u, (rem > halfBt) || (rem == halfBt && (hm & 1u) == 1u));
+            hm = hm + add;
+            if (hm >= 0x400u) { return sign | (1u << 10u); }
+            return sign | hm;
+        }
+        
+        return sign; // signed zero
+    }
+
+    // Hybrid pack: software for subnormals, builtin for normal range
+    fn pack2x16floatSafe(v: vec2f) -> u32 {
+        // Convert the input floats to their 32-bit IEEE-754 bit patterns.
+        // We'll inspect the exponent bits directly to determine their numeric range.
+        let u_x: u32  = bitcast<u32>(v.x);
+        let u_y: u32  = bitcast<u32>(v.y);
+        
+        // Extract the unbiased exponent for each component (float32 uses bias = 127).
+        // e32 = exponent - 127  ⇒  actual power of two for each value.
+        let e32_x: i32 = i32((u_x >> 23u) & 0xFFu) - 127;
+        let e32_y: i32 = i32((u_y >> 23u) & 0xFFu) - 127;
+        
+        // -------------------------------------------------------------------------
+        // Detect values that would become *subnormal* (or zero) in float16.
+        //
+        //   e32 < -14  ⇔  |value| < 2^-14 ≈ 6.1035e-5
+        //
+        // Many mobile GPUs (including Adreno and Mali) mishandle half-precision
+        // subnormals—typically flushing them to zero or rounding incorrectly.
+        // To preserve correct rounding and sign, we use the software conversion
+        // path (floatToHalf) for these small magnitudes.
+        //
+        // The software branch runs very rarely (<0.1% of typical values for
+        // normalized scene data) and costs only a few ALU instructions, so the
+        // performance impact is negligible while avoiding visible precision loss.
+        // -------------------------------------------------------------------------
+        if (e32_x < -14 || e32_y < -14) {
+            // Convert both components with the reference software routine
+            // and pack into a 32-bit uint: low 16 bits = x, high 16 bits = y.
+            return (floatToHalf(v.y) << 16u) | floatToHalf(v.x);
+        }
+        
+        // Normal range: use the fast hardware builtin
+        return pack2x16float(v);
+    }
+
+#else
+
+    // On non-Android platforms, use builtin directly (no subnormal workaround needed)
+    fn pack2x16floatSafe(v: vec2f) -> u32 {
+        return pack2x16float(v);
+    }
+
+#endif
+`;
diff --git a/src/scene/shader-lib/wgsl/collections/shader-chunks-wgsl.js b/src/scene/shader-lib/wgsl/collections/shader-chunks-wgsl.js
@@ -89,6 +89,7 @@ import opacityPS from '../chunks/standard/frag/opacity.js';
 import opacityDitherPS from '../chunks/standard/frag/opacity-dither.js';
 import outputPS from '../chunks/lit/frag/output.js';
 import outputAlphaPS from '../chunks/lit/frag/outputAlpha.js';
+import packHalfPS from '../chunks/internal/frag/packHalf.js';
 import outputTex2DPS from '../chunks/common/frag/outputTex2D.js';
 import sheenPS from '../chunks/standard/frag/sheen.js';
 import sheenGlossPS from '../chunks/standard/frag/sheenGloss.js';
@@ -258,6 +259,7 @@ const shaderChunksWGSL = {
     opacityDitherPS,
     outputPS,
     outputAlphaPS,
+    packHalfPS,
     outputTex2DPS,
     sheenPS,
     sheenGlossPS,