[HDRP] Fix a performance regression introduced in PR 39296

EvgeniiG · Evergreen · commit c17109135723 · 2024-02-07T17:25:07.000Z
Fix a performance regression introduced in https://github.cds.internal.unity3d.com/unity/unity/pull/39296. The fix introduces scalarization of area lighting code in forward lighting shaders. Both the issue and the fix are console-only.
diff --git a/Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.hlsl b/Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.hlsl
@@ -639,7 +639,7 @@ void LightLoop( float3 V, PositionInputs posInput, PreLightData preLightData, BS
 #if SHADEROPTIONS_AREA_LIGHTS
     if (featureFlags & LIGHTFEATUREFLAGS_AREA)
     {
-        uint lightCount, lightStart;
+        uint lightCount, lightStart; // Start is the offset specific to the tile (or cluster)
 
     #ifndef LIGHTLOOP_DISABLE_TILE_AND_CLUSTER
         GetCountAndStart(posInput, LIGHTCATEGORY_AREA, lightStart, lightCount);
@@ -648,14 +648,57 @@ void LightLoop( float3 V, PositionInputs posInput, PreLightData preLightData, BS
         lightStart = _PunctualLightCount;
     #endif
 
-        for (i = 0; i < lightCount; i++)
+        bool fastPath = false;
+    #if SCALARIZE_LIGHT_LOOP
+        uint lightStartLane0;
+        fastPath = IsFastPath(lightStart, lightStartLane0); // True if all pixels belong to the same tile (or cluster)
+
+        if (fastPath)
+        {
+            lightStart = lightStartLane0;
+        }
+    #endif
+
+        // Scalarized loop. All lights that are in a tile/cluster touched by any pixel in the wave are loaded (scalar load), only the one relevant to current thread/pixel are processed.
+        // For clarity, the following code will follow the convention: variables starting with s_ are meant to be wave uniform (meant for scalar register),
+        // v_ are variables that might have different value for each thread in the wave (meant for vector registers).
+        // This will perform more loads than it is supposed to, however, the benefits should offset the downside, especially given that light data accessed should be largely coherent.
+        // Note that the above is valid only if wave intriniscs are supported.
+        uint v_lightListOffset = 0;
+        uint v_lightIdx = lightStart;
+
+#if NEED_TO_CHECK_HELPER_LANE
+        // On some platform helper lanes don't behave as we'd expect, therefore we prevent them from entering the loop altogether.
+        // IMPORTANT! This has implications if ddx/ddy is used on results derived from lighting, however given Lightloop is called in compute we should be
+        // sure it will not happen.
+        bool isHelperLane = WaveIsHelperLane();
+        while (!isHelperLane && v_lightListOffset < lightCount)
+#else
+        while (v_lightListOffset < lightCount)
+#endif
         {
-            LightData lightData = FetchLight(lightStart, i);
+            v_lightIdx = FetchIndex(lightStart, v_lightListOffset);
+#if SCALARIZE_LIGHT_LOOP
+            uint s_lightIdx = ScalarizeElementIndex(v_lightIdx, fastPath);
+#else
+            uint s_lightIdx = v_lightIdx;
+#endif
+            if (s_lightIdx == -1)
+                break;
 
-            if (IsMatchingLightLayer(lightData.lightLayers, builtinData.renderingLayers))
+            LightData s_lightData = FetchLight(s_lightIdx);
+
+            // If current scalar and vector light index match, we process the light. The v_lightListOffset for current thread is increased.
+            // Note that the following should really be ==, however, since helper lanes are not considered by WaveActiveMin, such helper lanes could
+            // end up with a unique v_lightIdx value that is smaller than s_lightIdx hence being stuck in a loop. All the active lanes will not have this problem.
+            if (s_lightIdx >= v_lightIdx)
             {
-                DirectLighting lighting = EvaluateBSDF_Area(context, V, posInput, preLightData, lightData, bsdfData, builtinData);
-                AccumulateDirectLighting(lighting, aggregateLighting);
+                v_lightListOffset++;
+                if (IsMatchingLightLayer(s_lightData.lightLayers, builtinData.renderingLayers))
+                {
+                    DirectLighting lighting = EvaluateBSDF_Area(context, V, posInput, preLightData, s_lightData, bsdfData, builtinData);
+                    AccumulateDirectLighting(lighting, aggregateLighting);
+                }
             }
         }
     }