Make the add method treat the new write load the same as the old one, i.e. average rather than sum - and log it

PeteGillinElastic · PeteGillinElastic · commit bb71dd37cda0 · 2025-03-13T17:52:00.000Z
diff --git a/server/src/main/java/org/elasticsearch/index/shard/IndexingStats.java b/server/src/main/java/org/elasticsearch/index/shard/IndexingStats.java
@@ -9,6 +9,8 @@
 
 package org.elasticsearch.index.shard;
 
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
 import org.elasticsearch.TransportVersion;
 import org.elasticsearch.TransportVersions;
 import org.elasticsearch.common.io.stream.StreamInput;
@@ -29,6 +31,8 @@
 
 public class IndexingStats implements Writeable, ToXContentFragment {
 
+    private static final Logger logger = LogManager.getLogger(IndexingStats.class);
+
     public static class Stats implements Writeable, ToXContentFragment {
         private static final TransportVersion WRITE_LOAD_AVG_SUPPORTED_VERSION = TransportVersions.V_8_6_0;
 
@@ -99,15 +103,10 @@ public Stats(
             this.noopUpdateCount = noopUpdateCount;
             this.isThrottled = isThrottled;
             this.throttleTimeInMillis = throttleTimeInMillis;
-
-            // We store the raw unweighted write load values in order to avoid losing precision when we combine the shard stats.
-            // N.B. In add(Stats) we sum both of these, so getWriteLoad() will return the ratio of the sums, which is a weighted average of
-            // the ratios we would get for each shard.
+            // We store the raw unweighted write load values in order to avoid losing precision when we combine the shard stats
             this.totalIndexingTimeSinceShardStartedInNanos = totalIndexingTimeSinceShardStartedInNanos;
             this.totalActiveTimeInNanos = totalActiveTimeInNanos;
-
-            // We store the exponentially weighted write load value as a double. N.B. In add(Stats) we add these, and getRecentWriteLoad()
-            // will return that sum.
+            // We store the weighted write load as a double because the calculation is inherently floating point
             this.recentIndexingLoad = recentIndexingLoad;
         }
 
@@ -127,9 +126,34 @@ public void add(Stats stats) {
             if (isThrottled != stats.isThrottled) {
                 isThrottled = true; // When combining if one is throttled set result to throttled.
             }
+            // TODO(pete): Remove logging of sums
+            long tmpNum = totalIndexingTimeSinceShardStartedInNanos;
+            long tmpDen = totalActiveTimeInNanos;
+            double tmpWgt = recentIndexingLoad;
+            // N.B. getWriteLoad() returns the ratio of these sums, which is the average of the ratios weighted by active time:
             totalIndexingTimeSinceShardStartedInNanos += stats.totalIndexingTimeSinceShardStartedInNanos;
             totalActiveTimeInNanos += stats.totalActiveTimeInNanos;
-            recentIndexingLoad += stats.recentIndexingLoad;
+            // We want getRecentWriteLoad() for the aggregated stats to also be the average weighted by active time, so we use the updating
+            // formula for a weighted mean:
+            if (totalActiveTimeInNanos > 0) {
+                recentIndexingLoad += (stats.recentIndexingLoad - recentIndexingLoad) * stats.totalActiveTimeInNanos
+                    / totalActiveTimeInNanos;
+            }
+            logger.info(
+                "***** SUM UNWEIGHTED ({} / {} = {}) + ({} / {} = {}) = ({} + {} = {}) --- WEIGHTED {} + {} = {}",
+                tmpNum * 1.0e6,
+                tmpDen * 1.0e6,
+                1.0 * tmpNum / tmpDen,
+                stats.totalIndexingTimeSinceShardStartedInNanos * 1.0e6,
+                stats.totalActiveTimeInNanos * 1.0e6,
+                1.0 * stats.totalIndexingTimeSinceShardStartedInNanos / stats.totalActiveTimeInNanos,
+                totalIndexingTimeSinceShardStartedInNanos * 1.0e6,
+                totalActiveTimeInNanos * 1.0e6,
+                1.0 * totalIndexingTimeSinceShardStartedInNanos / totalActiveTimeInNanos,
+                tmpWgt,
+                stats.recentIndexingLoad,
+                recentIndexingLoad
+            );
         }
 
         /**
@@ -212,9 +236,8 @@ public long getNoopUpdateCount() {
          * <p>If this {@link Stats} instance represents a single shard, this is ratio of the sum of the time taken by every index operations
          * since the shard started to the elapsed time since the shard started.
          *
-         * <p>If this {@link Stats} instance represents multiple shards, this is the <b>average</b> that ratio for each shard, weighted by
-         * the elapsed time for each shard. N.B. This is a different behaviour to the {@link #getRecentWriteLoad()} method, which returns a
-         * sum over the shards.
+         * <p>If this {@link Stats} instance represents multiple shards, this is the average of that ratio for each shard, weighted by
+         * the elapsed time for each shard.
          */
         // TODO(pete): See which callers of this should be changed to use getRecentLoad(). Make sure that they are single-shard!
         public double getWriteLoad() {
@@ -227,8 +250,8 @@ public double getWriteLoad() {
          * <p>If this {@link Stats} instance represents a single shard, this is an Exponentially Weighted Moving Rate based on the time
          * taken by indexing operations in this shard since the shard started.
          *
-         * <p>If this {@link Stats} instance represents multiple shards, this is the <b>sum</b> that rate for each shard. N.B. This is a
-         * different behaviour to the {@link #getWriteLoad()} method, which returns an average over the shards.
+         * <p>If this {@link Stats} instance represents multiple shards, this is the average of that ratio for each shard, weighted by
+         * the elapsed time for each shard.
          */
         public double getRecentWriteLoad() {
             return recentIndexingLoad;
diff --git a/server/src/test/java/org/elasticsearch/index/shard/IndexingStatsTests.java b/server/src/test/java/org/elasticsearch/index/shard/IndexingStatsTests.java
@@ -0,0 +1,135 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.index.shard;
+
+import org.elasticsearch.test.ESTestCase;
+
+import static org.hamcrest.Matchers.closeTo;
+import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.is;
+
+public class IndexingStatsTests extends ESTestCase {
+
+    private static final double DOUBLE_TOLERANCE = 1.0e-10;
+
+    public void testStatsGetWriteLoad() {
+        IndexingStats.Stats stats = new IndexingStats.Stats(
+            1,
+            2,
+            3,
+            4,
+            5,
+            6,
+            7,
+            8,
+            9,
+            false,
+            10,
+            1_800_000_000L, // totalIndexingTimeSinceShardStartedInNanos - 1.8sec
+            3_000_000_000L, // totalActiveTimeInNanos - 3sec
+            0.1357 // recentWriteLoad
+        );
+        assertThat(stats.getWriteLoad(), closeTo(0.6, DOUBLE_TOLERANCE));
+    }
+
+    public void testStatsAdd_indexCount() {
+        IndexingStats.Stats stats1 = new IndexingStats.Stats(
+            1001L, // indexCount
+            2,
+            3,
+            4,
+            5,
+            6,
+            7,
+            8,
+            9,
+            false,
+            10,
+            11,
+            12,
+            0.1357
+        );
+        IndexingStats.Stats stats2 = new IndexingStats.Stats(
+            2001L, // indexCount
+            2,
+            3,
+            4,
+            5,
+            6,
+            7,
+            8,
+            9,
+            false, // isThrottled
+            10,
+            11,
+            12,
+            0.1357
+        );
+        IndexingStats.Stats statsAgg = sumOfStats(stats1, stats2);
+        assertThat(statsAgg.getIndexCount(), equalTo(1001L + 2001L));
+    }
+
+    public void testStatsAdd_throttled() {
+        IndexingStats.Stats statsFalse = new IndexingStats.Stats(1, 2, 3, 4, 5, 6, 7, 8, 9, false, 10, 11, 12, 0.1357);
+        IndexingStats.Stats statsTrue = new IndexingStats.Stats(1, 2, 3, 4, 5, 6, 7, 8, 9, true, 10, 11, 12, 0.1357);
+        assertThat(sumOfStats(statsFalse, statsFalse).isThrottled(), is(false));
+        assertThat(sumOfStats(statsFalse, statsTrue).isThrottled(), is(true));
+        assertThat(sumOfStats(statsTrue, statsFalse).isThrottled(), is(true));
+        assertThat(sumOfStats(statsTrue, statsTrue).isThrottled(), is(true));
+    }
+
+    public void testStatsAdd_writeLoads() {
+        IndexingStats.Stats stats1 = new IndexingStats.Stats(
+            1,
+            2,
+            3,
+            4,
+            5,
+            6,
+            7,
+            8,
+            9,
+            false,
+            10,
+            1_000_000_000L, // totalIndexingTimeSinceShardStartedInNanos - 1sec
+            2_000_000_000L, // totalActiveTimeInNanos - 2sec
+            0.1357 // recentWriteLoad
+        );
+        IndexingStats.Stats stats2 = new IndexingStats.Stats(
+            2,
+            2,
+            3,
+            4,
+            5,
+            6,
+            7,
+            8,
+            9,
+            false, // isThrottled
+            10,
+            2_100_000_000L, // totalIndexingTimeSinceShardStartedInNanos - 2.1sec
+            3_000_000_000L, // totalActiveTimeInNanos - 3sec
+            0.2468 // recentWriteLoad
+        );
+        IndexingStats.Stats statsAgg = sumOfStats(stats1, stats2);
+        // The unweighted write loads for the two shards are 0.5 (1sec / 2sec) and 0.7 (2.1sec / 3sec) respectively.
+        // The aggregated value should be the average weighted by the times, i.e. by 2sec and 3sec, giving weights of 0.4 and 0.6.
+        assertThat(statsAgg.getWriteLoad(), closeTo(0.4 * 0.5 + 0.6 * 0.7, DOUBLE_TOLERANCE));
+        // The aggregated value for the recent write load should be the average with the same weights.
+        assertThat(statsAgg.getRecentWriteLoad(), closeTo(0.4 * 0.1357 + 0.6 * 0.2468, DOUBLE_TOLERANCE));
+    }
+
+    private static IndexingStats.Stats sumOfStats(IndexingStats.Stats stats1, IndexingStats.Stats stats2) {
+        IndexingStats.Stats statsAgg = new IndexingStats.Stats();
+        statsAgg.add(stats1);
+        statsAgg.add(stats2);
+        return statsAgg;
+    }
+}