Skip to content

Commit d76e770

Browse files
committed
[xds] Implement A114: WRR support for custom backend metrics
Updates the Weighted Round Robin (WRR) load balancing policy to support customizable utilization metrics via the `metric_names_for_computing_utilization` configuration. This allows endpoint weights to be driven by arbitrary named metrics (e.g. `named_metrics.foo`) or other standard metrics (e.g. `memory_utilization`) instead of solely `application_utilization` or the `cpu_utilization` fallback. Refactors metric resolution logic into `io.grpc.xds.internal.MetricReportUtils` to handle the new map lookup and validation requirements.
1 parent e636df5 commit d76e770

8 files changed

+631
-78
lines changed

xds/src/main/java/io/grpc/xds/LoadBalancerConfigFactory.java

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ class LoadBalancerConfigFactory {
9191
static final String SHUFFLE_ADDRESS_LIST_FIELD_NAME = "shuffleAddressList";
9292

9393
static final String ERROR_UTILIZATION_PENALTY = "errorUtilizationPenalty";
94+
static final String METRIC_NAMES_FOR_COMPUTING_UTILIZATION = "metricNamesForComputingUtilization";
9495

9596
/**
9697
* Factory method for creating a new {link LoadBalancerConfigConverter} for a given xDS {@link
@@ -134,11 +135,9 @@ class LoadBalancerConfigFactory {
134135
* the given config values.
135136
*/
136137
private static ImmutableMap<String, ?> buildWrrConfig(String blackoutPeriod,
137-
String weightExpirationPeriod,
138-
String oobReportingPeriod,
139-
Boolean enableOobLoadReport,
140-
String weightUpdatePeriod,
141-
Float errorUtilizationPenalty) {
138+
String weightExpirationPeriod, String oobReportingPeriod, Boolean enableOobLoadReport,
139+
String weightUpdatePeriod, Float errorUtilizationPenalty,
140+
ImmutableList<String> metricNamesForComputingUtilization) {
142141
ImmutableMap.Builder<String, Object> configBuilder = ImmutableMap.builder();
143142
if (blackoutPeriod != null) {
144143
configBuilder.put(BLACK_OUT_PERIOD, blackoutPeriod);
@@ -158,6 +157,10 @@ class LoadBalancerConfigFactory {
158157
if (errorUtilizationPenalty != null) {
159158
configBuilder.put(ERROR_UTILIZATION_PENALTY, errorUtilizationPenalty);
160159
}
160+
if (metricNamesForComputingUtilization != null
161+
&& !metricNamesForComputingUtilization.isEmpty()) {
162+
configBuilder.put(METRIC_NAMES_FOR_COMPUTING_UTILIZATION, metricNamesForComputingUtilization);
163+
}
161164
return ImmutableMap.of(WeightedRoundRobinLoadBalancerProvider.SCHEME,
162165
configBuilder.buildOrThrow());
163166
}
@@ -284,7 +287,7 @@ static class LoadBalancingPolicyConverter {
284287
}
285288

286289
private static ImmutableMap<String, ?> convertWeightedRoundRobinConfig(
287-
ClientSideWeightedRoundRobin wrr) throws ResourceInvalidException {
290+
ClientSideWeightedRoundRobin wrr) throws ResourceInvalidException {
288291
try {
289292
return buildWrrConfig(
290293
wrr.hasBlackoutPeriod() ? Durations.toString(wrr.getBlackoutPeriod()) : null,
@@ -293,7 +296,8 @@ static class LoadBalancingPolicyConverter {
293296
wrr.hasOobReportingPeriod() ? Durations.toString(wrr.getOobReportingPeriod()) : null,
294297
wrr.hasEnableOobLoadReport() ? wrr.getEnableOobLoadReport().getValue() : null,
295298
wrr.hasWeightUpdatePeriod() ? Durations.toString(wrr.getWeightUpdatePeriod()) : null,
296-
wrr.hasErrorUtilizationPenalty() ? wrr.getErrorUtilizationPenalty().getValue() : null);
299+
wrr.hasErrorUtilizationPenalty() ? wrr.getErrorUtilizationPenalty().getValue() : null,
300+
ImmutableList.copyOf(wrr.getMetricNamesForComputingUtilizationList()));
297301
} catch (IllegalArgumentException ex) {
298302
throw new ResourceInvalidException("Invalid duration in weighted round robin config: "
299303
+ ex.getMessage());

xds/src/main/java/io/grpc/xds/WeightedRoundRobinLoadBalancer.java

Lines changed: 85 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
import io.grpc.services.MetricReport;
4141
import io.grpc.util.ForwardingSubchannel;
4242
import io.grpc.util.MultiChildLoadBalancer;
43+
import io.grpc.xds.internal.MetricReportUtils;
4344
import io.grpc.xds.orca.OrcaOobUtil;
4445
import io.grpc.xds.orca.OrcaOobUtil.OrcaOobReportListener;
4546
import io.grpc.xds.orca.OrcaPerRequestUtil;
@@ -49,6 +50,7 @@
4950
import java.util.HashSet;
5051
import java.util.List;
5152
import java.util.Objects;
53+
import java.util.OptionalDouble;
5254
import java.util.Random;
5355
import java.util.Set;
5456
import java.util.concurrent.ScheduledExecutorService;
@@ -87,6 +89,9 @@
8789
* See related documentation: https://cloud.google.com/service-mesh/legacy/load-balancing-apis/proxyless-configure-advanced-traffic-management#custom-lb-config
8890
*/
8991
final class WeightedRoundRobinLoadBalancer extends MultiChildLoadBalancer {
92+
@VisibleForTesting
93+
static boolean enableCustomConfig =
94+
Boolean.parseBoolean(System.getenv("GRPC_EXPERIMENTAL_WRR_CUSTOM_METRICS"));
9095

9196
private static final LongCounterMetricInstrument RR_FALLBACK_COUNTER;
9297
private static final LongCounterMetricInstrument ENDPOINT_WEIGHT_NOT_YET_USEABLE_COUNTER;
@@ -189,7 +194,7 @@ public Status acceptResolvedAddresses(ResolvedAddresses resolvedAddresses) {
189194
this.backendService = "";
190195
}
191196
config =
192-
(WeightedRoundRobinLoadBalancerConfig) resolvedAddresses.getLoadBalancingPolicyConfig();
197+
(WeightedRoundRobinLoadBalancerConfig) resolvedAddresses.getLoadBalancingPolicyConfig();
193198

194199
if (weightUpdateTimer != null && weightUpdateTimer.isPending()) {
195200
weightUpdateTimer.cancel();
@@ -236,7 +241,8 @@ protected void updateOverallBalancingState() {
236241

237242
private SubchannelPicker createReadyPicker(Collection<ChildLbState> activeList) {
238243
WeightedRoundRobinPicker picker = new WeightedRoundRobinPicker(ImmutableList.copyOf(activeList),
239-
config.enableOobLoadReport, config.errorUtilizationPenalty, sequence);
244+
config.enableOobLoadReport, config.errorUtilizationPenalty, sequence,
245+
config.metricNamesForComputingUtilization);
240246
updateWeight(picker);
241247
return picker;
242248
}
@@ -325,12 +331,16 @@ public void addSubchannel(WrrSubchannel wrrSubchannel) {
325331
subchannels.add(wrrSubchannel);
326332
}
327333

328-
public OrcaReportListener getOrCreateOrcaListener(float errorUtilizationPenalty) {
334+
public OrcaReportListener getOrCreateOrcaListener(float errorUtilizationPenalty,
335+
ImmutableList<String> metricNamesForComputingUtilization) {
329336
if (orcaReportListener != null
330-
&& orcaReportListener.errorUtilizationPenalty == errorUtilizationPenalty) {
337+
&& orcaReportListener.errorUtilizationPenalty == errorUtilizationPenalty
338+
&& Objects.equals(orcaReportListener.metricNamesForComputingUtilization,
339+
metricNamesForComputingUtilization)) {
331340
return orcaReportListener;
332341
}
333-
orcaReportListener = new OrcaReportListener(errorUtilizationPenalty);
342+
orcaReportListener =
343+
new OrcaReportListener(errorUtilizationPenalty, metricNamesForComputingUtilization);
334344
return orcaReportListener;
335345
}
336346

@@ -355,18 +365,19 @@ public void updateBalancingState(ConnectivityState newState, SubchannelPicker ne
355365

356366
final class OrcaReportListener implements OrcaPerRequestReportListener, OrcaOobReportListener {
357367
private final float errorUtilizationPenalty;
368+
private final ImmutableList<String> metricNamesForComputingUtilization;
358369

359-
OrcaReportListener(float errorUtilizationPenalty) {
370+
OrcaReportListener(float errorUtilizationPenalty,
371+
ImmutableList<String> metricNamesForComputingUtilization) {
360372
this.errorUtilizationPenalty = errorUtilizationPenalty;
373+
this.metricNamesForComputingUtilization = metricNamesForComputingUtilization;
361374
}
362375

363376
@Override
364377
public void onLoadReport(MetricReport report) {
378+
double utilization = getUtilization(report, metricNamesForComputingUtilization);
379+
365380
double newWeight = 0;
366-
// Prefer application utilization and fallback to CPU utilization if unset.
367-
double utilization =
368-
report.getApplicationUtilization() > 0 ? report.getApplicationUtilization()
369-
: report.getCpuUtilization();
370381
if (utilization > 0 && report.getQps() > 0) {
371382
double penalty = 0;
372383
if (report.getEps() > 0 && errorUtilizationPenalty > 0) {
@@ -383,6 +394,37 @@ public void onLoadReport(MetricReport report) {
383394
lastUpdated = ticker.nanoTime();
384395
weight = newWeight;
385396
}
397+
398+
/**
399+
* Returns the utilization value computed from the specified metric names. If the application
400+
* utilization is present and valid, it is returned. Otherwise, the maximum of the custom
401+
* metrics specified is returned. If none of the custom metrics are present, the CPU
402+
* utilization is returned.
403+
*/
404+
private double getUtilization(MetricReport report, ImmutableList<String> metricNames) {
405+
double appUtil = report.getApplicationUtilization();
406+
if (appUtil > 0) {
407+
return appUtil;
408+
}
409+
return getCustomMetricUtilization(report, metricNames)
410+
.orElse(report.getCpuUtilization());
411+
}
412+
413+
/**
414+
* Returns the maximum utilization value among the specified metric names.
415+
* Returns OptionalDouble.empty() if NONE of the specified metrics are present in the report,
416+
* or if all present metrics are NaN.
417+
* Returns OptionalDouble.of(maxUtil) if at least one non-NaN metric is present.
418+
*/
419+
private OptionalDouble getCustomMetricUtilization(MetricReport report,
420+
ImmutableList<String> metricNames) {
421+
return metricNames.stream()
422+
.map(name -> MetricReportUtils.getMetric(report, name))
423+
.filter(OptionalDouble::isPresent)
424+
.mapToDouble(OptionalDouble::getAsDouble)
425+
.filter(d -> !Double.isNaN(d) && d > 0)
426+
.max();
427+
}
386428
}
387429
}
388430

@@ -403,10 +445,10 @@ private void createAndApplyOrcaListeners() {
403445
for (WrrSubchannel weightedSubchannel : wChild.subchannels) {
404446
if (config.enableOobLoadReport) {
405447
OrcaOobUtil.setListener(weightedSubchannel,
406-
wChild.getOrCreateOrcaListener(config.errorUtilizationPenalty),
448+
wChild.getOrCreateOrcaListener(config.errorUtilizationPenalty,
449+
config.metricNamesForComputingUtilization),
407450
OrcaOobUtil.OrcaReportingConfig.newBuilder()
408-
.setReportInterval(config.oobReportingPeriodNanos, TimeUnit.NANOSECONDS)
409-
.build());
451+
.setReportInterval(config.oobReportingPeriodNanos, TimeUnit.NANOSECONDS).build());
410452
} else {
411453
OrcaOobUtil.setListener(weightedSubchannel, null, null);
412454
}
@@ -473,7 +515,8 @@ static final class WeightedRoundRobinPicker extends SubchannelPicker {
473515
private volatile StaticStrideScheduler scheduler;
474516

475517
WeightedRoundRobinPicker(List<ChildLbState> children, boolean enableOobLoadReport,
476-
float errorUtilizationPenalty, AtomicInteger sequence) {
518+
float errorUtilizationPenalty, AtomicInteger sequence,
519+
ImmutableList<String> metricNamesForComputingUtilization) {
477520
checkNotNull(children, "children");
478521
Preconditions.checkArgument(!children.isEmpty(), "empty child list");
479522
this.children = children;
@@ -482,7 +525,8 @@ static final class WeightedRoundRobinPicker extends SubchannelPicker {
482525
for (ChildLbState child : children) {
483526
WeightedChildLbState wChild = (WeightedChildLbState) child;
484527
pickers.add(wChild.getCurrentPicker());
485-
reportListeners.add(wChild.getOrCreateOrcaListener(errorUtilizationPenalty));
528+
reportListeners.add(wChild.getOrCreateOrcaListener(errorUtilizationPenalty,
529+
metricNamesForComputingUtilization));
486530
}
487531
this.pickers = pickers;
488532
this.reportListeners = reportListeners;
@@ -565,11 +609,11 @@ public boolean equals(Object o) {
565609
* The Static Stride Scheduler is an implementation of an earliest deadline first (EDF) scheduler
566610
* in which each object's deadline is the multiplicative inverse of the object's weight.
567611
* <p>
568-
* The way in which this is implemented is through a static stride scheduler.
612+
* The way in which this is implemented is through a static stride scheduler.
569613
* The Static Stride Scheduler works by iterating through the list of subchannel weights
570-
* and using modular arithmetic to proportionally distribute picks, favoring entries
571-
* with higher weights. It is based on the observation that the intended sequence generated
572-
* from an EDF scheduler is a periodic one that can be achieved through modular arithmetic.
614+
* and using modular arithmetic to proportionally distribute picks, favoring entries
615+
* with higher weights. It is based on the observation that the intended sequence generated
616+
* from an EDF scheduler is a periodic one that can be achieved through modular arithmetic.
573617
* The Static Stride Scheduler is more performant than other implementations of the EDF
574618
* Scheduler, as it removes the need for a priority queue (and thus mutex locks).
575619
* <p>
@@ -720,23 +764,23 @@ static final class WeightedRoundRobinLoadBalancerConfig {
720764
final long oobReportingPeriodNanos;
721765
final long weightUpdatePeriodNanos;
722766
final float errorUtilizationPenalty;
767+
final ImmutableList<String> metricNamesForComputingUtilization;
723768

724769
public static Builder newBuilder() {
725770
return new Builder();
726771
}
727772

728773
private WeightedRoundRobinLoadBalancerConfig(long blackoutPeriodNanos,
729-
long weightExpirationPeriodNanos,
730-
boolean enableOobLoadReport,
731-
long oobReportingPeriodNanos,
732-
long weightUpdatePeriodNanos,
733-
float errorUtilizationPenalty) {
774+
long weightExpirationPeriodNanos, boolean enableOobLoadReport, long oobReportingPeriodNanos,
775+
long weightUpdatePeriodNanos, float errorUtilizationPenalty,
776+
ImmutableList<String> metricNamesForComputingUtilization) {
734777
this.blackoutPeriodNanos = blackoutPeriodNanos;
735778
this.weightExpirationPeriodNanos = weightExpirationPeriodNanos;
736779
this.enableOobLoadReport = enableOobLoadReport;
737780
this.oobReportingPeriodNanos = oobReportingPeriodNanos;
738781
this.weightUpdatePeriodNanos = weightUpdatePeriodNanos;
739782
this.errorUtilizationPenalty = errorUtilizationPenalty;
783+
this.metricNamesForComputingUtilization = metricNamesForComputingUtilization;
740784
}
741785

742786
@Override
@@ -751,27 +795,26 @@ public boolean equals(Object o) {
751795
&& this.oobReportingPeriodNanos == that.oobReportingPeriodNanos
752796
&& this.weightUpdatePeriodNanos == that.weightUpdatePeriodNanos
753797
// Float.compare considers NaNs equal
754-
&& Float.compare(this.errorUtilizationPenalty, that.errorUtilizationPenalty) == 0;
798+
&& Float.compare(this.errorUtilizationPenalty, that.errorUtilizationPenalty) == 0
799+
&& Objects.equals(this.metricNamesForComputingUtilization,
800+
that.metricNamesForComputingUtilization);
755801
}
756802

757803
@Override
758804
public int hashCode() {
759-
return Objects.hash(
760-
blackoutPeriodNanos,
761-
weightExpirationPeriodNanos,
762-
enableOobLoadReport,
763-
oobReportingPeriodNanos,
764-
weightUpdatePeriodNanos,
765-
errorUtilizationPenalty);
805+
return Objects.hash(blackoutPeriodNanos, weightExpirationPeriodNanos, enableOobLoadReport,
806+
oobReportingPeriodNanos, weightUpdatePeriodNanos, errorUtilizationPenalty,
807+
metricNamesForComputingUtilization);
766808
}
767809

768810
static final class Builder {
769811
long blackoutPeriodNanos = 10_000_000_000L; // 10s
770-
long weightExpirationPeriodNanos = 180_000_000_000L; //3min
812+
long weightExpirationPeriodNanos = 180_000_000_000L; // 3min
771813
boolean enableOobLoadReport = false;
772814
long oobReportingPeriodNanos = 10_000_000_000L; // 10s
773815
long weightUpdatePeriodNanos = 1_000_000_000L; // 1s
774816
float errorUtilizationPenalty = 1.0F;
817+
ImmutableList<String> metricNamesForComputingUtilization = ImmutableList.of();
775818

776819
private Builder() {
777820

@@ -809,10 +852,17 @@ Builder setErrorUtilizationPenalty(float errorUtilizationPenalty) {
809852
return this;
810853
}
811854

855+
Builder setMetricNamesForComputingUtilization(
856+
List<String> metricNamesForComputingUtilization) {
857+
this.metricNamesForComputingUtilization =
858+
ImmutableList.copyOf(metricNamesForComputingUtilization);
859+
return this;
860+
}
861+
812862
WeightedRoundRobinLoadBalancerConfig build() {
813863
return new WeightedRoundRobinLoadBalancerConfig(blackoutPeriodNanos,
814-
weightExpirationPeriodNanos, enableOobLoadReport, oobReportingPeriodNanos,
815-
weightUpdatePeriodNanos, errorUtilizationPenalty);
864+
weightExpirationPeriodNanos, enableOobLoadReport, oobReportingPeriodNanos,
865+
weightUpdatePeriodNanos, errorUtilizationPenalty, metricNamesForComputingUtilization);
816866
}
817867
}
818868
}

xds/src/main/java/io/grpc/xds/WeightedRoundRobinLoadBalancerProvider.java

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import io.grpc.Status;
2727
import io.grpc.internal.JsonUtil;
2828
import io.grpc.xds.WeightedRoundRobinLoadBalancer.WeightedRoundRobinLoadBalancerConfig;
29+
import java.util.List;
2930
import java.util.Map;
3031

3132
/**
@@ -73,14 +74,16 @@ public ConfigOrError parseLoadBalancingPolicyConfig(Map<String, ?> rawConfig) {
7374
private ConfigOrError parseLoadBalancingPolicyConfigInternal(Map<String, ?> rawConfig) {
7475
Long blackoutPeriodNanos = JsonUtil.getStringAsDuration(rawConfig, "blackoutPeriod");
7576
Long weightExpirationPeriodNanos =
76-
JsonUtil.getStringAsDuration(rawConfig, "weightExpirationPeriod");
77+
JsonUtil.getStringAsDuration(rawConfig, "weightExpirationPeriod");
7778
Long oobReportingPeriodNanos = JsonUtil.getStringAsDuration(rawConfig, "oobReportingPeriod");
7879
Boolean enableOobLoadReport = JsonUtil.getBoolean(rawConfig, "enableOobLoadReport");
7980
Long weightUpdatePeriodNanos = JsonUtil.getStringAsDuration(rawConfig, "weightUpdatePeriod");
8081
Float errorUtilizationPenalty = JsonUtil.getNumberAsFloat(rawConfig, "errorUtilizationPenalty");
82+
List<String> metricNamesForComputingUtilization = JsonUtil.getListOfStrings(rawConfig,
83+
LoadBalancerConfigFactory.METRIC_NAMES_FOR_COMPUTING_UTILIZATION);
8184

8285
WeightedRoundRobinLoadBalancerConfig.Builder configBuilder =
83-
WeightedRoundRobinLoadBalancerConfig.newBuilder();
86+
WeightedRoundRobinLoadBalancerConfig.newBuilder();
8487
if (blackoutPeriodNanos != null) {
8588
configBuilder.setBlackoutPeriodNanos(blackoutPeriodNanos);
8689
}
@@ -102,6 +105,11 @@ private ConfigOrError parseLoadBalancingPolicyConfigInternal(Map<String, ?> rawC
102105
if (errorUtilizationPenalty != null) {
103106
configBuilder.setErrorUtilizationPenalty(errorUtilizationPenalty);
104107
}
108+
if (metricNamesForComputingUtilization != null) {
109+
if (WeightedRoundRobinLoadBalancer.enableCustomConfig) {
110+
configBuilder.setMetricNamesForComputingUtilization(metricNamesForComputingUtilization);
111+
}
112+
}
105113
return ConfigOrError.fromConfig(configBuilder.build());
106114
}
107115
}

0 commit comments

Comments
 (0)