-
Notifications
You must be signed in to change notification settings - Fork 25.5k
Balancer skips disk related computation when disk weight factor is zero #136352
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
e6eef9e
842957d
97f6e27
7e34f79
137bf84
0affb99
b7e22fc
52c7448
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -169,7 +169,8 @@ public void allocate(RoutingAllocation allocation) { | |
| allocation, | ||
| balancerSettings.getThreshold(), | ||
| balancingWeights, | ||
| balancerSettings.completeEarlyOnShardAssignmentChange() | ||
| balancerSettings.completeEarlyOnShardAssignmentChange(), | ||
| balancerSettings.getDiskUsageBalanceFactor() == 0 | ||
| ); | ||
|
|
||
| boolean shardAssigned = false, shardMoved = false, shardBalanced = false; | ||
|
|
@@ -248,7 +249,8 @@ public ShardAllocationDecision decideShardAllocation(final ShardRouting shard, f | |
| allocation, | ||
| balancerSettings.getThreshold(), | ||
| balancingWeightsFactory.create(), | ||
| balancerSettings.completeEarlyOnShardAssignmentChange() | ||
| balancerSettings.completeEarlyOnShardAssignmentChange(), | ||
| balancerSettings.getDiskUsageBalanceFactor() == 0 | ||
| ); | ||
| AllocateUnassignedDecision allocateUnassignedDecision = AllocateUnassignedDecision.NOT_TAKEN; | ||
| MoveDecision moveDecision = MoveDecision.NOT_TAKEN; | ||
|
|
@@ -309,13 +311,15 @@ public static class Balancer { | |
| private final BalancingWeights balancingWeights; | ||
| private final NodeSorters nodeSorters; | ||
| private final boolean completeEarlyOnShardAssignmentChange; | ||
| private final boolean skipDiskUsageCalculation; | ||
|
|
||
| private Balancer( | ||
| WriteLoadForecaster writeLoadForecaster, | ||
| RoutingAllocation allocation, | ||
| float threshold, | ||
| BalancingWeights balancingWeights, | ||
| boolean completeEarlyOnShardAssignmentChange | ||
| boolean completeEarlyOnShardAssignmentChange, | ||
| boolean skipDiskUsageCalculation | ||
| ) { | ||
| this.writeLoadForecaster = writeLoadForecaster; | ||
| this.allocation = allocation; | ||
|
|
@@ -324,11 +328,14 @@ private Balancer( | |
| this.threshold = threshold; | ||
| avgShardsPerNode = WeightFunction.avgShardPerNode(metadata, routingNodes); | ||
| avgWriteLoadPerNode = WeightFunction.avgWriteLoadPerNode(writeLoadForecaster, metadata, routingNodes); | ||
| avgDiskUsageInBytesPerNode = WeightFunction.avgDiskUsageInBytesPerNode(allocation.clusterInfo(), metadata, routingNodes); | ||
| nodes = Collections.unmodifiableMap(buildModelFromAssigned()); | ||
| avgDiskUsageInBytesPerNode = skipDiskUsageCalculation | ||
| ? 0 | ||
| : WeightFunction.avgDiskUsageInBytesPerNode(allocation.clusterInfo(), metadata, routingNodes); | ||
| nodes = Collections.unmodifiableMap(buildModelFromAssigned(skipDiskUsageCalculation)); | ||
| this.nodeSorters = balancingWeights.createNodeSorters(nodesArray(), this); | ||
| this.balancingWeights = balancingWeights; | ||
| this.completeEarlyOnShardAssignmentChange = completeEarlyOnShardAssignmentChange; | ||
| this.skipDiskUsageCalculation = skipDiskUsageCalculation; | ||
| } | ||
|
|
||
| private static long getShardDiskUsageInBytes(ShardRouting shardRouting, IndexMetadata indexMetadata, ClusterInfo clusterInfo) { | ||
|
|
@@ -345,7 +352,12 @@ private float getShardWriteLoad(ProjectIndex index) { | |
| return (float) writeLoadForecaster.getForecastedWriteLoad(projectMetadata.index(index.indexName)).orElse(0.0); | ||
| } | ||
|
|
||
| // This method is used only by NodeSorter#minWeightDelta to compute the node weight delta. | ||
| // Hence, we can return 0 when disk usage is ignored. Any future usage of this method should review whether this still holds. | ||
| private float maxShardSizeBytes(ProjectIndex index) { | ||
| if (skipDiskUsageCalculation) { | ||
| return 0; | ||
| } | ||
|
||
| final var indexMetadata = indexMetadata(index); | ||
| if (indexMetadata.ignoreDiskWatermarks()) { | ||
| // disk watermarks are ignored for partial searchable snapshots | ||
|
|
@@ -1156,10 +1168,10 @@ private Decision decideCanForceAllocateForVacate(ShardRouting shardRouting, Rout | |
| * on the target node which we respect during the allocation / balancing | ||
| * process. In short, this method recreates the status-quo in the cluster. | ||
| */ | ||
| private Map<String, ModelNode> buildModelFromAssigned() { | ||
| private Map<String, ModelNode> buildModelFromAssigned(boolean skipDiskUsageCalculation) { | ||
| Map<String, ModelNode> nodes = Maps.newMapWithExpectedSize(routingNodes.size()); | ||
| for (RoutingNode rn : routingNodes) { | ||
| ModelNode node = new ModelNode(writeLoadForecaster, metadata, allocation.clusterInfo(), rn); | ||
| ModelNode node = new ModelNode(writeLoadForecaster, metadata, allocation.clusterInfo(), rn, skipDiskUsageCalculation); | ||
| nodes.put(rn.nodeId(), node); | ||
| for (ShardRouting shard : rn) { | ||
| assert rn.nodeId().equals(shard.currentNodeId()); | ||
|
|
@@ -1476,13 +1488,21 @@ public static class ModelNode implements Iterable<ModelIndex> { | |
| private final ClusterInfo clusterInfo; | ||
| private final RoutingNode routingNode; | ||
| private final Map<ProjectIndex, ModelIndex> indices; | ||
| private final boolean skipDiskUsageCalculation; | ||
|
|
||
| public ModelNode(WriteLoadForecaster writeLoadForecaster, Metadata metadata, ClusterInfo clusterInfo, RoutingNode routingNode) { | ||
| public ModelNode( | ||
| WriteLoadForecaster writeLoadForecaster, | ||
| Metadata metadata, | ||
| ClusterInfo clusterInfo, | ||
| RoutingNode routingNode, | ||
| boolean skipDiskUsageCalculation | ||
| ) { | ||
| this.writeLoadForecaster = writeLoadForecaster; | ||
| this.metadata = metadata; | ||
| this.clusterInfo = clusterInfo; | ||
| this.routingNode = routingNode; | ||
| this.indices = Maps.newMapWithExpectedSize(routingNode.size() + 10);// some extra to account for shard movements | ||
| this.skipDiskUsageCalculation = skipDiskUsageCalculation; | ||
| } | ||
|
|
||
| public ModelIndex getIndex(ProjectIndex index) { | ||
|
|
@@ -1527,7 +1547,9 @@ public void addShard(ProjectIndex index, ShardRouting shard) { | |
| indices.computeIfAbsent(index, t -> new ModelIndex()).addShard(shard); | ||
| IndexMetadata indexMetadata = metadata.getProject(index.project).index(shard.index()); | ||
| writeLoad += writeLoadForecaster.getForecastedWriteLoad(indexMetadata).orElse(0.0); | ||
| diskUsageInBytes += Balancer.getShardDiskUsageInBytes(shard, indexMetadata, clusterInfo); | ||
| if (skipDiskUsageCalculation == false) { | ||
| diskUsageInBytes += Balancer.getShardDiskUsageInBytes(shard, indexMetadata, clusterInfo); | ||
| } | ||
| numShards++; | ||
| } | ||
|
|
||
|
|
@@ -1541,7 +1563,9 @@ public void removeShard(ProjectIndex projectIndex, ShardRouting shard) { | |
| } | ||
| IndexMetadata indexMetadata = metadata.getProject(projectIndex.project).index(shard.index()); | ||
| writeLoad -= writeLoadForecaster.getForecastedWriteLoad(indexMetadata).orElse(0.0); | ||
| diskUsageInBytes -= Balancer.getShardDiskUsageInBytes(shard, indexMetadata, clusterInfo); | ||
| if (skipDiskUsageCalculation == false) { | ||
| diskUsageInBytes -= Balancer.getShardDiskUsageInBytes(shard, indexMetadata, clusterInfo); | ||
| } | ||
| numShards--; | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The cost saving is realistic. My main question is whether the approach is considered hacky.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What about if rather than having the additional flag, we passed the weighting around, and the expensive parts could only perform the calculation if the weighting was non-zero?
I'm not sure if that's better, but it's a thought.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could refer to
this.balancingWeightsperhaps?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See below flamegraph (from many-shards benchmark) that shows the time spent on disk related computation (purple color) inside allocate calls.

There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We synced offline and agreed to change the boolean flag to be a method on
BalancingWeights.