Skip to content

Commit 751a7a4

Browse files
vishesh92dhslove
authored andcommitted
Add advance settings to fine tune DRS imbalance calculation (apache#8521)
* Use free/total instead of free metric to calculate imbalance * Filter out hosts for condensed while checking imbalance * Make DRS more configurable * code refactor * Add unit tests * fixup * Fix validation for drs.imbalance.condensed.skip.threshold * Add logging and other minor changes for drs * Add some logging for drs * Change format for drs imbalance to string * Show drs imbalance as percentage * Fixup label for memorytotal in en.json
1 parent 7a8e28c commit 751a7a4

File tree

16 files changed

+430
-232
lines changed

16 files changed

+430
-232
lines changed

api/src/main/java/org/apache/cloudstack/cluster/ClusterDrsAlgorithm.java

Lines changed: 144 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@
3333
import java.util.List;
3434
import java.util.Map;
3535

36+
import static org.apache.cloudstack.cluster.ClusterDrsService.ClusterDrsMetric;
37+
import static org.apache.cloudstack.cluster.ClusterDrsService.ClusterDrsMetricType;
38+
import static org.apache.cloudstack.cluster.ClusterDrsService.ClusterDrsMetricUseRatio;
39+
3640
public interface ClusterDrsAlgorithm extends Adapter {
3741

3842
/**
@@ -42,16 +46,17 @@ public interface ClusterDrsAlgorithm extends Adapter {
4246
* @param clusterId
4347
* the ID of the cluster to check
4448
* @param cpuList
45-
* a list of CPU allocated values for each host in the cluster
49+
* a list of Ternary of used, reserved & total CPU for each host in the cluster
4650
* @param memoryList
47-
* a list of memory allocated values for each host in the cluster
51+
* a list of Ternary of used, reserved & total memory values for each host in the cluster
4852
*
4953
* @return true if a DRS operation is needed, false otherwise
5054
*
5155
* @throws ConfigurationException
5256
* if there is an error in the configuration
5357
*/
54-
boolean needsDrs(long clusterId, List<Long> cpuList, List<Long> memoryList) throws ConfigurationException;
58+
boolean needsDrs(long clusterId, List<Ternary<Long, Long, Long>> cpuList,
59+
List<Ternary<Long, Long, Long>> memoryList) throws ConfigurationException;
5560

5661

5762
/**
@@ -65,18 +70,19 @@ public interface ClusterDrsAlgorithm extends Adapter {
6570
* the service offering for the virtual machine
6671
* @param destHost
6772
* the destination host for the virtual machine
68-
* @param hostCpuFreeMap
69-
* a map of host IDs to the amount of CPU free on each host
70-
* @param hostMemoryFreeMap
71-
* a map of host IDs to the amount of memory free on each host
73+
* @param hostCpuMap
74+
* a map of host IDs to the Ternary of used, reserved and total CPU on each host
75+
* @param hostMemoryMap
76+
* a map of host IDs to the Ternary of used, reserved and total memory on each host
7277
* @param requiresStorageMotion
7378
* whether storage motion is required for the virtual machine
7479
*
7580
* @return a ternary containing improvement, cost, benefit
7681
*/
7782
Ternary<Double, Double, Double> getMetrics(long clusterId, VirtualMachine vm, ServiceOffering serviceOffering,
78-
Host destHost, Map<Long, Long> hostCpuFreeMap,
79-
Map<Long, Long> hostMemoryFreeMap, Boolean requiresStorageMotion);
83+
Host destHost, Map<Long, Ternary<Long, Long, Long>> hostCpuMap,
84+
Map<Long, Ternary<Long, Long, Long>> hostMemoryMap,
85+
Boolean requiresStorageMotion) throws ConfigurationException;
8086

8187
/**
8288
* Calculates the imbalance of the cluster after a virtual machine migration.
@@ -87,62 +93,101 @@ Ternary<Double, Double, Double> getMetrics(long clusterId, VirtualMachine vm, Se
8793
* the virtual machine being migrated
8894
* @param destHost
8995
* the destination host for the virtual machine
90-
* @param hostCpuFreeMap
91-
* a map of host IDs to the amount of CPU free on each host
92-
* @param hostMemoryFreeMap
93-
* a map of host IDs to the amount of memory free on each host
96+
* @param hostCpuMap
97+
* a map of host IDs to the Ternary of used, reserved and total CPU on each host
98+
* @param hostMemoryMap
99+
* a map of host IDs to the Ternary of used, reserved and total memory on each host
94100
*
95101
* @return a pair containing the CPU and memory imbalance of the cluster after the migration
96102
*/
97-
default Pair<Double, Double> getImbalancePostMigration(ServiceOffering serviceOffering, VirtualMachine vm,
98-
Host destHost, Map<Long, Long> hostCpuFreeMap,
99-
Map<Long, Long> hostMemoryFreeMap) {
100-
List<Long> postCpuList = new ArrayList<>();
101-
List<Long> postMemoryList = new ArrayList<>();
102-
final int vmCpu = serviceOffering.getCpu() * serviceOffering.getSpeed();
103-
final long vmRam = serviceOffering.getRamSize() * 1024L * 1024L;
104-
105-
for (Long hostId : hostCpuFreeMap.keySet()) {
106-
long cpu = hostCpuFreeMap.get(hostId);
107-
long memory = hostMemoryFreeMap.get(hostId);
108-
if (hostId == destHost.getId()) {
109-
postCpuList.add(cpu - vmCpu);
110-
postMemoryList.add(memory - vmRam);
111-
} else if (hostId.equals(vm.getHostId())) {
112-
postCpuList.add(cpu + vmCpu);
113-
postMemoryList.add(memory + vmRam);
114-
} else {
115-
postCpuList.add(cpu);
116-
postMemoryList.add(memory);
117-
}
103+
default Double getImbalancePostMigration(ServiceOffering serviceOffering, VirtualMachine vm,
104+
Host destHost, Map<Long, Ternary<Long, Long, Long>> hostCpuMap,
105+
Map<Long, Ternary<Long, Long, Long>> hostMemoryMap) throws ConfigurationException {
106+
Pair<Long, Map<Long, Ternary<Long, Long, Long>>> pair = getHostMetricsMapAndType(destHost.getClusterId(), serviceOffering, hostCpuMap, hostMemoryMap);
107+
long vmMetric = pair.first();
108+
Map<Long, Ternary<Long, Long, Long>> hostMetricsMap = pair.second();
109+
110+
List<Double> list = new ArrayList<>();
111+
for (Long hostId : hostMetricsMap.keySet()) {
112+
list.add(getMetricValuePostMigration(destHost.getClusterId(), hostMetricsMap.get(hostId), vmMetric, hostId, destHost.getId(), vm.getHostId()));
118113
}
119-
return new Pair<>(getClusterImbalance(postCpuList), getClusterImbalance(postMemoryList));
114+
return getImbalance(list);
120115
}
121116

122-
/**
123-
* The cluster imbalance is defined as the percentage deviation from the mean
124-
* for a configured metric of the cluster. The standard deviation is used as a
125-
* mathematical tool to normalize the metric data for all the resource and the
126-
* percentage deviation provides an easy tool to compare a cluster’s current
127-
* state against the defined imbalance threshold. Because this is essentially a
128-
* percentage, the value is a number between 0.0 and 1.0.
129-
* Cluster Imbalance, Ic = σc / mavg , where σc is the standard deviation and
130-
* mavg is the mean metric value for the cluster.
131-
*/
132-
default Double getClusterImbalance(List<Long> metricList) {
117+
private Pair<Long, Map<Long, Ternary<Long, Long, Long>>> getHostMetricsMapAndType(Long clusterId,
118+
ServiceOffering serviceOffering, Map<Long, Ternary<Long, Long, Long>> hostCpuMap,
119+
Map<Long, Ternary<Long, Long, Long>> hostMemoryMap) throws ConfigurationException {
120+
String metric = getClusterDrsMetric(clusterId);
121+
Pair<Long, Map<Long, Ternary<Long, Long, Long>>> pair;
122+
switch (metric) {
123+
case "cpu":
124+
pair = new Pair<>((long) serviceOffering.getCpu() * serviceOffering.getSpeed(), hostCpuMap);
125+
break;
126+
case "memory":
127+
pair = new Pair<>(serviceOffering.getRamSize() * 1024L * 1024L, hostMemoryMap);
128+
break;
129+
default:
130+
throw new ConfigurationException(
131+
String.format("Invalid metric: %s for cluster: %d", metric, clusterId));
132+
}
133+
return pair;
134+
}
135+
136+
private Double getMetricValuePostMigration(Long clusterId, Ternary<Long, Long, Long> metrics, long vmMetric,
137+
long hostId, long destHostId, long vmHostId) {
138+
long used = metrics.first();
139+
long actualTotal = metrics.third() - metrics.second();
140+
long free = actualTotal - metrics.first();
141+
142+
if (hostId == destHostId) {
143+
used += vmMetric;
144+
free -= vmMetric;
145+
} else if (hostId == vmHostId) {
146+
used -= vmMetric;
147+
free += vmMetric;
148+
}
149+
return getMetricValue(clusterId, used, free, actualTotal, null);
150+
}
151+
152+
private static Double getImbalance(List<Double> metricList) {
133153
Double clusterMeanMetric = getClusterMeanMetric(metricList);
134154
Double clusterStandardDeviation = getClusterStandardDeviation(metricList, clusterMeanMetric);
135155
return clusterStandardDeviation / clusterMeanMetric;
136156
}
137157

158+
static String getClusterDrsMetric(long clusterId) {
159+
return ClusterDrsMetric.valueIn(clusterId);
160+
}
161+
162+
static Double getMetricValue(long clusterId, long used, long free, long total, Float skipThreshold) {
163+
boolean useRatio = getDrsMetricUseRatio(clusterId);
164+
switch (getDrsMetricType(clusterId)) {
165+
case "free":
166+
if (skipThreshold != null && free < skipThreshold * total) return null;
167+
if (useRatio) {
168+
return (double) free / total;
169+
} else {
170+
return (double) free;
171+
}
172+
case "used":
173+
if (skipThreshold != null && used > skipThreshold * total) return null;
174+
if (useRatio) {
175+
return (double) used / total;
176+
} else {
177+
return (double) used;
178+
}
179+
}
180+
return null;
181+
}
182+
138183
/**
139184
* Mean is the average of a collection or set of metrics. In context of a DRS
140185
* cluster, the cluster metrics defined as the average metrics value for some
141186
* metric (such as CPU, memory etc.) for every resource such as host.
142187
* Cluster Mean Metric, mavg = (∑mi) / N, where mi is a measurable metric for a
143188
* resource ‘i’ in a cluster with total N number of resources.
144189
*/
145-
default Double getClusterMeanMetric(List<Long> metricList) {
190+
static Double getClusterMeanMetric(List<Double> metricList) {
146191
return new Mean().evaluate(metricList.stream().mapToDouble(i -> i).toArray());
147192
}
148193

@@ -157,11 +202,62 @@ default Double getClusterMeanMetric(List<Long> metricList) {
157202
* mean metric value and mi is a measurable metric for some resource ‘i’ in the
158203
* cluster with total N number of resources.
159204
*/
160-
default Double getClusterStandardDeviation(List<Long> metricList, Double mean) {
205+
static Double getClusterStandardDeviation(List<Double> metricList, Double mean) {
161206
if (mean != null) {
162207
return new StandardDeviation(false).evaluate(metricList.stream().mapToDouble(i -> i).toArray(), mean);
163208
} else {
164209
return new StandardDeviation(false).evaluate(metricList.stream().mapToDouble(i -> i).toArray());
165210
}
166211
}
212+
213+
static boolean getDrsMetricUseRatio(long clusterId) {
214+
return ClusterDrsMetricUseRatio.valueIn(clusterId);
215+
}
216+
217+
static String getDrsMetricType(long clusterId) {
218+
return ClusterDrsMetricType.valueIn(clusterId);
219+
}
220+
221+
/**
222+
* The cluster imbalance is defined as the percentage deviation from the mean
223+
* for a configured metric of the cluster. The standard deviation is used as a
224+
* mathematical tool to normalize the metric data for all the resource and the
225+
* percentage deviation provides an easy tool to compare a cluster’s current
226+
* state against the defined imbalance threshold. Because this is essentially a
227+
* percentage, the value is a number between 0.0 and 1.0.
228+
* Cluster Imbalance, Ic = σc / mavg , where σc is the standard deviation and
229+
* mavg is the mean metric value for the cluster.
230+
*/
231+
static Double getClusterImbalance(Long clusterId, List<Ternary<Long, Long, Long>> cpuList,
232+
List<Ternary<Long, Long, Long>> memoryList, Float skipThreshold) throws ConfigurationException {
233+
String metric = getClusterDrsMetric(clusterId);
234+
List<Double> list;
235+
switch (metric) {
236+
case "cpu":
237+
list = getMetricList(clusterId, cpuList, skipThreshold);
238+
break;
239+
case "memory":
240+
list = getMetricList(clusterId, memoryList, skipThreshold);
241+
break;
242+
default:
243+
throw new ConfigurationException(
244+
String.format("Invalid metric: %s for cluster: %d", metric, clusterId));
245+
}
246+
return getImbalance(list);
247+
}
248+
249+
static List<Double> getMetricList(Long clusterId, List<Ternary<Long, Long, Long>> hostMetricsList,
250+
Float skipThreshold) {
251+
List<Double> list = new ArrayList<>();
252+
for (Ternary<Long, Long, Long> ternary : hostMetricsList) {
253+
long used = ternary.first();
254+
long actualTotal = ternary.third() - ternary.second();
255+
long free = actualTotal - ternary.first();
256+
Double metricValue = getMetricValue(clusterId, used, free, actualTotal, skipThreshold);
257+
if (metricValue != null) {
258+
list.add(metricValue);
259+
}
260+
}
261+
return list;
262+
}
167263
}

api/src/main/java/org/apache/cloudstack/cluster/ClusterDrsService.java

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,29 @@ public interface ClusterDrsService extends Manager, Configurable, Scheduler {
6666
true, ConfigKey.Scope.Cluster, null, "DRS metric", null, null, null, ConfigKey.Kind.Select,
6767
"memory,cpu");
6868

69+
ConfigKey<String> ClusterDrsMetricType = new ConfigKey<>(String.class, "drs.metric.type", ConfigKey.CATEGORY_ADVANCED,
70+
"used",
71+
"The metric type used to measure imbalance in a cluster. This can completely change the imbalance value. Possible values are free, used.",
72+
true, ConfigKey.Scope.Cluster, null, "DRS metric type", null, null, null, ConfigKey.Kind.Select,
73+
"free,used");
74+
75+
ConfigKey<Boolean> ClusterDrsMetricUseRatio = new ConfigKey<>(Boolean.class, "drs.metric.use.ratio", ConfigKey.CATEGORY_ADVANCED,
76+
"true",
77+
"Whether to use ratio of selected metric & total. Useful when the cluster has hosts with different capacities",
78+
true, ConfigKey.Scope.Cluster, null, "DRS metric use ratio", null, null, null, ConfigKey.Kind.Select,
79+
"true,false");
80+
81+
ConfigKey<Float> ClusterDrsImbalanceSkipThreshold = new ConfigKey<>(Float.class,
82+
"drs.imbalance.condensed.skip.threshold", ConfigKey.CATEGORY_ADVANCED, "0.95",
83+
"Threshold to ignore the metric for a host while calculating the imbalance to decide " +
84+
"whether DRS is required for a cluster.This is to avoid cases when the calculated imbalance" +
85+
" gets skewed due to a single host having a very high/low metric value resulting in imbalance" +
86+
" being higher than 1. If " + ClusterDrsMetricType.key() + " is 'free', set a lower value and if it is 'used' " +
87+
"set a higher value. The value should be between 0.0 and 1.0",
88+
true, ConfigKey.Scope.Cluster, null, "DRS imbalance skip threshold for Condensed algorithm",
89+
null, null, null);
90+
91+
6992
/**
7093
* Generate a DRS plan for a cluster and save it as per the parameters
7194
*
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.cloudstack.cluster;
21+
22+
import com.cloud.utils.Ternary;
23+
import junit.framework.TestCase;
24+
import org.junit.Test;
25+
import org.junit.runner.RunWith;
26+
import org.mockito.MockedStatic;
27+
import org.mockito.Mockito;
28+
import org.mockito.junit.MockitoJUnitRunner;
29+
30+
import java.util.List;
31+
32+
import static org.apache.cloudstack.cluster.ClusterDrsAlgorithm.getMetricValue;
33+
import static org.mockito.ArgumentMatchers.any;
34+
import static org.mockito.ArgumentMatchers.anyFloat;
35+
import static org.mockito.ArgumentMatchers.anyLong;
36+
import static org.mockito.Mockito.when;
37+
38+
@RunWith(MockitoJUnitRunner.class)
39+
public class ClusterDrsAlgorithmTest extends TestCase {
40+
41+
@Test
42+
public void testGetMetricValue() {
43+
List<Ternary<Boolean, String, Double>> testData = List.of(
44+
new Ternary<>(true, "free", 0.4),
45+
new Ternary<>(false, "free", 40.0),
46+
new Ternary<>(true, "used", 0.3),
47+
new Ternary<>(false, "used", 30.0)
48+
);
49+
50+
long used = 30;
51+
long free = 40;
52+
long total = 100;
53+
54+
for (Ternary<Boolean, String, Double> data : testData) {
55+
boolean useRatio = data.first();
56+
String metricType = data.second();
57+
double expectedValue = data.third();
58+
59+
try (MockedStatic<ClusterDrsAlgorithm> ignored = Mockito.mockStatic(ClusterDrsAlgorithm.class)) {
60+
when(ClusterDrsAlgorithm.getDrsMetricUseRatio(1L)).thenReturn(useRatio);
61+
when(ClusterDrsAlgorithm.getDrsMetricType(1L)).thenReturn(metricType);
62+
when(ClusterDrsAlgorithm.getMetricValue(anyLong(), anyLong(), anyLong(), anyLong(), any())).thenCallRealMethod();
63+
64+
assertEquals(expectedValue, getMetricValue(1, used, free, total, null));
65+
}
66+
}
67+
}
68+
69+
@Test
70+
public void testGetMetricValueWithSkipThreshold() {
71+
List<Ternary<Boolean, String, Double>> testData = List.of(
72+
new Ternary<>(true, "free", 0.15),
73+
new Ternary<>(false, "free", 15.0),
74+
new Ternary<>(true, "used", null),
75+
new Ternary<>(false, "used", null)
76+
);
77+
78+
long used = 80;
79+
long free = 15;
80+
long total = 100;
81+
82+
for (Ternary<Boolean, String, Double> data : testData) {
83+
boolean useRatio = data.first();
84+
String metricType = data.second();
85+
Double expectedValue = data.third();
86+
float skipThreshold = metricType.equals("free") ? 0.1f : 0.7f;
87+
88+
try (MockedStatic<ClusterDrsAlgorithm> ignored = Mockito.mockStatic(ClusterDrsAlgorithm.class)) {
89+
when(ClusterDrsAlgorithm.getDrsMetricUseRatio(1L)).thenReturn(useRatio);
90+
when(ClusterDrsAlgorithm.getDrsMetricType(1L)).thenReturn(metricType);
91+
when(ClusterDrsAlgorithm.getMetricValue(anyLong(), anyLong(), anyLong(), anyLong(), anyFloat())).thenCallRealMethod();
92+
93+
assertEquals(expectedValue, ClusterDrsAlgorithm.getMetricValue(1L, used, free, total, skipThreshold));
94+
}
95+
}
96+
}
97+
}

0 commit comments

Comments
 (0)