Skip to content

Commit a464a6d

Browse files
authored
Merge pull request #1028 from mbarbero/kubelimits-racecondition
JENKINS-66484 - Tentative fix of a race condition in KubernetesProvisioningLimits
2 parents d5fbf8e + 8a46a59 commit a464a6d

File tree

2 files changed

+105
-88
lines changed

2 files changed

+105
-88
lines changed

src/main/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesProvisioningLimits.java

Lines changed: 56 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,15 @@
11
package org.csanchez.jenkins.plugins.kubernetes;
22

3-
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
3+
import java.util.HashMap;
4+
import java.util.Map;
5+
import java.util.logging.Level;
6+
import java.util.logging.Logger;
7+
8+
import javax.annotation.Nonnull;
9+
10+
import org.kohsuke.accmod.Restricted;
11+
import org.kohsuke.accmod.restrictions.NoExternalUse;
12+
413
import hudson.Extension;
514
import hudson.ExtensionList;
615
import hudson.init.InitMilestone;
@@ -10,16 +19,6 @@
1019
import jenkins.metrics.api.Metrics;
1120
import jenkins.model.Jenkins;
1221
import jenkins.model.NodeListener;
13-
import org.kohsuke.accmod.Restricted;
14-
import org.kohsuke.accmod.restrictions.NoExternalUse;
15-
16-
import javax.annotation.Nonnull;
17-
import java.util.Collections;
18-
import java.util.HashMap;
19-
import java.util.Map;
20-
import java.util.concurrent.atomic.AtomicInteger;
21-
import java.util.logging.Level;
22-
import java.util.logging.Logger;
2322

2423
/**
2524
* Implements provisioning limits for clouds and pod templates
@@ -31,26 +30,28 @@ public final class KubernetesProvisioningLimits {
3130
/**
3231
* Tracks current number of kubernetes agents per pod template
3332
*/
34-
private final Map<String, AtomicInteger> podTemplateCounts = Collections.synchronizedMap(new HashMap<>());
33+
private final Map<String, Integer> podTemplateCounts = new HashMap<>();
3534

3635
/**
3736
* Tracks current number of kubernetes agents per kubernetes cloud
3837
*/
39-
private final Map<String, AtomicInteger> cloudCounts = Collections.synchronizedMap(new HashMap<>());
38+
private final Map<String, Integer> cloudCounts = new HashMap<>();
4039

4140
@Initializer(after = InitMilestone.SYSTEM_CONFIG_LOADED)
4241
public static void init() {
4342
// We don't want anything to be provisioned while we do the initial count.
4443
Queue.withLock(() -> {
4544
final KubernetesProvisioningLimits instance = get();
46-
Jenkins.get().getNodes()
47-
.stream()
48-
.filter(KubernetesSlave.class::isInstance)
49-
.map(KubernetesSlave.class::cast)
50-
.forEach(node -> {
51-
instance.getGlobalCount(node.getCloudName()).addAndGet(node.getNumExecutors());
52-
instance.getPodTemplateCount(node.getTemplateId()).addAndGet(node.getNumExecutors());
53-
});
45+
synchronized(instance) {
46+
Jenkins.get().getNodes()
47+
.stream()
48+
.filter(KubernetesSlave.class::isInstance)
49+
.map(KubernetesSlave.class::cast)
50+
.forEach(node -> {
51+
instance.cloudCounts.put(node.getCloudName(), instance.getGlobalCount(node.getCloudName()) + node.getNumExecutors());
52+
instance.podTemplateCounts.put(node.getTemplateId(), instance.getPodTemplateCount(node.getTemplateId()) + node.getNumExecutors());
53+
});
54+
}
5455
});
5556
}
5657

@@ -67,26 +68,24 @@ public static KubernetesProvisioningLimits get() {
6768
* @param podTemplate the pod template used to schedule the agent
6869
* @param numExecutors the number of executors (pretty much always 1)
6970
*/
70-
@SuppressFBWarnings(value="JLM_JSR166_UTILCONCURRENT_MONITORENTER", justification = "Trust me here")
71-
public boolean register(@Nonnull KubernetesCloud cloud, @Nonnull PodTemplate podTemplate, int numExecutors) {
72-
AtomicInteger globalCount = getGlobalCount(cloud.name);
73-
AtomicInteger podTemplateCount = getPodTemplateCount(podTemplate.getId());
74-
synchronized (globalCount) {
75-
synchronized (podTemplateCount) {
76-
if (globalCount.get() + numExecutors <= cloud.getContainerCap()) {
77-
if (podTemplateCount.get() + numExecutors <= podTemplate.getInstanceCap()) {
78-
int g = globalCount.addAndGet(numExecutors);
79-
int p = podTemplateCount.addAndGet(numExecutors);
80-
LOGGER.log(Level.FINEST, () -> cloud.name + " global limit: " + g + "/" + cloud.getContainerCap());
81-
LOGGER.log(Level.FINEST, () -> podTemplate.getName() + " template limit: " + p + "/" + podTemplate.getInstanceCap());
82-
return true;
83-
} else {
84-
Metrics.metricRegistry().counter(MetricNames.REACHED_POD_CAP).inc();
85-
}
86-
} else {
87-
Metrics.metricRegistry().counter(MetricNames.REACHED_GLOBAL_CAP).inc();
88-
}
71+
public synchronized boolean register(@Nonnull KubernetesCloud cloud, @Nonnull PodTemplate podTemplate, int numExecutors) {
72+
int newGlobalCount = getGlobalCount(cloud.name) + numExecutors;
73+
if (newGlobalCount <= cloud.getContainerCap()) {
74+
int newPodTemplateCount = getPodTemplateCount(podTemplate.getId()) + numExecutors;
75+
if (newPodTemplateCount <= podTemplate.getInstanceCap()) {
76+
cloudCounts.put(cloud.name, newGlobalCount);
77+
LOGGER.log(Level.FINEST, () -> cloud.name + " global limit: " + newGlobalCount + "/" + cloud.getContainerCap());
78+
79+
podTemplateCounts.put(podTemplate.getId(), newPodTemplateCount);
80+
LOGGER.log(Level.FINEST, () -> podTemplate.getName() + " template limit: " + newPodTemplateCount + "/" + podTemplate.getInstanceCap());
81+
return true;
82+
} else {
83+
LOGGER.log(Level.FINEST, () -> podTemplate.getName() + " template limit reached: " + getPodTemplateCount(podTemplate.getId()) + "/" + podTemplate.getInstanceCap() + ". Cannot add " + numExecutors + " more!");
84+
Metrics.metricRegistry().counter(MetricNames.REACHED_POD_CAP).inc();
8985
}
86+
} else {
87+
LOGGER.log(Level.FINEST, () -> cloud.name + " global limit reached: " + getGlobalCount(cloud.name) + "/" + cloud.getContainerCap() + ". Cannot add " + numExecutors + " more!");
88+
Metrics.metricRegistry().counter(MetricNames.REACHED_GLOBAL_CAP).inc();
9089
}
9190
return false;
9291
}
@@ -97,40 +96,32 @@ public boolean register(@Nonnull KubernetesCloud cloud, @Nonnull PodTemplate pod
9796
* @param podTemplate the pod template used to schedule the agent
9897
* @param numExecutors the number of executors (pretty much always 1)
9998
*/
100-
@SuppressFBWarnings(value="JLM_JSR166_UTILCONCURRENT_MONITORENTER", justification = "Trust me here")
101-
public void unregister(@Nonnull KubernetesCloud cloud, @Nonnull PodTemplate podTemplate, int numExecutors) {
102-
AtomicInteger globalCount = getGlobalCount(cloud.name);
103-
AtomicInteger podTemplateCount = getPodTemplateCount(podTemplate.getId());
104-
synchronized (globalCount) {
105-
synchronized (podTemplateCount) {
106-
int newGlobalCount = globalCount.addAndGet(numExecutors * -1);
107-
int newPodTemplateCount = podTemplateCount.addAndGet(numExecutors * -1);
108-
if (newGlobalCount < 0) {
109-
LOGGER.log(Level.WARNING, "Global count for " + cloud.name + " went below zero. There is likely a bug in kubernetes-plugin");
110-
globalCount.set(0);
111-
} else {
112-
LOGGER.log(Level.FINEST, () -> cloud.name + " global limit: " + newGlobalCount + "/" + cloud.getContainerCap());
113-
}
114-
if (newPodTemplateCount < 0) {
115-
LOGGER.log(Level.WARNING, "Pod template count for " + podTemplate.getId() + " went below zero. There is likely a bug in kubernetes-plugin");
116-
podTemplateCount.set(0);
117-
} else {
118-
LOGGER.log(Level.FINEST, () -> podTemplate.getName() + " template limit: " + newPodTemplateCount + "/" + podTemplate.getInstanceCap());
119-
}
120-
}
99+
public synchronized void unregister(@Nonnull KubernetesCloud cloud, @Nonnull PodTemplate podTemplate, int numExecutors) {
100+
int newGlobalCount = getGlobalCount(cloud.name) - numExecutors;
101+
if (newGlobalCount < 0) {
102+
LOGGER.log(Level.WARNING, "Global count for " + cloud.name + " went below zero. There is likely a bug in kubernetes-plugin");
103+
}
104+
cloudCounts.put(cloud.name, Math.max(0, newGlobalCount));
105+
LOGGER.log(Level.FINEST, () -> cloud.name + " global limit: " + Math.max(0, newGlobalCount) + "/" + cloud.getContainerCap());
106+
107+
int newPodTemplateCount = getPodTemplateCount(podTemplate.getId()) - numExecutors;
108+
if (newPodTemplateCount < 0) {
109+
LOGGER.log(Level.WARNING, "Pod template count for " + podTemplate.getName() + " went below zero. There is likely a bug in kubernetes-plugin");
121110
}
111+
podTemplateCounts.put(podTemplate.getId(), Math.max(0, newPodTemplateCount));
112+
LOGGER.log(Level.FINEST, () -> podTemplate.getName() + " template limit: " + Math.max(0, newPodTemplateCount) + "/" + podTemplate.getInstanceCap());
122113
}
123114

124115
@Nonnull
125116
@Restricted(NoExternalUse.class)
126-
AtomicInteger getGlobalCount(String name) {
127-
return cloudCounts.computeIfAbsent(name, k -> new AtomicInteger());
117+
int getGlobalCount(String cloudName) {
118+
return cloudCounts.getOrDefault(cloudName, 0);
128119
}
129120

130121
@Nonnull
131122
@Restricted(NoExternalUse.class)
132-
AtomicInteger getPodTemplateCount(String id) {
133-
return podTemplateCounts.computeIfAbsent(id, k -> new AtomicInteger());
123+
int getPodTemplateCount(String podTemplate) {
124+
return podTemplateCounts.getOrDefault(podTemplate, 0);
134125
}
135126

136127
@Extension

src/test/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesProvisioningLimitsTest.java

Lines changed: 49 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,24 @@
11
package org.csanchez.jenkins.plugins.kubernetes;
22

3-
import org.junit.Rule;
4-
import org.junit.Test;
5-
import org.jvnet.hudson.test.JenkinsRule;
6-
import org.jvnet.hudson.test.LoggerRule;
3+
import static org.junit.Assert.assertEquals;
4+
import static org.junit.Assert.assertTrue;
75

8-
import java.util.ArrayList;
96
import java.util.List;
10-
import java.util.concurrent.Callable;
7+
import java.util.concurrent.CompletionService;
8+
import java.util.concurrent.ExecutorCompletionService;
119
import java.util.concurrent.ExecutorService;
1210
import java.util.concurrent.Executors;
1311
import java.util.concurrent.ThreadLocalRandom;
12+
import java.util.concurrent.TimeUnit;
1413
import java.util.logging.Level;
1514

16-
import static org.junit.Assert.assertEquals;
15+
import org.junit.Rule;
16+
import org.junit.Test;
17+
import org.jvnet.hudson.test.JenkinsRule;
18+
import org.jvnet.hudson.test.LoggerRule;
1719

1820
public class KubernetesProvisioningLimitsTest {
21+
1922
@Rule
2023
public JenkinsRule j = new JenkinsRule();
2124

@@ -25,40 +28,63 @@ public class KubernetesProvisioningLimitsTest {
2528
@Test
2629
public void lotsOfCloudsAndTemplates() throws InterruptedException {
2730
ThreadLocalRandom testRandom = ThreadLocalRandom.current();
28-
for (int i = 1; i <= 10; i++) {
31+
for (int i = 1; i < 4; i++) {
2932
KubernetesCloud cloud = new KubernetesCloud("kubernetes-" + i);
30-
cloud.setContainerCap(testRandom.nextInt(40)+1);
31-
for (int j = 1; j < 10; j++) {
33+
cloud.setContainerCap(testRandom.nextInt(4)+1);
34+
for (int j = 1; j < 4; j++) {
3235
PodTemplate pt = new PodTemplate();
36+
pt.setName(cloud.name + "-podTemplate-" + j);
3337
pt.setInstanceCap(testRandom.nextInt(4)+1);
3438
cloud.addTemplate(pt);
3539
}
3640
j.jenkins.clouds.add(cloud);
3741
}
38-
ExecutorService EXEC = Executors.newFixedThreadPool(100);
39-
List<Callable<Void>> tasks = new ArrayList<>();
42+
43+
ExecutorService threadPool = Executors.newCachedThreadPool();
44+
System.out.println(threadPool.getClass().getName());
45+
CompletionService<Void> ecs = new ExecutorCompletionService<>(threadPool);
46+
KubernetesProvisioningLimits kubernetesProvisioningLimits = KubernetesProvisioningLimits.get();
47+
48+
List<KubernetesCloud> clouds = j.jenkins.clouds.getAll(KubernetesCloud.class);
4049
for (int k = 0 ; k < 1000; k++) {
41-
Callable<Void> c = () -> {
50+
ecs.submit(() -> {
4251
ThreadLocalRandom random = ThreadLocalRandom.current();
43-
List<KubernetesCloud> clouds = j.jenkins.clouds.getAll(KubernetesCloud.class);
4452
KubernetesCloud cloud = clouds.get(random.nextInt(clouds.size()));
4553
List<PodTemplate> templates = cloud.getTemplates();
4654
PodTemplate podTemplate = templates.get(random.nextInt(templates.size()));
47-
if (KubernetesProvisioningLimits.get().register(cloud, podTemplate, 1)) {
48-
Thread.sleep(random.nextInt(3) * 1000);
49-
KubernetesProvisioningLimits.get().unregister(cloud, podTemplate, 1);
55+
while (!kubernetesProvisioningLimits.register(cloud, podTemplate, 1)) {
56+
try {
57+
Thread.sleep(8);
58+
} catch(InterruptedException e) {
59+
Thread.currentThread().interrupt();
60+
break;
61+
}
5062
}
51-
return null;
52-
};
53-
tasks.add(c);
63+
64+
ecs.submit(() -> {
65+
kubernetesProvisioningLimits.unregister(cloud, podTemplate, 1);
66+
}, null);
67+
}, null);
5468
}
55-
EXEC.invokeAll(tasks);
69+
70+
while (ecs.poll(20, TimeUnit.SECONDS) != null) {
71+
try {
72+
Thread.sleep(8);
73+
} catch(InterruptedException e) {
74+
Thread.currentThread().interrupt();
75+
break;
76+
}
77+
}
78+
79+
threadPool.shutdown();
80+
assertTrue(threadPool.awaitTermination(60, TimeUnit.SECONDS));
81+
assertTrue(threadPool.shutdownNow().size() == 0);
5682

5783
// Check that every count is back to 0
5884
for (KubernetesCloud cloud : j.jenkins.clouds.getAll(KubernetesCloud.class)) {
59-
assertEquals(0, KubernetesProvisioningLimits.get().getGlobalCount(cloud.name).get());
85+
assertEquals(0, KubernetesProvisioningLimits.get().getGlobalCount(cloud.name));
6086
for (PodTemplate template : cloud.getTemplates()) {
61-
assertEquals(0, KubernetesProvisioningLimits.get().getPodTemplateCount(template.getId()).get());
87+
assertEquals(0, KubernetesProvisioningLimits.get().getPodTemplateCount(template.getId()));
6288
}
6389
}
6490
}

0 commit comments

Comments
 (0)