Skip to content

Commit 6e7573f

Browse files
committed
GEODE-10490: Add retry logic for port binding failures in Locator and Server
When PortKeepers are closed immediately before service binding, there's a small race window where parallel test workers can steal the port. Added retry logic (up to 3 attempts) in LocatorStarterRule.startLocator() and ServerStarterRule.startServer() to handle 'Address already in use' errors by reallocating fresh ports and retrying. This addresses intermittent binding failures seen in CI integration tests like DisabledClusterConfigTest when multiple test workers compete for ports.
1 parent cb6b51b commit 6e7573f

File tree

2 files changed

+173
-71
lines changed

2 files changed

+173
-71
lines changed

geode-dunit/src/main/java/org/apache/geode/test/junit/rules/LocatorStarterRule.java

Lines changed: 65 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import org.apache.geode.distributed.DistributedSystem;
2727
import org.apache.geode.distributed.internal.DistributionConfig;
2828
import org.apache.geode.distributed.internal.InternalLocator;
29+
import org.apache.geode.internal.UniquePortSupplier;
2930
import org.apache.geode.internal.cache.InternalCache;
3031
import org.apache.geode.test.dunit.rules.ClusterStartupRule;
3132

@@ -86,28 +87,73 @@ protected void stopMember() {
8687
}
8788

8889
public void startLocator() {
89-
try {
90-
// Close all keepers immediately before starting locator
91-
// This minimizes the race window to < 1ms (time between keeper.close() and service bind)
92-
if (memberPortKeeper != null) {
93-
memberPortKeeper.close();
94-
memberPortKeeper = null;
95-
}
96-
if (jmxPortKeeper != null) {
97-
jmxPortKeeper.close();
98-
jmxPortKeeper = null;
99-
}
100-
if (httpPortKeeper != null) {
101-
httpPortKeeper.close();
102-
httpPortKeeper = null;
90+
int maxRetries = 3;
91+
IOException lastException = null;
92+
93+
for (int attempt = 1; attempt <= maxRetries; attempt++) {
94+
try {
95+
// Close all keepers immediately before starting locator
96+
// This minimizes the race window to < 1ms (time between keeper.close() and service bind)
97+
if (memberPortKeeper != null) {
98+
memberPortKeeper.close();
99+
memberPortKeeper = null;
100+
}
101+
if (jmxPortKeeper != null) {
102+
jmxPortKeeper.close();
103+
jmxPortKeeper = null;
104+
}
105+
if (httpPortKeeper != null) {
106+
httpPortKeeper.close();
107+
httpPortKeeper = null;
108+
}
109+
110+
// Start locator - it will bind to memberPort, and services will bind to JMX/HTTP ports
111+
locator = (InternalLocator) startLocatorAndDS(memberPort, null, properties);
112+
break; // Success - exit retry loop
113+
114+
} catch (IOException e) {
115+
lastException = e;
116+
// Check if this is a port binding failure
117+
if (e.getMessage() != null && e.getMessage().contains("Address already in use")
118+
&& attempt < maxRetries) {
119+
// Port was stolen - allocate new ports and retry
120+
UniquePortSupplier portSupplier = new UniquePortSupplier();
121+
memberPort = portSupplier.getAvailablePort();
122+
try {
123+
memberPortKeeper = new PortKeeper(memberPort);
124+
// Don't reallocate JMX/HTTP ports if they're user-specified (>0)
125+
if (jmxPort <= 0) {
126+
int newJmxPort = portSupplier.getAvailablePort();
127+
jmxPortKeeper = new PortKeeper(newJmxPort);
128+
// Update properties with new JMX port
129+
if (jmxPort == 0) {
130+
properties.setProperty(DistributionConfig.JMX_MANAGER_PORT_NAME,
131+
String.valueOf(newJmxPort));
132+
}
133+
}
134+
if (httpPort <= 0) {
135+
int newHttpPort = portSupplier.getAvailablePort();
136+
httpPortKeeper = new PortKeeper(newHttpPort);
137+
// Update properties with new HTTP port
138+
if (httpPort == 0) {
139+
properties.setProperty(DistributionConfig.HTTP_SERVICE_PORT_NAME,
140+
String.valueOf(newHttpPort));
141+
}
142+
}
143+
} catch (IOException retryException) {
144+
// Failed to allocate new ports - will retry with what we have
145+
}
146+
continue; // Retry with new ports
147+
}
148+
// Not a retryable error or max retries reached
149+
throw new UncheckedIOException(e);
103150
}
151+
}
104152

105-
// Start locator - it will bind to memberPort, and services will bind to JMX/HTTP ports
106-
locator = (InternalLocator) startLocatorAndDS(memberPort, null, properties);
107-
108-
} catch (IOException e) {
109-
throw new UncheckedIOException(e);
153+
if (locator == null && lastException != null) {
154+
throw new UncheckedIOException(lastException);
110155
}
156+
111157
// memberPort is by default zero, which translates to "randomly select an available port,"
112158
// which is why it is updated here after being specified above.
113159
memberPort = locator.getPort();

geode-dunit/src/main/java/org/apache/geode/test/junit/rules/ServerStarterRule.java

Lines changed: 108 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
import org.apache.geode.distributed.internal.DistributionConfig;
3434
import org.apache.geode.distributed.internal.InternalDistributedSystem;
3535
import org.apache.geode.internal.AvailablePortHelper;
36+
import org.apache.geode.internal.UniquePortSupplier;
3637
import org.apache.geode.internal.cache.GemFireCacheImpl;
3738
import org.apache.geode.internal.cache.InternalCache;
3839
import org.apache.geode.internal.cache.tier.sockets.CacheServerHelper;
@@ -198,61 +199,116 @@ public void startServer() {
198199
servers = new ArrayList<>();
199200
}
200201

201-
// Close all keepers immediately before starting server
202-
// This minimizes the race window to < 1ms (time between keeper.close() and service bind)
203-
if (memberPortKeeper != null) {
204-
memberPortKeeper.close();
205-
memberPortKeeper = null;
206-
}
207-
if (jmxPortKeeper != null) {
208-
jmxPortKeeper.close();
209-
jmxPortKeeper = null;
210-
}
211-
if (httpPortKeeper != null) {
212-
httpPortKeeper.close();
213-
httpPortKeeper = null;
214-
}
202+
int maxRetries = 3;
203+
RuntimeException lastException = null;
215204

216-
CacheFactory cf = new CacheFactory(properties);
217-
if (pdxPersistentUserSet) {
218-
cf.setPdxPersistent(pdxPersistent);
219-
}
220-
if (pdxReadSerializedUserSet) {
221-
cf.setPdxReadSerialized(pdxReadSerialized);
222-
}
223-
if (pdxSerializer != null) {
224-
cf.setPdxSerializer(pdxSerializer);
225-
}
226-
cache = (InternalCache) cf.create();
227-
DistributionConfig config =
228-
((InternalDistributedSystem) cache.getDistributedSystem()).getConfig();
229-
jmxPort = config.getJmxManagerPort();
230-
httpPort = config.getHttpServicePort();
231-
232-
for (int i = 0; i < serverCount; i++) {
233-
CacheServer server = cache.addCacheServer();
234-
if (i == 0) {
235-
CacheServerHelper.setIsDefaultServer(server);
236-
}
237-
// memberPort is by default zero, which translates to "randomly select an available port,"
238-
// which is why it is updated after this try block
239-
if (serverCount == 1) {
240-
server.setPort(memberPort);
241-
} else {
242-
server.setPort(0);
243-
}
244-
if (maxThreads >= 0) {
245-
server.setMaxThreads(maxThreads);
246-
}
205+
for (int attempt = 1; attempt <= maxRetries; attempt++) {
247206
try {
248-
server.start();
249-
} catch (IOException e) {
250-
throw new RuntimeException("unable to start server", e);
207+
// Close all keepers immediately before starting server
208+
// This minimizes the race window to < 1ms (time between keeper.close() and service bind)
209+
if (memberPortKeeper != null) {
210+
memberPortKeeper.close();
211+
memberPortKeeper = null;
212+
}
213+
if (jmxPortKeeper != null) {
214+
jmxPortKeeper.close();
215+
jmxPortKeeper = null;
216+
}
217+
if (httpPortKeeper != null) {
218+
httpPortKeeper.close();
219+
httpPortKeeper = null;
220+
}
221+
222+
CacheFactory cf = new CacheFactory(properties);
223+
if (pdxPersistentUserSet) {
224+
cf.setPdxPersistent(pdxPersistent);
225+
}
226+
if (pdxReadSerializedUserSet) {
227+
cf.setPdxReadSerialized(pdxReadSerialized);
228+
}
229+
if (pdxSerializer != null) {
230+
cf.setPdxSerializer(pdxSerializer);
231+
}
232+
cache = (InternalCache) cf.create();
233+
DistributionConfig config =
234+
((InternalDistributedSystem) cache.getDistributedSystem()).getConfig();
235+
jmxPort = config.getJmxManagerPort();
236+
httpPort = config.getHttpServicePort();
237+
238+
for (int i = 0; i < serverCount; i++) {
239+
CacheServer server = cache.addCacheServer();
240+
if (i == 0) {
241+
CacheServerHelper.setIsDefaultServer(server);
242+
}
243+
// memberPort is by default zero, which translates to "randomly select an available port,"
244+
// which is why it is updated after this try block
245+
if (serverCount == 1) {
246+
server.setPort(memberPort);
247+
} else {
248+
server.setPort(0);
249+
}
250+
if (maxThreads >= 0) {
251+
server.setMaxThreads(maxThreads);
252+
}
253+
try {
254+
server.start();
255+
} catch (IOException e) {
256+
// Check if this is a retryable port binding failure
257+
if (e.getMessage() != null && e.getMessage().contains("Address already in use")
258+
&& attempt < maxRetries && serverCount == 1) {
259+
// Port was stolen - clean up and retry with new ports
260+
cache.close();
261+
cache = null;
262+
servers.clear();
263+
throw new RuntimeException("Port binding failed, will retry", e);
264+
}
265+
throw new RuntimeException("unable to start server", e);
266+
}
267+
// if this member has multiple cache servers, the memberPort will be the last server's
268+
// port started.
269+
memberPort = server.getPort();
270+
servers.add(server);
271+
}
272+
return; // Success - exit retry loop
273+
274+
} catch (RuntimeException e) {
275+
lastException = e;
276+
if (e.getMessage() != null && e.getMessage().contains("Port binding failed, will retry")
277+
&& attempt < maxRetries) {
278+
// Allocate new ports and retry
279+
UniquePortSupplier portSupplier = new UniquePortSupplier();
280+
memberPort = portSupplier.getAvailablePort();
281+
try {
282+
memberPortKeeper = new PortKeeper(memberPort);
283+
// Don't reallocate JMX/HTTP ports if they're user-specified (>0)
284+
if (jmxPort <= 0) {
285+
int newJmxPort = portSupplier.getAvailablePort();
286+
jmxPortKeeper = new PortKeeper(newJmxPort);
287+
if (jmxPort == 0) {
288+
properties.setProperty(DistributionConfig.JMX_MANAGER_PORT_NAME,
289+
String.valueOf(newJmxPort));
290+
}
291+
}
292+
if (httpPort <= 0) {
293+
int newHttpPort = portSupplier.getAvailablePort();
294+
httpPortKeeper = new PortKeeper(newHttpPort);
295+
if (httpPort == 0) {
296+
properties.setProperty(DistributionConfig.HTTP_SERVICE_PORT_NAME,
297+
String.valueOf(newHttpPort));
298+
}
299+
}
300+
} catch (IOException retryException) {
301+
// Failed to allocate new ports - will retry with what we have
302+
}
303+
continue; // Retry with new ports
304+
}
305+
// Not a retryable error or max retries reached
306+
throw e;
251307
}
252-
// if this member has multiple cache servers, the memberPort will be the last server's port
253-
// started.
254-
memberPort = server.getPort();
255-
servers.add(server);
308+
}
309+
310+
if (cache == null && lastException != null) {
311+
throw lastException;
256312
}
257313
}
258314

0 commit comments

Comments
 (0)