Skip to content

Commit 543bfc5

Browse files
committed
cells: use core domain endpoint only if hostname resolable
Motivation: Recently we have observe cases whee tunnel connections failed with: java.lang.NullPointerException: null at java.base/java.net.Socket.<init>(Socket.java:501) at java.base/java.net.Socket.<init>(Socket.java:319) at java.base/javax.net.DefaultSocketFactory.createSocket(SocketFactory.java:277) at dmg.cells.network.LocationManagerConnector.connect(LocationManagerConnector.java:66) at dmg.cells.network.LocationManagerConnector.run(LocationManagerConnector.java:96) at dmg.cells.nucleus.CellNucleus.lambda$wrapLoggingContext$2(CellNucleus.java:725) at java.base/java.lang.Thread.run(Thread.java:840) This is possible only if hostname can't be resolve. Such tunnels die and never re-connect. Modification: Update zookeeper node update logic to ensure that we accept endpoint only if hostname is resolvable. Result: More robust tunnel handling. Issue: #5326 Acked-by: Marina Sahakyan Target: master, 10.2 Require-book: no Require-notes: yes
1 parent 6d85e9a commit 543bfc5

File tree

1 file changed

+42
-27
lines changed

1 file changed

+42
-27
lines changed

modules/cells/src/main/java/dmg/cells/services/LocationManager.java

Lines changed: 42 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
import org.apache.curator.framework.recipes.cache.ChildData;
5959
import org.apache.curator.framework.recipes.cache.PathChildrenCache;
6060
import org.apache.curator.framework.recipes.cache.PathChildrenCacheEvent;
61+
import org.apache.curator.framework.recipes.cache.PathChildrenCacheEvent.Type;
6162
import org.apache.curator.utils.CloseableUtils;
6263
import org.apache.curator.utils.ZKPaths;
6364
import org.dcache.ssl.CanlSslSocketCreator;
@@ -176,7 +177,9 @@ public CoreDomainInfo(byte[] bytes) {
176177
switch (entry.getScheme()) {
177178
case "tls":
178179
case "tcp":
179-
endpoints.add(entry);
180+
if (canResoveHost(entry)) {
181+
endpoints.add(entry);
182+
}
180183
break;
181184
default:
182185
LOGGER.warn("Unknown Scheme for LocationManager Cores: {}", entry);
@@ -189,6 +192,26 @@ public CoreDomainInfo(byte[] bytes) {
189192
}
190193
}
191194

195+
/**
196+
* Check if hostname provided by urc can be resolved.
197+
* @param endpoint core domain endpoint
198+
* @return true it host name can be resolved. Otherwise false.
199+
*/
200+
private boolean canResoveHost(URI endpoint) {
201+
var h = endpoint.getHost();
202+
if (h == null) {
203+
LOGGER.warn("Ignoring URL without host: {}", endpoint);
204+
return false;
205+
}
206+
try {
207+
var ip = java.net.InetAddress.getByName(h);
208+
} catch (UnknownHostException e) {
209+
LOGGER.warn("Ignoring unknown host: {} : {}", endpoint, e.toString());
210+
return false;
211+
}
212+
return true;
213+
}
214+
192215
void addCore(String scheme, String host, int port) {
193216
switch (scheme) {
194217
case "tls":
@@ -385,46 +408,38 @@ public void update(PathChildrenCacheEvent event) {
385408
LOGGER.info("{}", event);
386409
String cell;
387410

411+
if (hasNoData(event.getData())) {
412+
LOGGER.warn("Ignoring empty event {}", event.getType());
413+
return;
414+
}
415+
416+
String domain = ZKPaths.getNodeFromPath(event.getData().getPath());
417+
388418
switch (event.getType()) {
389419
case CHILD_REMOVED:
390-
cell = connectors.remove(ZKPaths.getNodeFromPath(event.getData().getPath()));
420+
cell = connectors.remove(domain);
391421
if (cell != null) {
392422
killConnector(cell);
393423
}
394424
break;
395425
case CHILD_UPDATED:
396-
if (hasNoData(event.getData())) {
397-
LOGGER.warn("Ignoring empty data on UPDATED for {}", event.getData().getPath());
398-
break;
399-
}
400-
cell = connectors.remove(
401-
ZKPaths.getNodeFromPath(event.getData().getPath()));
402-
if (cell != null) {
403-
killConnector(cell);
404-
}
405-
// fall through
406426
case CHILD_ADDED:
407-
if (hasNoData(event.getData())) {
408-
LOGGER.warn("Ignoring empty data on ADDED for {}", event.getData().getPath());
427+
428+
CoreDomainInfo info = infoFromZKEvent(event);
429+
if (info.endpoints.isEmpty()) {
430+
LOGGER.warn("Ignoring invalid core URI", domain);
409431
break;
410432
}
411433

412-
//Log if the Core Domain Information received is incompatible with previous
413-
CoreDomainInfo info = infoFromZKEvent(event);
414-
String domain = ZKPaths.getNodeFromPath(event.getData().getPath());
434+
if (event.getType() == Type.CHILD_UPDATED) {
435+
cell = connectors.remove(domain);
436+
if (cell != null) {
437+
killConnector(cell);
438+
}
439+
}
415440

416441
try {
417442
if (shouldConnectTo(domain)) {
418-
cell = connectors.remove(domain);
419-
if (cell != null) {
420-
LOGGER.error(
421-
"About to create tunnel to core domain {}, but to my surprise "
422-
+
423-
"a tunnel called {} already exists. Will kill it. Please contact "
424-
+
425-
"[email protected].", domain, cell);
426-
killConnector(cell);
427-
}
428443
cell = connectors.put(domain, startConnector(domain, info));
429444
if (cell != null) {
430445
LOGGER.error(

0 commit comments

Comments
 (0)