diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
index f02ad15e3dbc3..777617d7e640a 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
@@ -1917,6 +1917,26 @@ public static boolean isAclEnabled(Configuration conf) {
public static final String NM_GPU_PATH_TO_EXEC =
NM_GPU_RESOURCE_PREFIX + "path-to-discovery-executables";
+ /**
+ * Sets the maximum duration for executions of the discovery binary.
+ */
+ @Private
+ public static final String NM_GPU_DISCOVERY_TIMEOUT =
+ NM_GPU_RESOURCE_PREFIX + "discovery-timeout";
+
+ @Private
+ public static final String NM_GPU_DISCOVERY_TIMEOUT_DEFAULT = "10s";
+
+ /**
+ * Sets the maximum number of errors allowed from the discovery binary.
+ */
+ @Private
+ public static final String NM_GPU_DISCOVERY_MAX_ERRORS =
+ NM_GPU_RESOURCE_PREFIX + "discovery-max-errors";
+
+ @Private
+ public static final int NM_GPU_DISCOVERY_MAX_ERRORS_DEFAULT = 10;
+
/**
* Settings to control which implementation of docker plugin for GPU will be
* used.
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
index 9013627eb7595..8c453cbeb8918 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
@@ -4650,6 +4650,34 @@
+
+
+ Sets the maximum duration for executions of the discovery binary defined in
+ yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables. If
+ the binary takes longer than this amount of time to run, then the process
+ is aborted. Discovery may be attempted again, depending on
+ yarn.nodemanager.resource-plugins.gpu.discovery-max-errors.
+
+ yarn.nodemanager.resource-plugins.gpu.discovery-timeout
+ 10s
+
+
+
+
+ Sets the maximum number of errors allowed from the discovery binary
+ defined in
+ yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables. If
+ the number of errors exceeds this amount, then discovery is aborted, and
+ the NodeManager will never reattempt discovery again. Errors may be either
+ non-zero exit codes returned from the binary or timeouts as defined by
+ yarn.nodemanager.resource-plugins.gpu.discovery-timeout. Set this to a
+ negative value to disable enforcement of max errors and retry continually
+ until successful.
+
+ yarn.nodemanager.resource-plugins.gpu.discovery-max-errors
+ 10
+
+
Enable additional discovery/isolation of resources on the NodeManager,
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
index 60314c38374f8..93d8451d1e750 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
@@ -45,7 +45,7 @@
import java.util.List;
import java.util.Map;
import java.util.Set;
-
+import java.util.concurrent.TimeUnit;
@InterfaceAudience.Private
@InterfaceStability.Unstable
@@ -61,10 +61,10 @@ public class GpuDiscoverer extends Configured {
private static final Set DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of(
"/usr/bin", "/bin", "/usr/local/nvidia/bin");
- private static final int MAX_REPEATED_ERROR_ALLOWED = 10;
-
private NvidiaBinaryHelper nvidiaBinaryHelper;
private String pathOfGpuBinary = null;
+ private long discoveryTimeoutMs;
+ private int discoveryMaxErrors;
private Map environment = new HashMap<>();
private int numOfErrorExecutionSinceLastSucceed = 0;
@@ -86,7 +86,7 @@ private String getErrorMessageOfScriptExecution(String msg) {
private String getErrorMessageOfScriptExecutionThresholdReached() {
return getFailedToExecuteScriptMessage() + " for " +
- MAX_REPEATED_ERROR_ALLOWED + " times, " +
+ discoveryMaxErrors + " times, " +
"skipping following executions!";
}
@@ -114,7 +114,8 @@ private String getFailedToParseErrorMessage(String msg) {
*/
public synchronized GpuDeviceInformation getGpuDeviceInformation()
throws YarnException {
- if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
+ if (discoveryMaxErrors >= 0 &&
+ numOfErrorExecutionSinceLastSucceed == discoveryMaxErrors) {
String msg = getErrorMessageOfScriptExecutionThresholdReached();
LOG.error(msg);
throw new YarnException(msg);
@@ -122,7 +123,8 @@ public synchronized GpuDeviceInformation getGpuDeviceInformation()
try {
lastDiscoveredGpuInformation =
- nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary);
+ nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary,
+ discoveryTimeoutMs);
} catch (IOException e) {
numOfErrorExecutionSinceLastSucceed++;
String msg = getErrorMessageOfScriptExecution(e.getMessage());
@@ -298,6 +300,16 @@ private void lookUpAutoDiscoveryBinary(Configuration config)
}
pathOfGpuBinary = binaryPath.getAbsolutePath();
+
+ discoveryTimeoutMs = config.getTimeDuration(
+ YarnConfiguration.NM_GPU_DISCOVERY_TIMEOUT,
+ YarnConfiguration.NM_GPU_DISCOVERY_TIMEOUT_DEFAULT,
+ TimeUnit.MILLISECONDS);
+
+ discoveryMaxErrors = config.getInt(
+ YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS,
+ YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS_DEFAULT);
+
}
private File handleConfiguredBinaryPathIsDirectory(File configuredBinaryFile)
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java
index 8efc32a8b1330..2c206feaa4936 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java
@@ -34,10 +34,6 @@
*
*/
public class NvidiaBinaryHelper {
- /**
- * command should not run more than 10 sec.
- */
- private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
/**
* @param pathOfGpuBinary The path of the binary
@@ -47,7 +43,8 @@ public class NvidiaBinaryHelper {
* or the output parse failed
*/
synchronized GpuDeviceInformation getGpuDeviceInformation(
- String pathOfGpuBinary) throws IOException, YarnException {
+ String pathOfGpuBinary, long discoveryTimeoutMs)
+ throws IOException, YarnException {
GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
if (pathOfGpuBinary == null) {
@@ -57,7 +54,7 @@ synchronized GpuDeviceInformation getGpuDeviceInformation(
}
String output = Shell.execCommand(new HashMap<>(),
- new String[]{pathOfGpuBinary, "-x", "-q"}, MAX_EXEC_TIMEOUT_MS);
+ new String[]{pathOfGpuBinary, "-x", "-q"}, discoveryTimeoutMs);
return parser.parseXml(output);
}
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
index ca2d5b6d3e756..6a22b070192e2 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
@@ -297,6 +297,91 @@ public void testGetGpuDeviceInformationFaultyNvidiaSmiScriptConsecutiveRun()
assertNotNull(discoverer.getGpusUsableByYarn());
}
+ @Test
+ public void testGetGpuDeviceInformationOverrideMaxErrors()
+ throws YarnException, IOException {
+ Configuration conf = new Configuration(false);
+ // The default is 10 max errors. Override to 11.
+ conf.setInt(YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS, 11);
+
+ // Initial creation will call the script once. Start out with a successful
+ // script. Otherwise, our error count assertions will be off by one later.
+ File fakeBinary = createFakeNvidiaSmiScriptAsRunnableFile(
+ this::createNvidiaSmiScript);
+
+ GpuDiscoverer discoverer = creatediscovererWithGpuPathDefined(conf);
+ assertEquals(fakeBinary.getAbsolutePath(),
+ discoverer.getPathOfGpuBinary());
+ assertNull(discoverer.getEnvironmentToRunCommand().get(PATH));
+
+ LOG.debug("Replacing script with faulty version!");
+ createFaultyNvidiaSmiScript(fakeBinary);
+
+ final String terminateMsg = "Failed to execute GPU device " +
+ "detection script (" + fakeBinary.getAbsolutePath() + ") for 11 times";
+ final String msg = "Failed to execute GPU device detection script";
+
+ // We expect 11 attempts (not the default of 10).
+ for (int i = 0; i < 11; i++) {
+ try {
+ LOG.debug("Executing faulty nvidia-smi script...");
+ discoverer.getGpuDeviceInformation();
+ fail("Query of GPU device info via nvidia-smi should fail as " +
+ "script should be faulty: " + fakeBinary);
+ } catch (YarnException e) {
+ assertThat(e.getMessage()).contains(msg);
+ assertThat(e.getMessage()).doesNotContain(terminateMsg);
+ }
+ }
+
+ // On a 12th attempt, we've exceed the configured max of 11, so we expect
+ // the termination message.
+ try {
+ LOG.debug("Executing faulty nvidia-smi script again..." +
+ "We should reach the error threshold now!");
+ discoverer.getGpuDeviceInformation();
+ fail("Query of GPU device info via nvidia-smi should fail as " +
+ "script should be faulty: " + fakeBinary);
+ } catch (YarnException e) {
+ assertThat(e.getMessage()).contains(terminateMsg);
+ }
+
+ LOG.debug("Verifying if GPUs are still hold the value of " +
+ "first successful query");
+ assertNotNull(discoverer.getGpusUsableByYarn());
+ }
+
+ @Test
+ public void testGetGpuDeviceInformationDisableMaxErrors()
+ throws YarnException, IOException {
+ Configuration conf = new Configuration(false);
+ // A negative value should disable max errors enforcement.
+ conf.setInt(YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS, -1);
+
+ File fakeBinary = createFakeNvidiaSmiScriptAsRunnableFile(
+ this::createFaultyNvidiaSmiScript);
+
+ GpuDiscoverer discoverer = creatediscovererWithGpuPathDefined(conf);
+ assertEquals(fakeBinary.getAbsolutePath(),
+ discoverer.getPathOfGpuBinary());
+ assertNull(discoverer.getEnvironmentToRunCommand().get(PATH));
+
+ final String terminateMsg = "Failed to execute GPU device " +
+ "detection script (" + fakeBinary.getAbsolutePath() + ") for 10 times";
+ final String msg = "Failed to execute GPU device detection script";
+
+ // The default max errors is 10. Verify that it keeps going for more, and we
+ // never see the termination message.
+ for (int i = 0; i < 20; ++i) {
+ YarnException exception = assertThrows(YarnException.class, () -> {
+ discoverer.getGpuDeviceInformation();
+ });
+
+ assertThat(exception.getMessage()).contains(msg);
+ assertThat(exception.getMessage()).doesNotContain(terminateMsg);
+ }
+ }
+
@Test
public void testGetGpuDeviceInformationNvidiaSmiScriptWithInvalidXml()
throws YarnException, IOException {
@@ -545,4 +630,4 @@ public void testBinaryIsNotNvidiaSmi() throws YarnException {
"nvidia-smi", "badfile");
assertThat(yarnException.getMessage()).contains(format);
}
-}
\ No newline at end of file
+}