diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index f02ad15e3dbc3..777617d7e640a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1917,6 +1917,26 @@ public static boolean isAclEnabled(Configuration conf) { public static final String NM_GPU_PATH_TO_EXEC = NM_GPU_RESOURCE_PREFIX + "path-to-discovery-executables"; + /** + * Sets the maximum duration for executions of the discovery binary. + */ + @Private + public static final String NM_GPU_DISCOVERY_TIMEOUT = + NM_GPU_RESOURCE_PREFIX + "discovery-timeout"; + + @Private + public static final String NM_GPU_DISCOVERY_TIMEOUT_DEFAULT = "10s"; + + /** + * Sets the maximum number of errors allowed from the discovery binary. + */ + @Private + public static final String NM_GPU_DISCOVERY_MAX_ERRORS = + NM_GPU_RESOURCE_PREFIX + "discovery-max-errors"; + + @Private + public static final int NM_GPU_DISCOVERY_MAX_ERRORS_DEFAULT = 10; + /** * Settings to control which implementation of docker plugin for GPU will be * used. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 9013627eb7595..8c453cbeb8918 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -4650,6 +4650,34 @@ + + + Sets the maximum duration for executions of the discovery binary defined in + yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables. If + the binary takes longer than this amount of time to run, then the process + is aborted. Discovery may be attempted again, depending on + yarn.nodemanager.resource-plugins.gpu.discovery-max-errors. + + yarn.nodemanager.resource-plugins.gpu.discovery-timeout + 10s + + + + + Sets the maximum number of errors allowed from the discovery binary + defined in + yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables. If + the number of errors exceeds this amount, then discovery is aborted, and + the NodeManager will never reattempt discovery again. Errors may be either + non-zero exit codes returned from the binary or timeouts as defined by + yarn.nodemanager.resource-plugins.gpu.discovery-timeout. Set this to a + negative value to disable enforcement of max errors and retry continually + until successful. + + yarn.nodemanager.resource-plugins.gpu.discovery-max-errors + 10 + + Enable additional discovery/isolation of resources on the NodeManager, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java index 60314c38374f8..93d8451d1e750 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java @@ -45,7 +45,7 @@ import java.util.List; import java.util.Map; import java.util.Set; - +import java.util.concurrent.TimeUnit; @InterfaceAudience.Private @InterfaceStability.Unstable @@ -61,10 +61,10 @@ public class GpuDiscoverer extends Configured { private static final Set DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of( "/usr/bin", "/bin", "/usr/local/nvidia/bin"); - private static final int MAX_REPEATED_ERROR_ALLOWED = 10; - private NvidiaBinaryHelper nvidiaBinaryHelper; private String pathOfGpuBinary = null; + private long discoveryTimeoutMs; + private int discoveryMaxErrors; private Map environment = new HashMap<>(); private int numOfErrorExecutionSinceLastSucceed = 0; @@ -86,7 +86,7 @@ private String getErrorMessageOfScriptExecution(String msg) { private String getErrorMessageOfScriptExecutionThresholdReached() { return getFailedToExecuteScriptMessage() + " for " + - MAX_REPEATED_ERROR_ALLOWED + " times, " + + discoveryMaxErrors + " times, " + "skipping following executions!"; } @@ -114,7 +114,8 @@ private String getFailedToParseErrorMessage(String msg) { */ public synchronized GpuDeviceInformation getGpuDeviceInformation() throws YarnException { - if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) { + if (discoveryMaxErrors >= 0 && + numOfErrorExecutionSinceLastSucceed == discoveryMaxErrors) { String msg = getErrorMessageOfScriptExecutionThresholdReached(); LOG.error(msg); throw new YarnException(msg); @@ -122,7 +123,8 @@ public synchronized GpuDeviceInformation getGpuDeviceInformation() try { lastDiscoveredGpuInformation = - nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary); + nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary, + discoveryTimeoutMs); } catch (IOException e) { numOfErrorExecutionSinceLastSucceed++; String msg = getErrorMessageOfScriptExecution(e.getMessage()); @@ -298,6 +300,16 @@ private void lookUpAutoDiscoveryBinary(Configuration config) } pathOfGpuBinary = binaryPath.getAbsolutePath(); + + discoveryTimeoutMs = config.getTimeDuration( + YarnConfiguration.NM_GPU_DISCOVERY_TIMEOUT, + YarnConfiguration.NM_GPU_DISCOVERY_TIMEOUT_DEFAULT, + TimeUnit.MILLISECONDS); + + discoveryMaxErrors = config.getInt( + YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS, + YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS_DEFAULT); + } private File handleConfiguredBinaryPathIsDirectory(File configuredBinaryFile) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java index 8efc32a8b1330..2c206feaa4936 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java @@ -34,10 +34,6 @@ * */ public class NvidiaBinaryHelper { - /** - * command should not run more than 10 sec. - */ - private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000; /** * @param pathOfGpuBinary The path of the binary @@ -47,7 +43,8 @@ public class NvidiaBinaryHelper { * or the output parse failed */ synchronized GpuDeviceInformation getGpuDeviceInformation( - String pathOfGpuBinary) throws IOException, YarnException { + String pathOfGpuBinary, long discoveryTimeoutMs) + throws IOException, YarnException { GpuDeviceInformationParser parser = new GpuDeviceInformationParser(); if (pathOfGpuBinary == null) { @@ -57,7 +54,7 @@ synchronized GpuDeviceInformation getGpuDeviceInformation( } String output = Shell.execCommand(new HashMap<>(), - new String[]{pathOfGpuBinary, "-x", "-q"}, MAX_EXEC_TIMEOUT_MS); + new String[]{pathOfGpuBinary, "-x", "-q"}, discoveryTimeoutMs); return parser.parseXml(output); } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java index ca2d5b6d3e756..6a22b070192e2 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java @@ -297,6 +297,91 @@ public void testGetGpuDeviceInformationFaultyNvidiaSmiScriptConsecutiveRun() assertNotNull(discoverer.getGpusUsableByYarn()); } + @Test + public void testGetGpuDeviceInformationOverrideMaxErrors() + throws YarnException, IOException { + Configuration conf = new Configuration(false); + // The default is 10 max errors. Override to 11. + conf.setInt(YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS, 11); + + // Initial creation will call the script once. Start out with a successful + // script. Otherwise, our error count assertions will be off by one later. + File fakeBinary = createFakeNvidiaSmiScriptAsRunnableFile( + this::createNvidiaSmiScript); + + GpuDiscoverer discoverer = creatediscovererWithGpuPathDefined(conf); + assertEquals(fakeBinary.getAbsolutePath(), + discoverer.getPathOfGpuBinary()); + assertNull(discoverer.getEnvironmentToRunCommand().get(PATH)); + + LOG.debug("Replacing script with faulty version!"); + createFaultyNvidiaSmiScript(fakeBinary); + + final String terminateMsg = "Failed to execute GPU device " + + "detection script (" + fakeBinary.getAbsolutePath() + ") for 11 times"; + final String msg = "Failed to execute GPU device detection script"; + + // We expect 11 attempts (not the default of 10). + for (int i = 0; i < 11; i++) { + try { + LOG.debug("Executing faulty nvidia-smi script..."); + discoverer.getGpuDeviceInformation(); + fail("Query of GPU device info via nvidia-smi should fail as " + + "script should be faulty: " + fakeBinary); + } catch (YarnException e) { + assertThat(e.getMessage()).contains(msg); + assertThat(e.getMessage()).doesNotContain(terminateMsg); + } + } + + // On a 12th attempt, we've exceed the configured max of 11, so we expect + // the termination message. + try { + LOG.debug("Executing faulty nvidia-smi script again..." + + "We should reach the error threshold now!"); + discoverer.getGpuDeviceInformation(); + fail("Query of GPU device info via nvidia-smi should fail as " + + "script should be faulty: " + fakeBinary); + } catch (YarnException e) { + assertThat(e.getMessage()).contains(terminateMsg); + } + + LOG.debug("Verifying if GPUs are still hold the value of " + + "first successful query"); + assertNotNull(discoverer.getGpusUsableByYarn()); + } + + @Test + public void testGetGpuDeviceInformationDisableMaxErrors() + throws YarnException, IOException { + Configuration conf = new Configuration(false); + // A negative value should disable max errors enforcement. + conf.setInt(YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS, -1); + + File fakeBinary = createFakeNvidiaSmiScriptAsRunnableFile( + this::createFaultyNvidiaSmiScript); + + GpuDiscoverer discoverer = creatediscovererWithGpuPathDefined(conf); + assertEquals(fakeBinary.getAbsolutePath(), + discoverer.getPathOfGpuBinary()); + assertNull(discoverer.getEnvironmentToRunCommand().get(PATH)); + + final String terminateMsg = "Failed to execute GPU device " + + "detection script (" + fakeBinary.getAbsolutePath() + ") for 10 times"; + final String msg = "Failed to execute GPU device detection script"; + + // The default max errors is 10. Verify that it keeps going for more, and we + // never see the termination message. + for (int i = 0; i < 20; ++i) { + YarnException exception = assertThrows(YarnException.class, () -> { + discoverer.getGpuDeviceInformation(); + }); + + assertThat(exception.getMessage()).contains(msg); + assertThat(exception.getMessage()).doesNotContain(terminateMsg); + } + } + @Test public void testGetGpuDeviceInformationNvidiaSmiScriptWithInvalidXml() throws YarnException, IOException { @@ -545,4 +630,4 @@ public void testBinaryIsNotNvidiaSmi() throws YarnException { "nvidia-smi", "badfile"); assertThat(yarnException.getMessage()).contains(format); } -} \ No newline at end of file +}