From 5db27ec6b9eff06b8b9c315de223c1be31167155 Mon Sep 17 00:00:00 2001 From: Chris Nauroth Date: Tue, 5 Aug 2025 17:58:56 +0000 Subject: [PATCH 1/5] YARN-11844: Support configuration of retry policy on GPU discovery Closes #7857 Co-authored-by: Jayadeep Jayaraman Reviewed-by: Ashutosh Gupta --- .../hadoop/yarn/conf/YarnConfiguration.java | 20 ++++++++++++ .../src/main/resources/yarn-default.xml | 28 ++++++++++++++++ .../resourceplugin/gpu/GpuDiscoverer.java | 24 ++++++++++---- .../gpu/NvidiaBinaryHelper.java | 9 ++---- .../resourceplugin/gpu/TestGpuDiscoverer.java | 32 ++++++++++++++++++- 5 files changed, 100 insertions(+), 13 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index f02ad15e3dbc3..f107068853c20 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1917,6 +1917,26 @@ public static boolean isAclEnabled(Configuration conf) { public static final String NM_GPU_PATH_TO_EXEC = NM_GPU_RESOURCE_PREFIX + "path-to-discovery-executables"; + /** + * Sets the maximum duration for executions of the discovery binary. + */ + @Private + public static final String NM_GPU_DISCOVERY_TIMEOUT = + NM_GPU_RESOURCE_PREFIX + "discovery-timeout"; + + @Private + public static final String NM_GPU_DISCOVERY_TIMEOUT_DEFAULT = "10000ms"; + + /** + * Sets the maximum number of errors allowed from the discovery binary. + */ + @Private + public static final String NM_GPU_DISCOVERY_MAX_ERRORS = + NM_GPU_RESOURCE_PREFIX + "discovery-max-errors"; + + @Private + public static final int NM_GPU_DISCOVERY_MAX_ERRORS_DEFAULT = 10; + /** * Settings to control which implementation of docker plugin for GPU will be * used. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 9013627eb7595..4ea5f28bdc58c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -4650,6 +4650,34 @@ + + + Sets the maximum duration for executions of the discovery binary defined in + yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables. If + the binary takes longer than this amount of time to run, then the process + is aborted. Discovery may be attempted again, depending on + yarn.nodemanager.resource-plugins.gpu.discovery-max-errors. + + yarn.nodemanager.resource-plugins.gpu.discovery-timeout + 10000ms + + + + + Sets the maximum number of errors allowed from the discovery binary + defined in + yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables. If + the number of errors exceeds this amount, then discovery is aborted, and + the NodeManager will never reattempt discovery again. Errors may be either + non-zero exit codes returned from the binary or timeouts as defined by + yarn.nodemanager.resource-plugins.gpu.discovery-timeout. Set this to a + negative value to disable enforcement of max errors and retry continually + until successful. + + yarn.nodemanager.resource-plugins.gpu.discovery-max-errors + 10 + + Enable additional discovery/isolation of resources on the NodeManager, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java index 60314c38374f8..93d8451d1e750 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java @@ -45,7 +45,7 @@ import java.util.List; import java.util.Map; import java.util.Set; - +import java.util.concurrent.TimeUnit; @InterfaceAudience.Private @InterfaceStability.Unstable @@ -61,10 +61,10 @@ public class GpuDiscoverer extends Configured { private static final Set DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of( "/usr/bin", "/bin", "/usr/local/nvidia/bin"); - private static final int MAX_REPEATED_ERROR_ALLOWED = 10; - private NvidiaBinaryHelper nvidiaBinaryHelper; private String pathOfGpuBinary = null; + private long discoveryTimeoutMs; + private int discoveryMaxErrors; private Map environment = new HashMap<>(); private int numOfErrorExecutionSinceLastSucceed = 0; @@ -86,7 +86,7 @@ private String getErrorMessageOfScriptExecution(String msg) { private String getErrorMessageOfScriptExecutionThresholdReached() { return getFailedToExecuteScriptMessage() + " for " + - MAX_REPEATED_ERROR_ALLOWED + " times, " + + discoveryMaxErrors + " times, " + "skipping following executions!"; } @@ -114,7 +114,8 @@ private String getFailedToParseErrorMessage(String msg) { */ public synchronized GpuDeviceInformation getGpuDeviceInformation() throws YarnException { - if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) { + if (discoveryMaxErrors >= 0 && + numOfErrorExecutionSinceLastSucceed == discoveryMaxErrors) { String msg = getErrorMessageOfScriptExecutionThresholdReached(); LOG.error(msg); throw new YarnException(msg); @@ -122,7 +123,8 @@ public synchronized GpuDeviceInformation getGpuDeviceInformation() try { lastDiscoveredGpuInformation = - nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary); + nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary, + discoveryTimeoutMs); } catch (IOException e) { numOfErrorExecutionSinceLastSucceed++; String msg = getErrorMessageOfScriptExecution(e.getMessage()); @@ -298,6 +300,16 @@ private void lookUpAutoDiscoveryBinary(Configuration config) } pathOfGpuBinary = binaryPath.getAbsolutePath(); + + discoveryTimeoutMs = config.getTimeDuration( + YarnConfiguration.NM_GPU_DISCOVERY_TIMEOUT, + YarnConfiguration.NM_GPU_DISCOVERY_TIMEOUT_DEFAULT, + TimeUnit.MILLISECONDS); + + discoveryMaxErrors = config.getInt( + YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS, + YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS_DEFAULT); + } private File handleConfiguredBinaryPathIsDirectory(File configuredBinaryFile) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java index 8efc32a8b1330..2c206feaa4936 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java @@ -34,10 +34,6 @@ * */ public class NvidiaBinaryHelper { - /** - * command should not run more than 10 sec. - */ - private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000; /** * @param pathOfGpuBinary The path of the binary @@ -47,7 +43,8 @@ public class NvidiaBinaryHelper { * or the output parse failed */ synchronized GpuDeviceInformation getGpuDeviceInformation( - String pathOfGpuBinary) throws IOException, YarnException { + String pathOfGpuBinary, long discoveryTimeoutMs) + throws IOException, YarnException { GpuDeviceInformationParser parser = new GpuDeviceInformationParser(); if (pathOfGpuBinary == null) { @@ -57,7 +54,7 @@ synchronized GpuDeviceInformation getGpuDeviceInformation( } String output = Shell.execCommand(new HashMap<>(), - new String[]{pathOfGpuBinary, "-x", "-q"}, MAX_EXEC_TIMEOUT_MS); + new String[]{pathOfGpuBinary, "-x", "-q"}, discoveryTimeoutMs); return parser.parseXml(output); } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java index ca2d5b6d3e756..0f417559b1dc0 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java @@ -297,6 +297,36 @@ public void testGetGpuDeviceInformationFaultyNvidiaSmiScriptConsecutiveRun() assertNotNull(discoverer.getGpusUsableByYarn()); } + @Test + public void testGetGpuDeviceInformationDisableMaxErrors() + throws YarnException, IOException { + Configuration conf = new Configuration(false); + // A negative value should disable max errors enforcement. + conf.setInt(YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS, -1); + + File fakeBinary = createFakeNvidiaSmiScriptAsRunnableFile( + this::createFaultyNvidiaSmiScript); + + GpuDiscoverer discoverer = creatediscovererWithGpuPathDefined(conf); + assertEquals(fakeBinary.getAbsolutePath(), + discoverer.getPathOfGpuBinary()); + assertNull(discoverer.getEnvironmentToRunCommand().get(PATH)); + + final String terminateMsg = "Failed to execute GPU device " + + "detection script (" + fakeBinary.getAbsolutePath() + ") for 10 times"; + final String msg = "Failed to execute GPU device detection script"; + + // The default max errors is 10. Verify that it keeps going for an 11th try. + for (int i = 0; i < 11; ++i) { + YarnException exception = assertThrows(YarnException.class, () -> { + discoverer.getGpuDeviceInformation(); + }); + + assertThat(exception.getMessage()).contains(msg); + assertThat(exception.getMessage()).doesNotContain(terminateMsg); + } + } + @Test public void testGetGpuDeviceInformationNvidiaSmiScriptWithInvalidXml() throws YarnException, IOException { @@ -545,4 +575,4 @@ public void testBinaryIsNotNvidiaSmi() throws YarnException { "nvidia-smi", "badfile"); assertThat(yarnException.getMessage()).contains(format); } -} \ No newline at end of file +} From 7608230d90d8ded10d5937ad9694c750460df070 Mon Sep 17 00:00:00 2001 From: Chris Nauroth Date: Thu, 7 Aug 2025 22:48:40 +0000 Subject: [PATCH 2/5] YARN-11844: code review feedback --- .../hadoop/yarn/conf/YarnConfiguration.java | 2 +- .../src/main/resources/yarn-default.xml | 2 +- .../resourceplugin/gpu/TestGpuDiscoverer.java | 60 ++++++++++++++++++- 3 files changed, 60 insertions(+), 4 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index f107068853c20..777617d7e640a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1925,7 +1925,7 @@ public static boolean isAclEnabled(Configuration conf) { NM_GPU_RESOURCE_PREFIX + "discovery-timeout"; @Private - public static final String NM_GPU_DISCOVERY_TIMEOUT_DEFAULT = "10000ms"; + public static final String NM_GPU_DISCOVERY_TIMEOUT_DEFAULT = "10s"; /** * Sets the maximum number of errors allowed from the discovery binary. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 4ea5f28bdc58c..8c453cbeb8918 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -4659,7 +4659,7 @@ yarn.nodemanager.resource-plugins.gpu.discovery-max-errors. yarn.nodemanager.resource-plugins.gpu.discovery-timeout - 10000ms + 10s diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java index 0f417559b1dc0..78baccfee22a5 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java @@ -297,6 +297,61 @@ public void testGetGpuDeviceInformationFaultyNvidiaSmiScriptConsecutiveRun() assertNotNull(discoverer.getGpusUsableByYarn()); } + @Test + public void testGetGpuDeviceInformationOverrideMaxErrors() + throws YarnException, IOException { + Configuration conf = new Configuration(false); + // The default is 10 max errors. Override to 11. + conf.setInt(YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS, 11); + + File fakeBinary = createFakeNvidiaSmiScriptAsRunnableFile( + this::createNvidiaSmiScript); + + GpuDiscoverer discoverer = creatediscovererWithGpuPathDefined(conf); + assertEquals(fakeBinary.getAbsolutePath(), + discoverer.getPathOfGpuBinary()); + assertNull(discoverer.getEnvironmentToRunCommand().get(PATH)); + + LOG.debug("Querying nvidia-smi correctly, once..."); + discoverer.getGpuDeviceInformation(); + + LOG.debug("Replacing script with faulty version!"); + createFaultyNvidiaSmiScript(fakeBinary); + + final String terminateMsg = "Failed to execute GPU device " + + "detection script (" + fakeBinary.getAbsolutePath() + ") for 10 times"; + final String msg = "Failed to execute GPU device detection script"; + + // We expect 11 attempts (not the default of 10). + for (int i = 0; i < 11; i++) { + try { + LOG.debug("Executing faulty nvidia-smi script..."); + discoverer.getGpuDeviceInformation(); + fail("Query of GPU device info via nvidia-smi should fail as " + + "script should be faulty: " + fakeBinary); + } catch (YarnException e) { + assertThat(e.getMessage()).contains(msg); + assertThat(e.getMessage()).doesNotContain(terminateMsg); + } + } + + // On a 12th attempt, we've exceed the configured max of 11, so we expect + // the termination message. + try { + LOG.debug("Executing faulty nvidia-smi script again..." + + "We should reach the error threshold now!"); + discoverer.getGpuDeviceInformation(); + fail("Query of GPU device info via nvidia-smi should fail as " + + "script should be faulty: " + fakeBinary); + } catch (YarnException e) { + assertThat(e.getMessage()).contains(terminateMsg); + } + + LOG.debug("Verifying if GPUs are still hold the value of " + + "first successful query"); + assertNotNull(discoverer.getGpusUsableByYarn()); + } + @Test public void testGetGpuDeviceInformationDisableMaxErrors() throws YarnException, IOException { @@ -316,8 +371,9 @@ public void testGetGpuDeviceInformationDisableMaxErrors() "detection script (" + fakeBinary.getAbsolutePath() + ") for 10 times"; final String msg = "Failed to execute GPU device detection script"; - // The default max errors is 10. Verify that it keeps going for an 11th try. - for (int i = 0; i < 11; ++i) { + // The default max errors is 10. Verify that it keeps going for more, and we + // never see the termination message. + for (int i = 0; i < 20; ++i) { YarnException exception = assertThrows(YarnException.class, () -> { discoverer.getGpuDeviceInformation(); }); From 42ccf817c7dfd6966634be0af48fcb8b92fa947d Mon Sep 17 00:00:00 2001 From: Chris Nauroth Date: Fri, 8 Aug 2025 16:09:39 +0000 Subject: [PATCH 3/5] YARN-11844: Checkstyle --- .../resourceplugin/gpu/TestGpuDiscoverer.java | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java index 78baccfee22a5..c0e5cfe8e23cb 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java @@ -305,19 +305,13 @@ public void testGetGpuDeviceInformationOverrideMaxErrors() conf.setInt(YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS, 11); File fakeBinary = createFakeNvidiaSmiScriptAsRunnableFile( - this::createNvidiaSmiScript); + this::createFaultyNvidiaSmiScript); GpuDiscoverer discoverer = creatediscovererWithGpuPathDefined(conf); assertEquals(fakeBinary.getAbsolutePath(), discoverer.getPathOfGpuBinary()); assertNull(discoverer.getEnvironmentToRunCommand().get(PATH)); - LOG.debug("Querying nvidia-smi correctly, once..."); - discoverer.getGpuDeviceInformation(); - - LOG.debug("Replacing script with faulty version!"); - createFaultyNvidiaSmiScript(fakeBinary); - final String terminateMsg = "Failed to execute GPU device " + "detection script (" + fakeBinary.getAbsolutePath() + ") for 10 times"; final String msg = "Failed to execute GPU device detection script"; @@ -375,8 +369,8 @@ public void testGetGpuDeviceInformationDisableMaxErrors() // never see the termination message. for (int i = 0; i < 20; ++i) { YarnException exception = assertThrows(YarnException.class, () -> { - discoverer.getGpuDeviceInformation(); - }); + discoverer.getGpuDeviceInformation(); + }); assertThat(exception.getMessage()).contains(msg); assertThat(exception.getMessage()).doesNotContain(terminateMsg); From 5e299fd6911a3dbcfcea226463cbbda12a8936a6 Mon Sep 17 00:00:00 2001 From: Chris Nauroth Date: Fri, 8 Aug 2025 20:40:09 +0000 Subject: [PATCH 4/5] YARN-11844: Fix test assertion on error message to say "11 times" --- .../containermanager/resourceplugin/gpu/TestGpuDiscoverer.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java index c0e5cfe8e23cb..0618ce35943fb 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java @@ -313,7 +313,7 @@ public void testGetGpuDeviceInformationOverrideMaxErrors() assertNull(discoverer.getEnvironmentToRunCommand().get(PATH)); final String terminateMsg = "Failed to execute GPU device " + - "detection script (" + fakeBinary.getAbsolutePath() + ") for 10 times"; + "detection script (" + fakeBinary.getAbsolutePath() + ") for 11 times"; final String msg = "Failed to execute GPU device detection script"; // We expect 11 attempts (not the default of 10). From acf84060b713234d5be8dd043624d441d77f79b4 Mon Sep 17 00:00:00 2001 From: Chris Nauroth Date: Sat, 9 Aug 2025 00:43:49 +0000 Subject: [PATCH 5/5] YARN-11844: Correct off-by-one in error count assertions --- .../resourceplugin/gpu/TestGpuDiscoverer.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java index 0618ce35943fb..6a22b070192e2 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java @@ -304,14 +304,19 @@ public void testGetGpuDeviceInformationOverrideMaxErrors() // The default is 10 max errors. Override to 11. conf.setInt(YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS, 11); + // Initial creation will call the script once. Start out with a successful + // script. Otherwise, our error count assertions will be off by one later. File fakeBinary = createFakeNvidiaSmiScriptAsRunnableFile( - this::createFaultyNvidiaSmiScript); + this::createNvidiaSmiScript); GpuDiscoverer discoverer = creatediscovererWithGpuPathDefined(conf); assertEquals(fakeBinary.getAbsolutePath(), discoverer.getPathOfGpuBinary()); assertNull(discoverer.getEnvironmentToRunCommand().get(PATH)); + LOG.debug("Replacing script with faulty version!"); + createFaultyNvidiaSmiScript(fakeBinary); + final String terminateMsg = "Failed to execute GPU device " + "detection script (" + fakeBinary.getAbsolutePath() + ") for 11 times"; final String msg = "Failed to execute GPU device detection script";