Skip to content

Commit 9385a6a

Browse files
YARN-11844: Support configuration of retry policy on GPU discovery
Closes #7857 Co-authored-by: Jayadeep Jayaraman <[email protected]> Reviewed-by: Ashutosh Gupta <[email protected]> Signed-off-by: Ayush Saxena <[email protected]> (cherry picked from commit 0f34922) (cherry picked from commit f2b69ba)
1 parent 448bbe1 commit 9385a6a

File tree

5 files changed

+158
-13
lines changed

5 files changed

+158
-13
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1831,6 +1831,26 @@ public static boolean isAclEnabled(Configuration conf) {
18311831
public static final String NM_GPU_PATH_TO_EXEC =
18321832
NM_GPU_RESOURCE_PREFIX + "path-to-discovery-executables";
18331833

1834+
/**
1835+
* Sets the maximum duration for executions of the discovery binary.
1836+
*/
1837+
@Private
1838+
public static final String NM_GPU_DISCOVERY_TIMEOUT =
1839+
NM_GPU_RESOURCE_PREFIX + "discovery-timeout";
1840+
1841+
@Private
1842+
public static final String NM_GPU_DISCOVERY_TIMEOUT_DEFAULT = "10s";
1843+
1844+
/**
1845+
* Sets the maximum number of errors allowed from the discovery binary.
1846+
*/
1847+
@Private
1848+
public static final String NM_GPU_DISCOVERY_MAX_ERRORS =
1849+
NM_GPU_RESOURCE_PREFIX + "discovery-max-errors";
1850+
1851+
@Private
1852+
public static final int NM_GPU_DISCOVERY_MAX_ERRORS_DEFAULT = 10;
1853+
18341854
/**
18351855
* Settings to control which implementation of docker plugin for GPU will be
18361856
* used.

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4363,6 +4363,34 @@
43634363
<value></value>
43644364
</property>
43654365

4366+
<property>
4367+
<description>
4368+
Sets the maximum duration for executions of the discovery binary defined in
4369+
yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables. If
4370+
the binary takes longer than this amount of time to run, then the process
4371+
is aborted. Discovery may be attempted again, depending on
4372+
yarn.nodemanager.resource-plugins.gpu.discovery-max-errors.
4373+
</description>
4374+
<name>yarn.nodemanager.resource-plugins.gpu.discovery-timeout</name>
4375+
<value>10s</value>
4376+
</property>
4377+
4378+
<property>
4379+
<description>
4380+
Sets the maximum number of errors allowed from the discovery binary
4381+
defined in
4382+
yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables. If
4383+
the number of errors exceeds this amount, then discovery is aborted, and
4384+
the NodeManager will never reattempt discovery again. Errors may be either
4385+
non-zero exit codes returned from the binary or timeouts as defined by
4386+
yarn.nodemanager.resource-plugins.gpu.discovery-timeout. Set this to a
4387+
negative value to disable enforcement of max errors and retry continually
4388+
until successful.
4389+
</description>
4390+
<name>yarn.nodemanager.resource-plugins.gpu.discovery-max-errors</name>
4391+
<value>10</value>
4392+
</property>
4393+
43664394
<property>
43674395
<description>
43684396
Enable additional discovery/isolation of resources on the NodeManager,

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
import java.util.List;
4444
import java.util.Map;
4545
import java.util.Set;
46-
46+
import java.util.concurrent.TimeUnit;
4747

4848
@InterfaceAudience.Private
4949
@InterfaceStability.Unstable
@@ -59,10 +59,10 @@ public class GpuDiscoverer extends Configured {
5959
private static final Set<String> DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of(
6060
"/usr/bin", "/bin", "/usr/local/nvidia/bin");
6161

62-
private static final int MAX_REPEATED_ERROR_ALLOWED = 10;
63-
6462
private NvidiaBinaryHelper nvidiaBinaryHelper;
6563
private String pathOfGpuBinary = null;
64+
private long discoveryTimeoutMs;
65+
private int discoveryMaxErrors;
6666
private Map<String, String> environment = new HashMap<>();
6767

6868
private int numOfErrorExecutionSinceLastSucceed = 0;
@@ -84,7 +84,7 @@ private String getErrorMessageOfScriptExecution(String msg) {
8484

8585
private String getErrorMessageOfScriptExecutionThresholdReached() {
8686
return getFailedToExecuteScriptMessage() + " for " +
87-
MAX_REPEATED_ERROR_ALLOWED + " times, " +
87+
discoveryMaxErrors + " times, " +
8888
"skipping following executions!";
8989
}
9090

@@ -112,15 +112,17 @@ private String getFailedToParseErrorMessage(String msg) {
112112
*/
113113
public synchronized GpuDeviceInformation getGpuDeviceInformation()
114114
throws YarnException {
115-
if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
115+
if (discoveryMaxErrors >= 0 &&
116+
numOfErrorExecutionSinceLastSucceed == discoveryMaxErrors) {
116117
String msg = getErrorMessageOfScriptExecutionThresholdReached();
117118
LOG.error(msg);
118119
throw new YarnException(msg);
119120
}
120121

121122
try {
122123
lastDiscoveredGpuInformation =
123-
nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary);
124+
nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary,
125+
discoveryTimeoutMs);
124126
} catch (IOException e) {
125127
numOfErrorExecutionSinceLastSucceed++;
126128
String msg = getErrorMessageOfScriptExecution(e.getMessage());
@@ -295,6 +297,16 @@ private void lookUpAutoDiscoveryBinary(Configuration config)
295297
}
296298

297299
pathOfGpuBinary = binaryPath.getAbsolutePath();
300+
301+
discoveryTimeoutMs = config.getTimeDuration(
302+
YarnConfiguration.NM_GPU_DISCOVERY_TIMEOUT,
303+
YarnConfiguration.NM_GPU_DISCOVERY_TIMEOUT_DEFAULT,
304+
TimeUnit.MILLISECONDS);
305+
306+
discoveryMaxErrors = config.getInt(
307+
YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS,
308+
YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS_DEFAULT);
309+
298310
}
299311

300312
private File handleConfiguredBinaryPathIsDirectory(File configuredBinaryFile)

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,6 @@
3434
*
3535
*/
3636
public class NvidiaBinaryHelper {
37-
/**
38-
* command should not run more than 10 sec.
39-
*/
40-
private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
4137

4238
/**
4339
* @param pathOfGpuBinary The path of the binary
@@ -47,7 +43,8 @@ public class NvidiaBinaryHelper {
4743
* or the output parse failed
4844
*/
4945
synchronized GpuDeviceInformation getGpuDeviceInformation(
50-
String pathOfGpuBinary) throws IOException, YarnException {
46+
String pathOfGpuBinary, long discoveryTimeoutMs)
47+
throws IOException, YarnException {
5148
GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
5249

5350
if (pathOfGpuBinary == null) {
@@ -57,7 +54,7 @@ synchronized GpuDeviceInformation getGpuDeviceInformation(
5754
}
5855

5956
String output = Shell.execCommand(new HashMap<>(),
60-
new String[]{pathOfGpuBinary, "-x", "-q"}, MAX_EXEC_TIMEOUT_MS);
57+
new String[]{pathOfGpuBinary, "-x", "-q"}, discoveryTimeoutMs);
6158
return parser.parseXml(output);
6259
}
6360
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,94 @@ public void testGetGpuDeviceInformationFaultyNvidiaSmiScriptConsecutiveRun()
291291
assertNotNull(discoverer.getGpusUsableByYarn());
292292
}
293293

294+
@Test
295+
public void testGetGpuDeviceInformationOverrideMaxErrors()
296+
throws YarnException, IOException {
297+
Configuration conf = new Configuration(false);
298+
// The default is 10 max errors. Override to 11.
299+
conf.setInt(YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS, 11);
300+
301+
// Initial creation will call the script once. Start out with a successful
302+
// script. Otherwise, our error count assertions will be off by one later.
303+
File fakeBinary = createFakeNvidiaSmiScriptAsRunnableFile(
304+
this::createNvidiaSmiScript);
305+
306+
GpuDiscoverer discoverer = creatediscovererWithGpuPathDefined(conf);
307+
assertEquals(fakeBinary.getAbsolutePath(),
308+
discoverer.getPathOfGpuBinary());
309+
assertNull(discoverer.getEnvironmentToRunCommand().get(PATH));
310+
311+
LOG.debug("Replacing script with faulty version!");
312+
createFaultyNvidiaSmiScript(fakeBinary);
313+
314+
final String terminateMsg = "Failed to execute GPU device " +
315+
"detection script (" + fakeBinary.getAbsolutePath() + ") for 11 times";
316+
final String msg = "Failed to execute GPU device detection script";
317+
318+
// We expect 11 attempts (not the default of 10).
319+
for (int i = 0; i < 11; i++) {
320+
try {
321+
LOG.debug("Executing faulty nvidia-smi script...");
322+
discoverer.getGpuDeviceInformation();
323+
fail("Query of GPU device info via nvidia-smi should fail as " +
324+
"script should be faulty: " + fakeBinary);
325+
} catch (YarnException e) {
326+
assertThat(e.getMessage(), containsString(msg));
327+
assertThat(e.getMessage(), not(containsString(terminateMsg)));
328+
}
329+
}
330+
331+
// On a 12th attempt, we've exceed the configured max of 11, so we expect
332+
// the termination message.
333+
try {
334+
LOG.debug("Executing faulty nvidia-smi script again..." +
335+
"We should reach the error threshold now!");
336+
discoverer.getGpuDeviceInformation();
337+
fail("Query of GPU device info via nvidia-smi should fail as " +
338+
"script should be faulty: " + fakeBinary);
339+
} catch (YarnException e) {
340+
assertThat(e.getMessage(), containsString(terminateMsg));
341+
}
342+
343+
LOG.debug("Verifying if GPUs are still hold the value of " +
344+
"first successful query");
345+
assertNotNull(discoverer.getGpusUsableByYarn());
346+
}
347+
348+
@Test
349+
public void testGetGpuDeviceInformationDisableMaxErrors()
350+
throws YarnException, IOException {
351+
Configuration conf = new Configuration(false);
352+
// A negative value should disable max errors enforcement.
353+
conf.setInt(YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS, -1);
354+
355+
File fakeBinary = createFakeNvidiaSmiScriptAsRunnableFile(
356+
this::createFaultyNvidiaSmiScript);
357+
358+
GpuDiscoverer discoverer = creatediscovererWithGpuPathDefined(conf);
359+
assertEquals(fakeBinary.getAbsolutePath(),
360+
discoverer.getPathOfGpuBinary());
361+
assertNull(discoverer.getEnvironmentToRunCommand().get(PATH));
362+
363+
final String terminateMsg = "Failed to execute GPU device " +
364+
"detection script (" + fakeBinary.getAbsolutePath() + ") for 10 times";
365+
final String msg = "Failed to execute GPU device detection script";
366+
367+
// The default max errors is 10. Verify that it keeps going for more, and we
368+
// never see the termination message.
369+
for (int i = 0; i < 20; ++i) {
370+
try {
371+
LOG.debug("Executing faulty nvidia-smi script...");
372+
discoverer.getGpuDeviceInformation();
373+
fail("Query of GPU device info via nvidia-smi should fail as " +
374+
"script should be faulty: " + fakeBinary);
375+
} catch (YarnException e) {
376+
assertThat(e.getMessage(), containsString(msg));
377+
assertThat(e.getMessage(), not(containsString(terminateMsg)));
378+
}
379+
}
380+
}
381+
294382
@Test
295383
public void testGetGpuDeviceInformationNvidiaSmiScriptWithInvalidXml()
296384
throws YarnException, IOException {
@@ -513,4 +601,4 @@ public void testScriptNotCalled() throws YarnException, IOException {
513601

514602
verify(gpuSpy, never()).getGpuDeviceInformation();
515603
}
516-
}
604+
}

0 commit comments

Comments
 (0)