Skip to content

Commit 0f34922

Browse files
YARN-11844: Support configuration of retry policy on GPU discovery
Closes #7857 Co-authored-by: Jayadeep Jayaraman <[email protected]> Reviewed-by: Ashutosh Gupta <[email protected]> Signed-off-by: Ayush Saxena <[email protected]>
1 parent 4c7f9b0 commit 0f34922

File tree

5 files changed

+155
-13
lines changed

5 files changed

+155
-13
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1917,6 +1917,26 @@ public static boolean isAclEnabled(Configuration conf) {
19171917
public static final String NM_GPU_PATH_TO_EXEC =
19181918
NM_GPU_RESOURCE_PREFIX + "path-to-discovery-executables";
19191919

1920+
/**
1921+
* Sets the maximum duration for executions of the discovery binary.
1922+
*/
1923+
@Private
1924+
public static final String NM_GPU_DISCOVERY_TIMEOUT =
1925+
NM_GPU_RESOURCE_PREFIX + "discovery-timeout";
1926+
1927+
@Private
1928+
public static final String NM_GPU_DISCOVERY_TIMEOUT_DEFAULT = "10s";
1929+
1930+
/**
1931+
* Sets the maximum number of errors allowed from the discovery binary.
1932+
*/
1933+
@Private
1934+
public static final String NM_GPU_DISCOVERY_MAX_ERRORS =
1935+
NM_GPU_RESOURCE_PREFIX + "discovery-max-errors";
1936+
1937+
@Private
1938+
public static final int NM_GPU_DISCOVERY_MAX_ERRORS_DEFAULT = 10;
1939+
19201940
/**
19211941
* Settings to control which implementation of docker plugin for GPU will be
19221942
* used.

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4650,6 +4650,34 @@
46504650
<value></value>
46514651
</property>
46524652

4653+
<property>
4654+
<description>
4655+
Sets the maximum duration for executions of the discovery binary defined in
4656+
yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables. If
4657+
the binary takes longer than this amount of time to run, then the process
4658+
is aborted. Discovery may be attempted again, depending on
4659+
yarn.nodemanager.resource-plugins.gpu.discovery-max-errors.
4660+
</description>
4661+
<name>yarn.nodemanager.resource-plugins.gpu.discovery-timeout</name>
4662+
<value>10s</value>
4663+
</property>
4664+
4665+
<property>
4666+
<description>
4667+
Sets the maximum number of errors allowed from the discovery binary
4668+
defined in
4669+
yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables. If
4670+
the number of errors exceeds this amount, then discovery is aborted, and
4671+
the NodeManager will never reattempt discovery again. Errors may be either
4672+
non-zero exit codes returned from the binary or timeouts as defined by
4673+
yarn.nodemanager.resource-plugins.gpu.discovery-timeout. Set this to a
4674+
negative value to disable enforcement of max errors and retry continually
4675+
until successful.
4676+
</description>
4677+
<name>yarn.nodemanager.resource-plugins.gpu.discovery-max-errors</name>
4678+
<value>10</value>
4679+
</property>
4680+
46534681
<property>
46544682
<description>
46554683
Enable additional discovery/isolation of resources on the NodeManager,

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
import java.util.List;
4646
import java.util.Map;
4747
import java.util.Set;
48-
48+
import java.util.concurrent.TimeUnit;
4949

5050
@InterfaceAudience.Private
5151
@InterfaceStability.Unstable
@@ -61,10 +61,10 @@ public class GpuDiscoverer extends Configured {
6161
private static final Set<String> DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of(
6262
"/usr/bin", "/bin", "/usr/local/nvidia/bin");
6363

64-
private static final int MAX_REPEATED_ERROR_ALLOWED = 10;
65-
6664
private NvidiaBinaryHelper nvidiaBinaryHelper;
6765
private String pathOfGpuBinary = null;
66+
private long discoveryTimeoutMs;
67+
private int discoveryMaxErrors;
6868
private Map<String, String> environment = new HashMap<>();
6969

7070
private int numOfErrorExecutionSinceLastSucceed = 0;
@@ -86,7 +86,7 @@ private String getErrorMessageOfScriptExecution(String msg) {
8686

8787
private String getErrorMessageOfScriptExecutionThresholdReached() {
8888
return getFailedToExecuteScriptMessage() + " for " +
89-
MAX_REPEATED_ERROR_ALLOWED + " times, " +
89+
discoveryMaxErrors + " times, " +
9090
"skipping following executions!";
9191
}
9292

@@ -114,15 +114,17 @@ private String getFailedToParseErrorMessage(String msg) {
114114
*/
115115
public synchronized GpuDeviceInformation getGpuDeviceInformation()
116116
throws YarnException {
117-
if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
117+
if (discoveryMaxErrors >= 0 &&
118+
numOfErrorExecutionSinceLastSucceed == discoveryMaxErrors) {
118119
String msg = getErrorMessageOfScriptExecutionThresholdReached();
119120
LOG.error(msg);
120121
throw new YarnException(msg);
121122
}
122123

123124
try {
124125
lastDiscoveredGpuInformation =
125-
nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary);
126+
nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary,
127+
discoveryTimeoutMs);
126128
} catch (IOException e) {
127129
numOfErrorExecutionSinceLastSucceed++;
128130
String msg = getErrorMessageOfScriptExecution(e.getMessage());
@@ -298,6 +300,16 @@ private void lookUpAutoDiscoveryBinary(Configuration config)
298300
}
299301

300302
pathOfGpuBinary = binaryPath.getAbsolutePath();
303+
304+
discoveryTimeoutMs = config.getTimeDuration(
305+
YarnConfiguration.NM_GPU_DISCOVERY_TIMEOUT,
306+
YarnConfiguration.NM_GPU_DISCOVERY_TIMEOUT_DEFAULT,
307+
TimeUnit.MILLISECONDS);
308+
309+
discoveryMaxErrors = config.getInt(
310+
YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS,
311+
YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS_DEFAULT);
312+
301313
}
302314

303315
private File handleConfiguredBinaryPathIsDirectory(File configuredBinaryFile)

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,6 @@
3434
*
3535
*/
3636
public class NvidiaBinaryHelper {
37-
/**
38-
* command should not run more than 10 sec.
39-
*/
40-
private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
4137

4238
/**
4339
* @param pathOfGpuBinary The path of the binary
@@ -47,7 +43,8 @@ public class NvidiaBinaryHelper {
4743
* or the output parse failed
4844
*/
4945
synchronized GpuDeviceInformation getGpuDeviceInformation(
50-
String pathOfGpuBinary) throws IOException, YarnException {
46+
String pathOfGpuBinary, long discoveryTimeoutMs)
47+
throws IOException, YarnException {
5148
GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
5249

5350
if (pathOfGpuBinary == null) {
@@ -57,7 +54,7 @@ synchronized GpuDeviceInformation getGpuDeviceInformation(
5754
}
5855

5956
String output = Shell.execCommand(new HashMap<>(),
60-
new String[]{pathOfGpuBinary, "-x", "-q"}, MAX_EXEC_TIMEOUT_MS);
57+
new String[]{pathOfGpuBinary, "-x", "-q"}, discoveryTimeoutMs);
6158
return parser.parseXml(output);
6259
}
6360
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java

Lines changed: 86 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,91 @@ public void testGetGpuDeviceInformationFaultyNvidiaSmiScriptConsecutiveRun()
297297
assertNotNull(discoverer.getGpusUsableByYarn());
298298
}
299299

300+
@Test
301+
public void testGetGpuDeviceInformationOverrideMaxErrors()
302+
throws YarnException, IOException {
303+
Configuration conf = new Configuration(false);
304+
// The default is 10 max errors. Override to 11.
305+
conf.setInt(YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS, 11);
306+
307+
// Initial creation will call the script once. Start out with a successful
308+
// script. Otherwise, our error count assertions will be off by one later.
309+
File fakeBinary = createFakeNvidiaSmiScriptAsRunnableFile(
310+
this::createNvidiaSmiScript);
311+
312+
GpuDiscoverer discoverer = creatediscovererWithGpuPathDefined(conf);
313+
assertEquals(fakeBinary.getAbsolutePath(),
314+
discoverer.getPathOfGpuBinary());
315+
assertNull(discoverer.getEnvironmentToRunCommand().get(PATH));
316+
317+
LOG.debug("Replacing script with faulty version!");
318+
createFaultyNvidiaSmiScript(fakeBinary);
319+
320+
final String terminateMsg = "Failed to execute GPU device " +
321+
"detection script (" + fakeBinary.getAbsolutePath() + ") for 11 times";
322+
final String msg = "Failed to execute GPU device detection script";
323+
324+
// We expect 11 attempts (not the default of 10).
325+
for (int i = 0; i < 11; i++) {
326+
try {
327+
LOG.debug("Executing faulty nvidia-smi script...");
328+
discoverer.getGpuDeviceInformation();
329+
fail("Query of GPU device info via nvidia-smi should fail as " +
330+
"script should be faulty: " + fakeBinary);
331+
} catch (YarnException e) {
332+
assertThat(e.getMessage()).contains(msg);
333+
assertThat(e.getMessage()).doesNotContain(terminateMsg);
334+
}
335+
}
336+
337+
// On a 12th attempt, we've exceed the configured max of 11, so we expect
338+
// the termination message.
339+
try {
340+
LOG.debug("Executing faulty nvidia-smi script again..." +
341+
"We should reach the error threshold now!");
342+
discoverer.getGpuDeviceInformation();
343+
fail("Query of GPU device info via nvidia-smi should fail as " +
344+
"script should be faulty: " + fakeBinary);
345+
} catch (YarnException e) {
346+
assertThat(e.getMessage()).contains(terminateMsg);
347+
}
348+
349+
LOG.debug("Verifying if GPUs are still hold the value of " +
350+
"first successful query");
351+
assertNotNull(discoverer.getGpusUsableByYarn());
352+
}
353+
354+
@Test
355+
public void testGetGpuDeviceInformationDisableMaxErrors()
356+
throws YarnException, IOException {
357+
Configuration conf = new Configuration(false);
358+
// A negative value should disable max errors enforcement.
359+
conf.setInt(YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS, -1);
360+
361+
File fakeBinary = createFakeNvidiaSmiScriptAsRunnableFile(
362+
this::createFaultyNvidiaSmiScript);
363+
364+
GpuDiscoverer discoverer = creatediscovererWithGpuPathDefined(conf);
365+
assertEquals(fakeBinary.getAbsolutePath(),
366+
discoverer.getPathOfGpuBinary());
367+
assertNull(discoverer.getEnvironmentToRunCommand().get(PATH));
368+
369+
final String terminateMsg = "Failed to execute GPU device " +
370+
"detection script (" + fakeBinary.getAbsolutePath() + ") for 10 times";
371+
final String msg = "Failed to execute GPU device detection script";
372+
373+
// The default max errors is 10. Verify that it keeps going for more, and we
374+
// never see the termination message.
375+
for (int i = 0; i < 20; ++i) {
376+
YarnException exception = assertThrows(YarnException.class, () -> {
377+
discoverer.getGpuDeviceInformation();
378+
});
379+
380+
assertThat(exception.getMessage()).contains(msg);
381+
assertThat(exception.getMessage()).doesNotContain(terminateMsg);
382+
}
383+
}
384+
300385
@Test
301386
public void testGetGpuDeviceInformationNvidiaSmiScriptWithInvalidXml()
302387
throws YarnException, IOException {
@@ -545,4 +630,4 @@ public void testBinaryIsNotNvidiaSmi() throws YarnException {
545630
"nvidia-smi", "badfile");
546631
assertThat(yarnException.getMessage()).contains(format);
547632
}
548-
}
633+
}

0 commit comments

Comments
 (0)