Skip to content

Commit f2b69ba

Browse files
YARN-11844: Support configuration of retry policy on GPU discovery
Closes #7857 Co-authored-by: Jayadeep Jayaraman <[email protected]> Reviewed-by: Ashutosh Gupta <[email protected]> Signed-off-by: Ayush Saxena <[email protected]> (cherry picked from commit 0f34922)
1 parent 148c4d7 commit f2b69ba

File tree

5 files changed

+158
-13
lines changed

5 files changed

+158
-13
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1901,6 +1901,26 @@ public static boolean isAclEnabled(Configuration conf) {
19011901
public static final String NM_GPU_PATH_TO_EXEC =
19021902
NM_GPU_RESOURCE_PREFIX + "path-to-discovery-executables";
19031903

1904+
/**
1905+
* Sets the maximum duration for executions of the discovery binary.
1906+
*/
1907+
@Private
1908+
public static final String NM_GPU_DISCOVERY_TIMEOUT =
1909+
NM_GPU_RESOURCE_PREFIX + "discovery-timeout";
1910+
1911+
@Private
1912+
public static final String NM_GPU_DISCOVERY_TIMEOUT_DEFAULT = "10s";
1913+
1914+
/**
1915+
* Sets the maximum number of errors allowed from the discovery binary.
1916+
*/
1917+
@Private
1918+
public static final String NM_GPU_DISCOVERY_MAX_ERRORS =
1919+
NM_GPU_RESOURCE_PREFIX + "discovery-max-errors";
1920+
1921+
@Private
1922+
public static final int NM_GPU_DISCOVERY_MAX_ERRORS_DEFAULT = 10;
1923+
19041924
/**
19051925
* Settings to control which implementation of docker plugin for GPU will be
19061926
* used.

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4593,6 +4593,34 @@
45934593
<value></value>
45944594
</property>
45954595

4596+
<property>
4597+
<description>
4598+
Sets the maximum duration for executions of the discovery binary defined in
4599+
yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables. If
4600+
the binary takes longer than this amount of time to run, then the process
4601+
is aborted. Discovery may be attempted again, depending on
4602+
yarn.nodemanager.resource-plugins.gpu.discovery-max-errors.
4603+
</description>
4604+
<name>yarn.nodemanager.resource-plugins.gpu.discovery-timeout</name>
4605+
<value>10s</value>
4606+
</property>
4607+
4608+
<property>
4609+
<description>
4610+
Sets the maximum number of errors allowed from the discovery binary
4611+
defined in
4612+
yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables. If
4613+
the number of errors exceeds this amount, then discovery is aborted, and
4614+
the NodeManager will never reattempt discovery again. Errors may be either
4615+
non-zero exit codes returned from the binary or timeouts as defined by
4616+
yarn.nodemanager.resource-plugins.gpu.discovery-timeout. Set this to a
4617+
negative value to disable enforcement of max errors and retry continually
4618+
until successful.
4619+
</description>
4620+
<name>yarn.nodemanager.resource-plugins.gpu.discovery-max-errors</name>
4621+
<value>10</value>
4622+
</property>
4623+
45964624
<property>
45974625
<description>
45984626
Enable additional discovery/isolation of resources on the NodeManager,

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
import java.util.List;
4646
import java.util.Map;
4747
import java.util.Set;
48-
48+
import java.util.concurrent.TimeUnit;
4949

5050
@InterfaceAudience.Private
5151
@InterfaceStability.Unstable
@@ -61,10 +61,10 @@ public class GpuDiscoverer extends Configured {
6161
private static final Set<String> DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of(
6262
"/usr/bin", "/bin", "/usr/local/nvidia/bin");
6363

64-
private static final int MAX_REPEATED_ERROR_ALLOWED = 10;
65-
6664
private NvidiaBinaryHelper nvidiaBinaryHelper;
6765
private String pathOfGpuBinary = null;
66+
private long discoveryTimeoutMs;
67+
private int discoveryMaxErrors;
6868
private Map<String, String> environment = new HashMap<>();
6969

7070
private int numOfErrorExecutionSinceLastSucceed = 0;
@@ -86,7 +86,7 @@ private String getErrorMessageOfScriptExecution(String msg) {
8686

8787
private String getErrorMessageOfScriptExecutionThresholdReached() {
8888
return getFailedToExecuteScriptMessage() + " for " +
89-
MAX_REPEATED_ERROR_ALLOWED + " times, " +
89+
discoveryMaxErrors + " times, " +
9090
"skipping following executions!";
9191
}
9292

@@ -114,15 +114,17 @@ private String getFailedToParseErrorMessage(String msg) {
114114
*/
115115
public synchronized GpuDeviceInformation getGpuDeviceInformation()
116116
throws YarnException {
117-
if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
117+
if (discoveryMaxErrors >= 0 &&
118+
numOfErrorExecutionSinceLastSucceed == discoveryMaxErrors) {
118119
String msg = getErrorMessageOfScriptExecutionThresholdReached();
119120
LOG.error(msg);
120121
throw new YarnException(msg);
121122
}
122123

123124
try {
124125
lastDiscoveredGpuInformation =
125-
nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary);
126+
nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary,
127+
discoveryTimeoutMs);
126128
} catch (IOException e) {
127129
numOfErrorExecutionSinceLastSucceed++;
128130
String msg = getErrorMessageOfScriptExecution(e.getMessage());
@@ -298,6 +300,16 @@ private void lookUpAutoDiscoveryBinary(Configuration config)
298300
}
299301

300302
pathOfGpuBinary = binaryPath.getAbsolutePath();
303+
304+
discoveryTimeoutMs = config.getTimeDuration(
305+
YarnConfiguration.NM_GPU_DISCOVERY_TIMEOUT,
306+
YarnConfiguration.NM_GPU_DISCOVERY_TIMEOUT_DEFAULT,
307+
TimeUnit.MILLISECONDS);
308+
309+
discoveryMaxErrors = config.getInt(
310+
YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS,
311+
YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS_DEFAULT);
312+
301313
}
302314

303315
private File handleConfiguredBinaryPathIsDirectory(File configuredBinaryFile)

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,6 @@
3434
*
3535
*/
3636
public class NvidiaBinaryHelper {
37-
/**
38-
* command should not run more than 10 sec.
39-
*/
40-
private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
4137

4238
/**
4339
* @param pathOfGpuBinary The path of the binary
@@ -47,7 +43,8 @@ public class NvidiaBinaryHelper {
4743
* or the output parse failed
4844
*/
4945
synchronized GpuDeviceInformation getGpuDeviceInformation(
50-
String pathOfGpuBinary) throws IOException, YarnException {
46+
String pathOfGpuBinary, long discoveryTimeoutMs)
47+
throws IOException, YarnException {
5148
GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
5249

5350
if (pathOfGpuBinary == null) {
@@ -57,7 +54,7 @@ synchronized GpuDeviceInformation getGpuDeviceInformation(
5754
}
5855

5956
String output = Shell.execCommand(new HashMap<>(),
60-
new String[]{pathOfGpuBinary, "-x", "-q"}, MAX_EXEC_TIMEOUT_MS);
57+
new String[]{pathOfGpuBinary, "-x", "-q"}, discoveryTimeoutMs);
6158
return parser.parseXml(output);
6259
}
6360
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,94 @@ public void testGetGpuDeviceInformationFaultyNvidiaSmiScriptConsecutiveRun()
302302
assertNotNull(discoverer.getGpusUsableByYarn());
303303
}
304304

305+
@Test
306+
public void testGetGpuDeviceInformationOverrideMaxErrors()
307+
throws YarnException, IOException {
308+
Configuration conf = new Configuration(false);
309+
// The default is 10 max errors. Override to 11.
310+
conf.setInt(YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS, 11);
311+
312+
// Initial creation will call the script once. Start out with a successful
313+
// script. Otherwise, our error count assertions will be off by one later.
314+
File fakeBinary = createFakeNvidiaSmiScriptAsRunnableFile(
315+
this::createNvidiaSmiScript);
316+
317+
GpuDiscoverer discoverer = creatediscovererWithGpuPathDefined(conf);
318+
assertEquals(fakeBinary.getAbsolutePath(),
319+
discoverer.getPathOfGpuBinary());
320+
assertNull(discoverer.getEnvironmentToRunCommand().get(PATH));
321+
322+
LOG.debug("Replacing script with faulty version!");
323+
createFaultyNvidiaSmiScript(fakeBinary);
324+
325+
final String terminateMsg = "Failed to execute GPU device " +
326+
"detection script (" + fakeBinary.getAbsolutePath() + ") for 11 times";
327+
final String msg = "Failed to execute GPU device detection script";
328+
329+
// We expect 11 attempts (not the default of 10).
330+
for (int i = 0; i < 11; i++) {
331+
try {
332+
LOG.debug("Executing faulty nvidia-smi script...");
333+
discoverer.getGpuDeviceInformation();
334+
fail("Query of GPU device info via nvidia-smi should fail as " +
335+
"script should be faulty: " + fakeBinary);
336+
} catch (YarnException e) {
337+
assertThat(e.getMessage(), containsString(msg));
338+
assertThat(e.getMessage(), not(containsString(terminateMsg)));
339+
}
340+
}
341+
342+
// On a 12th attempt, we've exceed the configured max of 11, so we expect
343+
// the termination message.
344+
try {
345+
LOG.debug("Executing faulty nvidia-smi script again..." +
346+
"We should reach the error threshold now!");
347+
discoverer.getGpuDeviceInformation();
348+
fail("Query of GPU device info via nvidia-smi should fail as " +
349+
"script should be faulty: " + fakeBinary);
350+
} catch (YarnException e) {
351+
assertThat(e.getMessage(), containsString(terminateMsg));
352+
}
353+
354+
LOG.debug("Verifying if GPUs are still hold the value of " +
355+
"first successful query");
356+
assertNotNull(discoverer.getGpusUsableByYarn());
357+
}
358+
359+
@Test
360+
public void testGetGpuDeviceInformationDisableMaxErrors()
361+
throws YarnException, IOException {
362+
Configuration conf = new Configuration(false);
363+
// A negative value should disable max errors enforcement.
364+
conf.setInt(YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS, -1);
365+
366+
File fakeBinary = createFakeNvidiaSmiScriptAsRunnableFile(
367+
this::createFaultyNvidiaSmiScript);
368+
369+
GpuDiscoverer discoverer = creatediscovererWithGpuPathDefined(conf);
370+
assertEquals(fakeBinary.getAbsolutePath(),
371+
discoverer.getPathOfGpuBinary());
372+
assertNull(discoverer.getEnvironmentToRunCommand().get(PATH));
373+
374+
final String terminateMsg = "Failed to execute GPU device " +
375+
"detection script (" + fakeBinary.getAbsolutePath() + ") for 10 times";
376+
final String msg = "Failed to execute GPU device detection script";
377+
378+
// The default max errors is 10. Verify that it keeps going for more, and we
379+
// never see the termination message.
380+
for (int i = 0; i < 20; ++i) {
381+
try {
382+
LOG.debug("Executing faulty nvidia-smi script...");
383+
discoverer.getGpuDeviceInformation();
384+
fail("Query of GPU device info via nvidia-smi should fail as " +
385+
"script should be faulty: " + fakeBinary);
386+
} catch (YarnException e) {
387+
assertThat(e.getMessage(), containsString(msg));
388+
assertThat(e.getMessage(), not(containsString(terminateMsg)));
389+
}
390+
}
391+
}
392+
305393
@Test
306394
public void testGetGpuDeviceInformationNvidiaSmiScriptWithInvalidXml()
307395
throws YarnException, IOException {
@@ -538,4 +626,4 @@ public void testBinaryIsNotNvidiaSmi() throws YarnException {
538626
GpuDiscoverer plugin = new GpuDiscoverer();
539627
plugin.initialize(conf, binaryHelper);
540628
}
541-
}
629+
}

0 commit comments

Comments
 (0)