Skip to content

Commit 5db27ec

Browse files
YARN-11844: Support configuration of retry policy on GPU discovery
Closes #7857 Co-authored-by: Jayadeep Jayaraman <[email protected]> Reviewed-by: Ashutosh Gupta <[email protected]>
1 parent 74bb044 commit 5db27ec

File tree

5 files changed

+100
-13
lines changed

5 files changed

+100
-13
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1917,6 +1917,26 @@ public static boolean isAclEnabled(Configuration conf) {
19171917
public static final String NM_GPU_PATH_TO_EXEC =
19181918
NM_GPU_RESOURCE_PREFIX + "path-to-discovery-executables";
19191919

1920+
/**
1921+
* Sets the maximum duration for executions of the discovery binary.
1922+
*/
1923+
@Private
1924+
public static final String NM_GPU_DISCOVERY_TIMEOUT =
1925+
NM_GPU_RESOURCE_PREFIX + "discovery-timeout";
1926+
1927+
@Private
1928+
public static final String NM_GPU_DISCOVERY_TIMEOUT_DEFAULT = "10000ms";
1929+
1930+
/**
1931+
* Sets the maximum number of errors allowed from the discovery binary.
1932+
*/
1933+
@Private
1934+
public static final String NM_GPU_DISCOVERY_MAX_ERRORS =
1935+
NM_GPU_RESOURCE_PREFIX + "discovery-max-errors";
1936+
1937+
@Private
1938+
public static final int NM_GPU_DISCOVERY_MAX_ERRORS_DEFAULT = 10;
1939+
19201940
/**
19211941
* Settings to control which implementation of docker plugin for GPU will be
19221942
* used.

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4650,6 +4650,34 @@
46504650
<value></value>
46514651
</property>
46524652

4653+
<property>
4654+
<description>
4655+
Sets the maximum duration for executions of the discovery binary defined in
4656+
yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables. If
4657+
the binary takes longer than this amount of time to run, then the process
4658+
is aborted. Discovery may be attempted again, depending on
4659+
yarn.nodemanager.resource-plugins.gpu.discovery-max-errors.
4660+
</description>
4661+
<name>yarn.nodemanager.resource-plugins.gpu.discovery-timeout</name>
4662+
<value>10000ms</value>
4663+
</property>
4664+
4665+
<property>
4666+
<description>
4667+
Sets the maximum number of errors allowed from the discovery binary
4668+
defined in
4669+
yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables. If
4670+
the number of errors exceeds this amount, then discovery is aborted, and
4671+
the NodeManager will never reattempt discovery again. Errors may be either
4672+
non-zero exit codes returned from the binary or timeouts as defined by
4673+
yarn.nodemanager.resource-plugins.gpu.discovery-timeout. Set this to a
4674+
negative value to disable enforcement of max errors and retry continually
4675+
until successful.
4676+
</description>
4677+
<name>yarn.nodemanager.resource-plugins.gpu.discovery-max-errors</name>
4678+
<value>10</value>
4679+
</property>
4680+
46534681
<property>
46544682
<description>
46554683
Enable additional discovery/isolation of resources on the NodeManager,

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
import java.util.List;
4646
import java.util.Map;
4747
import java.util.Set;
48-
48+
import java.util.concurrent.TimeUnit;
4949

5050
@InterfaceAudience.Private
5151
@InterfaceStability.Unstable
@@ -61,10 +61,10 @@ public class GpuDiscoverer extends Configured {
6161
private static final Set<String> DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of(
6262
"/usr/bin", "/bin", "/usr/local/nvidia/bin");
6363

64-
private static final int MAX_REPEATED_ERROR_ALLOWED = 10;
65-
6664
private NvidiaBinaryHelper nvidiaBinaryHelper;
6765
private String pathOfGpuBinary = null;
66+
private long discoveryTimeoutMs;
67+
private int discoveryMaxErrors;
6868
private Map<String, String> environment = new HashMap<>();
6969

7070
private int numOfErrorExecutionSinceLastSucceed = 0;
@@ -86,7 +86,7 @@ private String getErrorMessageOfScriptExecution(String msg) {
8686

8787
private String getErrorMessageOfScriptExecutionThresholdReached() {
8888
return getFailedToExecuteScriptMessage() + " for " +
89-
MAX_REPEATED_ERROR_ALLOWED + " times, " +
89+
discoveryMaxErrors + " times, " +
9090
"skipping following executions!";
9191
}
9292

@@ -114,15 +114,17 @@ private String getFailedToParseErrorMessage(String msg) {
114114
*/
115115
public synchronized GpuDeviceInformation getGpuDeviceInformation()
116116
throws YarnException {
117-
if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
117+
if (discoveryMaxErrors >= 0 &&
118+
numOfErrorExecutionSinceLastSucceed == discoveryMaxErrors) {
118119
String msg = getErrorMessageOfScriptExecutionThresholdReached();
119120
LOG.error(msg);
120121
throw new YarnException(msg);
121122
}
122123

123124
try {
124125
lastDiscoveredGpuInformation =
125-
nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary);
126+
nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary,
127+
discoveryTimeoutMs);
126128
} catch (IOException e) {
127129
numOfErrorExecutionSinceLastSucceed++;
128130
String msg = getErrorMessageOfScriptExecution(e.getMessage());
@@ -298,6 +300,16 @@ private void lookUpAutoDiscoveryBinary(Configuration config)
298300
}
299301

300302
pathOfGpuBinary = binaryPath.getAbsolutePath();
303+
304+
discoveryTimeoutMs = config.getTimeDuration(
305+
YarnConfiguration.NM_GPU_DISCOVERY_TIMEOUT,
306+
YarnConfiguration.NM_GPU_DISCOVERY_TIMEOUT_DEFAULT,
307+
TimeUnit.MILLISECONDS);
308+
309+
discoveryMaxErrors = config.getInt(
310+
YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS,
311+
YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS_DEFAULT);
312+
301313
}
302314

303315
private File handleConfiguredBinaryPathIsDirectory(File configuredBinaryFile)

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,6 @@
3434
*
3535
*/
3636
public class NvidiaBinaryHelper {
37-
/**
38-
* command should not run more than 10 sec.
39-
*/
40-
private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
4137

4238
/**
4339
* @param pathOfGpuBinary The path of the binary
@@ -47,7 +43,8 @@ public class NvidiaBinaryHelper {
4743
* or the output parse failed
4844
*/
4945
synchronized GpuDeviceInformation getGpuDeviceInformation(
50-
String pathOfGpuBinary) throws IOException, YarnException {
46+
String pathOfGpuBinary, long discoveryTimeoutMs)
47+
throws IOException, YarnException {
5148
GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
5249

5350
if (pathOfGpuBinary == null) {
@@ -57,7 +54,7 @@ synchronized GpuDeviceInformation getGpuDeviceInformation(
5754
}
5855

5956
String output = Shell.execCommand(new HashMap<>(),
60-
new String[]{pathOfGpuBinary, "-x", "-q"}, MAX_EXEC_TIMEOUT_MS);
57+
new String[]{pathOfGpuBinary, "-x", "-q"}, discoveryTimeoutMs);
6158
return parser.parseXml(output);
6259
}
6360
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,36 @@ public void testGetGpuDeviceInformationFaultyNvidiaSmiScriptConsecutiveRun()
297297
assertNotNull(discoverer.getGpusUsableByYarn());
298298
}
299299

300+
@Test
301+
public void testGetGpuDeviceInformationDisableMaxErrors()
302+
throws YarnException, IOException {
303+
Configuration conf = new Configuration(false);
304+
// A negative value should disable max errors enforcement.
305+
conf.setInt(YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS, -1);
306+
307+
File fakeBinary = createFakeNvidiaSmiScriptAsRunnableFile(
308+
this::createFaultyNvidiaSmiScript);
309+
310+
GpuDiscoverer discoverer = creatediscovererWithGpuPathDefined(conf);
311+
assertEquals(fakeBinary.getAbsolutePath(),
312+
discoverer.getPathOfGpuBinary());
313+
assertNull(discoverer.getEnvironmentToRunCommand().get(PATH));
314+
315+
final String terminateMsg = "Failed to execute GPU device " +
316+
"detection script (" + fakeBinary.getAbsolutePath() + ") for 10 times";
317+
final String msg = "Failed to execute GPU device detection script";
318+
319+
// The default max errors is 10. Verify that it keeps going for an 11th try.
320+
for (int i = 0; i < 11; ++i) {
321+
YarnException exception = assertThrows(YarnException.class, () -> {
322+
discoverer.getGpuDeviceInformation();
323+
});
324+
325+
assertThat(exception.getMessage()).contains(msg);
326+
assertThat(exception.getMessage()).doesNotContain(terminateMsg);
327+
}
328+
}
329+
300330
@Test
301331
public void testGetGpuDeviceInformationNvidiaSmiScriptWithInvalidXml()
302332
throws YarnException, IOException {
@@ -545,4 +575,4 @@ public void testBinaryIsNotNvidiaSmi() throws YarnException {
545575
"nvidia-smi", "badfile");
546576
assertThat(yarnException.getMessage()).contains(format);
547577
}
548-
}
578+
}

0 commit comments

Comments
 (0)