Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1917,6 +1917,26 @@ public static boolean isAclEnabled(Configuration conf) {
public static final String NM_GPU_PATH_TO_EXEC =
NM_GPU_RESOURCE_PREFIX + "path-to-discovery-executables";

/**
* Sets the maximum duration for executions of the discovery binary.
*/
@Private
public static final String NM_GPU_DISCOVERY_TIMEOUT =
NM_GPU_RESOURCE_PREFIX + "discovery-timeout";

@Private
public static final String NM_GPU_DISCOVERY_TIMEOUT_DEFAULT = "10s";

/**
* Sets the maximum number of errors allowed from the discovery binary.
*/
@Private
public static final String NM_GPU_DISCOVERY_MAX_ERRORS =
NM_GPU_RESOURCE_PREFIX + "discovery-max-errors";

@Private
public static final int NM_GPU_DISCOVERY_MAX_ERRORS_DEFAULT = 10;

/**
* Settings to control which implementation of docker plugin for GPU will be
* used.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4650,6 +4650,34 @@
<value></value>
</property>

<property>
<description>
Sets the maximum duration for executions of the discovery binary defined in
yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables. If
the binary takes longer than this amount of time to run, then the process
is aborted. Discovery may be attempted again, depending on
yarn.nodemanager.resource-plugins.gpu.discovery-max-errors.
</description>
<name>yarn.nodemanager.resource-plugins.gpu.discovery-timeout</name>
<value>10s</value>
</property>

<property>
<description>
Sets the maximum number of errors allowed from the discovery binary
defined in
yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables. If
the number of errors exceeds this amount, then discovery is aborted, and
the NodeManager will never reattempt discovery again. Errors may be either
non-zero exit codes returned from the binary or timeouts as defined by
yarn.nodemanager.resource-plugins.gpu.discovery-timeout. Set this to a
negative value to disable enforcement of max errors and retry continually
until successful.
</description>
<name>yarn.nodemanager.resource-plugins.gpu.discovery-max-errors</name>
<value>10</value>
</property>

<property>
<description>
Enable additional discovery/isolation of resources on the NodeManager,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
import java.util.List;
import java.util.Map;
import java.util.Set;

import java.util.concurrent.TimeUnit;

@InterfaceAudience.Private
@InterfaceStability.Unstable
Expand All @@ -61,10 +61,10 @@ public class GpuDiscoverer extends Configured {
private static final Set<String> DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of(
"/usr/bin", "/bin", "/usr/local/nvidia/bin");

private static final int MAX_REPEATED_ERROR_ALLOWED = 10;

private NvidiaBinaryHelper nvidiaBinaryHelper;
private String pathOfGpuBinary = null;
private long discoveryTimeoutMs;
private int discoveryMaxErrors;
private Map<String, String> environment = new HashMap<>();

private int numOfErrorExecutionSinceLastSucceed = 0;
Expand All @@ -86,7 +86,7 @@ private String getErrorMessageOfScriptExecution(String msg) {

private String getErrorMessageOfScriptExecutionThresholdReached() {
return getFailedToExecuteScriptMessage() + " for " +
MAX_REPEATED_ERROR_ALLOWED + " times, " +
discoveryMaxErrors + " times, " +
"skipping following executions!";
}

Expand Down Expand Up @@ -114,15 +114,17 @@ private String getFailedToParseErrorMessage(String msg) {
*/
public synchronized GpuDeviceInformation getGpuDeviceInformation()
throws YarnException {
if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
if (discoveryMaxErrors >= 0 &&
numOfErrorExecutionSinceLastSucceed == discoveryMaxErrors) {
String msg = getErrorMessageOfScriptExecutionThresholdReached();
LOG.error(msg);
throw new YarnException(msg);
}

try {
lastDiscoveredGpuInformation =
nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary);
nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary,
discoveryTimeoutMs);
} catch (IOException e) {
numOfErrorExecutionSinceLastSucceed++;
String msg = getErrorMessageOfScriptExecution(e.getMessage());
Expand Down Expand Up @@ -298,6 +300,16 @@ private void lookUpAutoDiscoveryBinary(Configuration config)
}

pathOfGpuBinary = binaryPath.getAbsolutePath();

discoveryTimeoutMs = config.getTimeDuration(
YarnConfiguration.NM_GPU_DISCOVERY_TIMEOUT,
YarnConfiguration.NM_GPU_DISCOVERY_TIMEOUT_DEFAULT,
TimeUnit.MILLISECONDS);

discoveryMaxErrors = config.getInt(
YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS,
YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS_DEFAULT);

}

private File handleConfiguredBinaryPathIsDirectory(File configuredBinaryFile)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,6 @@
*
*/
public class NvidiaBinaryHelper {
/**
* command should not run more than 10 sec.
*/
private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;

/**
* @param pathOfGpuBinary The path of the binary
Expand All @@ -47,7 +43,8 @@ public class NvidiaBinaryHelper {
* or the output parse failed
*/
synchronized GpuDeviceInformation getGpuDeviceInformation(
String pathOfGpuBinary) throws IOException, YarnException {
String pathOfGpuBinary, long discoveryTimeoutMs)
throws IOException, YarnException {
GpuDeviceInformationParser parser = new GpuDeviceInformationParser();

if (pathOfGpuBinary == null) {
Expand All @@ -57,7 +54,7 @@ synchronized GpuDeviceInformation getGpuDeviceInformation(
}

String output = Shell.execCommand(new HashMap<>(),
new String[]{pathOfGpuBinary, "-x", "-q"}, MAX_EXEC_TIMEOUT_MS);
new String[]{pathOfGpuBinary, "-x", "-q"}, discoveryTimeoutMs);
return parser.parseXml(output);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,91 @@ public void testGetGpuDeviceInformationFaultyNvidiaSmiScriptConsecutiveRun()
assertNotNull(discoverer.getGpusUsableByYarn());
}

@Test
public void testGetGpuDeviceInformationOverrideMaxErrors()
throws YarnException, IOException {
Configuration conf = new Configuration(false);
// The default is 10 max errors. Override to 11.
conf.setInt(YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS, 11);

// Initial creation will call the script once. Start out with a successful
// script. Otherwise, our error count assertions will be off by one later.
File fakeBinary = createFakeNvidiaSmiScriptAsRunnableFile(
this::createNvidiaSmiScript);

GpuDiscoverer discoverer = creatediscovererWithGpuPathDefined(conf);
assertEquals(fakeBinary.getAbsolutePath(),
discoverer.getPathOfGpuBinary());
assertNull(discoverer.getEnvironmentToRunCommand().get(PATH));

LOG.debug("Replacing script with faulty version!");
createFaultyNvidiaSmiScript(fakeBinary);

final String terminateMsg = "Failed to execute GPU device " +
"detection script (" + fakeBinary.getAbsolutePath() + ") for 11 times";
final String msg = "Failed to execute GPU device detection script";

// We expect 11 attempts (not the default of 10).
for (int i = 0; i < 11; i++) {
try {
LOG.debug("Executing faulty nvidia-smi script...");
discoverer.getGpuDeviceInformation();
fail("Query of GPU device info via nvidia-smi should fail as " +
"script should be faulty: " + fakeBinary);
} catch (YarnException e) {
assertThat(e.getMessage()).contains(msg);
assertThat(e.getMessage()).doesNotContain(terminateMsg);
}
}

// On a 12th attempt, we've exceed the configured max of 11, so we expect
// the termination message.
try {
LOG.debug("Executing faulty nvidia-smi script again..." +
"We should reach the error threshold now!");
discoverer.getGpuDeviceInformation();
fail("Query of GPU device info via nvidia-smi should fail as " +
"script should be faulty: " + fakeBinary);
} catch (YarnException e) {
assertThat(e.getMessage()).contains(terminateMsg);
}

LOG.debug("Verifying if GPUs are still hold the value of " +
"first successful query");
assertNotNull(discoverer.getGpusUsableByYarn());
}

@Test
public void testGetGpuDeviceInformationDisableMaxErrors()
throws YarnException, IOException {
Configuration conf = new Configuration(false);
// A negative value should disable max errors enforcement.
conf.setInt(YarnConfiguration.NM_GPU_DISCOVERY_MAX_ERRORS, -1);

File fakeBinary = createFakeNvidiaSmiScriptAsRunnableFile(
this::createFaultyNvidiaSmiScript);

GpuDiscoverer discoverer = creatediscovererWithGpuPathDefined(conf);
assertEquals(fakeBinary.getAbsolutePath(),
discoverer.getPathOfGpuBinary());
assertNull(discoverer.getEnvironmentToRunCommand().get(PATH));

final String terminateMsg = "Failed to execute GPU device " +
"detection script (" + fakeBinary.getAbsolutePath() + ") for 10 times";
final String msg = "Failed to execute GPU device detection script";

// The default max errors is 10. Verify that it keeps going for more, and we
// never see the termination message.
for (int i = 0; i < 20; ++i) {
YarnException exception = assertThrows(YarnException.class, () -> {
discoverer.getGpuDeviceInformation();
});

assertThat(exception.getMessage()).contains(msg);
assertThat(exception.getMessage()).doesNotContain(terminateMsg);
}
}

@Test
public void testGetGpuDeviceInformationNvidiaSmiScriptWithInvalidXml()
throws YarnException, IOException {
Expand Down Expand Up @@ -545,4 +630,4 @@ public void testBinaryIsNotNvidiaSmi() throws YarnException {
"nvidia-smi", "badfile");
assertThat(yarnException.getMessage()).contains(format);
}
}
}