Skip to content

Commit 57daf24

Browse files
deep1712sasankagrs-workDEVAID-MSFT
authored
Adding MLPerf Training (#179)
* added basics of parser and executor * Added -server in CUDAInstallation * Changed profilwe * Added Profile parameters * resolved comments on profile and executor * resolved comments on Executor * Resolved comments and removed unnecessary definitions * Removed redundant lines * Adding MLPerfTraining Executor * Adding Ml Perf Training * Adding Unit tests fixes * Adding Mount Disks Documentation * Adding Mount Disks Documentation * Adding Mount Disks Documentation * Adding PERF-GPU-MLPERF-TRAINING-NVIDIA * Adding PERF-GPU-MLPERF-TRAINING-NVIDIA * Adding documentation * Adding static method prefix * Fixing merge conflicts * Adding merge conflict fixes * Adding static method prefix * Fixing code for extensions * Fixing unit tests * Fixing comments * Adding ML Perf profiles doc * Moving changes of Contracts to Main * Fixing Contracts folder * Fixing Contracts * Fixing Contracts unit tests * Fixing contracts unit tests * Fixing Parsers regex * Adding unit tests for MountDisks * Minimizing the size of praser for mlperf training * Adding MLPerf Docs * Adding Bert Preprocessing data steps * Adding Bert Preprocessing documentation * Fixing unit test * Adding source link in doc * fixing merge conflict * Fixing Main * Fixing main --------- Co-authored-by: sasankagrs-work <[email protected]> Co-authored-by: deepanshu <[email protected]>
1 parent 1cf4fcd commit 57daf24

File tree

16 files changed

+5837
-13
lines changed

16 files changed

+5837
-13
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
+ : mlperf-nvidia:language_model
2+
+ : DGXA100_1x8x56x1
3+
+ : 1
4+
++ date +%y%m%d%H%M%S%N
5+
+ : 230306191015611751926
6+
++ pwd
7+
+ : /home/azureuser/mlperf/training_results_v2.0/NVIDIA/benchmarks/bert/implementations/pytorch/results
8+
+ : constants.BERT
9+
+ : 5
10+
+ : ./config_DGXA100_1x8x56x1.sh
11+
+ : /home/azureuser/mlperf/training_results_v2.0/NVIDIA/benchmarks/bert/implementations/pytorch/results/230306191015611751926
12+
+ : language_model
13+
+ : 0,1,2,3,4,5,6,7
14+
+ readonly docker_image=mlperf-nvidia:language_model
15+
+ docker_image=mlperf-nvidia:language_model
16+
+++ func_update_file_path_for_ci mounts.txt /home/azureuser/mlperf/training_results_v2.0/NVIDIA/benchmarks/bert/implementations/pytorch/language_model/pytorch
17+
+++ declare new_path
18+
+++ '[' -f mounts.txt ']'
19+
+++ new_path=mounts.txt
20+
+++ '[' '!' -f mounts.txt ']'
21+
+++ echo mounts.txt
22+
++ func_get_container_mounts mounts.txt
23+
+++ envsubst
24+
++++ sed '/^$/d' mounts.txt
25+
++++ sed '/^#/d'
26+
++++ sed 's/^[ ]*\(.*\)[ ]*/--volume=\1 /'
27+
++++ tr '\n' ' '
28+
++ echo --volume=/datadrive/bert/hdf5/training-4320/hdf5_4320_shards_varlength:/workspace/data --volume=/datadrive/bert/hdf5/training-4320/hdf5_4320_shards_varlength:/workspace/data_phase2 --volume=/datadrive/bert/phase1:/workspace/phase1 --volume=/datadrive/bert/hdf5/eval_varlength:/workspace/evaldata --volume=/lustre/fsw/mlperf/mlperft-bert/unit_test:/workspace/unit_test_data
29+
+ CONT_MOUNTS='--volume=/datadrive/bert/hdf5/training-4320/hdf5_4320_shards_varlength:/workspace/data --volume=/datadrive/bert/hdf5/training-4320/hdf5_4320_shards_varlength:/workspace/data_phase2 --volume=/datadrive/bert/phase1:/workspace/phase1 --volume=/datadrive/bert/hdf5/eval_varlength:/workspace/evaldata --volume=/lustre/fsw/mlperf/mlperft-bert/unit_test:/workspace/unit_test_data'
30+
+ mkdir -p /home/azureuser/mlperf/training_results_v2.0/NVIDIA/benchmarks/bert/implementations/pytorch/results
31+
+ mapfile -t _config_env
32+
++ env -i bash -c '. ./config_DGXA100_1x8x56x1.sh && compgen -e'
33+
++ grep -E -v '^(PWD|SHLVL)'
34+
+ _config_env+=(SEED)
35+
+ mapfile -t _config_env
36+
++ for v in "${_config_env[@]}"
37+
++ echo --env=BATCHSIZE
38+
++ for v in "${_config_env[@]}"
39+
++ echo --env=CHECKPOINTDIR
40+
++ for v in "${_config_env[@]}"
41+
++ echo --env=CHECKPOINTDIR_PHASE1
42+
++ for v in "${_config_env[@]}"
43+
++ echo --env=DATADIR
44+
++ for v in "${_config_env[@]}"
45+
++ echo --env=DATADIR_PHASE2
46+
++ for v in "${_config_env[@]}"
47+
++ echo --env=DGXHT
48+
++ for v in "${_config_env[@]}"
49+
++ echo --env=DGXNGPU
50+
++ for v in "${_config_env[@]}"
51+
++ echo --env=DGXNNODES
52+
++ for v in "${_config_env[@]}"
53+
++ echo --env=DGXNSOCKET
54+
++ for v in "${_config_env[@]}"
55+
++ echo --env=DGXSOCKETCORES
56+
++ for v in "${_config_env[@]}"
57+
++ echo --env=DGXSYSTEM
58+
++ for v in "${_config_env[@]}"
59+
++ echo --env=EVALDIR
60+
++ for v in "${_config_env[@]}"
61+
++ echo --env=EVAL_ITER_SAMPLES
62+
++ for v in "${_config_env[@]}"
63+
++ echo --env=EVAL_ITER_START_SAMPLES
64+
++ for v in "${_config_env[@]}"
65+
++ echo --env=EXTRA_PARAMS
66+
++ for v in "${_config_env[@]}"
67+
++ echo --env=GRADIENT_STEPS
68+
++ for v in "${_config_env[@]}"
69+
++ echo --env=LR
70+
++ for v in "${_config_env[@]}"
71+
++ echo --env=MAX_SAMPLES_TERMINATION
72+
++ for v in "${_config_env[@]}"
73+
++ echo --env=MAX_STEPS
74+
++ for v in "${_config_env[@]}"
75+
++ echo --env=OPT_LAMB_BETA_1
76+
++ for v in "${_config_env[@]}"
77+
++ echo --env=OPT_LAMB_BETA_2
78+
++ for v in "${_config_env[@]}"
79+
++ echo --env=PHASE
80+
++ for v in "${_config_env[@]}"
81+
++ echo --env=RESULTSDIR
82+
++ for v in "${_config_env[@]}"
83+
++ echo --env=SLURM_NTASKS
84+
++ for v in "${_config_env[@]}"
85+
++ echo --env=START_WARMUP_STEP
86+
++ for v in "${_config_env[@]}"
87+
++ echo --env=UNITTESTDIR
88+
++ for v in "${_config_env[@]}"
89+
++ echo --env=WALLTIME
90+
++ for v in "${_config_env[@]}"
91+
++ echo --env=WARMUP_PROPORTION
92+
++ for v in "${_config_env[@]}"
93+
++ echo --env=WEIGHT_DECAY_RATE
94+
++ for v in "${_config_env[@]}"
95+
++ echo --env=SEED
96+
+ cleanup_docker
97+
+ docker container rm -f language_model
98+
Error response from daemon: No such container

src/VirtualClient/VirtualClient.Actions.UnitTests/Examples/MLPerfTraining/Example_bert_real_output.txt

Lines changed: 3006 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"namespace": "", "time_ms": 1652749508549, "event_type": "POINT_IN_TIME", "key": "cache_clear", "value": true, "metadata": {"file": "<string>", "lineno": 3}}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"namespace": "", "event_type": "POINT_IN_TIME", "key": "cache_clear", "value": true, "metadata": {"file": "<string>", "lineno": 3}}

src/VirtualClient/VirtualClient.Actions.UnitTests/Examples/MLPerfTraining/Example_bert_time_ms_multiple.txt

Lines changed: 1595 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
// Copyright (c) Microsoft Corporation.
2+
// Licensed under the MIT License.
3+
4+
namespace VirtualClient.Actions
5+
{
6+
using System;
7+
using System.Collections.Generic;
8+
using System.Diagnostics;
9+
using System.IO;
10+
using System.Linq;
11+
using System.Reflection;
12+
using System.Threading;
13+
using System.Threading.Tasks;
14+
using Microsoft.Extensions.DependencyInjection;
15+
using Moq;
16+
using Newtonsoft.Json;
17+
using Newtonsoft.Json.Linq;
18+
using NUnit.Framework;
19+
using Polly;
20+
using VirtualClient.Common;
21+
using VirtualClient.Common.Contracts;
22+
using VirtualClient.Common.Telemetry;
23+
using VirtualClient.Contracts;
24+
25+
[TestFixture]
26+
[Category("Unit")]
27+
public class MLPerfTrainingExecutorTests
28+
{
29+
private DependencyFixture mockFixture;
30+
private DependencyPath mockPackage;
31+
private IEnumerable<Disk> disks;
32+
private string output;
33+
private List<string> commandsExecuted = new List<string>();
34+
35+
[SetUp]
36+
public void SetupTests()
37+
{
38+
this.mockFixture = new DependencyFixture();
39+
this.SetupDefaultMockBehavior(PlatformID.Unix);
40+
}
41+
42+
[Test]
43+
public async Task MLPerfTrainingExecutorInitializesWorkloadAsExpected()
44+
{
45+
List<string> expectedCommands = new List<string>
46+
{
47+
"sudo usermod -aG docker anyuser",
48+
"sudo docker build --pull -t mlperf-training-anyuser-x86_64:language_model .",
49+
"sudo docker run --runtime=nvidia mlperf-training-anyuser-x86_64:language_model"
50+
};
51+
52+
using (TestMLPerfTrainingExecutor MLPerfTrainingExecutor = new TestMLPerfTrainingExecutor(this.mockFixture.Dependencies, this.mockFixture.Parameters))
53+
{
54+
await MLPerfTrainingExecutor.InitializeAsync(EventContext.None, CancellationToken.None).ConfigureAwait(false);
55+
}
56+
57+
CollectionAssert.AreEqual(expectedCommands, commandsExecuted);
58+
}
59+
60+
[Test]
61+
public async Task MLPerfTrainingExecutorExecutesAsExpected()
62+
{
63+
IEnumerable<string> expectedCommands = this.GetExpectedCommands();
64+
65+
using (TestMLPerfTrainingExecutor MLPerfTrainingExecutor = new TestMLPerfTrainingExecutor(this.mockFixture.Dependencies, this.mockFixture.Parameters))
66+
{
67+
await MLPerfTrainingExecutor.InitializeAsync(EventContext.None, CancellationToken.None).ConfigureAwait(false);
68+
await MLPerfTrainingExecutor.ExecuteAsync(EventContext.None, CancellationToken.None).ConfigureAwait(false);
69+
}
70+
71+
CollectionAssert.AreEqual(expectedCommands.ToArray(), commandsExecuted);
72+
}
73+
74+
private void SetupDefaultMockBehavior(PlatformID platformID)
75+
{
76+
this.commandsExecuted = new List<string>();
77+
this.mockFixture = new DependencyFixture();
78+
this.mockFixture.Setup(platformID);
79+
this.mockPackage = new DependencyPath("MLPerfTraining", this.mockFixture.PlatformSpecifics.GetPackagePath("mlperf"));
80+
81+
this.disks = this.mockFixture.CreateDisks(PlatformID.Unix, true);
82+
this.mockFixture.DiskManager.AddRange(this.disks);
83+
this.mockFixture.SetupWorkloadPackage("mlperftraining", expectedFiles: @"win-x64\diskspd.exe");
84+
85+
this.mockFixture.Parameters = new Dictionary<string, IConvertible>()
86+
{
87+
{ nameof(MLPerfTrainingExecutor.Username), "anyuser" },
88+
{ nameof(MLPerfTrainingExecutor.Model), "bert" },
89+
{ nameof(MLPerfTrainingExecutor.BatchSize), "45"},
90+
{ nameof(MLPerfTrainingExecutor.Implementation), "pytorch-22.09"},
91+
{ nameof(MLPerfTrainingExecutor.ContainerName), "language_model"},
92+
{ nameof(MLPerfTrainingExecutor.DataPath), "mlperf-training-data-bert.1.0.0"},
93+
{ nameof(MLPerfTrainingExecutor.GPUCount), "8"},
94+
{ nameof(MLPerfTrainingExecutor.Scenario), "training-mlperf-bert-batchsize-45-gpu-8"},
95+
{ nameof(MLPerfTrainingExecutor.ConfigFile), "config_DGXA100_1x8x56x1.sh"},
96+
{ nameof(MLPerfTrainingExecutor.PackageName), "mlperftraining"}
97+
};
98+
99+
string workingDirectory = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location);
100+
string outputPath = Path.Combine(workingDirectory, @"Examples\MLPerfTraining\Example_bert_real_output.txt");
101+
this.output = File.ReadAllText(outputPath);
102+
103+
this.mockFixture.ProcessManager.OnCreateProcess = (command, arguments, workingDir) =>
104+
{
105+
IProcessProxy process = this.mockFixture.CreateProcess(command, arguments, workingDir);
106+
this.commandsExecuted.Add($"{command} {arguments}".Trim());
107+
process.StandardOutput.Append(this.output);
108+
109+
return process;
110+
};
111+
}
112+
113+
private IEnumerable<string> GetExpectedCommands()
114+
{
115+
List<string> commands = null;
116+
commands = new List<string>
117+
{
118+
"sudo usermod -aG docker anyuser",
119+
"sudo docker build --pull -t mlperf-training-anyuser-x86_64:language_model .",
120+
"sudo docker run --runtime=nvidia mlperf-training-anyuser-x86_64:language_model",
121+
"sudo su -c \"source config_DGXA100_1x8x56x1.sh; env BATCHSIZE=45 DGXNGPU=8 CUDA_VISIBLE_DEVICES=\"0,1,2,3,4,5,6,7\" CONT=mlperf-training-anyuser-x86_64:language_model DATADIR=/mlperftraining0/mlperf-training-data-bert.1.0.0/mlperf-training-package/hdf5/training-4320 DATADIR_PHASE2=/mlperftraining0/mlperf-training-data-bert.1.0.0/mlperf-training-package/hdf5/training-4320 EVALDIR=/mlperftraining0/mlperf-training-data-bert.1.0.0/mlperf-training-package/hdf5/eval_varlength CHECKPOINTDIR=/mlperftraining0/mlperf-training-data-bert.1.0.0/mlperf-training-package/phase1 CHECKPOINTDIR_PHASE1=/mlperftraining0/mlperf-training-data-bert.1.0.0/mlperf-training-package/phase1 ./run_with_docker.sh\""
122+
};
123+
124+
return commands;
125+
}
126+
127+
protected class TestMLPerfTrainingExecutor : MLPerfTrainingExecutor
128+
{
129+
public TestMLPerfTrainingExecutor(IServiceCollection dependencies, IDictionary<string, IConvertible> parameters)
130+
: base(dependencies, parameters)
131+
{
132+
}
133+
134+
public new Task ExecuteAsync(EventContext context, CancellationToken cancellationToken)
135+
{
136+
return base.ExecuteAsync(context, cancellationToken);
137+
}
138+
139+
public new Task InitializeAsync(EventContext context, CancellationToken cancellationToken)
140+
{
141+
return base.InitializeAsync(context, cancellationToken);
142+
}
143+
144+
public new string GetContainerName()
145+
{
146+
return base.GetContainerName();
147+
}
148+
}
149+
}
150+
}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
using System.Collections.Generic;
2+
using System.IO;
3+
using System.Linq;
4+
using System.Reflection;
5+
using NUnit.Framework;
6+
using VirtualClient.Contracts;
7+
8+
// Copyright (c) Microsoft Corporation.
9+
// Licensed under the MIT License.
10+
11+
namespace VirtualClient.Actions
12+
{
13+
[TestFixture]
14+
[Category("Unit")]
15+
public class MLPerfTrainingMetricsParserTests
16+
{
17+
private string rawText;
18+
private MLPerfTrainingMetricsParser testParser;
19+
20+
private string ExamplePath
21+
{
22+
get
23+
{
24+
string workingDirectory = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location);
25+
return Path.Combine(workingDirectory, "Examples", "MLPerfTraining");
26+
}
27+
}
28+
29+
[Test]
30+
[TestCase("Example_bert_real_output.txt")]
31+
public void MLPerfTrainingMetricsParserParsesCorrectMetricsFromRawText(string exampleFile)
32+
{
33+
string outputPath = Path.Combine(this.ExamplePath, exampleFile);
34+
this.rawText = File.ReadAllText(outputPath);
35+
this.testParser = new MLPerfTrainingMetricsParser(this.rawText);
36+
IList<Metric> metrics = this.testParser.Parse();
37+
38+
Assert.AreEqual(5, metrics.Count);
39+
MetricAssert.Exists(metrics, "eval_mlm_accuracy", 0.71472860574722286);
40+
MetricAssert.Exists(metrics, "e2e_time", 596.53150777816768, "s");
41+
MetricAssert.Exists(metrics, "training_sequences_per_second", 1855.6555448898107);
42+
MetricAssert.Exists(metrics, "final_loss", 0);
43+
MetricAssert.Exists(metrics, "raw_train_time", 577.98435058593748, "s");
44+
}
45+
46+
[Test]
47+
[TestCase("Example_bert_incorrect_output.txt")]
48+
public void MLPerfTrainingMetricsParserThrowsOnIncorrectRawText(string exampleFile)
49+
{
50+
string outputPath = Path.Combine(this.ExamplePath, exampleFile);
51+
this.rawText = File.ReadAllText(outputPath);
52+
this.testParser = new MLPerfTrainingMetricsParser(this.rawText);
53+
SchemaException exception = Assert.Throws<SchemaException>(() => this.testParser.Parse());
54+
StringAssert.Contains("The MlPerf Training output file has incorrect format for parsing", exception.Message);
55+
}
56+
}
57+
}

0 commit comments

Comments
 (0)