microsoft
diff --git a/‎src/VirtualClient/VirtualClient.Actions.UnitTests/Examples/MLPerfTraining/Example_bert_incorrect_output.txt
Lines changed: 98 additions & 0 deletions b/‎src/VirtualClient/VirtualClient.Actions.UnitTests/Examples/MLPerfTraining/Example_bert_incorrect_output.txt
Lines changed: 98 additions & 0 deletions
diff --git a/‎src/VirtualClient/VirtualClient.Actions.UnitTests/Examples/MLPerfTraining/Example_bert_real_output.txt
Lines changed: 3006 additions & 0 deletions b/‎src/VirtualClient/VirtualClient.Actions.UnitTests/Examples/MLPerfTraining/Example_bert_real_output.txt
Lines changed: 3006 additions & 0 deletions
diff --git a/‎src/VirtualClient/VirtualClient.Actions.UnitTests/Examples/MLPerfTraining/Example_bert_time_ms.txt
Lines changed: 1 addition & 0 deletions b/‎src/VirtualClient/VirtualClient.Actions.UnitTests/Examples/MLPerfTraining/Example_bert_time_ms.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/VirtualClient/VirtualClient.Actions.UnitTests/Examples/MLPerfTraining/Example_bert_time_ms_err.txt
Lines changed: 1 addition & 0 deletions b/‎src/VirtualClient/VirtualClient.Actions.UnitTests/Examples/MLPerfTraining/Example_bert_time_ms_err.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/VirtualClient/VirtualClient.Actions.UnitTests/Examples/MLPerfTraining/Example_bert_time_ms_multiple.txt
Lines changed: 1595 additions & 0 deletions b/‎src/VirtualClient/VirtualClient.Actions.UnitTests/Examples/MLPerfTraining/Example_bert_time_ms_multiple.txt
Lines changed: 1595 additions & 0 deletions
diff --git a/‎src/VirtualClient/VirtualClient.Actions.UnitTests/MLPerfTraining/MLPerfTrainingExecutorTests.cs
Lines changed: 150 additions & 0 deletions b/‎src/VirtualClient/VirtualClient.Actions.UnitTests/MLPerfTraining/MLPerfTrainingExecutorTests.cs
Lines changed: 150 additions & 0 deletions
diff --git a/‎src/VirtualClient/VirtualClient.Actions.UnitTests/MLPerfTraining/MLPerfTrainingMetricsParserTests.cs
Lines changed: 57 additions & 0 deletions b/‎src/VirtualClient/VirtualClient.Actions.UnitTests/MLPerfTraining/MLPerfTrainingMetricsParserTests.cs
Lines changed: 57 additions & 0 deletions
@@ -0,0 +1,98 @@
++ : mlperf-nvidia:language_model
++ : DGXA100_1x8x56x1
++ : 1
+++ date +%y%m%d%H%M%S%N
++ : 230306191015611751926
+++ pwd
++ : /home/azureuser/mlperf/training_results_v2.0/NVIDIA/benchmarks/bert/implementations/pytorch/results
++ : constants.BERT
++ : 5
++ : ./config_DGXA100_1x8x56x1.sh
++ : /home/azureuser/mlperf/training_results_v2.0/NVIDIA/benchmarks/bert/implementations/pytorch/results/230306191015611751926
++ : language_model
++ : 0,1,2,3,4,5,6,7
++ readonly docker_image=mlperf-nvidia:language_model
++ docker_image=mlperf-nvidia:language_model
++++ func_update_file_path_for_ci mounts.txt /home/azureuser/mlperf/training_results_v2.0/NVIDIA/benchmarks/bert/implementations/pytorch/language_model/pytorch
++++ declare new_path
++++ '[' -f mounts.txt ']'
++++ new_path=mounts.txt
++++ '[' '!' -f mounts.txt ']'
++++ echo mounts.txt
+++ func_get_container_mounts mounts.txt
++++ envsubst
+++++ sed '/^$/d' mounts.txt
+++++ sed '/^#/d'
+++++ sed 's/^[ ]*\(.*\)[ ]*/--volume=\1 /'
+++++ tr '\n' ' '
+++ echo --volume=/datadrive/bert/hdf5/training-4320/hdf5_4320_shards_varlength:/workspace/data --volume=/datadrive/bert/hdf5/training-4320/hdf5_4320_shards_varlength:/workspace/data_phase2 --volume=/datadrive/bert/phase1:/workspace/phase1 --volume=/datadrive/bert/hdf5/eval_varlength:/workspace/evaldata --volume=/lustre/fsw/mlperf/mlperft-bert/unit_test:/workspace/unit_test_data
++ CONT_MOUNTS='--volume=/datadrive/bert/hdf5/training-4320/hdf5_4320_shards_varlength:/workspace/data --volume=/datadrive/bert/hdf5/training-4320/hdf5_4320_shards_varlength:/workspace/data_phase2 --volume=/datadrive/bert/phase1:/workspace/phase1 --volume=/datadrive/bert/hdf5/eval_varlength:/workspace/evaldata --volume=/lustre/fsw/mlperf/mlperft-bert/unit_test:/workspace/unit_test_data'
++ mkdir -p /home/azureuser/mlperf/training_results_v2.0/NVIDIA/benchmarks/bert/implementations/pytorch/results
++ mapfile -t _config_env
+++ env -i bash -c '. ./config_DGXA100_1x8x56x1.sh && compgen -e'
+++ grep -E -v '^(PWD|SHLVL)'
++ _config_env+=(SEED)
++ mapfile -t _config_env
+++ for v in "${_config_env[@]}"
+++ echo --env=BATCHSIZE
+++ for v in "${_config_env[@]}"
+++ echo --env=CHECKPOINTDIR
+++ for v in "${_config_env[@]}"
+++ echo --env=CHECKPOINTDIR_PHASE1
+++ for v in "${_config_env[@]}"
+++ echo --env=DATADIR
+++ for v in "${_config_env[@]}"
+++ echo --env=DATADIR_PHASE2
+++ for v in "${_config_env[@]}"
+++ echo --env=DGXHT
+++ for v in "${_config_env[@]}"
+++ echo --env=DGXNGPU
+++ for v in "${_config_env[@]}"
+++ echo --env=DGXNNODES
+++ for v in "${_config_env[@]}"
+++ echo --env=DGXNSOCKET
+++ for v in "${_config_env[@]}"
+++ echo --env=DGXSOCKETCORES
+++ for v in "${_config_env[@]}"
+++ echo --env=DGXSYSTEM
+++ for v in "${_config_env[@]}"
+++ echo --env=EVALDIR
+++ for v in "${_config_env[@]}"
+++ echo --env=EVAL_ITER_SAMPLES
+++ for v in "${_config_env[@]}"
+++ echo --env=EVAL_ITER_START_SAMPLES
+++ for v in "${_config_env[@]}"
+++ echo --env=EXTRA_PARAMS
+++ for v in "${_config_env[@]}"
+++ echo --env=GRADIENT_STEPS
+++ for v in "${_config_env[@]}"
+++ echo --env=LR
+++ for v in "${_config_env[@]}"
+++ echo --env=MAX_SAMPLES_TERMINATION
+++ for v in "${_config_env[@]}"
+++ echo --env=MAX_STEPS
+++ for v in "${_config_env[@]}"
+++ echo --env=OPT_LAMB_BETA_1
+++ for v in "${_config_env[@]}"
+++ echo --env=OPT_LAMB_BETA_2
+++ for v in "${_config_env[@]}"
+++ echo --env=PHASE
+++ for v in "${_config_env[@]}"
+++ echo --env=RESULTSDIR
+++ for v in "${_config_env[@]}"
+++ echo --env=SLURM_NTASKS
+++ for v in "${_config_env[@]}"
+++ echo --env=START_WARMUP_STEP
+++ for v in "${_config_env[@]}"
+++ echo --env=UNITTESTDIR
+++ for v in "${_config_env[@]}"
+++ echo --env=WALLTIME
+++ for v in "${_config_env[@]}"
+++ echo --env=WARMUP_PROPORTION
+++ for v in "${_config_env[@]}"
+++ echo --env=WEIGHT_DECAY_RATE
+++ for v in "${_config_env[@]}"
+++ echo --env=SEED
++ cleanup_docker
++ docker container rm -f language_model
+Error response from daemon: No such container
@@ -0,0 +1 @@
+{"namespace": "", "time_ms": 1652749508549, "event_type": "POINT_IN_TIME", "key": "cache_clear", "value": true, "metadata": {"file": "<string>", "lineno": 3}}
@@ -0,0 +1 @@
+{"namespace": "", "event_type": "POINT_IN_TIME", "key": "cache_clear", "value": true, "metadata": {"file": "<string>", "lineno": 3}}
@@ -0,0 +1,150 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+namespace VirtualClient.Actions
+{
+    using System;
+    using System.Collections.Generic;
+    using System.Diagnostics;
+    using System.IO;
+    using System.Linq;
+    using System.Reflection;
+    using System.Threading;
+    using System.Threading.Tasks;
+    using Microsoft.Extensions.DependencyInjection;
+    using Moq;
+    using Newtonsoft.Json;
+    using Newtonsoft.Json.Linq;
+    using NUnit.Framework;
+    using Polly;
+    using VirtualClient.Common;
+    using VirtualClient.Common.Contracts;
+    using VirtualClient.Common.Telemetry;
+    using VirtualClient.Contracts;
+
+    [TestFixture]
+    [Category("Unit")]
+    public class MLPerfTrainingExecutorTests
+    {
+        private DependencyFixture mockFixture;
+        private DependencyPath mockPackage;
+        private IEnumerable<Disk> disks;
+        private string output;
+        private List<string> commandsExecuted = new List<string>();
+
+        [SetUp]
+        public void SetupTests()
+        {
+            this.mockFixture = new DependencyFixture();
+            this.SetupDefaultMockBehavior(PlatformID.Unix);
+        }
+
+        [Test]
+        public async Task MLPerfTrainingExecutorInitializesWorkloadAsExpected()
+        { 
+            List<string> expectedCommands = new List<string>
+            {
+                "sudo usermod -aG docker anyuser",
+                "sudo docker build --pull -t mlperf-training-anyuser-x86_64:language_model .",
+                "sudo docker run --runtime=nvidia mlperf-training-anyuser-x86_64:language_model"
+            };
+
+            using (TestMLPerfTrainingExecutor MLPerfTrainingExecutor = new TestMLPerfTrainingExecutor(this.mockFixture.Dependencies, this.mockFixture.Parameters))
+            {
+                await MLPerfTrainingExecutor.InitializeAsync(EventContext.None, CancellationToken.None).ConfigureAwait(false);
+            }
+
+            CollectionAssert.AreEqual(expectedCommands, commandsExecuted);
+        }
+
+        [Test]
+        public async Task MLPerfTrainingExecutorExecutesAsExpected()
+        {
+            IEnumerable<string> expectedCommands = this.GetExpectedCommands();
+            
+            using (TestMLPerfTrainingExecutor MLPerfTrainingExecutor = new TestMLPerfTrainingExecutor(this.mockFixture.Dependencies, this.mockFixture.Parameters))
+            {
+                await MLPerfTrainingExecutor.InitializeAsync(EventContext.None, CancellationToken.None).ConfigureAwait(false);
+                await MLPerfTrainingExecutor.ExecuteAsync(EventContext.None, CancellationToken.None).ConfigureAwait(false);
+            }
+
+            CollectionAssert.AreEqual(expectedCommands.ToArray(), commandsExecuted);
+        }
+
+        private void SetupDefaultMockBehavior(PlatformID platformID)
+        {
+            this.commandsExecuted = new List<string>();
+            this.mockFixture = new DependencyFixture();
+            this.mockFixture.Setup(platformID);
+            this.mockPackage = new DependencyPath("MLPerfTraining", this.mockFixture.PlatformSpecifics.GetPackagePath("mlperf"));
+
+            this.disks = this.mockFixture.CreateDisks(PlatformID.Unix, true);
+            this.mockFixture.DiskManager.AddRange(this.disks);
+            this.mockFixture.SetupWorkloadPackage("mlperftraining", expectedFiles: @"win-x64\diskspd.exe");
+
+            this.mockFixture.Parameters = new Dictionary<string, IConvertible>()
+            {
+                { nameof(MLPerfTrainingExecutor.Username), "anyuser" },
+                { nameof(MLPerfTrainingExecutor.Model), "bert" },
+                { nameof(MLPerfTrainingExecutor.BatchSize), "45"},
+                { nameof(MLPerfTrainingExecutor.Implementation), "pytorch-22.09"},
+                { nameof(MLPerfTrainingExecutor.ContainerName), "language_model"},
+                { nameof(MLPerfTrainingExecutor.DataPath), "mlperf-training-data-bert.1.0.0"},
+                { nameof(MLPerfTrainingExecutor.GPUCount), "8"},
+                { nameof(MLPerfTrainingExecutor.Scenario), "training-mlperf-bert-batchsize-45-gpu-8"},
+                { nameof(MLPerfTrainingExecutor.ConfigFile), "config_DGXA100_1x8x56x1.sh"},
+                { nameof(MLPerfTrainingExecutor.PackageName), "mlperftraining"}
+            };
+
+            string workingDirectory = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location);
+            string outputPath = Path.Combine(workingDirectory, @"Examples\MLPerfTraining\Example_bert_real_output.txt");
+            this.output = File.ReadAllText(outputPath);
+
+            this.mockFixture.ProcessManager.OnCreateProcess = (command, arguments, workingDir) =>
+            {
+                IProcessProxy process = this.mockFixture.CreateProcess(command, arguments, workingDir);
+                this.commandsExecuted.Add($"{command} {arguments}".Trim());
+                process.StandardOutput.Append(this.output);
+
+                return process;
+            };
+        }
+
+        private IEnumerable<string> GetExpectedCommands()
+        {
+            List<string> commands = null;
+            commands = new List<string>
+            {
+                "sudo usermod -aG docker anyuser",
+                "sudo docker build --pull -t mlperf-training-anyuser-x86_64:language_model .",
+                "sudo docker run --runtime=nvidia mlperf-training-anyuser-x86_64:language_model",
+                "sudo su -c \"source config_DGXA100_1x8x56x1.sh; env BATCHSIZE=45 DGXNGPU=8 CUDA_VISIBLE_DEVICES=\"0,1,2,3,4,5,6,7\" CONT=mlperf-training-anyuser-x86_64:language_model DATADIR=/mlperftraining0/mlperf-training-data-bert.1.0.0/mlperf-training-package/hdf5/training-4320 DATADIR_PHASE2=/mlperftraining0/mlperf-training-data-bert.1.0.0/mlperf-training-package/hdf5/training-4320 EVALDIR=/mlperftraining0/mlperf-training-data-bert.1.0.0/mlperf-training-package/hdf5/eval_varlength CHECKPOINTDIR=/mlperftraining0/mlperf-training-data-bert.1.0.0/mlperf-training-package/phase1 CHECKPOINTDIR_PHASE1=/mlperftraining0/mlperf-training-data-bert.1.0.0/mlperf-training-package/phase1 ./run_with_docker.sh\""
+            };
+
+            return commands;
+        }
+
+        protected class TestMLPerfTrainingExecutor : MLPerfTrainingExecutor
+        {
+            public TestMLPerfTrainingExecutor(IServiceCollection dependencies, IDictionary<string, IConvertible> parameters)
+                : base(dependencies, parameters)
+            {
+            }
+
+            public new Task ExecuteAsync(EventContext context, CancellationToken cancellationToken)
+            {
+                return base.ExecuteAsync(context, cancellationToken);
+            }
+
+            public new Task InitializeAsync(EventContext context, CancellationToken cancellationToken)
+            {
+                return base.InitializeAsync(context, cancellationToken);
+            }
+
+            public new string GetContainerName()
+            {
+                return base.GetContainerName();
+            }
+        }
+    }
+}
@@ -0,0 +1,57 @@
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Reflection;
+using NUnit.Framework;
+using VirtualClient.Contracts;
+
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+namespace VirtualClient.Actions
+{
+    [TestFixture]
+    [Category("Unit")]
+    public class MLPerfTrainingMetricsParserTests
+    {
+        private string rawText;
+        private MLPerfTrainingMetricsParser testParser;
+
+        private string ExamplePath
+        {
+            get
+            {
+                string workingDirectory = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location);
+                return Path.Combine(workingDirectory, "Examples", "MLPerfTraining");
+            }
+        }
+
+        [Test]
+        [TestCase("Example_bert_real_output.txt")]
+        public void MLPerfTrainingMetricsParserParsesCorrectMetricsFromRawText(string exampleFile)
+        {
+            string outputPath = Path.Combine(this.ExamplePath, exampleFile);
+            this.rawText = File.ReadAllText(outputPath);
+            this.testParser = new MLPerfTrainingMetricsParser(this.rawText);
+            IList<Metric> metrics = this.testParser.Parse();
+
+            Assert.AreEqual(5, metrics.Count);
+            MetricAssert.Exists(metrics, "eval_mlm_accuracy", 0.71472860574722286);
+            MetricAssert.Exists(metrics, "e2e_time", 596.53150777816768, "s");
+            MetricAssert.Exists(metrics, "training_sequences_per_second", 1855.6555448898107);
+            MetricAssert.Exists(metrics, "final_loss", 0);
+            MetricAssert.Exists(metrics, "raw_train_time", 577.98435058593748, "s");
+        }
+
+        [Test]
+        [TestCase("Example_bert_incorrect_output.txt")]
+        public void MLPerfTrainingMetricsParserThrowsOnIncorrectRawText(string exampleFile)
+        {
+            string outputPath = Path.Combine(this.ExamplePath, exampleFile);
+            this.rawText = File.ReadAllText(outputPath);
+            this.testParser = new MLPerfTrainingMetricsParser(this.rawText);
+            SchemaException exception = Assert.Throws<SchemaException>(() => this.testParser.Parse());
+            StringAssert.Contains("The MlPerf Training output file has incorrect format for parsing", exception.Message);
+        }
+    }
+}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+{"namespace": "", "time_ms": 1652749508549, "event_type": "POINT_IN_TIME", "key": "cache_clear", "value": true, "metadata": {"file": "<string>", "lineno": 3}}`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+{"namespace": "", "event_type": "POINT_IN_TIME", "key": "cache_clear", "value": true, "metadata": {"file": "<string>", "lineno": 3}}`