Added metrics/monitor stability tests (#211)

Ashwin Ramesh · web-flow · commit c274b561c8dd · 2021-09-15T11:13:23.000-05:00
* Added metrics/monitor stability tests

* Use only 1 GPU for test
diff --git a/qa/L0_stability_metrics/check_results.py b/qa/L0_stability_metrics/check_results.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import sys
+import yaml
+import os
+import glob
+
+
+class TestOutputValidator:
+    """
+    Functions that validate the output
+    of the test
+    """
+
+    def __init__(self, config, test_name, results_path, tolerance):
+        self._config = config
+        self._models = list(config['profile_models'])
+        self._result_path = results_path
+        self._tolerance = tolerance
+
+        check_function = self.__getattribute__(f'check_{test_name}')
+
+        if check_function():
+            sys.exit(0)
+        else:
+            sys.exit(1)
+
+    def check_metrics_stability(self):
+        """
+        Makes sure that the same configuration
+        appears as the best across iterations for
+        each model
+        """
+
+        # There should be 4 csv files the results path
+        pathname = os.path.join(self._result_path, 'result_*.csv')
+        csv_contents = []
+        for filename in glob.glob(pathname):
+            with open(filename, 'r+') as f:
+                csv_contents.append(f.read())
+
+        # Now in the first csv, get the metrics
+        metric_values = {}
+        for csv in csv_contents:
+            csv_lines = csv.split('\n')
+            for line in csv_lines[1:-2]:
+                model, _, _, gpu_memory, gpu_utilization = line.split(',')
+                if model in metric_values:
+                    metric_values[model]['gpu_memory'].append(float(gpu_memory))
+                    metric_values[model]['gpu_utilization'].append(
+                        float(gpu_utilization))
+                else:
+                    metric_values[model] = {
+                        'gpu_memory': [float(gpu_memory)],
+                        'gpu_utilization': [float(gpu_utilization)]
+                    }
+
+        # Compare metrics
+        for model in metric_values:
+            for metric, values in metric_values[model].items():
+                start_value = values[0]
+                for value in values[1:]:
+                    deviation_percent = abs(
+                        (value - start_value) / start_value) * 100
+                    if deviation_percent > self._tolerance:
+                        print(
+                            f"\n***"
+                            f"\n***  For model {model}, value for metric {metric}"
+                            "\n***  is unstable.\n***\n"
+                            f"\n***\n***  Expected: {start_value} +/- {self._tolerance*start_value/100}."
+                            f"\n***  Found: {values[1:]}.\n***")
+                        return False
+        return True
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f',
+                        '--config-file',
+                        type=str,
+                        required=True,
+                        help='The path to the config yaml file.')
+    parser.add_argument('-r',
+                        '--inference-results-path',
+                        type=str,
+                        required=True,
+                        help='The full path to the analyzer log.')
+    parser.add_argument('-t',
+                        '--test-name',
+                        type=str,
+                        required=True,
+                        help='The name of the test to be run.')
+    parser.add_argument(
+        '--tolerance',
+        type=int,
+        default=10,
+        help='The percent tolerance allowed for the metrics to vary.')
+
+    args = parser.parse_args()
+
+    with open(args.config_file, 'r') as f:
+        config = yaml.safe_load(f)
+
+    TestOutputValidator(config, args.test_name, args.inference_results_path,
+                        args.tolerance)
diff --git a/qa/L0_stability_metrics/test.sh b/qa/L0_stability_metrics/test.sh
@@ -0,0 +1,106 @@
+# Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ANALYZER_LOG="test.log"
+source ../common/util.sh
+
+rm -f *.log
+
+# Set test parameters
+MODEL_ANALYZER="`which model-analyzer`"
+REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION}
+MODEL_REPOSITORY=${MODEL_REPOSITORY:="/mnt/dldata/inferenceserver/model_analyzer_benchmark_models"}
+CHECKPOINT_REPOSITORY=${CHECKPOINT_REPOSITORY:="/mnt/dldata/inferenceserver/model_analyzer_checkpoints"}
+TRITON_LAUNCH_MODE=${TRITON_LAUNCH_MODE:="local"}
+CLIENT_PROTOCOL="grpc"
+PORTS=(`find_available_ports 3`)
+GPUS=(`get_all_gpus_uuids`)
+OUTPUT_MODEL_REPOSITORY=${OUTPUT_MODEL_REPOSITORY:=`get_output_directory`}
+CONFIG_FILE="config.yaml"
+NUM_ITERATIONS=${NUM_ITERATIONS:=5}
+BENCHMARK_MODELS="`ls ${MODEL_REPOSITORY}`"
+MODEL_NAMES="$(echo $BENCHMARK_MODELS | sed 's/ /,/g')"
+EXPORT_PATH="`pwd`/results"
+FILENAME_SERVER_ONLY="server-metrics.csv"
+FILENAME_INFERENCE_MODEL="model-metrics-inference.csv"
+FILENAME_GPU_MODEL="model-metrics-gpu.csv"
+CHECKPOINT_DIRECTORY="./checkpoints"
+CSV_PATH='.'
+
+# Set up checkpoints
+mkdir $EXPORT_PATH
+
+# Generate test configs
+python3 test_config_generator.py --models $MODEL_NAMES
+
+# Set analyzer config options
+RET=0
+
+set +e
+
+# Run the analyzer and check the results
+for (( i=1; i<=$NUM_ITERATIONS; i++ )); do
+    # First profile
+    MODEL_ANALYZER_ARGS="-m $MODEL_REPOSITORY -f $CONFIG_FILE"
+    MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --client-protocol=$CLIENT_PROTOCOL --triton-launch-mode=$TRITON_LAUNCH_MODE"
+    MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --triton-http-endpoint localhost:${PORTS[0]} --triton-grpc-endpoint localhost:${PORTS[1]}"
+    MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --triton-metrics-url http://localhost:${PORTS[2]}/metrics"
+    MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --output-model-repository-path $OUTPUT_MODEL_REPOSITORY --override-output-model-repository"
+    MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --gpus 0"
+    MODEL_ANALYZER_SUBCOMMAND="profile"
+    
+    run_analyzer
+    if [ $? -ne 0 ]; then
+        echo -e "\n***\n*** Test Failed. model-analyzer $MODEL_ANALYZER_SUBCOMMAND exited with non-zero exit code. \n***"
+        cat $ANALYZER_LOG
+        RET=1
+    fi
+
+    # Then generate results
+    MODEL_ANALYZER_ARGS="-e $EXPORT_PATH -f $CONFIG_FILE --filename-server-only=$FILENAME_SERVER_ONLY"
+    MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --filename-model-inference=$FILENAME_INFERENCE_MODEL --filename-model-gpu=$FILENAME_GPU_MODEL "
+    MODEL_ANALYZER_SUBCOMMAND="analyze"
+    
+    run_analyzer
+    if [ $? -ne 0 ]; then
+        echo -e "\n***\n*** Test Failed. model-analyzer $MODEL_ANALYZER_SUBCOMMAND exited with non-zero exit code. \n***"
+        cat $ANALYZER_LOG
+        RET=1
+    fi
+    mv $EXPORT_PATH/results/$FILENAME_GPU_MODEL $CSV_PATH/result_${i}.csv
+    rm $CHECKPOINT_DIRECTORY/*
+done
+
+# Check the Analyzer log for correct output
+TEST_NAME='metrics_stability'
+python3 check_results.py -f $CONFIG_FILE -t $TEST_NAME -r $CSV_PATH
+if [ $? -ne 0 ]; then
+    echo -e "\n***\n*** Test Output Verification Failed for $TEST_NAME test.\n***"
+    cat $ANALYZER_LOG
+    RET=1
+fi
+set -e
+
+rm -rf $EXPORT_PATH
+rm -rf $OUTPUT_MODEL_REPOSITORY
+rm -rf $CHECKPOINT_DIRECTORY
+rm *.csv
+
+if [ $RET -eq 0 ]; then
+    echo -e "\n***\n*** Test PASSED\n***"
+else
+    echo -e "\n***\n*** Test FAILED\n***"
+fi
+
+exit $RET
diff --git a/qa/L0_stability_metrics/test_config_generator.py b/qa/L0_stability_metrics/test_config_generator.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import yaml
+
+
+class TestConfigGenerator:
+    """
+    This class contains functions that
+    create configs for various test scenarios.
+    
+    The `setup` function does the work common to all tests
+
+    TO ADD A TEST: Simply add a member function whose name starts
+                    with 'generate'.
+    """
+
+    def __init__(self):
+        test_functions = [
+            self.__getattribute__(name)
+            for name in dir(self)
+            if name.startswith('generate')
+        ]
+
+        for test_function in test_functions:
+            self.setup()
+            test_function()
+
+    def setup(self):
+        parser = argparse.ArgumentParser()
+        parser.add_argument('-m',
+                            '--models',
+                            type=str,
+                            required=True,
+                            help='The models used for this test')
+
+        self.args = parser.parse_args()
+        self.models = sorted(self.args.models.split(','))
+
+        self.config = {}
+
+        # Profile config
+        self.config['run_config_search_disable'] = True
+        self.config['concurrency'] = 16
+        self.config['batch-size'] = 8
+        self.config['profile_models'] = self.models
+
+        # Analyze config
+        self.config['summarize'] = False
+        self.config['collect_cpu_metrics'] = True
+        self.config['gpu_output_fields'] = [
+            'model_name', 'batch_size', 'concurrency', 'gpu_used_memory',
+            'gpu_utilization'
+        ]
+        self.config['analysis_models'] = {}
+        for model in self.models:
+            self.config['analysis_models'][model] = {
+                'objectives': {
+                    'perf_throughput': 10
+                }
+            }
+
+    def generate_configs(self):
+        with open('config.yaml', 'w+') as f:
+            yaml.dump(self.config, f)
+
+
+if __name__ == '__main__':
+    TestConfigGenerator()
diff --git a/qa/L0_stability_perf/test_config_generator.py b/qa/L0_stability_perf/test_config_generator.py
@@ -44,7 +44,7 @@ def setup(self):
                             '--profile-models',
                             type=str,
                             required=True,
-                            help='The config file for this test')
+                            help='The models used for this test')
         parser.add_argument(
             '-r',
             '--request-count',
diff --git a/qa/L0_stability_result/test_config_generator.py b/qa/L0_stability_result/test_config_generator.py
@@ -44,7 +44,7 @@ def setup(self):
                             '--analysis-models',
                             type=str,
                             required=True,
-                            help='The config file for this test')
+                            help='The models used for this test')
 
         self.args = parser.parse_args()
         self.config = {}
diff --git a/qa/L0_stability_steps/test_config_generator.py b/qa/L0_stability_steps/test_config_generator.py
@@ -44,7 +44,7 @@ def setup(self):
                             '--profile-models',
                             type=str,
                             required=True,
-                            help='The config file for this test')
+                            help='The models used for this test')
 
         self.args = parser.parse_args()
         self.profile_models = sorted(self.args.profile_models.split(','))