Skip to content

Commit c274b56

Browse files
author
Ashwin Ramesh
authored
Added metrics/monitor stability tests (#211)
* Added metrics/monitor stability tests * Use only 1 GPU for test
1 parent c19d6ae commit c274b56

File tree

6 files changed

+308
-3
lines changed

6 files changed

+308
-3
lines changed
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
# Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import argparse
16+
import sys
17+
import yaml
18+
import os
19+
import glob
20+
21+
22+
class TestOutputValidator:
23+
"""
24+
Functions that validate the output
25+
of the test
26+
"""
27+
28+
def __init__(self, config, test_name, results_path, tolerance):
29+
self._config = config
30+
self._models = list(config['profile_models'])
31+
self._result_path = results_path
32+
self._tolerance = tolerance
33+
34+
check_function = self.__getattribute__(f'check_{test_name}')
35+
36+
if check_function():
37+
sys.exit(0)
38+
else:
39+
sys.exit(1)
40+
41+
def check_metrics_stability(self):
42+
"""
43+
Makes sure that the same configuration
44+
appears as the best across iterations for
45+
each model
46+
"""
47+
48+
# There should be 4 csv files the results path
49+
pathname = os.path.join(self._result_path, 'result_*.csv')
50+
csv_contents = []
51+
for filename in glob.glob(pathname):
52+
with open(filename, 'r+') as f:
53+
csv_contents.append(f.read())
54+
55+
# Now in the first csv, get the metrics
56+
metric_values = {}
57+
for csv in csv_contents:
58+
csv_lines = csv.split('\n')
59+
for line in csv_lines[1:-2]:
60+
model, _, _, gpu_memory, gpu_utilization = line.split(',')
61+
if model in metric_values:
62+
metric_values[model]['gpu_memory'].append(float(gpu_memory))
63+
metric_values[model]['gpu_utilization'].append(
64+
float(gpu_utilization))
65+
else:
66+
metric_values[model] = {
67+
'gpu_memory': [float(gpu_memory)],
68+
'gpu_utilization': [float(gpu_utilization)]
69+
}
70+
71+
# Compare metrics
72+
for model in metric_values:
73+
for metric, values in metric_values[model].items():
74+
start_value = values[0]
75+
for value in values[1:]:
76+
deviation_percent = abs(
77+
(value - start_value) / start_value) * 100
78+
if deviation_percent > self._tolerance:
79+
print(
80+
f"\n***"
81+
f"\n*** For model {model}, value for metric {metric}"
82+
"\n*** is unstable.\n***\n"
83+
f"\n***\n*** Expected: {start_value} +/- {self._tolerance*start_value/100}."
84+
f"\n*** Found: {values[1:]}.\n***")
85+
return False
86+
return True
87+
88+
89+
if __name__ == '__main__':
90+
parser = argparse.ArgumentParser()
91+
parser.add_argument('-f',
92+
'--config-file',
93+
type=str,
94+
required=True,
95+
help='The path to the config yaml file.')
96+
parser.add_argument('-r',
97+
'--inference-results-path',
98+
type=str,
99+
required=True,
100+
help='The full path to the analyzer log.')
101+
parser.add_argument('-t',
102+
'--test-name',
103+
type=str,
104+
required=True,
105+
help='The name of the test to be run.')
106+
parser.add_argument(
107+
'--tolerance',
108+
type=int,
109+
default=10,
110+
help='The percent tolerance allowed for the metrics to vary.')
111+
112+
args = parser.parse_args()
113+
114+
with open(args.config_file, 'r') as f:
115+
config = yaml.safe_load(f)
116+
117+
TestOutputValidator(config, args.test_name, args.inference_results_path,
118+
args.tolerance)

qa/L0_stability_metrics/test.sh

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
# Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
ANALYZER_LOG="test.log"
16+
source ../common/util.sh
17+
18+
rm -f *.log
19+
20+
# Set test parameters
21+
MODEL_ANALYZER="`which model-analyzer`"
22+
REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION}
23+
MODEL_REPOSITORY=${MODEL_REPOSITORY:="/mnt/dldata/inferenceserver/model_analyzer_benchmark_models"}
24+
CHECKPOINT_REPOSITORY=${CHECKPOINT_REPOSITORY:="/mnt/dldata/inferenceserver/model_analyzer_checkpoints"}
25+
TRITON_LAUNCH_MODE=${TRITON_LAUNCH_MODE:="local"}
26+
CLIENT_PROTOCOL="grpc"
27+
PORTS=(`find_available_ports 3`)
28+
GPUS=(`get_all_gpus_uuids`)
29+
OUTPUT_MODEL_REPOSITORY=${OUTPUT_MODEL_REPOSITORY:=`get_output_directory`}
30+
CONFIG_FILE="config.yaml"
31+
NUM_ITERATIONS=${NUM_ITERATIONS:=5}
32+
BENCHMARK_MODELS="`ls ${MODEL_REPOSITORY}`"
33+
MODEL_NAMES="$(echo $BENCHMARK_MODELS | sed 's/ /,/g')"
34+
EXPORT_PATH="`pwd`/results"
35+
FILENAME_SERVER_ONLY="server-metrics.csv"
36+
FILENAME_INFERENCE_MODEL="model-metrics-inference.csv"
37+
FILENAME_GPU_MODEL="model-metrics-gpu.csv"
38+
CHECKPOINT_DIRECTORY="./checkpoints"
39+
CSV_PATH='.'
40+
41+
# Set up checkpoints
42+
mkdir $EXPORT_PATH
43+
44+
# Generate test configs
45+
python3 test_config_generator.py --models $MODEL_NAMES
46+
47+
# Set analyzer config options
48+
RET=0
49+
50+
set +e
51+
52+
# Run the analyzer and check the results
53+
for (( i=1; i<=$NUM_ITERATIONS; i++ )); do
54+
# First profile
55+
MODEL_ANALYZER_ARGS="-m $MODEL_REPOSITORY -f $CONFIG_FILE"
56+
MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --client-protocol=$CLIENT_PROTOCOL --triton-launch-mode=$TRITON_LAUNCH_MODE"
57+
MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --triton-http-endpoint localhost:${PORTS[0]} --triton-grpc-endpoint localhost:${PORTS[1]}"
58+
MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --triton-metrics-url http://localhost:${PORTS[2]}/metrics"
59+
MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --output-model-repository-path $OUTPUT_MODEL_REPOSITORY --override-output-model-repository"
60+
MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --gpus 0"
61+
MODEL_ANALYZER_SUBCOMMAND="profile"
62+
63+
run_analyzer
64+
if [ $? -ne 0 ]; then
65+
echo -e "\n***\n*** Test Failed. model-analyzer $MODEL_ANALYZER_SUBCOMMAND exited with non-zero exit code. \n***"
66+
cat $ANALYZER_LOG
67+
RET=1
68+
fi
69+
70+
# Then generate results
71+
MODEL_ANALYZER_ARGS="-e $EXPORT_PATH -f $CONFIG_FILE --filename-server-only=$FILENAME_SERVER_ONLY"
72+
MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --filename-model-inference=$FILENAME_INFERENCE_MODEL --filename-model-gpu=$FILENAME_GPU_MODEL "
73+
MODEL_ANALYZER_SUBCOMMAND="analyze"
74+
75+
run_analyzer
76+
if [ $? -ne 0 ]; then
77+
echo -e "\n***\n*** Test Failed. model-analyzer $MODEL_ANALYZER_SUBCOMMAND exited with non-zero exit code. \n***"
78+
cat $ANALYZER_LOG
79+
RET=1
80+
fi
81+
mv $EXPORT_PATH/results/$FILENAME_GPU_MODEL $CSV_PATH/result_${i}.csv
82+
rm $CHECKPOINT_DIRECTORY/*
83+
done
84+
85+
# Check the Analyzer log for correct output
86+
TEST_NAME='metrics_stability'
87+
python3 check_results.py -f $CONFIG_FILE -t $TEST_NAME -r $CSV_PATH
88+
if [ $? -ne 0 ]; then
89+
echo -e "\n***\n*** Test Output Verification Failed for $TEST_NAME test.\n***"
90+
cat $ANALYZER_LOG
91+
RET=1
92+
fi
93+
set -e
94+
95+
rm -rf $EXPORT_PATH
96+
rm -rf $OUTPUT_MODEL_REPOSITORY
97+
rm -rf $CHECKPOINT_DIRECTORY
98+
rm *.csv
99+
100+
if [ $RET -eq 0 ]; then
101+
echo -e "\n***\n*** Test PASSED\n***"
102+
else
103+
echo -e "\n***\n*** Test FAILED\n***"
104+
fi
105+
106+
exit $RET
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
# Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import argparse
16+
import yaml
17+
18+
19+
class TestConfigGenerator:
20+
"""
21+
This class contains functions that
22+
create configs for various test scenarios.
23+
24+
The `setup` function does the work common to all tests
25+
26+
TO ADD A TEST: Simply add a member function whose name starts
27+
with 'generate'.
28+
"""
29+
30+
def __init__(self):
31+
test_functions = [
32+
self.__getattribute__(name)
33+
for name in dir(self)
34+
if name.startswith('generate')
35+
]
36+
37+
for test_function in test_functions:
38+
self.setup()
39+
test_function()
40+
41+
def setup(self):
42+
parser = argparse.ArgumentParser()
43+
parser.add_argument('-m',
44+
'--models',
45+
type=str,
46+
required=True,
47+
help='The models used for this test')
48+
49+
self.args = parser.parse_args()
50+
self.models = sorted(self.args.models.split(','))
51+
52+
self.config = {}
53+
54+
# Profile config
55+
self.config['run_config_search_disable'] = True
56+
self.config['concurrency'] = 16
57+
self.config['batch-size'] = 8
58+
self.config['profile_models'] = self.models
59+
60+
# Analyze config
61+
self.config['summarize'] = False
62+
self.config['collect_cpu_metrics'] = True
63+
self.config['gpu_output_fields'] = [
64+
'model_name', 'batch_size', 'concurrency', 'gpu_used_memory',
65+
'gpu_utilization'
66+
]
67+
self.config['analysis_models'] = {}
68+
for model in self.models:
69+
self.config['analysis_models'][model] = {
70+
'objectives': {
71+
'perf_throughput': 10
72+
}
73+
}
74+
75+
def generate_configs(self):
76+
with open('config.yaml', 'w+') as f:
77+
yaml.dump(self.config, f)
78+
79+
80+
if __name__ == '__main__':
81+
TestConfigGenerator()

qa/L0_stability_perf/test_config_generator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def setup(self):
4444
'--profile-models',
4545
type=str,
4646
required=True,
47-
help='The config file for this test')
47+
help='The models used for this test')
4848
parser.add_argument(
4949
'-r',
5050
'--request-count',

qa/L0_stability_result/test_config_generator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def setup(self):
4444
'--analysis-models',
4545
type=str,
4646
required=True,
47-
help='The config file for this test')
47+
help='The models used for this test')
4848

4949
self.args = parser.parse_args()
5050
self.config = {}

qa/L0_stability_steps/test_config_generator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def setup(self):
4444
'--profile-models',
4545
type=str,
4646
required=True,
47-
help='The config file for this test')
47+
help='The models used for this test')
4848

4949
self.args = parser.parse_args()
5050
self.profile_models = sorted(self.args.profile_models.split(','))

0 commit comments

Comments
 (0)