Skip to content

Commit c4ae533

Browse files
authored
Stop MA if failures are detected on initial profiles (#697)
* Detecting if no measurements are returned at the beginning of a profile * Removing timeout PA test * Adding missing newlines * Fixes based on review comments
1 parent b30e061 commit c4ae533

File tree

6 files changed

+62
-28
lines changed

6 files changed

+62
-28
lines changed

model_analyzer/cli/cli.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222

2323
class CLI:
2424
"""
25-
CLI class to parse the commandline arguments
25+
CLI class to parse the command line arguments
2626
"""
2727

2828
def __init__(self):
@@ -49,7 +49,7 @@ def _add_global_options(self):
4949
'-v',
5050
'--verbose',
5151
action='store_true',
52-
help='Show detailed logs, messags and status.')
52+
help='Show detailed logs, messages and status.')
5353
self._parser.add_argument('-m',
5454
'--mode',
5555
type=str,

model_analyzer/constants.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,3 +62,6 @@
6262

6363
# Constraints
6464
GLOBAL_CONSTRAINTS_KEY = "__default__"
65+
66+
# Measurement constants
67+
INVALID_MEASUREMENT_THRESHOLD = 2

model_analyzer/model_manager.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
from typing import List
15+
from typing import List, Optional
1616

17-
from model_analyzer.constants import LOGGER_NAME
17+
from model_analyzer.constants import LOGGER_NAME, INVALID_MEASUREMENT_THRESHOLD
1818
from model_analyzer.config.generate.run_config_generator_factory import RunConfigGeneratorFactory
1919
from .model_analyzer_exceptions import TritonModelAnalyzerException
2020
from model_analyzer.config.generate.model_variant_name_manager import ModelVariantNameManager
@@ -29,6 +29,7 @@
2929
from model_analyzer.state.analyzer_state_manager import AnalyzerStateManager
3030
from model_analyzer.triton.model.model_config import ModelConfig
3131
from model_analyzer.config.input.objects.config_model_profile_spec import ConfigModelProfileSpec
32+
from model_analyzer.result.run_config_measurement import RunConfigMeasurement
3233

3334
import logging
3435

@@ -79,6 +80,9 @@ def __init__(self, config: ConfigCommandProfile, gpus: List[GPUDevice],
7980
if state_manager.starting_fresh_run():
8081
self._init_state()
8182

83+
self._failed_measurement_attempts = 0
84+
self._received_measurement_values_from_pa = False
85+
8286
self._model_variant_name_manager = ModelVariantNameManager.from_dict(
8387
self._state_manager.get_state_variable(
8488
'ModelManager.model_variant_name_manager'))
@@ -121,6 +125,9 @@ def run_models(self, models: List[ConfigModelProfileSpec]) -> None:
121125
if run_config.is_legal_combination():
122126
measurement = self._metrics_manager.execute_run_config(
123127
run_config)
128+
129+
self._check_for_valid_measurement(measurement)
130+
self._stop_ma_if_no_valid_measurement_threshold_reached()
124131
else:
125132
logger.info("Skipping illegal run configuration")
126133
measurement = None
@@ -188,9 +195,27 @@ def _check_for_ensemble_model_incompatibility(
188195
def _init_state(self):
189196
"""
190197
Sets ModelManager object managed
191-
state variables in AnalyerState
198+
state variables in AnalyzerState
192199
"""
193200

194201
self._state_manager.set_state_variable(
195202
'ModelManager.model_variant_name_manager',
196203
self._state_manager.default_encode(ModelVariantNameManager()))
204+
205+
def _check_for_valid_measurement(
206+
self, measurement: Optional[RunConfigMeasurement]) -> None:
207+
if measurement:
208+
self._received_measurement_values_from_pa = True
209+
else:
210+
self._failed_measurement_attempts += 1
211+
212+
def _stop_ma_if_no_valid_measurement_threshold_reached(self) -> None:
213+
if self._received_measurement_values_from_pa:
214+
return
215+
216+
if self._failed_measurement_attempts >= INVALID_MEASUREMENT_THRESHOLD:
217+
raise TritonModelAnalyzerException(
218+
f'The first {INVALID_MEASUREMENT_THRESHOLD} attempts to acquire measurements ' \
219+
'have failed. Please examine the Tritonserver/PA error logs ' \
220+
'to determine what has gone wrong.'
221+
)

qa/L0_perf_analyzer/test.sh

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,6 @@ for CONFIG_FILE in ${LIST_OF_CONFIG_FILES[@]}; do
6363
TEST_NAME="count_window"
6464
elif [[ "$CONFIG_FILE" == "config-additive-args-count-no-adjust.yml" ]]; then
6565
TEST_NAME="count_window"
66-
elif [[ "$CONFIG_FILE" == "config-perf-output-timeout.yml" ]]; then
67-
TEST_NAME="perf_output_timeout"
6866
fi
6967

7068
echo $TEST_NAME

qa/L0_perf_analyzer/test_config_generator.py

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -87,20 +87,6 @@ def generate_additive_args(self):
8787
with open('./config-additive-args-count-no-adjust.yml', 'w') as f:
8888
yaml.dump(model_config, f)
8989

90-
def generate_perf_output_timeout(self):
91-
model_config = {
92-
'profile_models': ['vgg19_libtorch'],
93-
'skip_summary_reports': True, # Don't fail analyzing no results
94-
'perf_output': True,
95-
'perf_analyzer_timeout': 2,
96-
'perf_analyzer_flags': {
97-
'measurement-mode': 'time_windows',
98-
'measurement-interval': 2,
99-
}
100-
}
101-
with open('./config-perf-output-timeout.yml', 'w') as f:
102-
yaml.dump(model_config, f)
103-
10490

10591
if __name__ == '__main__':
10692
TestConfigGenerator()

tests/test_model_manager.py

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -691,7 +691,7 @@ def test_default_config_always_run_automatic_search(self):
691691

692692
def test_throughput_early_exit_minimum_runs(self):
693693
"""
694-
Test that there is an early backoff when sweeping concurrency
694+
Test that there is an early back off when sweeping concurrency
695695
696696
The behavior is that MA will try at least 4 concurrencies. If
697697
at that point none of the last 3 attempts have had satisfactory
@@ -738,7 +738,7 @@ def test_throughput_early_exit_minimum_runs(self):
738738

739739
def test_no_early_exit_if_not_auto_search(self):
740740
"""
741-
Test that there is NOT an early backoff when sweeping concurrency if not in auto sweep mode
741+
Test that there is NOT an early back off when sweeping concurrency if not in auto sweep mode
742742
743743
This test hardcodes the 'throughput' to 1, so for all model
744744
configs the gain will be invalid. However, it should still sweep
@@ -780,7 +780,7 @@ def test_no_early_exit_if_not_auto_search(self):
780780

781781
def test_throughput_early_exit(self):
782782
"""
783-
Test that there is an early backoff when sweeping concurrency
783+
Test that there is an early back off when sweeping concurrency
784784
785785
The behavior is that MA stop if it had 4 concurrencies in a row
786786
without any valid gain amongst any of them
@@ -822,9 +822,10 @@ def test_throughput_early_exit(self):
822822
]
823823
self._test_model_manager(yaml_str, expected_ranges)
824824

825+
@patch('model_analyzer.model_manager.INVALID_MEASUREMENT_THRESHOLD', 999)
825826
def test_bad_result_early_PA_exit(self):
826827
"""
827-
Test that there is an early backoff for bad result (out of memory)
828+
Test that there is an early back off for bad result (out of memory)
828829
829830
If no measurements are returned in an attempt, no further concurrencies
830831
should be tried.
@@ -863,9 +864,30 @@ def test_bad_result_early_PA_exit(self):
863864
mock_method.return_value = None
864865
self._test_model_manager(yaml_str, expected_ranges)
865866

867+
def test_report_failure_no_measurements(self):
868+
"""
869+
Test that MA takes an exception if we detect no measurements returned from
870+
PA at the start of profile
871+
"""
872+
873+
yaml_str = ("""
874+
profile_models: test_model
875+
run_config_search_max_concurrency: 128
876+
run_config_search_max_instance_count: 2
877+
run_config_search_min_model_batch_size: 8
878+
run_config_search_max_model_batch_size: 8
879+
run_config_search_disable: False
880+
""")
881+
882+
with patch.object(MetricsManagerSubclass,
883+
"_get_next_perf_throughput_value") as mock_method:
884+
mock_method.return_value = None
885+
with self.assertRaises(TritonModelAnalyzerException):
886+
self._test_model_manager(yaml_str, None)
887+
866888
def test_lower_throughput_early_batch_size_exit(self):
867889
"""
868-
Test that there is an early backoff for throughput decreasing
890+
Test that there is an early back off for throughput decreasing
869891
when sweeping max_batch_size
870892
871893
If a list of measurements is provided with a lower max throughput than the previous
@@ -919,10 +941,10 @@ def test_lower_throughput_early_batch_size_exit(self):
919941
#yapf: disable
920942
mock_method.side_effect = [
921943
1, 2, 4, # Default config, concurrency 1,2,4
922-
1, 2, 4, # 1 Instance, Batch size 8, concurency 1,2,4
944+
1, 2, 4, # 1 Instance, Batch size 8, concurrency 1,2,4
923945
2, 4, 8, # 1 Instance, Batch size 16, concurrency 1,2,4
924946
2, 4, 8, # 1 Instance, Batch size 32, concurrency 1,2,4
925-
1, 2, 4, # 1 Instance, Batch size 8, concurency 1,2,4
947+
1, 2, 4, # 1 Instance, Batch size 8, concurrency 1,2,4
926948
8, 4, 2, # 1 Instance, Batch size 16, concurrency 1,2,4
927949
4, 8, 16, # 1 Instance, Batch size 32, concurrency 1,2,4
928950
4, 8, 16 # 1 Instance, Batch size 64, concurrency 1,2,4

0 commit comments

Comments
 (0)