Skip to content

Commit 6ec8d3e

Browse files
authored
fix: ci tests (#970)
1 parent 7574c4c commit 6ec8d3e

File tree

37 files changed

+410
-100
lines changed

37 files changed

+410
-100
lines changed

.github/workflows/pre-commit.yml

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,24 @@ on:
3131

3232
jobs:
3333
pre-commit:
34-
runs-on: ubuntu-22.04
34+
runs-on: ubuntu-latest
3535
steps:
36-
- uses: actions/checkout@v3
37-
- uses: actions/setup-python@v3
38-
- uses: pre-commit/[email protected]
36+
- uses: actions/checkout@v4
37+
with:
38+
fetch-depth: 1
39+
40+
- name: Fetch base commit
41+
run: |
42+
git remote add base "${{ github.event.pull_request.base.repo.clone_url }}"
43+
git fetch --no-tags --depth=1 base ${{ github.event.pull_request.base.sha }}
44+
45+
- uses: actions/setup-python@v4
46+
with:
47+
python-version: "3.11"
48+
49+
50+
- uses: pre-commit/[email protected]
51+
with:
52+
extra_args: >-
53+
--from-ref ${{ github.event.pull_request.base.sha }}
54+
--to-ref ${{ github.sha }}

.github/workflows/python-package.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ jobs:
3838
strategy:
3939
fail-fast: false
4040
matrix:
41-
os: ["ubuntu-22.04"]
41+
os: ["ubuntu-latest"]
4242
python-version: ["3.11"]
4343
env:
4444
SKIP_GPU_TESTS: 1

.pre-commit-config.yaml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,10 @@ repos:
6969
# More details about these pre-commit hooks here:
7070
# https://pre-commit.com/hooks.html
7171
- repo: https://github.com/pre-commit/pre-commit-hooks
72-
rev: v4.4.0
72+
rev: v5.0.0
7373
hooks:
74+
- id: check-added-large-files
75+
args: ['--maxkb=5000']
7476
- id: check-case-conflict
7577
- id: check-executables-have-shebangs
7678
- id: check-merge-conflict
@@ -80,8 +82,10 @@ repos:
8082
exclude: ^helm-chart/templates\/.*$
8183
- id: check-shebang-scripts-are-executable
8284
- id: end-of-file-fixer
83-
types_or: [c, c++, cuda, proto, textproto, java, python]
85+
types_or: [c, c++, cuda, proto, textproto, proto, java, python, shell, bash]
8486
- id: mixed-line-ending
87+
- id: no-commit-to-branch
88+
args: [--branch, main]
8589
- id: requirements-txt-fixer
8690
- id: trailing-whitespace
8791

docs/cli.md

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,6 @@ command line interface.
2323
$ model-analyzer -h
2424
```
2525

26-
Options like `-q`, `--quiet` and `-v`, `--verbose` are global and apply to all
27-
model analyzer subcommands.
28-
2926
## Model Analyzer Modes
3027

3128
The `-m` or `--mode` flag is global and is accessible to all subcommands. It tells the model analyzer the context

model_analyzer/cli/cli.py

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -50,18 +50,6 @@ def _add_global_options(self):
5050
to the parser
5151
"""
5252

53-
self._parser.add_argument(
54-
"-q",
55-
"--quiet",
56-
action="store_true",
57-
help="Suppress all output except for error messages.",
58-
)
59-
self._parser.add_argument(
60-
"-v",
61-
"--verbose",
62-
action="store_true",
63-
help="Show detailed logs, messages and status.",
64-
)
6553
self._parser.add_argument(
6654
"-m",
6755
"--mode",
@@ -74,6 +62,25 @@ def _add_global_options(self):
7462
"--version", action="store_true", help="Show the Model Analyzer version."
7563
)
7664

65+
def _add_global_options_to_subparser(self, subparser):
66+
"""
67+
Adds global options to a subparser so they can be
68+
used after the subcommand (e.g., 'model-analyzer profile -v')
69+
"""
70+
71+
subparser.add_argument(
72+
"-q",
73+
"--quiet",
74+
action="store_true",
75+
help="Suppress all output except for error messages.",
76+
)
77+
subparser.add_argument(
78+
"-v",
79+
"--verbose",
80+
action="store_true",
81+
help="Show detailed logs, messages and status.",
82+
)
83+
7784
def add_subcommand(self, cmd, help, config=None):
7885
"""
7986
Adds a subparser to the main parser representing
@@ -92,6 +99,9 @@ def add_subcommand(self, cmd, help, config=None):
9299
"""
93100

94101
subparser = self._subparsers.add_parser(cmd, help=help)
102+
103+
self._add_global_options_to_subparser(subparser)
104+
95105
if config:
96106
self._add_config_arguments(subparser, config)
97107
self._subcommand_configs[cmd] = config

model_analyzer/perf_analyzer/perf_analyzer.py

Lines changed: 57 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,37 @@ def _create_process(self, cmd, perf_analyzer_env):
380380
raise TritonModelAnalyzerException(f"perf_analyzer binary not found : {e}")
381381
return process
382382

383+
def _verify_output_files_exist(self):
384+
"""
385+
Verify that perf_analyzer created the expected output files.
386+
Waits briefly to handle filesystem buffering delays.
387+
Returns True if all expected files exist, False otherwise.
388+
"""
389+
import time
390+
391+
max_wait_time = 2.0 # seconds
392+
wait_interval = 0.1 # seconds
393+
max_attempts = int(max_wait_time / wait_interval)
394+
395+
for perf_config in [
396+
mrc.perf_config() for mrc in self._config.model_run_configs()
397+
]:
398+
latency_file = perf_config["latency-report-file"]
399+
400+
file_found = False
401+
for attempt in range(max_attempts):
402+
if os.path.isfile(latency_file):
403+
file_found = True
404+
break
405+
if attempt < max_attempts - 1: # Don't sleep on last attempt
406+
time.sleep(wait_interval)
407+
408+
if not file_found:
409+
logger.error(f"Expected output file not found: {latency_file}")
410+
return False
411+
412+
return True
413+
383414
def _resolve_process(self, process):
384415
if self._poll_perf_analyzer(process) == 1:
385416
return self.PA_FAIL
@@ -396,6 +427,21 @@ def _resolve_process(self, process):
396427
)
397428
return self.PA_FAIL
398429

430+
if not self._verify_output_files_exist():
431+
logger.error(
432+
"perf_analyzer returned success but did not create expected output files"
433+
)
434+
logger.error("perf_analyzer output:")
435+
if self._output:
436+
logger.error(self._output)
437+
else:
438+
logger.error("(no output captured)")
439+
# Check if this is due to measurement window being too small
440+
if self._auto_adjust_parameters(process) == self.PA_FAIL:
441+
return self.PA_FAIL
442+
else:
443+
return self.PA_RETRY
444+
399445
return self.PA_SUCCESS
400446

401447
def _poll_perf_analyzer(self, process):
@@ -452,10 +498,14 @@ def _auto_adjust_parameters(self, process):
452498
"""
453499
Attempt to update PA parameters based on the output
454500
"""
501+
logger.debug(
502+
f"_auto_adjust_parameters called. returncode={process.returncode}, output_length={len(self._output)}, has_failed_msg={'Failed to obtain stable measurement' in self._output}, has_larger_window_msg={'Please use a larger time window' in self._output}"
503+
)
455504
if (
456505
self._output.find("Failed to obtain stable measurement") != -1
457506
or self._output.find("Please use a larger time window") != -1
458507
):
508+
logger.debug("Found error message, will adjust parameters")
459509
per_rank_logs = self._split_output_per_rank()
460510

461511
for index, log in enumerate(per_rank_logs):
@@ -476,6 +526,9 @@ def _auto_adjust_parameters_for_perf_config(self, perf_config, log):
476526
log.find("Failed to obtain stable measurement") != -1
477527
or log.find("Please use a larger time window") != -1
478528
):
529+
logger.debug(
530+
f"Found measurement error in log, will adjust parameters. measurement-mode={perf_config['measurement-mode']}, current measurement-interval={perf_config['measurement-interval']}"
531+
)
479532
if perf_config["measurement-mode"] == "time_windows":
480533
if perf_config["measurement-interval"] is None:
481534
perf_config["measurement-interval"] = (
@@ -545,10 +598,10 @@ def _parse_generic_outputs(self, metrics):
545598
for perf_config in [
546599
mrc.perf_config() for mrc in self._config.model_run_configs()
547600
]:
548-
logger.debug(
549-
f"Reading PA results from {perf_config['latency-report-file']}"
550-
)
551-
with open(perf_config["latency-report-file"], mode="r") as f:
601+
latency_file = perf_config["latency-report-file"]
602+
logger.debug(f"Reading PA results from {latency_file}")
603+
604+
with open(latency_file, mode="r") as f:
552605
csv_reader = csv.DictReader(f, delimiter=",")
553606

554607
for row in csv_reader:

model_analyzer/triton/client/client.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,11 @@ def wait_for_server_ready(
6565
time.sleep(sleep_time)
6666
retries -= 1
6767
except Exception as e:
68+
# Log connection failures with more detail for debugging
69+
if retries == num_retries or retries % 10 == 0:
70+
logger.debug(
71+
f"Failed to connect to Triton server (attempt {num_retries - retries + 1}/{num_retries}): {e}"
72+
)
6873
self._check_for_triton_log_errors(log_file)
6974
time.sleep(sleep_time)
7075
retries -= 1

model_analyzer/triton/client/grpc_client.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,24 @@ def __init__(self, server_url, ssl_options={}):
4949
if "ssl-grpc-certificate-chain-file" in ssl_options:
5050
certificate_chain = ssl_options["ssl-grpc-certificate-chain-file"]
5151

52+
# Fix for gRPC 1.60.0+: Force IPv4 resolution for localhost connections
53+
# gRPC 1.60.0+ prefers IPv6, causing "localhost" to resolve to [::1]
54+
# On systems where IPv6 is not properly configured, this causes connection failures
55+
# Force IPv4 by using 127.0.0.1, which is more reliable across environments
56+
channel_args = None
57+
if "localhost" in server_url:
58+
server_url = server_url.replace("localhost", "127.0.0.1")
59+
# For SSL connections, override target name to match certificate
60+
if ssl:
61+
channel_args = [("grpc.ssl_target_name_override", "localhost")]
62+
5263
self._client = grpcclient.InferenceServerClient(
5364
url=server_url,
5465
ssl=ssl,
5566
root_certificates=root_certificates,
5667
private_key=private_key,
5768
certificate_chain=certificate_chain,
69+
channel_args=channel_args,
5870
)
5971

6072
def get_model_config(self, model_name, num_retries):

nvidia_entrypoint.sh

Lines changed: 50 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -13,47 +13,78 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16-
set -e
16+
set -euo pipefail
1717

18-
if [[ "$(find -L /usr -name libcuda.so.1 | grep -v "compat") " == " " || "$(ls /dev/nvidiactl 2>/dev/null) " == " " ]]; then
18+
# --- Helpers ---------------------------------------------------------------
19+
20+
first_lib() {
21+
# Prefer ldconfig (fast/quiet), fall back to a bounded dir scan
22+
local pat="$1"
23+
local p
24+
p="$(ldconfig -p 2>/dev/null | awk -v re="$pat" '$0 ~ re {print $4; exit}')" || true
25+
if [[ -z "${p:-}" ]]; then
26+
for d in /usr/lib /usr/lib64 /lib /lib64 /usr/lib/x86_64-linux-gnu /lib/x86_64-linux-gnu; do
27+
[[ -d "$d" ]] || continue
28+
p="$(printf '%s\n' "$d"/lib*.so* 2>/dev/null | awk -v re="$pat" '$0 ~ re {print; exit}')" || true
29+
[[ -n "${p:-}" ]] && break
30+
done
31+
fi
32+
[[ -n "${p:-}" ]] && printf '%s\n' "$p"
33+
}
34+
35+
has_nvidia_driver() {
36+
[[ -e /dev/nvidiactl ]] || return 1
37+
[[ -n "$(first_lib 'libcuda\.so\.1($| )')" ]]
38+
}
39+
40+
# --- GPU presence / compatibility -----------------------------------------
41+
42+
if ! has_nvidia_driver; then
1943
echo
20-
echo "WARNING: The NVIDIA Driver was not detected. GPU functionality will not be available."
21-
echo " Use Docker with NVIDIA Container Toolkit to start this container; see"
22-
echo " https://github.com/NVIDIA/nvidia-docker."
23-
ln -s `find / -name libnvidia-ml.so -print -quit` /opt/tritonserver/lib/libnvidia-ml.so.1
44+
echo "WARNING: The NVIDIA Driver was not detected. GPU functionality will not be available."
45+
echo " Use Docker with NVIDIA Container Toolkit: https://github.com/NVIDIA/nvidia-docker"
46+
# Some Triton paths expect libnvidia-ml.so.1; create a symlink if we can find any variant.
47+
if ml="$(first_lib 'libnvidia-ml\.so(\.|$)')" ; then
48+
install -d /opt/tritonserver/lib
49+
ln -sf "$ml" /opt/tritonserver/lib/libnvidia-ml.so.1 || true
50+
fi
2451
export TRITON_SERVER_CPU_ONLY=1
2552
else
26-
DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
27-
if [[ ! "$DRIVER_VERSION" =~ ^[0-9]*.[0-9]*(.[0-9]*)?$ ]]; then
53+
DRIVER_VERSION="$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)"
54+
if [[ -z "${DRIVER_VERSION}" || ! "$DRIVER_VERSION" =~ ^[0-9]+(\.[0-9]+){0,2}$ ]]; then
2855
echo "Failed to detect NVIDIA driver version."
29-
elif [[ "${DRIVER_VERSION%%.*}" -lt "${CUDA_DRIVER_VERSION%%.*}" ]]; then
30-
if [[ "${_CUDA_COMPAT_STATUS}" == "CUDA Driver OK" ]]; then
56+
elif [[ -n "${CUDA_DRIVER_VERSION:-}" ]] && [[ "${DRIVER_VERSION%%.*}" -lt "${CUDA_DRIVER_VERSION%%.*}" ]]; then
57+
if [[ "${_CUDA_COMPAT_STATUS:-}" == "CUDA Driver OK" ]]; then
3158
echo
32-
echo "NOTE: Legacy NVIDIA Driver detected. Compatibility mode ENABLED."
59+
echo "NOTE: Legacy NVIDIA Driver detected. Compatibility mode ENABLED."
3360
else
3461
echo
35-
echo "ERROR: This container was built for NVIDIA Driver Release ${CUDA_DRIVER_VERSION%.*} or later, but"
36-
echo " version ${DRIVER_VERSION} was detected and compatibility mode is UNAVAILABLE."
62+
echo "ERROR: This container was built for NVIDIA Driver Release ${CUDA_DRIVER_VERSION%.*} or later,"
63+
echo " but version ${DRIVER_VERSION} was detected and compatibility mode is UNAVAILABLE."
3764
echo
38-
echo " [[${_CUDA_COMPAT_STATUS}]]"
65+
echo " [[${_CUDA_COMPAT_STATUS:-unset}]]"
3966
sleep 2
4067
fi
4168
fi
4269
fi
4370

44-
if ! cat /proc/cpuinfo | grep flags | sort -u | grep avx >& /dev/null; then
71+
# --- CPU AVX advisory ------------------------------------------------------
72+
73+
if ! grep -qm1 ' avx' /proc/cpuinfo; then
4574
echo
46-
echo "ERROR: This container was built for CPUs supporting at least the AVX instruction set, but"
47-
echo " the CPU detected was $(cat /proc/cpuinfo |grep "model name" | sed 's/^.*: //' | sort -u), which does not report"
48-
echo " support for AVX. An Illegal Instruction exception at runtime is likely to result."
75+
echo "ERROR: This container was built for CPUs supporting at least the AVX instruction set,"
76+
echo " but the detected CPU ($(grep -m1 'model name' /proc/cpuinfo | sed 's/^.*: //')) does not report AVX."
77+
echo " An Illegal Instruction exception at runtime is likely to result."
4978
echo " See https://en.wikipedia.org/wiki/Advanced_Vector_Extensions#CPUs_with_AVX ."
5079
sleep 2
5180
fi
5281

5382
echo
5483

84+
# --- Hand off --------------------------------------------------------------
85+
5586
if [[ $# -eq 0 ]]; then
56-
exec "/bin/bash"
87+
exec /bin/bash
5788
else
5889
exec "$@"
5990
fi

qa/L0_bls_model/check_results.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,22 @@ def check_profile_logs(self):
4949
with open(self._analyzer_log, "r") as f:
5050
log_contents = f.read()
5151

52+
# Quick search (hill-climbing algorithm) with default search space
53+
# Model - bls (Business Logic Scripting):
54+
# concurrency: 1 to 1024 (11) [default max_concurrency]
55+
# max_batch_size: model-dependent (~8)
56+
# instance_group: 1 to 5 (5) [default max_instance_count]
57+
# Composing models also have instance_group configurations
58+
#
59+
# Quick search explores the space using hill-climbing, starting from
60+
# a default configuration and moving to better neighbors until convergence.
61+
# BLS models have additional composing models that are profiled together.
62+
#
63+
# With default max values, the search space is large, resulting in
64+
# more measurements as the algorithm explores different configurations.
65+
#
66+
# Minimum number of measurements: 20
67+
# Maximum number of measurements: 80
5268
expected_min_num_measurements = 20
5369
expected_max_num_measurements = 80
5470

0 commit comments

Comments
 (0)