triton-inference-server
diff --git a/‎.github/workflows/pre-commit.yml‎
Lines changed: 20 additions & 4 deletions b/‎.github/workflows/pre-commit.yml‎
Lines changed: 20 additions & 4 deletions
diff --git a/‎.github/workflows/python-package.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/python-package.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 6 additions & 2 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎docs/cli.md‎
Lines changed: 0 additions & 3 deletions b/‎docs/cli.md‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎model_analyzer/cli/cli.py‎
Lines changed: 22 additions & 12 deletions b/‎model_analyzer/cli/cli.py‎
Lines changed: 22 additions & 12 deletions
diff --git a/‎model_analyzer/perf_analyzer/perf_analyzer.py‎
Lines changed: 57 additions & 4 deletions b/‎model_analyzer/perf_analyzer/perf_analyzer.py‎
Lines changed: 57 additions & 4 deletions
diff --git a/‎model_analyzer/triton/client/client.py‎
Lines changed: 5 additions & 0 deletions b/‎model_analyzer/triton/client/client.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎model_analyzer/triton/client/grpc_client.py‎
Lines changed: 12 additions & 0 deletions b/‎model_analyzer/triton/client/grpc_client.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎nvidia_entrypoint.sh‎
Lines changed: 50 additions & 19 deletions b/‎nvidia_entrypoint.sh‎
Lines changed: 50 additions & 19 deletions
diff --git a/‎qa/L0_bls_model/check_results.py‎
Lines changed: 16 additions & 0 deletions b/‎qa/L0_bls_model/check_results.py‎
Lines changed: 16 additions & 0 deletions
@@ -31,8 +31,24 @@ on:
 
 jobs:
   pre-commit:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
-    - uses: actions/setup-python@v3
-    - uses: pre-commit/[email protected]
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 1
+
+    - name: Fetch base commit
+      run: |
+        git remote add base "${{ github.event.pull_request.base.repo.clone_url }}"
+        git fetch --no-tags --depth=1 base ${{ github.event.pull_request.base.sha }}
+
+    - uses: actions/setup-python@v4
+      with:
+        python-version: "3.11"
+
+
+    - uses: pre-commit/[email protected]
+      with:
+        extra_args: >-
+          --from-ref ${{ github.event.pull_request.base.sha }}
+          --to-ref   ${{ github.sha }}
@@ -38,7 +38,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: ["ubuntu-22.04"]
+        os: ["ubuntu-latest"]
         python-version: ["3.11"]
     env:
       SKIP_GPU_TESTS: 1
 
@@ -69,8 +69,10 @@ repos:
   # More details about these pre-commit hooks here:
   # https://pre-commit.com/hooks.html
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
+    rev: v5.0.0
     hooks:
+      - id: check-added-large-files
+        args: ['--maxkb=5000']
       - id: check-case-conflict
       - id: check-executables-have-shebangs
       - id: check-merge-conflict
@@ -80,8 +82,10 @@ repos:
         exclude: ^helm-chart/templates\/.*$
       - id: check-shebang-scripts-are-executable
       - id: end-of-file-fixer
-        types_or: [c, c++, cuda, proto, textproto, java, python]
+        types_or: [c, c++, cuda, proto, textproto, proto, java, python, shell, bash]
       - id: mixed-line-ending
+      - id: no-commit-to-branch
+        args: [--branch, main]
       - id: requirements-txt-fixer
       - id: trailing-whitespace
 
 
@@ -23,9 +23,6 @@ command line interface.
 $ model-analyzer -h
 ```
 
-Options like `-q`, `--quiet` and `-v`, `--verbose` are global and apply to all
-model analyzer subcommands.
-
 ## Model Analyzer Modes
 
 The `-m` or `--mode` flag is global and is accessible to all subcommands. It tells the model analyzer the context
 
@@ -50,18 +50,6 @@ def _add_global_options(self):
         to the parser
         """
 
-        self._parser.add_argument(
-            "-q",
-            "--quiet",
-            action="store_true",
-            help="Suppress all output except for error messages.",
-        )
-        self._parser.add_argument(
-            "-v",
-            "--verbose",
-            action="store_true",
-            help="Show detailed logs, messages and status.",
-        )
         self._parser.add_argument(
             "-m",
             "--mode",
@@ -74,6 +62,25 @@ def _add_global_options(self):
             "--version", action="store_true", help="Show the Model Analyzer version."
         )
 
+    def _add_global_options_to_subparser(self, subparser):
+        """
+        Adds global options to a subparser so they can be
+        used after the subcommand (e.g., 'model-analyzer profile -v')
+        """
+
+        subparser.add_argument(
+            "-q",
+            "--quiet",
+            action="store_true",
+            help="Suppress all output except for error messages.",
+        )
+        subparser.add_argument(
+            "-v",
+            "--verbose",
+            action="store_true",
+            help="Show detailed logs, messages and status.",
+        )
+
     def add_subcommand(self, cmd, help, config=None):
         """
         Adds a subparser to the main parser representing
@@ -92,6 +99,9 @@ def add_subcommand(self, cmd, help, config=None):
         """
 
         subparser = self._subparsers.add_parser(cmd, help=help)
+
+        self._add_global_options_to_subparser(subparser)
+
         if config:
             self._add_config_arguments(subparser, config)
             self._subcommand_configs[cmd] = config
 
@@ -380,6 +380,37 @@ def _create_process(self, cmd, perf_analyzer_env):
             raise TritonModelAnalyzerException(f"perf_analyzer binary not found : {e}")
         return process
 
+    def _verify_output_files_exist(self):
+        """
+        Verify that perf_analyzer created the expected output files.
+        Waits briefly to handle filesystem buffering delays.
+        Returns True if all expected files exist, False otherwise.
+        """
+        import time
+
+        max_wait_time = 2.0  # seconds
+        wait_interval = 0.1  # seconds
+        max_attempts = int(max_wait_time / wait_interval)
+
+        for perf_config in [
+            mrc.perf_config() for mrc in self._config.model_run_configs()
+        ]:
+            latency_file = perf_config["latency-report-file"]
+
+            file_found = False
+            for attempt in range(max_attempts):
+                if os.path.isfile(latency_file):
+                    file_found = True
+                    break
+                if attempt < max_attempts - 1:  # Don't sleep on last attempt
+                    time.sleep(wait_interval)
+
+            if not file_found:
+                logger.error(f"Expected output file not found: {latency_file}")
+                return False
+
+        return True
+
     def _resolve_process(self, process):
         if self._poll_perf_analyzer(process) == 1:
             return self.PA_FAIL
@@ -396,6 +427,21 @@ def _resolve_process(self, process):
             )
             return self.PA_FAIL
 
+        if not self._verify_output_files_exist():
+            logger.error(
+                "perf_analyzer returned success but did not create expected output files"
+            )
+            logger.error("perf_analyzer output:")
+            if self._output:
+                logger.error(self._output)
+            else:
+                logger.error("(no output captured)")
+            # Check if this is due to measurement window being too small
+            if self._auto_adjust_parameters(process) == self.PA_FAIL:
+                return self.PA_FAIL
+            else:
+                return self.PA_RETRY
+
         return self.PA_SUCCESS
 
     def _poll_perf_analyzer(self, process):
@@ -452,10 +498,14 @@ def _auto_adjust_parameters(self, process):
         """
         Attempt to update PA parameters based on the output
         """
+        logger.debug(
+            f"_auto_adjust_parameters called. returncode={process.returncode}, output_length={len(self._output)}, has_failed_msg={'Failed to obtain stable measurement' in self._output}, has_larger_window_msg={'Please use a larger time window' in self._output}"
+        )
         if (
             self._output.find("Failed to obtain stable measurement") != -1
             or self._output.find("Please use a larger time window") != -1
         ):
+            logger.debug("Found error message, will adjust parameters")
             per_rank_logs = self._split_output_per_rank()
 
             for index, log in enumerate(per_rank_logs):
@@ -476,6 +526,9 @@ def _auto_adjust_parameters_for_perf_config(self, perf_config, log):
             log.find("Failed to obtain stable measurement") != -1
             or log.find("Please use a larger time window") != -1
         ):
+            logger.debug(
+                f"Found measurement error in log, will adjust parameters. measurement-mode={perf_config['measurement-mode']}, current measurement-interval={perf_config['measurement-interval']}"
+            )
             if perf_config["measurement-mode"] == "time_windows":
                 if perf_config["measurement-interval"] is None:
                     perf_config["measurement-interval"] = (
@@ -545,10 +598,10 @@ def _parse_generic_outputs(self, metrics):
         for perf_config in [
             mrc.perf_config() for mrc in self._config.model_run_configs()
         ]:
-            logger.debug(
-                f"Reading PA results from {perf_config['latency-report-file']}"
-            )
-            with open(perf_config["latency-report-file"], mode="r") as f:
+            latency_file = perf_config["latency-report-file"]
+            logger.debug(f"Reading PA results from {latency_file}")
+
+            with open(latency_file, mode="r") as f:
                 csv_reader = csv.DictReader(f, delimiter=",")
 
                 for row in csv_reader:
 
@@ -65,6 +65,11 @@ def wait_for_server_ready(
                     time.sleep(sleep_time)
                     retries -= 1
             except Exception as e:
+                # Log connection failures with more detail for debugging
+                if retries == num_retries or retries % 10 == 0:
+                    logger.debug(
+                        f"Failed to connect to Triton server (attempt {num_retries - retries + 1}/{num_retries}): {e}"
+                    )
                 self._check_for_triton_log_errors(log_file)
                 time.sleep(sleep_time)
                 retries -= 1
 
@@ -49,12 +49,24 @@ def __init__(self, server_url, ssl_options={}):
         if "ssl-grpc-certificate-chain-file" in ssl_options:
             certificate_chain = ssl_options["ssl-grpc-certificate-chain-file"]
 
+        # Fix for gRPC 1.60.0+: Force IPv4 resolution for localhost connections
+        # gRPC 1.60.0+ prefers IPv6, causing "localhost" to resolve to [::1]
+        # On systems where IPv6 is not properly configured, this causes connection failures
+        # Force IPv4 by using 127.0.0.1, which is more reliable across environments
+        channel_args = None
+        if "localhost" in server_url:
+            server_url = server_url.replace("localhost", "127.0.0.1")
+            # For SSL connections, override target name to match certificate
+            if ssl:
+                channel_args = [("grpc.ssl_target_name_override", "localhost")]
+
         self._client = grpcclient.InferenceServerClient(
             url=server_url,
             ssl=ssl,
             root_certificates=root_certificates,
             private_key=private_key,
             certificate_chain=certificate_chain,
+            channel_args=channel_args,
         )
 
     def get_model_config(self, model_name, num_retries):
 
@@ -13,47 +13,78 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-set -e
+set -euo pipefail
 
-if [[ "$(find -L /usr -name libcuda.so.1 | grep -v "compat") " == " " || "$(ls /dev/nvidiactl 2>/dev/null) " == " " ]]; then
+# --- Helpers ---------------------------------------------------------------
+
+first_lib() {
+  # Prefer ldconfig (fast/quiet), fall back to a bounded dir scan
+  local pat="$1"
+  local p
+  p="$(ldconfig -p 2>/dev/null | awk -v re="$pat" '$0 ~ re {print $4; exit}')" || true
+  if [[ -z "${p:-}" ]]; then
+    for d in /usr/lib /usr/lib64 /lib /lib64 /usr/lib/x86_64-linux-gnu /lib/x86_64-linux-gnu; do
+      [[ -d "$d" ]] || continue
+      p="$(printf '%s\n' "$d"/lib*.so* 2>/dev/null | awk -v re="$pat" '$0 ~ re {print; exit}')" || true
+      [[ -n "${p:-}" ]] && break
+    done
+  fi
+  [[ -n "${p:-}" ]] && printf '%s\n' "$p"
+}
+
+has_nvidia_driver() {
+  [[ -e /dev/nvidiactl ]] || return 1
+  [[ -n "$(first_lib 'libcuda\.so\.1($| )')" ]]
+}
+
+# --- GPU presence / compatibility -----------------------------------------
+
+if ! has_nvidia_driver; then
   echo
-  echo "WARNING: The NVIDIA Driver was not detected.  GPU functionality will not be available."
-  echo "   Use Docker with NVIDIA Container Toolkit to start this container; see"
-  echo "   https://github.com/NVIDIA/nvidia-docker."
-  ln -s `find / -name libnvidia-ml.so -print -quit` /opt/tritonserver/lib/libnvidia-ml.so.1
+  echo "WARNING: The NVIDIA Driver was not detected. GPU functionality will not be available."
+  echo "   Use Docker with NVIDIA Container Toolkit: https://github.com/NVIDIA/nvidia-docker"
+  # Some Triton paths expect libnvidia-ml.so.1; create a symlink if we can find any variant.
+  if ml="$(first_lib 'libnvidia-ml\.so(\.|$)')" ; then
+    install -d /opt/tritonserver/lib
+    ln -sf "$ml" /opt/tritonserver/lib/libnvidia-ml.so.1 || true
+  fi
   export TRITON_SERVER_CPU_ONLY=1
 else
-  DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
-  if [[ ! "$DRIVER_VERSION" =~ ^[0-9]*.[0-9]*(.[0-9]*)?$ ]]; then
+  DRIVER_VERSION="$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)"
+  if [[ -z "${DRIVER_VERSION}" || ! "$DRIVER_VERSION" =~ ^[0-9]+(\.[0-9]+){0,2}$ ]]; then
     echo "Failed to detect NVIDIA driver version."
-  elif [[ "${DRIVER_VERSION%%.*}" -lt "${CUDA_DRIVER_VERSION%%.*}" ]]; then
-    if [[ "${_CUDA_COMPAT_STATUS}" == "CUDA Driver OK" ]]; then
+  elif [[ -n "${CUDA_DRIVER_VERSION:-}" ]] && [[ "${DRIVER_VERSION%%.*}" -lt "${CUDA_DRIVER_VERSION%%.*}" ]]; then
+    if [[ "${_CUDA_COMPAT_STATUS:-}" == "CUDA Driver OK" ]]; then
       echo
-      echo "NOTE: Legacy NVIDIA Driver detected.  Compatibility mode ENABLED."
+      echo "NOTE: Legacy NVIDIA Driver detected. Compatibility mode ENABLED."
     else
       echo
-      echo "ERROR: This container was built for NVIDIA Driver Release ${CUDA_DRIVER_VERSION%.*} or later, but"
-      echo "       version ${DRIVER_VERSION} was detected and compatibility mode is UNAVAILABLE."
+      echo "ERROR: This container was built for NVIDIA Driver Release ${CUDA_DRIVER_VERSION%.*} or later,"
+      echo "       but version ${DRIVER_VERSION} was detected and compatibility mode is UNAVAILABLE."
       echo
-      echo "       [[${_CUDA_COMPAT_STATUS}]]"
+      echo "       [[${_CUDA_COMPAT_STATUS:-unset}]]"
       sleep 2
     fi
   fi
 fi
 
-if ! cat /proc/cpuinfo | grep flags | sort -u | grep avx >& /dev/null; then
+# --- CPU AVX advisory ------------------------------------------------------
+
+if ! grep -qm1 ' avx' /proc/cpuinfo; then
   echo
-  echo "ERROR: This container was built for CPUs supporting at least the AVX instruction set, but"
-  echo "       the CPU detected was $(cat /proc/cpuinfo |grep "model name" | sed 's/^.*: //' | sort -u), which does not report"
-  echo "       support for AVX.  An Illegal Instruction exception at runtime is likely to result."
+  echo "ERROR: This container was built for CPUs supporting at least the AVX instruction set,"
+  echo "       but the detected CPU ($(grep -m1 'model name' /proc/cpuinfo | sed 's/^.*: //')) does not report AVX."
+  echo "       An Illegal Instruction exception at runtime is likely to result."
   echo "       See https://en.wikipedia.org/wiki/Advanced_Vector_Extensions#CPUs_with_AVX ."
   sleep 2
 fi
 
 echo
 
+# --- Hand off --------------------------------------------------------------
+
 if [[ $# -eq 0 ]]; then
-  exec "/bin/bash"
+  exec /bin/bash
 else
   exec "$@"
 fi
@@ -49,6 +49,22 @@ def check_profile_logs(self):
         with open(self._analyzer_log, "r") as f:
             log_contents = f.read()
 
+        # Quick search (hill-climbing algorithm) with default search space
+        # Model - bls (Business Logic Scripting):
+        #   concurrency: 1 to 1024 (11) [default max_concurrency]
+        #   max_batch_size: model-dependent (~8)
+        #   instance_group: 1 to 5 (5) [default max_instance_count]
+        # Composing models also have instance_group configurations
+        #
+        # Quick search explores the space using hill-climbing, starting from
+        # a default configuration and moving to better neighbors until convergence.
+        # BLS models have additional composing models that are profiled together.
+        #
+        # With default max values, the search space is large, resulting in
+        # more measurements as the algorithm explores different configurations.
+        #
+        # Minimum number of measurements: 20
+        # Maximum number of measurements: 80
         expected_min_num_measurements = 20
         expected_max_num_measurements = 80