[ML] Replace shell test runner with portable CMake/CTest infrastructure (#2900)

edsavage · web-flow · commit a4b2634c3c46 · 2026-03-16T11:55:37.000+13:00
Replace run_tests_as_seperate_processes.sh with cmake/run-tests-individually.cmake
for portable cross-platform parallel test execution. Enable CTest integration and
rename the 'test' target to 'ml_test' to avoid conflicts.
diff --git a/.buildkite/branch.json.py b/.buildkite/branch.json.py
@@ -40,6 +40,10 @@ def main():
         build_linux = pipeline_steps.generate_step_template("Linux", "build", config.build_aarch64, config.build_x86_64)
         pipeline_steps.append(build_linux)
 
+    # Analyse build timings after all build+test steps complete
+    pipeline_steps.append(pipeline_steps.generate_step("Analyse build timings",
+                                                       ".buildkite/pipelines/analyze_build_timings.yml.sh"))
+
     # Build the DRA artifacts and upload to S3 and GCS
     pipeline_steps.append(pipeline_steps.generate_step("Create daily releasable artifacts",
                                                        ".buildkite/pipelines/create_dra.yml.sh"))
diff --git a/.buildkite/hooks/post-checkout b/.buildkite/hooks/post-checkout
@@ -27,20 +27,27 @@ if [[ "$BUILDKITE_PIPELINE_SLUG" == ml-cpp* ]]; then
     export BUILDKITE_ANALYTICS_TOKEN=$(vault read secret/ci/elastic-ml-cpp/buildkite/test_analytics/windows_x86_64 | awk '/^token/ {print $2;}')
   fi
 
+  if [[ "$BUILDKITE_STEP_KEY" == "analyze_build_timings" ]]; then
+    export BUILDKITE_API_READ_TOKEN=$(vault read -field=token secret/ci/elastic-ml-cpp/buildkite/api_read_token 2>/dev/null || echo "")
+  fi
+
+  # GCS service account — inject credentials for build and Java IT steps.
+  # Build steps use it for sccache; Java IT steps use it for the Gradle
+  # build cache.  The key is stored in Vault.
+  if [[ "$BUILDKITE_STEP_KEY" == build_test_* || "$BUILDKITE_STEP_KEY" == java_integration_tests_* ]]; then
+    SCCACHE_GCS_KEY_JSON=$(vault read -field=key secret/ci/elastic-ml-cpp/sccache/gcs_service_account 2>/dev/null || echo "")
+    if [ -n "$SCCACHE_GCS_KEY_JSON" ]; then
+      export SCCACHE_GCS_BUCKET="elastic-ml-cpp-sccache"
+      export SCCACHE_GCS_KEY_FILE=$(mktemp)
+      echo "$SCCACHE_GCS_KEY_JSON" > "$SCCACHE_GCS_KEY_FILE"
+      export GOOGLE_APPLICATION_CREDENTIALS="$SCCACHE_GCS_KEY_FILE"
+      export SCCACHE_GCS_KEY_PATH="$SCCACHE_GCS_KEY_FILE"
+    fi
+  fi
+
   if [[ "$BUILDKITE_STEP_KEY" == "build_pytorch_docker_image" ]]; then
     export DOCKER_REGISTRY_USERNAME=$(vault read --field=username  secret/ci/elastic-ml-cpp/prod_docker_registry_credentials)
     export DOCKER_REGISTRY_PASSWORD=$(vault read --field=password  secret/ci/elastic-ml-cpp/prod_docker_registry_credentials)
   fi
 
-  # Retrieve GCS service account key for sccache (compiler caching).
-  if [[ "$BUILDKITE_STEP_KEY" == build_test_* ]]; then
-      SCCACHE_GCS_KEY_JSON=$(vault read -field=key secret/ci/elastic-ml-cpp/sccache/gcs_service_account 2>/dev/null || echo "")
-      if [ -n "$SCCACHE_GCS_KEY_JSON" ]; then
-          export SCCACHE_GCS_BUCKET="elastic-ml-cpp-sccache"
-          export SCCACHE_GCS_KEY_FILE=$(mktemp)
-          echo "$SCCACHE_GCS_KEY_JSON" > "$SCCACHE_GCS_KEY_FILE"
-          export GOOGLE_APPLICATION_CREDENTIALS="$SCCACHE_GCS_KEY_FILE"
-          export SCCACHE_GCS_KEY_PATH="$SCCACHE_GCS_KEY_FILE"
-      fi
-  fi
 fi
diff --git a/.buildkite/pipelines/analyze_build_timings.yml.sh b/.buildkite/pipelines/analyze_build_timings.yml.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+# or more contributor license agreements. Licensed under the Elastic License
+# 2.0 and the following additional limitation. Functionality enabled by the
+# files subject to the Elastic License 2.0 may only be used in production when
+# invoked by an Elasticsearch process with a license key installed that permits
+# use of machine learning features. You may not use this file except in
+# compliance with the Elastic License 2.0 and the foregoing additional
+# limitation.
+
+cat <<EOL
+steps:
+  - label: "Analyse build timings :chart_with_upwards_trend:"
+    key: "analyze_build_timings"
+    command:
+        - "python3 .buildkite/scripts/steps/analyze_build_timings.py"
+    depends_on:
+        - "test_linux-aarch64-RelWithDebInfo"
+        - "test_linux-x86_64-RelWithDebInfo"
+        - "test_macos-aarch64-RelWithDebInfo"
+        - "test_Windows-x86_64-RelWithDebInfo"
+    allow_dependency_failure: true
+    soft_fail: true
+    agents:
+      image: "python:3-slim"
+EOL
diff --git a/.buildkite/scripts/steps/analyze_build_timings.py b/.buildkite/scripts/steps/analyze_build_timings.py
@@ -0,0 +1,185 @@
+#!/usr/bin/env python3
+#
+# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+# or more contributor license agreements. Licensed under the Elastic License
+# 2.0 and the following additional limitation. Functionality enabled by the
+# files subject to the Elastic License 2.0 may only be used in production when
+# invoked by an Elasticsearch process with a license key installed that permits
+# use of machine learning features. You may not use this file except in
+# compliance with the Elastic License 2.0 and the foregoing additional
+# limitation.
+
+"""
+Analyse build+test timings for the current snapshot build and compare
+against recent history.  Produces a Buildkite annotation with a summary
+table and flags any regressions.
+"""
+
+import json
+import math
+import os
+import subprocess
+import sys
+import urllib.request
+import urllib.error
+
+PIPELINE_SLUG = "ml-cpp-snapshot-builds"
+ORG_SLUG = "elastic"
+API_BASE = f"https://api.buildkite.com/v2/organizations/{ORG_SLUG}/pipelines/{PIPELINE_SLUG}"
+HISTORY_COUNT = 14
+
+PLATFORM_MAP = {
+    "Windows": "windows_x86_64",
+    "MacOS": "macos_aarch64",
+    "linux-x86_64": "linux_x86_64",
+    "linux-aarch64": "linux_aarch64",
+}
+
+
+def api_get(path, token):
+    url = f"{API_BASE}{path}"
+    req = urllib.request.Request(url, headers={"Authorization": f"Bearer {token}"})
+    try:
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            return json.loads(resp.read())
+    except urllib.error.HTTPError as e:
+        print(f"API error {e.code} for {url}: {e.read().decode()}", file=sys.stderr)
+        sys.exit(1)
+
+
+def extract_timings(build_data):
+    """Extract per-platform build+test timings from a build's jobs."""
+    timings = {}
+    for job in build_data.get("jobs", []):
+        name = job.get("name") or ""
+        if "Build & test" not in name:
+            continue
+        if "debug" in name.lower():
+            continue
+        started = job.get("started_at")
+        finished = job.get("finished_at")
+        if not started or not finished:
+            continue
+
+        for pattern, key in PLATFORM_MAP.items():
+            if pattern in name:
+                from datetime import datetime, timezone
+                fmt = "%Y-%m-%dT%H:%M:%S.%fZ"
+                t_start = datetime.strptime(started, fmt).replace(tzinfo=timezone.utc)
+                t_end = datetime.strptime(finished, fmt).replace(tzinfo=timezone.utc)
+                mins = (t_end - t_start).total_seconds() / 60.0
+                timings[key] = round(mins, 1)
+                break
+    return timings
+
+
+def mean_stddev(values):
+    if not values:
+        return 0.0, 0.0
+    n = len(values)
+    m = sum(values) / n
+    if n < 2:
+        return m, 0.0
+    variance = sum((x - m) ** 2 for x in values) / (n - 1)
+    return m, math.sqrt(variance)
+
+
+def annotate(markdown, style="info"):
+    """Create a Buildkite annotation."""
+    cmd = ["buildkite-agent", "annotate", "--style", style, "--context", "build-timings"]
+    proc = subprocess.run(cmd, input=markdown.encode(), capture_output=True)
+    if proc.returncode != 0:
+        print(f"buildkite-agent annotate failed: {proc.stderr.decode()}", file=sys.stderr)
+
+
+def main():
+    token = os.environ.get("BUILDKITE_API_READ_TOKEN", "")
+    if not token:
+        print("BUILDKITE_API_READ_TOKEN not set, skipping timing analysis", file=sys.stderr)
+        sys.exit(0)
+
+    build_number = os.environ.get("BUILDKITE_BUILD_NUMBER", "")
+    branch = os.environ.get("BUILDKITE_BRANCH", "main")
+
+    # Fetch current build
+    current = api_get(f"/builds/{build_number}", token)
+    current_timings = extract_timings(current)
+    current_date = current.get("created_at", "")[:10]
+
+    if not current_timings:
+        print("No build+test timings found for current build")
+        sys.exit(0)
+
+    # Fetch historical builds for the same branch
+    history_data = api_get(
+        f"/builds?branch={branch}&state=passed&per_page={HISTORY_COUNT + 1}", token
+    )
+
+    # Exclude the current build from history
+    history_builds = [
+        b for b in history_data if str(b.get("number")) != str(build_number)
+    ][:HISTORY_COUNT]
+
+    # Collect historical timings per platform
+    history = {key: [] for key in PLATFORM_MAP.values()}
+    for build in history_builds:
+        full_build = api_get(f"/builds/{build['number']}", token)
+        timings = extract_timings(full_build)
+        for key, val in timings.items():
+            history[key].append(val)
+
+    # Build the summary table
+    platforms = ["linux_x86_64", "linux_aarch64", "macos_aarch64", "windows_x86_64"]
+    platform_labels = {
+        "linux_x86_64": "Linux x86_64",
+        "linux_aarch64": "Linux aarch64",
+        "macos_aarch64": "macOS aarch64",
+        "windows_x86_64": "Windows x86_64",
+    }
+
+    lines = []
+    lines.append(f"### Build Timing Analysis — {current_date} (build #{build_number})")
+    lines.append("")
+    lines.append("| Platform | Current (min) | Avg (min) | Std Dev | Delta | Status |")
+    lines.append("|----------|:------------:|:---------:|:-------:|:-----:|:------:|")
+
+    has_regression = False
+    for plat in platforms:
+        cur = current_timings.get(plat)
+        hist = history.get(plat, [])
+        avg, sd = mean_stddev(hist)
+
+        if cur is None:
+            lines.append(f"| {platform_labels[plat]} | — | {avg:.1f} | {sd:.1f} | — | — |")
+            continue
+
+        delta = cur - avg
+        delta_pct = (delta / avg * 100) if avg > 0 else 0
+        sign = "+" if delta >= 0 else ""
+
+        if avg > 0 and sd > 0 and cur > avg + 2 * sd:
+            status = ":rotating_light: Regression"
+            has_regression = True
+        elif avg > 0 and cur < avg - sd:
+            status = ":rocket: Faster"
+        else:
+            status = ":white_check_mark: Normal"
+
+        lines.append(
+            f"| {platform_labels[plat]} | **{cur:.1f}** | {avg:.1f} | {sd:.1f} "
+            f"| {sign}{delta:.1f} ({sign}{delta_pct:.0f}%) | {status} |"
+        )
+
+    n_hist = len(history_builds)
+    lines.append("")
+    lines.append(f"_Compared against {n_hist} recent `{branch}` builds._")
+
+    markdown = "\n".join(lines)
+    print(markdown)
+
+    style = "warning" if has_regression else "info"
+    annotate(markdown, style)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.buildkite/scripts/steps/run_es_tests.sh b/.buildkite/scripts/steps/run_es_tests.sh
@@ -24,6 +24,18 @@ export PR_AUTHOR=$(expr "$BUILDKITE_BRANCH" : '\(.*\):.*')
 export PR_SOURCE_BRANCH=$(expr "$BUILDKITE_BRANCH" : '.*:\(.*\)')
 export PR_TARGET_BRANCH=${BUILDKITE_PULL_REQUEST_BASE_BRANCH}
 
+# Set up GCS credentials for Gradle build cache persistence (if available).
+# The post-checkout hook writes the GCS service account key for sccache;
+# reuse the same credentials for the Gradle cache bucket.
+if [ -n "${SCCACHE_GCS_BUCKET:-}" ] && [ -n "${GOOGLE_APPLICATION_CREDENTIALS:-}" ]; then
+    export GRADLE_BUILD_CACHE_GCS_BUCKET="${SCCACHE_GCS_BUCKET}"
+    # Install gsutil if not already present
+    if ! command -v gsutil &>/dev/null; then
+        echo "--- Installing gsutil"
+        pip3 install --quiet gsutil 2>/dev/null || pip install --quiet gsutil 2>/dev/null || echo "Warning: failed to install gsutil"
+    fi
+fi
+
 mkdir -p "${IVY_REPO}/maven/org/elasticsearch/ml/ml-cpp/$VERSION"
 cp "build/distributions/ml-cpp-$VERSION-linux-$HARDWARE_ARCH.zip" "${IVY_REPO}/maven/org/elasticsearch/ml/ml-cpp/$VERSION/ml-cpp-$VERSION.zip"
 # Since this is all local, for simplicity, cheat with the dependencies/no-dependencies split
diff --git a/dev-tools/gradle-build-cache-init.gradle b/dev-tools/gradle-build-cache-init.gradle
@@ -0,0 +1,17 @@
+/*
+ * Gradle init script to enable the local build cache for ES integration test
+ * builds.  Injected via --init-script so that we don't need to modify the
+ * cloned Elasticsearch repository.
+ *
+ * The local build cache stores task outputs keyed on their inputs.  When the
+ * cache directory is persisted between CI runs (e.g. via GCS), subsequent
+ * builds with the same ES commit get near-instant compilation.
+ */
+
+settingsEvaluated { settings ->
+    settings.buildCache {
+        local {
+            enabled = true
+        }
+    }
+}
diff --git a/dev-tools/run_es_tests.sh b/dev-tools/run_es_tests.sh
@@ -24,6 +24,9 @@
 
 set -e
 
+# Resolve the ml-cpp repo root before we cd away.
+ML_CPP_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+
 function isCloneTargetValid {
     FORK_TO_CHECK="$1"
     BRANCH_TO_CHECK="$2"
@@ -113,6 +116,57 @@ export GIT_COMMIT="$(git rev-parse HEAD)"
 export GIT_PREVIOUS_COMMIT="$GIT_COMMIT"
 
 IVY_REPO_URL="file://$2"
-./gradlew $GRADLE_JVM_OPTS -Dbuild.ml_cpp.repo="$IVY_REPO_URL" :x-pack:plugin:ml:qa:native-multi-node-tests:javaRestTest $EXTRA_TEST_OPTS
-./gradlew $GRADLE_JVM_OPTS -Dbuild.ml_cpp.repo="$IVY_REPO_URL" :x-pack:plugin:yamlRestTest --tests "org.elasticsearch.xpack.test.rest.XPackRestIT.test {p0=ml/*}" $EXTRA_TEST_OPTS
+
+INIT_SCRIPT="$ML_CPP_ROOT/dev-tools/gradle-build-cache-init.gradle"
+GRADLE_CACHE_DIR="$HOME/.gradle/caches/build-cache-1"
+CACHE_ARGS=""
+if [ -f "$INIT_SCRIPT" ]; then
+    CACHE_ARGS="--build-cache --init-script $INIT_SCRIPT"
+fi
+
+# Restore Gradle build cache from GCS if credentials are available.
+# This lets ephemeral CI agents reuse compilation outputs from prior builds.
+CACHE_KEY="gradle-build-cache-$(uname -m)"
+GCS_CACHE_PATH=""
+if [ -n "${GRADLE_BUILD_CACHE_GCS_BUCKET:-}" ] && [ -n "${GOOGLE_APPLICATION_CREDENTIALS:-}" ]; then
+    GCS_CACHE_PATH="gs://${GRADLE_BUILD_CACHE_GCS_BUCKET}/${CACHE_KEY}.tar.gz"
+    if command -v gsutil &>/dev/null; then
+        # The gcloud SDK gsutil needs explicit service account activation;
+        # GOOGLE_APPLICATION_CREDENTIALS alone is not sufficient.
+        if command -v gcloud &>/dev/null; then
+            gcloud auth activate-service-account --key-file="$GOOGLE_APPLICATION_CREDENTIALS" 2>/dev/null || true
+        fi
+        echo "--- Restoring Gradle build cache from $GCS_CACHE_PATH"
+        mkdir -p "$GRADLE_CACHE_DIR"
+        if gsutil -q stat "$GCS_CACHE_PATH" 2>/dev/null; then
+            gsutil cp "$GCS_CACHE_PATH" /tmp/gradle-cache.tar.gz \
+                && tar xzf /tmp/gradle-cache.tar.gz -C "$HOME/.gradle/caches/" \
+                && rm -f /tmp/gradle-cache.tar.gz \
+                && echo "Gradle build cache restored ($(du -sh "$GRADLE_CACHE_DIR" 2>/dev/null | cut -f1))" \
+                || echo "Warning: failed to restore Gradle build cache, continuing without it"
+        else
+            echo "No cached Gradle build cache found, will build from scratch"
+        fi
+    else
+        echo "gsutil not found, skipping Gradle build cache restore"
+    fi
+fi
+
+./gradlew $GRADLE_JVM_OPTS $CACHE_ARGS -Dbuild.ml_cpp.repo="$IVY_REPO_URL" :x-pack:plugin:ml:qa:native-multi-node-tests:javaRestTest $EXTRA_TEST_OPTS
+./gradlew $GRADLE_JVM_OPTS $CACHE_ARGS -Dbuild.ml_cpp.repo="$IVY_REPO_URL" :x-pack:plugin:yamlRestTest --tests "org.elasticsearch.xpack.test.rest.XPackRestIT.test {p0=ml/*}" $EXTRA_TEST_OPTS
+
+# Upload Gradle build cache to GCS for future builds.
+if [ -n "$GCS_CACHE_PATH" ] && [ -d "$GRADLE_CACHE_DIR" ] && command -v gsutil &>/dev/null; then
+    echo "--- Uploading Gradle build cache to $GCS_CACHE_PATH"
+    CACHE_SIZE=$(du -sm "$GRADLE_CACHE_DIR" 2>/dev/null | cut -f1)
+    if [ "${CACHE_SIZE:-0}" -gt 0 ] && [ "${CACHE_SIZE:-0}" -lt 4096 ]; then
+        tar czf /tmp/gradle-cache.tar.gz -C "$HOME/.gradle/caches/" build-cache-1 \
+            && gsutil -o "GSUtil:parallel_composite_upload_threshold=50M" cp /tmp/gradle-cache.tar.gz "$GCS_CACHE_PATH" \
+            && rm -f /tmp/gradle-cache.tar.gz \
+            && echo "Gradle build cache uploaded (${CACHE_SIZE}M)" \
+            || echo "Warning: failed to upload Gradle build cache"
+    else
+        echo "Skipping cache upload (size=${CACHE_SIZE:-0}M, expected 1-4095M)"
+    fi
+fi