Skip to content

Commit a4b2634

Browse files
authored
[ML] Replace shell test runner with portable CMake/CTest infrastructure (#2900)
Replace run_tests_as_seperate_processes.sh with cmake/run-tests-individually.cmake for portable cross-platform parallel test execution. Enable CTest integration and rename the 'test' target to 'ml_test' to avoid conflicts.
1 parent 4f1ec3e commit a4b2634

File tree

7 files changed

+318
-13
lines changed

7 files changed

+318
-13
lines changed

.buildkite/branch.json.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ def main():
4040
build_linux = pipeline_steps.generate_step_template("Linux", "build", config.build_aarch64, config.build_x86_64)
4141
pipeline_steps.append(build_linux)
4242

43+
# Analyse build timings after all build+test steps complete
44+
pipeline_steps.append(pipeline_steps.generate_step("Analyse build timings",
45+
".buildkite/pipelines/analyze_build_timings.yml.sh"))
46+
4347
# Build the DRA artifacts and upload to S3 and GCS
4448
pipeline_steps.append(pipeline_steps.generate_step("Create daily releasable artifacts",
4549
".buildkite/pipelines/create_dra.yml.sh"))

.buildkite/hooks/post-checkout

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,20 +27,27 @@ if [[ "$BUILDKITE_PIPELINE_SLUG" == ml-cpp* ]]; then
2727
export BUILDKITE_ANALYTICS_TOKEN=$(vault read secret/ci/elastic-ml-cpp/buildkite/test_analytics/windows_x86_64 | awk '/^token/ {print $2;}')
2828
fi
2929

30+
if [[ "$BUILDKITE_STEP_KEY" == "analyze_build_timings" ]]; then
31+
export BUILDKITE_API_READ_TOKEN=$(vault read -field=token secret/ci/elastic-ml-cpp/buildkite/api_read_token 2>/dev/null || echo "")
32+
fi
33+
34+
# GCS service account — inject credentials for build and Java IT steps.
35+
# Build steps use it for sccache; Java IT steps use it for the Gradle
36+
# build cache. The key is stored in Vault.
37+
if [[ "$BUILDKITE_STEP_KEY" == build_test_* || "$BUILDKITE_STEP_KEY" == java_integration_tests_* ]]; then
38+
SCCACHE_GCS_KEY_JSON=$(vault read -field=key secret/ci/elastic-ml-cpp/sccache/gcs_service_account 2>/dev/null || echo "")
39+
if [ -n "$SCCACHE_GCS_KEY_JSON" ]; then
40+
export SCCACHE_GCS_BUCKET="elastic-ml-cpp-sccache"
41+
export SCCACHE_GCS_KEY_FILE=$(mktemp)
42+
echo "$SCCACHE_GCS_KEY_JSON" > "$SCCACHE_GCS_KEY_FILE"
43+
export GOOGLE_APPLICATION_CREDENTIALS="$SCCACHE_GCS_KEY_FILE"
44+
export SCCACHE_GCS_KEY_PATH="$SCCACHE_GCS_KEY_FILE"
45+
fi
46+
fi
47+
3048
if [[ "$BUILDKITE_STEP_KEY" == "build_pytorch_docker_image" ]]; then
3149
export DOCKER_REGISTRY_USERNAME=$(vault read --field=username secret/ci/elastic-ml-cpp/prod_docker_registry_credentials)
3250
export DOCKER_REGISTRY_PASSWORD=$(vault read --field=password secret/ci/elastic-ml-cpp/prod_docker_registry_credentials)
3351
fi
3452

35-
# Retrieve GCS service account key for sccache (compiler caching).
36-
if [[ "$BUILDKITE_STEP_KEY" == build_test_* ]]; then
37-
SCCACHE_GCS_KEY_JSON=$(vault read -field=key secret/ci/elastic-ml-cpp/sccache/gcs_service_account 2>/dev/null || echo "")
38-
if [ -n "$SCCACHE_GCS_KEY_JSON" ]; then
39-
export SCCACHE_GCS_BUCKET="elastic-ml-cpp-sccache"
40-
export SCCACHE_GCS_KEY_FILE=$(mktemp)
41-
echo "$SCCACHE_GCS_KEY_JSON" > "$SCCACHE_GCS_KEY_FILE"
42-
export GOOGLE_APPLICATION_CREDENTIALS="$SCCACHE_GCS_KEY_FILE"
43-
export SCCACHE_GCS_KEY_PATH="$SCCACHE_GCS_KEY_FILE"
44-
fi
45-
fi
4653
fi
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#!/bin/bash
2+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
# or more contributor license agreements. Licensed under the Elastic License
4+
# 2.0 and the following additional limitation. Functionality enabled by the
5+
# files subject to the Elastic License 2.0 may only be used in production when
6+
# invoked by an Elasticsearch process with a license key installed that permits
7+
# use of machine learning features. You may not use this file except in
8+
# compliance with the Elastic License 2.0 and the foregoing additional
9+
# limitation.
10+
11+
cat <<EOL
12+
steps:
13+
- label: "Analyse build timings :chart_with_upwards_trend:"
14+
key: "analyze_build_timings"
15+
command:
16+
- "python3 .buildkite/scripts/steps/analyze_build_timings.py"
17+
depends_on:
18+
- "test_linux-aarch64-RelWithDebInfo"
19+
- "test_linux-x86_64-RelWithDebInfo"
20+
- "test_macos-aarch64-RelWithDebInfo"
21+
- "test_Windows-x86_64-RelWithDebInfo"
22+
allow_dependency_failure: true
23+
soft_fail: true
24+
agents:
25+
image: "python:3-slim"
26+
EOL
Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
#!/usr/bin/env python3
2+
#
3+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
4+
# or more contributor license agreements. Licensed under the Elastic License
5+
# 2.0 and the following additional limitation. Functionality enabled by the
6+
# files subject to the Elastic License 2.0 may only be used in production when
7+
# invoked by an Elasticsearch process with a license key installed that permits
8+
# use of machine learning features. You may not use this file except in
9+
# compliance with the Elastic License 2.0 and the foregoing additional
10+
# limitation.
11+
12+
"""
13+
Analyse build+test timings for the current snapshot build and compare
14+
against recent history. Produces a Buildkite annotation with a summary
15+
table and flags any regressions.
16+
"""
17+
18+
import json
19+
import math
20+
import os
21+
import subprocess
22+
import sys
23+
import urllib.request
24+
import urllib.error
25+
26+
PIPELINE_SLUG = "ml-cpp-snapshot-builds"
27+
ORG_SLUG = "elastic"
28+
API_BASE = f"https://api.buildkite.com/v2/organizations/{ORG_SLUG}/pipelines/{PIPELINE_SLUG}"
29+
HISTORY_COUNT = 14
30+
31+
PLATFORM_MAP = {
32+
"Windows": "windows_x86_64",
33+
"MacOS": "macos_aarch64",
34+
"linux-x86_64": "linux_x86_64",
35+
"linux-aarch64": "linux_aarch64",
36+
}
37+
38+
39+
def api_get(path, token):
40+
url = f"{API_BASE}{path}"
41+
req = urllib.request.Request(url, headers={"Authorization": f"Bearer {token}"})
42+
try:
43+
with urllib.request.urlopen(req, timeout=30) as resp:
44+
return json.loads(resp.read())
45+
except urllib.error.HTTPError as e:
46+
print(f"API error {e.code} for {url}: {e.read().decode()}", file=sys.stderr)
47+
sys.exit(1)
48+
49+
50+
def extract_timings(build_data):
51+
"""Extract per-platform build+test timings from a build's jobs."""
52+
timings = {}
53+
for job in build_data.get("jobs", []):
54+
name = job.get("name") or ""
55+
if "Build & test" not in name:
56+
continue
57+
if "debug" in name.lower():
58+
continue
59+
started = job.get("started_at")
60+
finished = job.get("finished_at")
61+
if not started or not finished:
62+
continue
63+
64+
for pattern, key in PLATFORM_MAP.items():
65+
if pattern in name:
66+
from datetime import datetime, timezone
67+
fmt = "%Y-%m-%dT%H:%M:%S.%fZ"
68+
t_start = datetime.strptime(started, fmt).replace(tzinfo=timezone.utc)
69+
t_end = datetime.strptime(finished, fmt).replace(tzinfo=timezone.utc)
70+
mins = (t_end - t_start).total_seconds() / 60.0
71+
timings[key] = round(mins, 1)
72+
break
73+
return timings
74+
75+
76+
def mean_stddev(values):
77+
if not values:
78+
return 0.0, 0.0
79+
n = len(values)
80+
m = sum(values) / n
81+
if n < 2:
82+
return m, 0.0
83+
variance = sum((x - m) ** 2 for x in values) / (n - 1)
84+
return m, math.sqrt(variance)
85+
86+
87+
def annotate(markdown, style="info"):
88+
"""Create a Buildkite annotation."""
89+
cmd = ["buildkite-agent", "annotate", "--style", style, "--context", "build-timings"]
90+
proc = subprocess.run(cmd, input=markdown.encode(), capture_output=True)
91+
if proc.returncode != 0:
92+
print(f"buildkite-agent annotate failed: {proc.stderr.decode()}", file=sys.stderr)
93+
94+
95+
def main():
96+
token = os.environ.get("BUILDKITE_API_READ_TOKEN", "")
97+
if not token:
98+
print("BUILDKITE_API_READ_TOKEN not set, skipping timing analysis", file=sys.stderr)
99+
sys.exit(0)
100+
101+
build_number = os.environ.get("BUILDKITE_BUILD_NUMBER", "")
102+
branch = os.environ.get("BUILDKITE_BRANCH", "main")
103+
104+
# Fetch current build
105+
current = api_get(f"/builds/{build_number}", token)
106+
current_timings = extract_timings(current)
107+
current_date = current.get("created_at", "")[:10]
108+
109+
if not current_timings:
110+
print("No build+test timings found for current build")
111+
sys.exit(0)
112+
113+
# Fetch historical builds for the same branch
114+
history_data = api_get(
115+
f"/builds?branch={branch}&state=passed&per_page={HISTORY_COUNT + 1}", token
116+
)
117+
118+
# Exclude the current build from history
119+
history_builds = [
120+
b for b in history_data if str(b.get("number")) != str(build_number)
121+
][:HISTORY_COUNT]
122+
123+
# Collect historical timings per platform
124+
history = {key: [] for key in PLATFORM_MAP.values()}
125+
for build in history_builds:
126+
full_build = api_get(f"/builds/{build['number']}", token)
127+
timings = extract_timings(full_build)
128+
for key, val in timings.items():
129+
history[key].append(val)
130+
131+
# Build the summary table
132+
platforms = ["linux_x86_64", "linux_aarch64", "macos_aarch64", "windows_x86_64"]
133+
platform_labels = {
134+
"linux_x86_64": "Linux x86_64",
135+
"linux_aarch64": "Linux aarch64",
136+
"macos_aarch64": "macOS aarch64",
137+
"windows_x86_64": "Windows x86_64",
138+
}
139+
140+
lines = []
141+
lines.append(f"### Build Timing Analysis — {current_date} (build #{build_number})")
142+
lines.append("")
143+
lines.append("| Platform | Current (min) | Avg (min) | Std Dev | Delta | Status |")
144+
lines.append("|----------|:------------:|:---------:|:-------:|:-----:|:------:|")
145+
146+
has_regression = False
147+
for plat in platforms:
148+
cur = current_timings.get(plat)
149+
hist = history.get(plat, [])
150+
avg, sd = mean_stddev(hist)
151+
152+
if cur is None:
153+
lines.append(f"| {platform_labels[plat]} | — | {avg:.1f} | {sd:.1f} | — | — |")
154+
continue
155+
156+
delta = cur - avg
157+
delta_pct = (delta / avg * 100) if avg > 0 else 0
158+
sign = "+" if delta >= 0 else ""
159+
160+
if avg > 0 and sd > 0 and cur > avg + 2 * sd:
161+
status = ":rotating_light: Regression"
162+
has_regression = True
163+
elif avg > 0 and cur < avg - sd:
164+
status = ":rocket: Faster"
165+
else:
166+
status = ":white_check_mark: Normal"
167+
168+
lines.append(
169+
f"| {platform_labels[plat]} | **{cur:.1f}** | {avg:.1f} | {sd:.1f} "
170+
f"| {sign}{delta:.1f} ({sign}{delta_pct:.0f}%) | {status} |"
171+
)
172+
173+
n_hist = len(history_builds)
174+
lines.append("")
175+
lines.append(f"_Compared against {n_hist} recent `{branch}` builds._")
176+
177+
markdown = "\n".join(lines)
178+
print(markdown)
179+
180+
style = "warning" if has_regression else "info"
181+
annotate(markdown, style)
182+
183+
184+
if __name__ == "__main__":
185+
main()

.buildkite/scripts/steps/run_es_tests.sh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,18 @@ export PR_AUTHOR=$(expr "$BUILDKITE_BRANCH" : '\(.*\):.*')
2424
export PR_SOURCE_BRANCH=$(expr "$BUILDKITE_BRANCH" : '.*:\(.*\)')
2525
export PR_TARGET_BRANCH=${BUILDKITE_PULL_REQUEST_BASE_BRANCH}
2626

27+
# Set up GCS credentials for Gradle build cache persistence (if available).
28+
# The post-checkout hook writes the GCS service account key for sccache;
29+
# reuse the same credentials for the Gradle cache bucket.
30+
if [ -n "${SCCACHE_GCS_BUCKET:-}" ] && [ -n "${GOOGLE_APPLICATION_CREDENTIALS:-}" ]; then
31+
export GRADLE_BUILD_CACHE_GCS_BUCKET="${SCCACHE_GCS_BUCKET}"
32+
# Install gsutil if not already present
33+
if ! command -v gsutil &>/dev/null; then
34+
echo "--- Installing gsutil"
35+
pip3 install --quiet gsutil 2>/dev/null || pip install --quiet gsutil 2>/dev/null || echo "Warning: failed to install gsutil"
36+
fi
37+
fi
38+
2739
mkdir -p "${IVY_REPO}/maven/org/elasticsearch/ml/ml-cpp/$VERSION"
2840
cp "build/distributions/ml-cpp-$VERSION-linux-$HARDWARE_ARCH.zip" "${IVY_REPO}/maven/org/elasticsearch/ml/ml-cpp/$VERSION/ml-cpp-$VERSION.zip"
2941
# Since this is all local, for simplicity, cheat with the dependencies/no-dependencies split
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
/*
2+
* Gradle init script to enable the local build cache for ES integration test
3+
* builds. Injected via --init-script so that we don't need to modify the
4+
* cloned Elasticsearch repository.
5+
*
6+
* The local build cache stores task outputs keyed on their inputs. When the
7+
* cache directory is persisted between CI runs (e.g. via GCS), subsequent
8+
* builds with the same ES commit get near-instant compilation.
9+
*/
10+
11+
settingsEvaluated { settings ->
12+
settings.buildCache {
13+
local {
14+
enabled = true
15+
}
16+
}
17+
}

dev-tools/run_es_tests.sh

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@
2424

2525
set -e
2626

27+
# Resolve the ml-cpp repo root before we cd away.
28+
ML_CPP_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
29+
2730
function isCloneTargetValid {
2831
FORK_TO_CHECK="$1"
2932
BRANCH_TO_CHECK="$2"
@@ -113,6 +116,57 @@ export GIT_COMMIT="$(git rev-parse HEAD)"
113116
export GIT_PREVIOUS_COMMIT="$GIT_COMMIT"
114117

115118
IVY_REPO_URL="file://$2"
116-
./gradlew $GRADLE_JVM_OPTS -Dbuild.ml_cpp.repo="$IVY_REPO_URL" :x-pack:plugin:ml:qa:native-multi-node-tests:javaRestTest $EXTRA_TEST_OPTS
117-
./gradlew $GRADLE_JVM_OPTS -Dbuild.ml_cpp.repo="$IVY_REPO_URL" :x-pack:plugin:yamlRestTest --tests "org.elasticsearch.xpack.test.rest.XPackRestIT.test {p0=ml/*}" $EXTRA_TEST_OPTS
119+
120+
INIT_SCRIPT="$ML_CPP_ROOT/dev-tools/gradle-build-cache-init.gradle"
121+
GRADLE_CACHE_DIR="$HOME/.gradle/caches/build-cache-1"
122+
CACHE_ARGS=""
123+
if [ -f "$INIT_SCRIPT" ]; then
124+
CACHE_ARGS="--build-cache --init-script $INIT_SCRIPT"
125+
fi
126+
127+
# Restore Gradle build cache from GCS if credentials are available.
128+
# This lets ephemeral CI agents reuse compilation outputs from prior builds.
129+
CACHE_KEY="gradle-build-cache-$(uname -m)"
130+
GCS_CACHE_PATH=""
131+
if [ -n "${GRADLE_BUILD_CACHE_GCS_BUCKET:-}" ] && [ -n "${GOOGLE_APPLICATION_CREDENTIALS:-}" ]; then
132+
GCS_CACHE_PATH="gs://${GRADLE_BUILD_CACHE_GCS_BUCKET}/${CACHE_KEY}.tar.gz"
133+
if command -v gsutil &>/dev/null; then
134+
# The gcloud SDK gsutil needs explicit service account activation;
135+
# GOOGLE_APPLICATION_CREDENTIALS alone is not sufficient.
136+
if command -v gcloud &>/dev/null; then
137+
gcloud auth activate-service-account --key-file="$GOOGLE_APPLICATION_CREDENTIALS" 2>/dev/null || true
138+
fi
139+
echo "--- Restoring Gradle build cache from $GCS_CACHE_PATH"
140+
mkdir -p "$GRADLE_CACHE_DIR"
141+
if gsutil -q stat "$GCS_CACHE_PATH" 2>/dev/null; then
142+
gsutil cp "$GCS_CACHE_PATH" /tmp/gradle-cache.tar.gz \
143+
&& tar xzf /tmp/gradle-cache.tar.gz -C "$HOME/.gradle/caches/" \
144+
&& rm -f /tmp/gradle-cache.tar.gz \
145+
&& echo "Gradle build cache restored ($(du -sh "$GRADLE_CACHE_DIR" 2>/dev/null | cut -f1))" \
146+
|| echo "Warning: failed to restore Gradle build cache, continuing without it"
147+
else
148+
echo "No cached Gradle build cache found, will build from scratch"
149+
fi
150+
else
151+
echo "gsutil not found, skipping Gradle build cache restore"
152+
fi
153+
fi
154+
155+
./gradlew $GRADLE_JVM_OPTS $CACHE_ARGS -Dbuild.ml_cpp.repo="$IVY_REPO_URL" :x-pack:plugin:ml:qa:native-multi-node-tests:javaRestTest $EXTRA_TEST_OPTS
156+
./gradlew $GRADLE_JVM_OPTS $CACHE_ARGS -Dbuild.ml_cpp.repo="$IVY_REPO_URL" :x-pack:plugin:yamlRestTest --tests "org.elasticsearch.xpack.test.rest.XPackRestIT.test {p0=ml/*}" $EXTRA_TEST_OPTS
157+
158+
# Upload Gradle build cache to GCS for future builds.
159+
if [ -n "$GCS_CACHE_PATH" ] && [ -d "$GRADLE_CACHE_DIR" ] && command -v gsutil &>/dev/null; then
160+
echo "--- Uploading Gradle build cache to $GCS_CACHE_PATH"
161+
CACHE_SIZE=$(du -sm "$GRADLE_CACHE_DIR" 2>/dev/null | cut -f1)
162+
if [ "${CACHE_SIZE:-0}" -gt 0 ] && [ "${CACHE_SIZE:-0}" -lt 4096 ]; then
163+
tar czf /tmp/gradle-cache.tar.gz -C "$HOME/.gradle/caches/" build-cache-1 \
164+
&& gsutil -o "GSUtil:parallel_composite_upload_threshold=50M" cp /tmp/gradle-cache.tar.gz "$GCS_CACHE_PATH" \
165+
&& rm -f /tmp/gradle-cache.tar.gz \
166+
&& echo "Gradle build cache uploaded (${CACHE_SIZE}M)" \
167+
|| echo "Warning: failed to upload Gradle build cache"
168+
else
169+
echo "Skipping cache upload (size=${CACHE_SIZE:-0}M, expected 1-4095M)"
170+
fi
171+
fi
118172

0 commit comments

Comments
 (0)