Skip to content

Commit 1a85f26

Browse files
authored
test: add OTEL-to-ClickHouse ingestion assertions to integration smoke test (#200)
## Summary - extend `scripts/smoke-test.sh` with ClickHouse query helpers, port-forward lifecycle cleanup, and ingestion polling - generate OTLP traces/logs with `telemetrygen` and send them to the in-cluster collector (`4318`) - assert `default.otel_traces` and `default.otel_logs` row counts increase after telemetry is sent - run smoke tests with `TIMEOUT=300` in the integration workflow so failures surface sooner - set `hyperdx.config.OPAMP_SERVER_URL: ""` in CI test values so the collector runs in standalone mode with built-in ClickHouse export pipelines ## Test - [x] `bash -n scripts/smoke-test.sh` - [x] `Helm Chart Integration Test` passed on this PR (`test-helm-chart`) - [x] `helm-unittest` passed on this PR ## New Test Assertions - fetch `CLICKHOUSE_APP_PASSWORD` from `clickstack-secret` and query ClickHouse over `8123` as `app` - capture baseline counts from `default.otel_traces` and `default.otel_logs` - send synthetic OTLP traces/logs to the collector - poll until each table count is greater than baseline; fail on timeout/query/auth errors ## Why This Would Catch `6f29e730856cb5bcc30138dd168794bfdb17441d` That commit fixed a grant-shape issue where split `app` grants could result in missing `SELECT` privileges (`SELECT ON default.*` / `SELECT ON system.*`) because only the first grant was effectively applied. The new integration assertions actively query `default.otel_traces` and `default.otel_logs` as the `app` user before and after telemetry ingestion. If the old broken grant shape were reintroduced, those `SELECT` checks would fail (or remain unreadable), causing the smoke test and PR checks to fail automatically. Made with [Cursor](https://cursor.com)
1 parent 00d69b9 commit 1a85f26

File tree

5 files changed

+289
-11
lines changed

5 files changed

+289
-11
lines changed

.github/workflows/chart-test.yml

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,10 +86,12 @@ jobs:
8686
- name: Deploy ClickStack chart (Phase 2)
8787
run: |
8888
# Create test values for faster deployment
89-
cat > test-values.yaml << EOF
89+
cat > test-values.yaml << 'EOF'
9090
hyperdx:
9191
secrets:
9292
HYPERDX_API_KEY: "test-api-key-for-ci"
93+
config:
94+
OPAMP_SERVER_URL: ""
9395
deployment:
9496
replicas: 1
9597
service:
@@ -139,11 +141,16 @@ jobs:
139141
140142
kubectl get services
141143
144+
- name: Set up Node.js
145+
uses: actions/setup-node@v4
146+
with:
147+
node-version: '24'
148+
142149
- name: Run comprehensive smoke tests
143150
run: |
144151
chmod +x ./scripts/smoke-test.sh
145152
146-
RELEASE_NAME=hyperdx-test NAMESPACE=default ./scripts/smoke-test.sh
153+
TIMEOUT=300 RELEASE_NAME=hyperdx-test NAMESPACE=default ./scripts/smoke-test.sh
147154
148155
- name: Collect logs on failure
149156
if: failure()

scripts/e2e/package.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"private": true,
3+
"devDependencies": {
4+
"@playwright/test": "^1.58.2"
5+
}
6+
}

scripts/e2e/playwright.config.ts

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import { defineConfig, devices } from '@playwright/test';
2+
3+
export default defineConfig({
4+
testDir: '.',
5+
timeout: 120_000,
6+
retries: 1,
7+
use: {
8+
baseURL: 'http://localhost:3000',
9+
trace: 'on-first-retry',
10+
screenshot: 'only-on-failure',
11+
},
12+
projects: [
13+
{ name: 'chromium', use: { ...devices['Desktop Chrome'] } },
14+
],
15+
});

scripts/e2e/search-log.spec.ts

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import { test, expect } from '@playwright/test';
2+
3+
const TEST_EMAIL = 'smoke@test.local';
4+
const TEST_PASSWORD = 'SmokeTest1234!';
5+
const SEARCH_TERM = 'clickstack smoke test log';
6+
7+
test('register user and verify log appears on search page', async ({ page }) => {
8+
await page.goto('/register');
9+
10+
await page.getByRole('textbox', { name: /email/i }).fill(TEST_EMAIL);
11+
await page.locator('input[name="password"]').fill(TEST_PASSWORD);
12+
await page.locator('input[name="confirmPassword"]').fill(TEST_PASSWORD);
13+
await page.getByRole('button', { name: 'Create' }).click();
14+
15+
await page.waitForURL('**/search**', { timeout: 60_000 });
16+
17+
const searchInput = page.getByTestId('search-input');
18+
await expect(searchInput).toBeVisible({ timeout: 30_000 });
19+
await searchInput.fill(SEARCH_TERM);
20+
await page.getByTestId('search-submit-button').click();
21+
await page.waitForLoadState('networkidle');
22+
23+
const resultsTable = page.getByTestId('search-results-table');
24+
await expect(resultsTable).toBeVisible({ timeout: 30_000 });
25+
await expect(resultsTable).toContainText(SEARCH_TERM, { timeout: 30_000 });
26+
});

scripts/smoke-test.sh

Lines changed: 233 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,49 @@
11
#!/bin/bash
22
set -e
3+
set -o pipefail
34

45
# Test script for HyperDX deployment
56
NAMESPACE=${NAMESPACE:-default}
67
RELEASE_NAME=${RELEASE_NAME:-hyperdx-test}
78
CHART_NAME=${CHART_NAME:-clickstack}
89
TIMEOUT=${TIMEOUT:-300}
10+
CLICKHOUSE_SERVICE=${CLICKHOUSE_SERVICE:-$RELEASE_NAME-$CHART_NAME-clickhouse-clickhouse-headless}
11+
CLICKHOUSE_SECRET_NAME=${CLICKHOUSE_SECRET_NAME:-clickstack-secret}
12+
CLICKHOUSE_HTTP_USER=${CLICKHOUSE_HTTP_USER:-app}
13+
CLICKHOUSE_DATABASE=${CLICKHOUSE_DATABASE:-default}
14+
CLICKHOUSE_TRACE_TABLE=${CLICKHOUSE_TRACE_TABLE:-otel_traces}
15+
CLICKHOUSE_LOG_TABLE=${CLICKHOUSE_LOG_TABLE:-otel_logs}
16+
INGESTION_POLL_INTERVAL=${INGESTION_POLL_INTERVAL:-5}
17+
OTEL_TELEMETRYGEN_IMAGE=${OTEL_TELEMETRYGEN_IMAGE:-ghcr.io/open-telemetry/opentelemetry-collector-contrib/telemetrygen:latest}
18+
OTEL_SIGNAL_COUNT=${OTEL_SIGNAL_COUNT:-20}
19+
20+
PORT_FORWARD_PIDS=()
21+
PORT_FORWARD_LOGS=()
22+
CLICKHOUSE_HTTP_PASSWORD=""
923

1024
echo "Starting HyperDX tests..."
1125
echo "Release: $RELEASE_NAME"
1226
echo "Chart: $CHART_NAME"
1327
echo "Namespace: $NAMESPACE"
1428

29+
cleanup_port_forwards() {
30+
local pid=""
31+
local log_file=""
32+
33+
for pid in "${PORT_FORWARD_PIDS[@]}"; do
34+
if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then
35+
kill "$pid" 2>/dev/null || true
36+
wait "$pid" 2>/dev/null || true
37+
fi
38+
done
39+
40+
for log_file in "${PORT_FORWARD_LOGS[@]}"; do
41+
rm -f "$log_file" 2>/dev/null || true
42+
done
43+
}
44+
45+
trap cleanup_port_forwards EXIT
46+
1547
wait_for_service() {
1648
local url=$1
1749
local name=$2
@@ -39,6 +71,7 @@ check_endpoint() {
3971
local url=$1
4072
local expected_code=$2
4173
local desc=$3
74+
local code=""
4275

4376
echo "Checking $desc..."
4477

@@ -53,6 +86,148 @@ check_endpoint() {
5386
fi
5487
}
5588

89+
start_port_forward() {
90+
local resource=$1
91+
local local_port=$2
92+
local remote_port=$3
93+
local name=$4
94+
local log_file=""
95+
local pid=""
96+
97+
log_file=$(mktemp "/tmp/${name}.XXXXXX.log")
98+
echo "Starting port-forward for $name (${resource} ${local_port}:${remote_port})..." >&2
99+
kubectl port-forward "$resource" "${local_port}:${remote_port}" -n "$NAMESPACE" >"$log_file" 2>&1 &
100+
pid=$!
101+
102+
PORT_FORWARD_PIDS+=("$pid")
103+
PORT_FORWARD_LOGS+=("$log_file")
104+
105+
sleep 3
106+
if ! kill -0 "$pid" 2>/dev/null; then
107+
echo "ERROR: Failed to start port-forward for $name" >&2
108+
sed -n '1,120p' "$log_file" >&2 || true
109+
return 1
110+
fi
111+
112+
echo "$pid"
113+
}
114+
115+
stop_port_forward() {
116+
local pid=$1
117+
118+
if [ -n "${pid:-}" ] && kill -0 "$pid" 2>/dev/null; then
119+
kill "$pid" 2>/dev/null || true
120+
wait "$pid" 2>/dev/null || true
121+
fi
122+
}
123+
124+
get_secret_value() {
125+
local secret_name=$1
126+
local key_name=$2
127+
128+
kubectl get secret "$secret_name" -n "$NAMESPACE" -o "jsonpath={.data.${key_name}}" | base64 --decode
129+
}
130+
131+
run_clickhouse_query() {
132+
local sql=$1
133+
134+
curl -sS --fail \
135+
-u "${CLICKHOUSE_HTTP_USER}:${CLICKHOUSE_HTTP_PASSWORD}" \
136+
--data-binary "$sql" \
137+
"http://localhost:8123/?database=${CLICKHOUSE_DATABASE}"
138+
}
139+
140+
get_table_count() {
141+
local table=$1
142+
local count=""
143+
144+
count=$(run_clickhouse_query "SELECT count() FROM \`${CLICKHOUSE_DATABASE}\`.\`${table}\`;")
145+
count=$(echo "$count" | tr -d '[:space:]')
146+
147+
if [[ ! "$count" =~ ^[0-9]+$ ]]; then
148+
echo "ERROR: Non-numeric count for table ${table}: ${count}"
149+
return 1
150+
fi
151+
152+
echo "$count"
153+
}
154+
155+
wait_for_table_queryable() {
156+
local table=$1
157+
local timeout_seconds=$2
158+
local start_time=0
159+
local now=0
160+
local count=""
161+
162+
start_time=$(date +%s)
163+
while true; do
164+
count=$(get_table_count "$table" 2>/dev/null || true)
165+
if [[ "$count" =~ ^[0-9]+$ ]]; then
166+
echo "$count"
167+
return 0
168+
fi
169+
170+
now=$(date +%s)
171+
if [ $((now - start_time)) -ge "$timeout_seconds" ]; then
172+
echo "ERROR: Timed out waiting for table ${CLICKHOUSE_DATABASE}.${table} to become queryable"
173+
return 1
174+
fi
175+
176+
sleep "$INGESTION_POLL_INTERVAL"
177+
done
178+
}
179+
180+
wait_for_table_count_increase() {
181+
local table=$1
182+
local baseline_count=$2
183+
local timeout_seconds=$3
184+
local start_time=0
185+
local now=0
186+
local current_count=""
187+
188+
start_time=$(date +%s)
189+
while true; do
190+
current_count=$(get_table_count "$table" 2>/dev/null || true)
191+
if [[ "$current_count" =~ ^[0-9]+$ ]]; then
192+
echo "Current count for ${CLICKHOUSE_DATABASE}.${table}: ${current_count} (baseline ${baseline_count})"
193+
if [ "$current_count" -gt "$baseline_count" ]; then
194+
echo "Detected new rows in ${CLICKHOUSE_DATABASE}.${table}"
195+
return 0
196+
fi
197+
fi
198+
199+
now=$(date +%s)
200+
if [ $((now - start_time)) -ge "$timeout_seconds" ]; then
201+
echo "ERROR: Timed out waiting for row increase in ${CLICKHOUSE_DATABASE}.${table}"
202+
return 1
203+
fi
204+
205+
sleep "$INGESTION_POLL_INTERVAL"
206+
done
207+
}
208+
209+
send_telemetrygen_signal() {
210+
local signal=$1
211+
local count_flag=$2
212+
local count=$3
213+
local run_id=$4
214+
local body_arg=()
215+
216+
if [ "$signal" = "logs" ]; then
217+
body_arg=(--body "clickstack smoke test log ${run_id}")
218+
fi
219+
220+
echo "Sending ${signal} to OTEL collector over OTLP HTTP..."
221+
docker run --rm --network host "$OTEL_TELEMETRYGEN_IMAGE" "$signal" \
222+
--otlp-http \
223+
--otlp-endpoint "localhost:4318" \
224+
--otlp-insecure \
225+
"$count_flag" "$count" \
226+
--rate 5 \
227+
--service "clickstack-smoke-test" \
228+
"${body_arg[@]}"
229+
}
230+
56231
# Check pods
57232
echo "Checking pod status..."
58233
kubectl wait --for=condition=Ready pods -l app.kubernetes.io/instance=$RELEASE_NAME --timeout=${TIMEOUT}s -n $NAMESPACE
@@ -62,26 +237,24 @@ kubectl get pods -l app.kubernetes.io/instance=$RELEASE_NAME -n $NAMESPACE
62237

63238
# Test UI
64239
echo "Testing HyperDX UI..."
65-
kubectl port-forward service/$RELEASE_NAME-$CHART_NAME-app 3000:3000 -n $NAMESPACE &
66-
pf_pid=$!
67-
sleep 10
240+
pf_pid=$(start_port_forward "service/$RELEASE_NAME-$CHART_NAME-app" "3000" "3000" "hyperdx-ui")
241+
sleep 2
68242

69243
wait_for_service "http://localhost:3000" "HyperDX UI"
70244
check_endpoint "http://localhost:3000" "200" "UI"
71245

72-
kill $pf_pid 2>/dev/null || true
246+
stop_port_forward "$pf_pid"
73247
sleep 2
74248

75249
# Test OTEL collector metrics endpoint
76250
echo "Testing OTEL collector metrics endpoint..."
77-
kubectl port-forward service/$RELEASE_NAME-otel-collector 8888:8888 -n $NAMESPACE &
78-
metrics_pf_pid=$!
79-
sleep 10
251+
metrics_pf_pid=$(start_port_forward "service/$RELEASE_NAME-otel-collector" "8888" "8888" "otel-metrics")
252+
sleep 2
80253

81254
wait_for_service "http://localhost:8888/metrics" "OTEL Metrics"
82255
check_endpoint "http://localhost:8888/metrics" "200" "OTEL Metrics endpoint"
83256

84-
kill $metrics_pf_pid 2>/dev/null || true
257+
stop_port_forward "$metrics_pf_pid"
85258
sleep 2
86259

87260
# Verify OTEL Collector Deployment is Available
@@ -105,11 +278,62 @@ else
105278
exit 1
106279
fi
107280

281+
# Verify OTEL data ingestion to ClickHouse
282+
echo "Verifying OTEL ingestion into ClickHouse..."
283+
otlp_http_pf_pid=$(start_port_forward "service/$RELEASE_NAME-otel-collector" "4318" "4318" "otel-http")
284+
clickhouse_pf_pid=$(start_port_forward "service/$CLICKHOUSE_SERVICE" "8123" "8123" "clickhouse-http")
285+
286+
CLICKHOUSE_HTTP_PASSWORD=$(get_secret_value "$CLICKHOUSE_SECRET_NAME" "CLICKHOUSE_APP_PASSWORD")
287+
if [ -z "${CLICKHOUSE_HTTP_PASSWORD:-}" ]; then
288+
echo "ERROR: Could not read CLICKHOUSE_APP_PASSWORD from secret ${CLICKHOUSE_SECRET_NAME}"
289+
exit 1
290+
fi
291+
292+
trace_baseline=$(wait_for_table_queryable "$CLICKHOUSE_TRACE_TABLE" "$TIMEOUT")
293+
log_baseline=$(wait_for_table_queryable "$CLICKHOUSE_LOG_TABLE" "$TIMEOUT")
294+
echo "Baseline count ${CLICKHOUSE_DATABASE}.${CLICKHOUSE_TRACE_TABLE}: ${trace_baseline}"
295+
echo "Baseline count ${CLICKHOUSE_DATABASE}.${CLICKHOUSE_LOG_TABLE}: ${log_baseline}"
296+
297+
if ! command -v docker > /dev/null 2>&1; then
298+
echo "ERROR: docker is required to run telemetrygen for OTEL ingestion checks"
299+
exit 1
300+
fi
301+
302+
run_id=$(date +%s)
303+
send_telemetrygen_signal "traces" "--traces" "$OTEL_SIGNAL_COUNT" "$run_id"
304+
send_telemetrygen_signal "logs" "--logs" "$OTEL_SIGNAL_COUNT" "$run_id"
305+
306+
echo "Waiting for traces/logs to land in ClickHouse..."
307+
308+
wait_for_table_count_increase "$CLICKHOUSE_TRACE_TABLE" "$trace_baseline" "$TIMEOUT"
309+
wait_for_table_count_increase "$CLICKHOUSE_LOG_TABLE" "$log_baseline" "$TIMEOUT"
310+
311+
stop_port_forward "$otlp_http_pf_pid"
312+
stop_port_forward "$clickhouse_pf_pid"
313+
314+
# Verify app works end-to-end with default connection (register + search)
315+
echo "Running Playwright e2e test..."
316+
ui_pf_pid=$(start_port_forward "service/$RELEASE_NAME-$CHART_NAME-app" "3000" "3000" "hyperdx-ui-e2e")
317+
sleep 2
318+
wait_for_service "http://localhost:3000" "HyperDX UI (e2e)"
319+
320+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
321+
(
322+
cd "$SCRIPT_DIR/e2e"
323+
npm install
324+
npx playwright install --with-deps chromium
325+
npx playwright test
326+
)
327+
328+
stop_port_forward "$ui_pf_pid"
329+
108330
echo ""
109331
echo "All smoke tests passed"
110332
echo "- All pods running"
111333
echo "- HyperDX UI responding"
112334
echo "- OTEL Collector metrics accessible"
113335
echo "- OTEL Collector Deployment available"
114336
echo "- ClickHouseCluster reconciled (Ready)"
115-
echo "- MongoDBCommunity reconciled (Running)"
337+
echo "- MongoDBCommunity reconciled (Running)"
338+
echo "- OTEL traces and logs persisted to ClickHouse"
339+
echo "- App registers user and displays logs via default connection"

0 commit comments

Comments
 (0)