Skip to content

Commit 67bdcf8

Browse files
authored
Merge branch 'main' into pipeline/Argocd
2 parents 88a59b6 + 74cdf40 commit 67bdcf8

21 files changed

+1922
-112
lines changed

.github/scripts/import-existing-resources.sh

Lines changed: 67 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
#!/bin/bash
2-
set -euo pipefail
2+
set -uo pipefail
33

44
# Import existing Kubernetes resources into Terraform state
55
# This prevents conflicts when deploying to an existing cluster
66

77
NAMESPACE="${NAMESPACE:-observability}"
88
REPORT_FILE="import-report.json"
9+
FAILURE_COUNT=0
10+
FAILED_OPERATIONS=()
911

1012
echo "🔍 Scanning for existing resources to import..."
1113

@@ -31,7 +33,8 @@ cleanup_conflicting_resources() {
3133

3234
# Get all resources of this type that match any of the keywords
3335
local pattern=$(echo "$keywords" | sed 's/,/|/g')
34-
local resources=$(kubectl get "$resource_type" -o name | grep -E "$pattern" || true)
36+
local resources
37+
resources=$(kubectl get "$resource_type" -o name 2>/dev/null | grep -E "$pattern" || true)
3538

3639
for res in $resources; do
3740
# Get owner using multiple methods
@@ -44,20 +47,28 @@ cleanup_conflicting_resources() {
4447

4548
if [ -n "$ns_owner" ] && [ "$ns_owner" != "$NAMESPACE" ]; then
4649
echo " ⚠️ CONFLICT: $res owned by '$ns_owner'. Deleting..."
47-
kubectl delete "$res" --ignore-not-found --timeout=30s
50+
if ! kubectl delete "$res" --ignore-not-found --timeout=30s 2>/dev/null; then
51+
echo " ❌ Failed to delete $res (continuing anyway)"
52+
FAILURE_COUNT=$((FAILURE_COUNT+1))
53+
FAILED_OPERATIONS+=("cleanup: kubectl delete $res")
54+
fi
4855
deleted_any=true
4956
elif [ -z "$ns_owner" ]; then
5057
# Special case: Resource exists but no Helm owner.
5158
# If it matches our exact release names, it's a "zombie" resource from a failed/partial manual install
5259
if [[ "$res" =~ monitoring-loki|monitoring-mimir|monitoring-tempo|monitoring-grafana|monitoring-prometheus ]]; then
5360
echo " ⚠️ ZOMBIE RESOURCE: $res has no owner but matches stack pattern. Deleting to ensure clean install..."
54-
kubectl delete "$res" --ignore-not-found --timeout=30s
61+
if ! kubectl delete "$res" --ignore-not-found --timeout=30s 2>/dev/null; then
62+
echo " ❌ Failed to delete zombie resource $res (continuing anyway)"
63+
FAILURE_COUNT=$((FAILURE_COUNT+1))
64+
FAILED_OPERATIONS+=("cleanup: kubectl delete zombie $res")
65+
fi
5566
deleted_any=true
5667
fi
5768
fi
5869
done
5970

60-
[ "$deleted_any" = true ] && sleep 5
71+
[ "$deleted_any" = true ] && sleep 5 || true
6172
}
6273

6374
# Deep Scan for all LGTM related cluster-scoped components
@@ -84,27 +95,34 @@ import_resource() {
8495
return 0
8596
else
8697
if grep -q "Resource already managed" /tmp/import.log; then
87-
echo " ℹ️ Already managed by Terraform"
98+
echo " Already managed by Terraform"
8899
jq --arg addr "$tf_address" --arg reason "already_managed" \
89-
'.skipped += [{"address": $addr, "reason": $reason}]' \
100+
'.imports += [{"address": $addr, "reason": $reason}]' \
90101
"$REPORT_FILE" > /tmp/report.tmp && mv /tmp/report.tmp "$REPORT_FILE"
102+
return 0 # Return success instead of failure
91103
else
92104
echo " ⚠️ Import failed (resource may not exist)"
93105
jq --arg addr "$tf_address" --arg error "$(cat /tmp/import.log | tail -5)" \
94106
'.errors += [{"address": $addr, "error": $error}]' \
95107
"$REPORT_FILE" > /tmp/report.tmp && mv /tmp/report.tmp "$REPORT_FILE"
108+
return 1
96109
fi
97-
return 1
98110
fi
99111
}
100112

101113
# Check if namespace exists
102114
if kubectl get namespace "$NAMESPACE" &>/dev/null; then
103115
echo "📂 Found existing namespace: $NAMESPACE"
104-
import_resource \
105-
"kubernetes_namespace.observability" \
106-
"$NAMESPACE" \
107-
"Namespace: $NAMESPACE"
116+
117+
# Check if already in Terraform state
118+
if terraform state list | grep -q "kubernetes_namespace.observability"; then
119+
echo " ℹ️ Namespace already managed by Terraform - skipping import"
120+
else
121+
import_resource \
122+
"kubernetes_namespace.observability" \
123+
"$NAMESPACE" \
124+
"Namespace: $NAMESPACE"
125+
fi
108126
else
109127
echo " ℹ️ Namespace $NAMESPACE does not exist (will be created)"
110128
fi
@@ -175,6 +193,30 @@ if [ "${CLOUD_PROVIDER:-}" == "gke" ]; then
175193
fi
176194
fi
177195

196+
# ── Grafana Imports ─────────────────────────────────────────────────
197+
# Global datasources and other core Grafana resources would be imported here.
198+
# NOTE: Tenant teams, datasources, and folders are managed dynamically
199+
# by the grafana-team-sync CronJob, so they are NOT imported into Terraform.
200+
# ─────────────────────────────────────────────────────────────────────
201+
202+
echo "📈 Scanning for existing Grafana resources to import into state..."
203+
204+
if [ -n "${GRAFANA_URL:-}" ] && [ -n "${GRAFANA_ADMIN_PASSWORD:-}" ]; then
205+
GRAFANA_AUTH="admin:${GRAFANA_ADMIN_PASSWORD}"
206+
207+
# Test Grafana connectivity first
208+
if ! curl -sf --user "$GRAFANA_AUTH" "${GRAFANA_URL}/api/health" >/dev/null 2>&1; then
209+
echo " ⚠️ Cannot reach Grafana API at ${GRAFANA_URL} - skipping Grafana imports"
210+
FAILURE_COUNT=$((FAILURE_COUNT+1))
211+
FAILED_OPERATIONS+=("grafana: API unreachable at ${GRAFANA_URL}")
212+
else
213+
echo " ℹ️ Grafana API is reachable at ${GRAFANA_URL}"
214+
# Future global Grafana resource imports can be added here
215+
fi
216+
else
217+
echo "⏭️ Skipping Grafana imports: GRAFANA_URL or GRAFANA_ADMIN_PASSWORD not set"
218+
fi
219+
178220
# Summary
179221
echo ""
180222
echo "📊 Import Summary:"
@@ -186,6 +228,19 @@ echo " ✅ Imported: $IMPORTED"
186228
echo " ⏭️ Skipped: $SKIPPED"
187229
echo " ❌ Errors: $ERRORS"
188230

231+
# Report any failures that occurred during execution
232+
if [ "$FAILURE_COUNT" -gt 0 ]; then
233+
echo ""
234+
echo "⚠️ Failure Summary: $FAILURE_COUNT operation(s) failed but script continued"
235+
echo "Failed operations:"
236+
for op in "${FAILED_OPERATIONS[@]}"; do
237+
echo " - $op"
238+
done
239+
echo ""
240+
echo "ℹ️ These failures were logged but did not prevent other imports from executing."
241+
echo "ℹ️ Terraform apply will proceed and create any resources that failed to import."
242+
fi
243+
189244
echo ""
190245
echo "📄 Full report saved to: $REPORT_FILE"
191246
cat "$REPORT_FILE" | jq '.'

.github/scripts/smoke-tests.sh

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ TIMESTAMP=$(date +%s)000000000
102102
TRACE_ID=$(uuidgen | tr -d '-')
103103

104104
LOKI_PUSH_RESPONSE=$(curl -s -X POST "$LOKI_ENDPOINT/loki/api/v1/push" \
105+
-H "X-Scope-OrgID: default" \
105106
-H "Content-Type: application/json" \
106107
-d '{
107108
"streams": [
@@ -140,6 +141,7 @@ START_TIME_LOKI=$(($(date +%s) - 300))
140141
for i in {1..6}; do
141142
sleep 5
142143
LOKI_QUERY_RESPONSE=$(curl -s -G "$LOKI_ENDPOINT/loki/api/v1/query_range" \
144+
-H "X-Scope-OrgID: default" \
143145
--data-urlencode 'query={job="smoke-test"}' \
144146
--data-urlencode "start=$START_TIME_LOKI" \
145147
--data-urlencode 'limit=10' || echo "FAILED")
@@ -185,6 +187,7 @@ EOF
185187
)
186188

187189
MIMIR_PUSH_RESPONSE=$(echo "$MIMIR_PUSH" | curl -s -X POST "$MIMIR_ENDPOINT/api/v1/push" \
190+
-H "X-Scope-OrgID: default" \
188191
-H "Content-Type: application/x-protobuf" \
189192
-H "X-Prometheus-Remote-Write-Version: 0.1.0" \
190193
--data-binary @- || echo "FAILED")
@@ -207,6 +210,7 @@ for i in {1..12}; do
207210
# Instead of a manual push which is hard with curl, we verify that Prometheus is successfully
208211
# remote-writing its own metrics to Mimir. We look for any metric starting with 'prometheus_'
209212
MIMIR_QUERY_RESPONSE=$(curl -s -G "$MIMIR_ENDPOINT/prometheus/api/v1/query" \
213+
-H "X-Scope-OrgID: default" \
210214
--data-urlencode 'query={__name__=~"prometheus_.*"}' || echo "FAILED")
211215

212216
if [[ "$MIMIR_QUERY_RESPONSE" != "FAILED" ]] && echo "$MIMIR_QUERY_RESPONSE" | jq -e '.data.result | length > 0' >/dev/null 2>&1; then
@@ -306,6 +310,7 @@ EOF
306310
)
307311

308312
TEMPO_PUSH_RESPONSE=$(echo "$TEMPO_TRACE" | curl -s -X POST "$TEMPO_INGEST_ENDPOINT/v1/traces" \
313+
-H "X-Scope-OrgID: default" \
309314
-H "Content-Type: application/json" \
310315
-d @- || echo "FAILED")
311316

@@ -324,7 +329,7 @@ TEMPO_QUERY_SUCCESS=false
324329
# Try for up to 60 seconds for Tempo as tracing can be slower to index
325330
for i in {1..12}; do
326331
sleep 5
327-
TEMPO_QUERY=$(curl -s "$TEMPO_QUERY_ENDPOINT/api/traces/${TRACE_ID}" || echo "FAILED")
332+
TEMPO_QUERY=$(curl -s -H "X-Scope-OrgID: default" "$TEMPO_QUERY_ENDPOINT/api/traces/${TRACE_ID}" || echo "FAILED")
328333

329334
if [[ "$TEMPO_QUERY" != "FAILED" ]] && echo "$TEMPO_QUERY" | jq -e '.batches | length > 0' >/dev/null 2>&1; then
330335
record_test "tempo" "query_trace" "PASS" "Successfully retrieved trace"
@@ -395,6 +400,87 @@ done
395400

396401

397402

403+
404+
#=============================================================================
405+
# MULTI-TENANCY ISOLATION TESTS
406+
# Validates that tenant data is fully isolated — webank cannot see default
407+
# data, and default cannot see webank data.
408+
# Acceptance Criteria: "Team A cannot view Team B's logs in Loki/Mimir/Tempo"
409+
#=============================================================================
410+
echo ""
411+
echo "🔒 Testing Multi-Tenancy Isolation..."
412+
413+
ISOLATION_OK=true
414+
415+
# --- Loki Isolation ---
416+
echo " 📝 [Loki] Pushing a secret log to 'webank' tenant..."
417+
ISOLATION_TIMESTAMP=$(date +%s)000000000
418+
WEBANK_SECRET="WEBANK-ONLY-SECRET-$(uuidgen)"
419+
420+
curl -s -X POST "$LOKI_ENDPOINT/loki/api/v1/push" \
421+
-H "X-Scope-OrgID: webank" \
422+
-H "Content-Type: application/json" \
423+
-d "{\"streams\":[{\"stream\":{\"job\":\"isolation-test\"},\"values\":[[\"$ISOLATION_TIMESTAMP\",\"$WEBANK_SECRET\"]]}" \
424+
> /dev/null
425+
426+
sleep 6
427+
428+
# Query as 'default' — must NOT see the webank secret
429+
ISOLATION_AS_DEFAULT=$(curl -s -G "$LOKI_ENDPOINT/loki/api/v1/query_range" \
430+
-H "X-Scope-OrgID: default" \
431+
--data-urlencode 'query={job="isolation-test"}' \
432+
--data-urlencode "start=$(($(date +%s) - 60))" \
433+
--data-urlencode 'limit=10' || echo "FAILED")
434+
435+
if echo "$ISOLATION_AS_DEFAULT" | grep -q "$WEBANK_SECRET"; then
436+
record_test "isolation" "loki_cross_tenant_leak" "FAIL" "CRITICAL: default tenant CAN see webank data — isolation is BROKEN"
437+
echo " ❌ CRITICAL: Loki isolation FAILED — default tenant sees webank data!"
438+
ISOLATION_OK=false
439+
else
440+
record_test "isolation" "loki_cross_tenant_leak" "PASS" "default tenant cannot see webank data"
441+
echo " ✅ Loki: default tenant cannot see webank data"
442+
fi
443+
444+
# Query as 'webank' — MUST see its own secret
445+
ISOLATION_AS_WEBANK=$(curl -s -G "$LOKI_ENDPOINT/loki/api/v1/query_range" \
446+
-H "X-Scope-OrgID: webank" \
447+
--data-urlencode 'query={job="isolation-test"}' \
448+
--data-urlencode "start=$(($(date +%s) - 60))" \
449+
--data-urlencode 'limit=10' || echo "FAILED")
450+
451+
if echo "$ISOLATION_AS_WEBANK" | grep -q "$WEBANK_SECRET"; then
452+
record_test "isolation" "loki_tenant_reads_own" "PASS" "webank tenant can read its own logs"
453+
echo " ✅ Loki: webank tenant can read its own logs"
454+
else
455+
record_test "isolation" "loki_tenant_reads_own" "FAIL" "webank tenant cannot read its own logs"
456+
echo " ❌ Loki: webank tenant cannot read its own logs"
457+
ISOLATION_OK=false
458+
fi
459+
460+
# --- Mimir Isolation ---
461+
echo " 📊 [Mimir] Checking metric namespace isolation..."
462+
463+
# Query a metric as 'webank' — it must not see 'prometheus_*' metrics
464+
# (those are shipped by Prometheus under the 'default' tenant)
465+
WEBANK_SEES_DEFAULT=$(curl -s -G "$MIMIR_ENDPOINT/prometheus/api/v1/query" \
466+
-H "X-Scope-OrgID: webank" \
467+
--data-urlencode 'query={__name__=~"prometheus_.*"}' || echo "FAILED")
468+
469+
if [[ "$WEBANK_SEES_DEFAULT" != "FAILED" ]] && echo "$WEBANK_SEES_DEFAULT" | jq -e '.data.result | length > 0' > /dev/null 2>&1; then
470+
record_test "isolation" "mimir_cross_tenant_leak" "FAIL" "CRITICAL: webank tenant sees prometheus_* metrics from 'default' tenant"
471+
echo " ❌ CRITICAL: Mimir isolation FAILED — webank sees default tenant metrics!"
472+
ISOLATION_OK=false
473+
else
474+
record_test "isolation" "mimir_cross_tenant_leak" "PASS" "webank tenant cannot see default tenant metrics"
475+
echo " ✅ Mimir: webank tenant cannot see default tenant metrics"
476+
fi
477+
478+
if [ "$ISOLATION_OK" = true ]; then
479+
echo " 🎉 All isolation tests PASSED — multi-tenancy is working correctly"
480+
else
481+
echo " ❌ Isolation tests FAILED — multi-tenancy is NOT properly enforced"
482+
fi
483+
398484
#=============================================================================
399485
# INTEGRATION TEST
400486
#=============================================================================

0 commit comments

Comments
 (0)