Skip to content

Commit cbeac26

Browse files
committed
feat: [#238] add Prometheus smoke test validation after run command
- Added PrometheusValidator for SSH-based smoke testing via curl to localhost:9090 - Added ServiceValidation struct for conditional validation (matches release validation pattern) - Added PrometheusValidationFailed error with comprehensive troubleshooting help - Updated run_run_validation to conditionally validate Prometheus when enabled - Renamed validate_running_services to validate_external_services for clarity * External services: tracker API, HTTP tracker (exposed, no SSH) * Internal services: Prometheus (port 9090, firewall-blocked, SSH required) - Updated E2E tests to validate Prometheus smoke test functionality - All E2E tests passing (deployment workflow validated Prometheus successfully)
1 parent b79b436 commit cbeac26

File tree

4 files changed

+254
-9
lines changed

4 files changed

+254
-9
lines changed

src/bin/e2e_deployment_workflow_tests.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,9 @@ use torrust_tracker_deployer_lib::testing::e2e::tasks::run_configuration_validat
8181
use torrust_tracker_deployer_lib::testing::e2e::tasks::run_release_validation::{
8282
run_release_validation, ServiceValidation,
8383
};
84-
use torrust_tracker_deployer_lib::testing::e2e::tasks::run_run_validation::run_run_validation;
84+
use torrust_tracker_deployer_lib::testing::e2e::tasks::run_run_validation::{
85+
run_run_validation, ServiceValidation as RunServiceValidation,
86+
};
8587

8688
/// Environment name for this E2E test
8789
const ENVIRONMENT_NAME: &str = "e2e-deployment";
@@ -297,11 +299,14 @@ async fn run_deployer_workflow(
297299
test_runner.run_services()?;
298300

299301
// Validate services are running using actual mapped ports from runtime environment
302+
// Note: E2E deployment environment has Prometheus enabled, so we validate it
303+
let run_services = RunServiceValidation { prometheus: true };
300304
run_run_validation(
301305
socket_addr,
302306
ssh_credentials,
303307
runtime_env.container_ports.http_api_port,
304308
vec![runtime_env.container_ports.http_tracker_port],
309+
Some(run_services),
305310
)
306311
.await
307312
.map_err(|e| anyhow::anyhow!("{e}"))?;
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
pub mod cloud_init;
22
pub mod docker;
33
pub mod docker_compose;
4+
pub mod prometheus;
45

56
pub use cloud_init::CloudInitValidator;
67
pub use docker::DockerValidator;
78
pub use docker_compose::DockerComposeValidator;
9+
pub use prometheus::PrometheusValidator;
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
//! Prometheus smoke test validator for remote instances
2+
//!
3+
//! This module provides the `PrometheusValidator` which performs a smoke test
4+
//! on a running Prometheus instance to verify it's operational and accessible.
5+
//!
6+
//! ## Key Features
7+
//!
8+
//! - Validates Prometheus web UI is accessible via HTTP
9+
//! - Checks Prometheus returns a successful HTTP response
10+
//! - Performs validation from inside the VM (not exposed externally)
11+
//!
12+
//! ## Validation Approach
13+
//!
14+
//! Since Prometheus is not exposed outside the VM (protected by firewall),
15+
//! validation must be performed from inside the VM via SSH:
16+
//!
17+
//! 1. Connect to VM via SSH
18+
//! 2. Execute `curl` command to fetch Prometheus homepage
19+
//! 3. Verify successful HTTP response (200 OK)
20+
//!
21+
//! This smoke test confirms Prometheus is:
22+
//! - Running and bound to the expected port
23+
//! - Responding to HTTP requests
24+
//! - Web UI is functional
25+
//!
26+
//! ## Future Enhancements
27+
//!
28+
//! For more comprehensive validation, consider:
29+
//!
30+
//! 1. **Configuration Validation**:
31+
//! - Parse Prometheus config file to verify scrape targets
32+
//! - Check that tracker endpoints are configured correctly
33+
//! - Validate scrape interval matches environment config
34+
//!
35+
//! 2. **Data Collection Validation**:
36+
//! - Query Prometheus API for active targets
37+
//! - Verify tracker metrics are being collected
38+
//! - Check that scrape jobs are succeeding (not in "down" state)
39+
//! - Example: `curl http://localhost:9090/api/v1/targets | jq`
40+
//!
41+
//! 3. **Metric Availability**:
42+
//! - Query specific tracker metrics (e.g., `torrust_tracker_info`)
43+
//! - Verify metrics have recent timestamps
44+
//! - Example: `curl http://localhost:9090/api/v1/query?query=up`
45+
//!
46+
//! These enhancements require:
47+
//! - JSON parsing of Prometheus API responses
48+
//! - Async coordination (waiting for first scrape to complete)
49+
//! - More complex error handling
50+
//!
51+
//! The current smoke test provides a good baseline validation that can be
52+
//! extended as needed.
53+
54+
use std::net::IpAddr;
55+
use tracing::{info, instrument};
56+
57+
use crate::adapters::ssh::SshClient;
58+
use crate::adapters::ssh::SshConfig;
59+
use crate::infrastructure::remote_actions::{RemoteAction, RemoteActionError};
60+
61+
/// Default Prometheus port (not exposed outside VM)
62+
const DEFAULT_PROMETHEUS_PORT: u16 = 9090;
63+
64+
/// Action that validates Prometheus is running and accessible
65+
pub struct PrometheusValidator {
66+
ssh_client: SshClient,
67+
prometheus_port: u16,
68+
}
69+
70+
impl PrometheusValidator {
71+
/// Create a new `PrometheusValidator` with the specified SSH configuration
72+
///
73+
/// # Arguments
74+
/// * `ssh_config` - SSH connection configuration containing credentials and host IP
75+
/// * `prometheus_port` - Port where Prometheus is running (defaults to 9090 if None)
76+
#[must_use]
77+
pub fn new(ssh_config: SshConfig, prometheus_port: Option<u16>) -> Self {
78+
let ssh_client = SshClient::new(ssh_config);
79+
Self {
80+
ssh_client,
81+
prometheus_port: prometheus_port.unwrap_or(DEFAULT_PROMETHEUS_PORT),
82+
}
83+
}
84+
}
85+
86+
impl RemoteAction for PrometheusValidator {
87+
fn name(&self) -> &'static str {
88+
"prometheus-smoke-test"
89+
}
90+
91+
#[instrument(
92+
name = "prometheus_smoke_test",
93+
skip(self),
94+
fields(
95+
action_type = "validation",
96+
component = "prometheus",
97+
server_ip = %server_ip,
98+
prometheus_port = self.prometheus_port
99+
)
100+
)]
101+
async fn execute(&self, server_ip: &IpAddr) -> Result<(), RemoteActionError> {
102+
info!(
103+
action = "prometheus_smoke_test",
104+
prometheus_port = self.prometheus_port,
105+
"Running Prometheus smoke test"
106+
);
107+
108+
// Perform smoke test: curl Prometheus homepage and check for success
109+
// Using -f flag to make curl fail on HTTP errors (4xx, 5xx)
110+
// Using -s flag for silent mode (no progress bar)
111+
// Using -o /dev/null to discard response body (we only care about status code)
112+
let command = format!(
113+
"curl -f -s -o /dev/null http://localhost:{} && echo 'success'",
114+
self.prometheus_port
115+
);
116+
117+
let output = self.ssh_client.execute(&command).map_err(|source| {
118+
RemoteActionError::SshCommandFailed {
119+
action_name: self.name().to_string(),
120+
source,
121+
}
122+
})?;
123+
124+
if !output.trim().contains("success") {
125+
return Err(RemoteActionError::ValidationFailed {
126+
action_name: self.name().to_string(),
127+
message: format!(
128+
"Prometheus smoke test failed. Prometheus may not be running or accessible on port {}. \
129+
Check that 'docker compose ps' shows Prometheus container as running.",
130+
self.prometheus_port
131+
),
132+
});
133+
}
134+
135+
info!(
136+
action = "prometheus_smoke_test",
137+
status = "success",
138+
"Prometheus is running and responding to HTTP requests"
139+
);
140+
141+
Ok(())
142+
}
143+
}

src/testing/e2e/tasks/run_run_validation.rs

Lines changed: 103 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,20 @@ use tracing::info;
5959
use crate::adapters::ssh::SshConfig;
6060
use crate::adapters::ssh::SshCredentials;
6161
use crate::infrastructure::external_validators::RunningServicesValidator;
62+
use crate::infrastructure::remote_actions::validators::PrometheusValidator;
6263
use crate::infrastructure::remote_actions::{RemoteAction, RemoteActionError};
6364

65+
/// Service validation configuration
66+
///
67+
/// Controls which optional service validations should be performed
68+
/// during run validation. This allows for flexible validation
69+
/// based on which services are enabled in the environment configuration.
70+
#[derive(Debug, Clone, Copy, Default)]
71+
pub struct ServiceValidation {
72+
/// Whether to validate Prometheus is running and accessible
73+
pub prometheus: bool,
74+
}
75+
6476
/// Errors that can occur during run validation
6577
#[derive(Debug, Error)]
6678
pub enum RunValidationError {
@@ -73,6 +85,16 @@ Tip: Ensure Docker Compose services are started and healthy"
7385
#[source]
7486
source: RemoteActionError,
7587
},
88+
89+
/// Prometheus smoke test failed
90+
#[error(
91+
"Prometheus smoke test failed: {source}
92+
Tip: Ensure Prometheus container is running and accessible on port 9090"
93+
)]
94+
PrometheusValidationFailed {
95+
#[source]
96+
source: RemoteActionError,
97+
},
7698
}
7799

78100
impl RunValidationError {
@@ -118,6 +140,35 @@ impl RunValidationError {
118140
- Re-run the 'run' command: cargo run -- run <environment>
119141
- Or manually: cd /opt/torrust && docker compose up -d
120142
143+
For more information, see docs/e2e-testing/."
144+
}
145+
Self::PrometheusValidationFailed { .. } => {
146+
"Prometheus Smoke Test Failed - Detailed Troubleshooting:
147+
148+
1. Check Prometheus container status:
149+
- SSH to instance: ssh user@instance-ip
150+
- Check container: cd /opt/torrust && docker compose ps
151+
- View Prometheus logs: docker compose logs prometheus
152+
153+
2. Verify Prometheus is accessible:
154+
- Test from inside VM: curl http://localhost:9090
155+
- Check if port 9090 is listening: ss -tlnp | grep 9090
156+
157+
3. Common issues:
158+
- Prometheus container failed to start (check logs)
159+
- Port 9090 already in use by another process
160+
- Prometheus configuration file has errors
161+
- Insufficient memory for Prometheus
162+
163+
4. Debug steps:
164+
- Check Prometheus config: docker compose exec prometheus cat /etc/prometheus/prometheus.yml
165+
- Restart Prometheus: docker compose restart prometheus
166+
- Check scrape targets: curl http://localhost:9090/api/v1/targets | jq
167+
168+
5. Re-deploy if needed:
169+
- Re-run 'run' command: cargo run -- run <environment>
170+
- Or manually: cd /opt/torrust && docker compose up -d prometheus
171+
121172
For more information, see docs/e2e-testing/."
122173
}
123174
}
@@ -135,6 +186,7 @@ For more information, see docs/e2e-testing/."
135186
/// * `ssh_credentials` - SSH credentials for connecting to the instance
136187
/// * `tracker_api_port` - Port for the tracker API health endpoint
137188
/// * `http_tracker_ports` - Ports for HTTP tracker health endpoints (can be empty)
189+
/// * `services` - Optional service validation configuration (defaults to no optional services)
138190
///
139191
/// # Returns
140192
///
@@ -146,24 +198,29 @@ For more information, see docs/e2e-testing/."
146198
/// - SSH connection cannot be established
147199
/// - Services are not running
148200
/// - Services are unhealthy
201+
/// - Optional service validation fails (when enabled)
149202
pub async fn run_run_validation(
150203
socket_addr: SocketAddr,
151204
ssh_credentials: &SshCredentials,
152205
tracker_api_port: u16,
153206
http_tracker_ports: Vec<u16>,
207+
services: Option<ServiceValidation>,
154208
) -> Result<(), RunValidationError> {
209+
let services = services.unwrap_or_default();
210+
155211
info!(
156212
socket_addr = %socket_addr,
157213
ssh_user = %ssh_credentials.ssh_username,
158214
tracker_api_port = tracker_api_port,
159215
http_tracker_ports = ?http_tracker_ports,
216+
validate_prometheus = services.prometheus,
160217
"Running 'run' command validation tests"
161218
);
162219

163220
let ip_addr = socket_addr.ip();
164221

165-
// Validate running services
166-
validate_running_services(
222+
// Validate externally accessible services (tracker API, HTTP tracker)
223+
validate_external_services(
167224
ip_addr,
168225
ssh_credentials,
169226
socket_addr.port(),
@@ -172,6 +229,11 @@ pub async fn run_run_validation(
172229
)
173230
.await?;
174231

232+
// Optionally validate Prometheus is running and accessible
233+
if services.prometheus {
234+
validate_prometheus(ip_addr, ssh_credentials, socket_addr.port()).await?;
235+
}
236+
175237
info!(
176238
socket_addr = %socket_addr,
177239
status = "success",
@@ -181,19 +243,25 @@ pub async fn run_run_validation(
181243
Ok(())
182244
}
183245

184-
/// Validate running services on a configured instance
246+
/// Validate externally accessible services on a configured instance
247+
///
248+
/// This function validates services that are exposed outside the VM and accessible
249+
/// without SSH (e.g., tracker API, HTTP tracker). These services have firewall rules
250+
/// allowing external access. It checks the status of services started by the `run`
251+
/// command and verifies they are operational by connecting from outside the VM.
252+
///
253+
/// # Note
185254
///
186-
/// This function validates that Docker Compose services are running and healthy
187-
/// on the target instance. It checks the status of services started by the `run`
188-
/// command and verifies they are operational.
189-
async fn validate_running_services(
255+
/// Internal services like Prometheus (not exposed externally) are validated separately
256+
/// via SSH in `validate_prometheus()`.
257+
async fn validate_external_services(
190258
ip_addr: IpAddr,
191259
ssh_credentials: &SshCredentials,
192260
port: u16,
193261
tracker_api_port: u16,
194262
http_tracker_ports: Vec<u16>,
195263
) -> Result<(), RunValidationError> {
196-
info!("Validating running services");
264+
info!("Validating externally accessible services (tracker API, HTTP tracker)");
197265

198266
let ssh_config = SshConfig::new(ssh_credentials.clone(), SocketAddr::new(ip_addr, port));
199267

@@ -206,3 +274,30 @@ async fn validate_running_services(
206274

207275
Ok(())
208276
}
277+
278+
/// Validate Prometheus is running and accessible via smoke test
279+
///
280+
/// This function performs a smoke test on Prometheus by connecting via SSH
281+
/// and executing a curl command to verify the web UI is accessible.
282+
///
283+
/// # Note
284+
///
285+
/// Prometheus runs on port 9090 inside the VM but is NOT exposed externally
286+
/// (blocked by firewall). Validation must be performed from inside the VM.
287+
async fn validate_prometheus(
288+
ip_addr: IpAddr,
289+
ssh_credentials: &SshCredentials,
290+
port: u16,
291+
) -> Result<(), RunValidationError> {
292+
info!("Validating Prometheus is running and accessible");
293+
294+
let ssh_config = SshConfig::new(ssh_credentials.clone(), SocketAddr::new(ip_addr, port));
295+
296+
let prometheus_validator = PrometheusValidator::new(ssh_config, None);
297+
prometheus_validator
298+
.execute(&ip_addr)
299+
.await
300+
.map_err(|source| RunValidationError::PrometheusValidationFailed { source })?;
301+
302+
Ok(())
303+
}

0 commit comments

Comments
 (0)