diff --git a/src/charm.py b/src/charm.py index d615f60d76..d7a20256ff 100755 --- a/src/charm.py +++ b/src/charm.py @@ -84,6 +84,7 @@ ) from ops.pebble import ( ChangeError, + CheckStatus, ExecError, Layer, PathError, @@ -1532,19 +1533,41 @@ def _on_update_status(self, _) -> None: logger.debug("on_update_status early exit: Service has not been added nor started yet") return + # Check health status in addition to service status + service_unhealthy = False + try: + checks = container.pebble.get_checks(names=[self.postgresql_service]) + if checks and checks[0].status != CheckStatus.UP: + service_unhealthy = True + logger.warning( + f"PostgreSQL health check failing: {checks[0].status} " + f"(failures: {checks[0].failures}/{checks[0].threshold})" + ) + except Exception as e: + logger.debug(f"Could not get health check status: {e}") + if ( not self.is_cluster_restoring_backup and not self.is_cluster_restoring_to_time and not self.is_unit_stopped - and services[0].current != ServiceStatus.ACTIVE + and (services[0].current != ServiceStatus.ACTIVE or service_unhealthy) ): logger.warning( - f"{self.postgresql_service} pebble service inactive, restarting service" + f"{self.postgresql_service} pebble service inactive or unhealthy, restarting service" + ) + + # Collect diagnostic logs before restart attempt + logger.error( + "PostgreSQL service is inactive or unhealthy. Collecting diagnostic information..." ) + self._get_postgresql_startup_logs(container) + try: container.restart(self.postgresql_service) except ChangeError: logger.exception("Failed to restart patroni") + # Collect logs after failed restart for additional diagnosis + self._get_postgresql_startup_logs(container) # If service doesn't recover fast, exit and wait for next hook run to re-check if not self._patroni.member_started: self.unit.status = MaintenanceStatus("Database service inactive, restarting") @@ -2728,6 +2751,42 @@ def log_pitr_last_transaction_time(self) -> None: else: logger.error("Can't tell last completed transaction time") + def _get_postgresql_startup_logs(self, container: Container) -> None: + """Collect and log PostgreSQL/Patroni diagnostic information. + + Args: + container: The workload container to collect logs from. + """ + try: + # Get recent Pebble logs for the postgresql service + log_exec = container.pebble.exec( + ["pebble", "logs", "postgresql", "-n", "100"], combine_stderr=True + ) + logs = log_exec.wait_output()[0] + logger.error(f"Recent PostgreSQL/Patroni logs:\n{logs}") + except ExecError as e: + logger.error(f"Failed to get pebble logs: {e}") + # Fallback to direct log file for older Juju versions + try: + log_exec = container.pebble.exec([ + "tail", + "-n", + "100", + "/var/log/postgresql/patroni.log", + ]) + logs = log_exec.wait_output()[0] + logger.error(f"Recent Patroni logs:\n{logs}") + except ExecError as e2: + logger.error(f"Failed to get patroni.log: {e2}") + + # Also get PostgreSQL logs if available + postgresql_logs = self._patroni.last_postgresql_logs() + if postgresql_logs: + # Only log last portion to avoid overwhelming the logs + log_lines = postgresql_logs.split("\n") + recent_lines = "\n".join(log_lines[-50:]) if len(log_lines) > 50 else postgresql_logs + logger.error(f"Recent PostgreSQL logs:\n{recent_lines}") + def get_plugins(self) -> list[str]: """Return a list of installed plugins.""" plugins = [ diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 517a481538..2934775989 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -19,7 +19,7 @@ RelationDataTypeError, WaitingStatus, ) -from ops.pebble import ChangeError, ServiceStatus +from ops.pebble import ChangeError, CheckStatus, ServiceStatus from ops.testing import Harness from requests import ConnectionError as RequestsConnectionError from tenacity import RetryError, wait_fixed @@ -492,7 +492,13 @@ def test_on_update_status(harness): return_value=False, ), patch("charm.Patroni.get_running_cluster_members", return_value=["test"]), + patch("charm.PostgresqlOperatorCharm._get_postgresql_startup_logs") as _get_startup_logs, ): + # Setup default healthy health check + _pebble.get_checks.return_value = [ + MagicMock(status=CheckStatus.UP, failures=0, threshold=3) + ] + # Early exit on can connect. harness.set_can_connect(POSTGRESQL_CONTAINER, False) harness.charm.on.update_status.emit() @@ -520,11 +526,13 @@ def test_on_update_status(harness): _enable_disable_extensions.assert_called_once() _pebble.get_services.assert_called_once() - # Test when a failure need to be handled. + # Test when a failure need to be handled (service inactive). _pebble.get_services.return_value = [MagicMock(current=ServiceStatus.INACTIVE)] harness.charm.on.update_status.emit() _restart.assert_called_once_with("postgresql") + _get_startup_logs.assert_called() _restart.reset_mock() + _get_startup_logs.reset_mock() _get_primary.assert_called_once_with(unit_name_pattern=True) _get_primary.reset_mock() @@ -534,8 +542,11 @@ def test_on_update_status(harness): harness.charm.on.update_status.emit() _restart.assert_called_once_with("postgresql") _logger.exception.assert_called_once_with("Failed to restart patroni") + # Verify startup logs are collected after failed restart + assert _get_startup_logs.call_count == 2 # Before and after restart attempt _restart.reset_mock() _logger.exception.reset_mock() + _get_startup_logs.reset_mock() _restart.side_effect = None # Check primary message not being set (current unit is not the primary). @@ -552,6 +563,78 @@ def test_on_update_status(harness): assert harness.model.unit.status == ActiveStatus("Primary") +def test_on_update_status_health_check_failing(harness): + """Test that service is restarted when health check is failing.""" + with ( + patch("charm.logger") as _logger, + patch("charm.PostgresqlOperatorCharm.enable_disable_extensions"), + patch("charm.Patroni.member_started", new_callable=PropertyMock) as _member_started, + patch("charm.Patroni.get_primary") as _get_primary, + patch("ops.model.Container.pebble") as _pebble, + patch("ops.model.Container.restart") as _restart, + patch("upgrade.PostgreSQLUpgrade.idle", return_value="idle"), + patch( + "charm.PostgresqlOperatorCharm.is_standby_leader", + new_callable=PropertyMock, + return_value=False, + ), + patch("charm.Patroni.get_running_cluster_members", return_value=["test"]), + patch("charm.PostgresqlOperatorCharm._get_postgresql_startup_logs") as _get_startup_logs, + ): + harness.set_can_connect(POSTGRESQL_CONTAINER, True) + + # Service is ACTIVE but health check is DOWN - should trigger restart + _pebble.get_services.return_value = [MagicMock(current=ServiceStatus.ACTIVE)] + _pebble.get_checks.return_value = [ + MagicMock(status=CheckStatus.DOWN, failures=3, threshold=3) + ] + _member_started.return_value = True + + harness.charm.on.update_status.emit() + + # Verify health check warning is logged + _logger.warning.assert_any_call( + "PostgreSQL health check failing: CheckStatus.DOWN (failures: 3/3)" + ) + # Verify restart was triggered + _restart.assert_called_once_with("postgresql") + # Verify diagnostic logs were collected + _get_startup_logs.assert_called() + + +def test_on_update_status_health_check_error_handling(harness): + """Test that health check errors are handled gracefully.""" + with ( + patch("charm.logger") as _logger, + patch("charm.PostgresqlOperatorCharm.enable_disable_extensions"), + patch("charm.Patroni.member_started", new_callable=PropertyMock) as _member_started, + patch("charm.Patroni.get_primary") as _get_primary, + patch("ops.model.Container.pebble") as _pebble, + patch("ops.model.Container.restart") as _restart, + patch("upgrade.PostgreSQLUpgrade.idle", return_value="idle"), + patch( + "charm.PostgresqlOperatorCharm.is_standby_leader", + new_callable=PropertyMock, + return_value=False, + ), + patch("charm.Patroni.get_running_cluster_members", return_value=["test"]), + ): + harness.set_can_connect(POSTGRESQL_CONTAINER, True) + + # Service is ACTIVE and health check throws an exception - should not restart + _pebble.get_services.return_value = [MagicMock(current=ServiceStatus.ACTIVE)] + _pebble.get_checks.side_effect = Exception("Pebble error") + _member_started.return_value = True + _get_primary.return_value = harness.charm.unit.name + + harness.charm.on.update_status.emit() + + # Verify error was logged at debug level + _logger.debug.assert_any_call("Could not get health check status: Pebble error") + # Verify restart was NOT triggered (service is active, only health check failed to query) + _restart.assert_not_called() + + def test_on_update_status_no_connection(harness): with ( patch("charm.Patroni.get_primary") as _get_primary, @@ -573,6 +656,10 @@ def test_on_update_status_with_error_on_get_primary(harness): ): # Mock the access to the list of Pebble services. _pebble.get_services.return_value = [MagicMock(current=ServiceStatus.ACTIVE)] + # Mock healthy health check to avoid triggering restart path + _pebble.get_checks.return_value = [ + MagicMock(status=CheckStatus.UP, failures=0, threshold=3) + ] _get_primary.side_effect = [RetryError("fake error")] @@ -1941,6 +2028,36 @@ def test_on_secret_remove(harness, only_with_juju_secrets): assert not event.remove_revision.called +def test_get_postgresql_startup_logs(harness): + """Test that _get_postgresql_startup_logs collects diagnostic information.""" + with ( + patch("charm.logger") as _logger, + patch("ops.model.Container.pebble") as _pebble, + patch("charm.Patroni.last_postgresql_logs") as _last_postgresql_logs, + ): + container = harness.charm.unit.get_container("postgresql") + + # Setup mock for pebble logs exec + mock_exec = MagicMock() + mock_exec.wait_output.return_value = ("Sample pebble logs output", "") + _pebble.exec.return_value = mock_exec + + # Setup mock for PostgreSQL logs + _last_postgresql_logs.return_value = "Sample PostgreSQL logs" + + harness.charm._get_postgresql_startup_logs(container) + + # Verify pebble logs were collected + _pebble.exec.assert_called_once_with( + ["pebble", "logs", "postgresql", "-n", "100"], combine_stderr=True + ) + _logger.error.assert_any_call("Recent PostgreSQL/Patroni logs:\nSample pebble logs output") + + # Verify PostgreSQL logs were collected + _last_postgresql_logs.assert_called_once() + _logger.error.assert_any_call("Recent PostgreSQL logs:\nSample PostgreSQL logs") + + def test_calculate_worker_process_config_auto_values(harness): """Test worker process config calculation with 'auto' values.""" with patch.object(