Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 61 additions & 2 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@
)
from ops.pebble import (
ChangeError,
CheckStatus,
ExecError,
Layer,
PathError,
Expand Down Expand Up @@ -1532,19 +1533,41 @@ def _on_update_status(self, _) -> None:
logger.debug("on_update_status early exit: Service has not been added nor started yet")
return

# Check health status in addition to service status
service_unhealthy = False
try:
checks = container.pebble.get_checks(names=[self.postgresql_service])
if checks and checks[0].status != CheckStatus.UP:
service_unhealthy = True
logger.warning(
f"PostgreSQL health check failing: {checks[0].status} "
f"(failures: {checks[0].failures}/{checks[0].threshold})"
)
except Exception as e:
logger.debug(f"Could not get health check status: {e}")

if (
not self.is_cluster_restoring_backup
and not self.is_cluster_restoring_to_time
and not self.is_unit_stopped
and services[0].current != ServiceStatus.ACTIVE
and (services[0].current != ServiceStatus.ACTIVE or service_unhealthy)
):
logger.warning(
f"{self.postgresql_service} pebble service inactive, restarting service"
f"{self.postgresql_service} pebble service inactive or unhealthy, restarting service"
)

# Collect diagnostic logs before restart attempt
logger.error(
"PostgreSQL service is inactive or unhealthy. Collecting diagnostic information..."
)
self._get_postgresql_startup_logs(container)

try:
container.restart(self.postgresql_service)
except ChangeError:
logger.exception("Failed to restart patroni")
# Collect logs after failed restart for additional diagnosis
self._get_postgresql_startup_logs(container)
# If service doesn't recover fast, exit and wait for next hook run to re-check
if not self._patroni.member_started:
self.unit.status = MaintenanceStatus("Database service inactive, restarting")
Expand Down Expand Up @@ -2728,6 +2751,42 @@ def log_pitr_last_transaction_time(self) -> None:
else:
logger.error("Can't tell last completed transaction time")

def _get_postgresql_startup_logs(self, container: Container) -> None:
"""Collect and log PostgreSQL/Patroni diagnostic information.

Args:
container: The workload container to collect logs from.
"""
try:
# Get recent Pebble logs for the postgresql service
log_exec = container.pebble.exec(
["pebble", "logs", "postgresql", "-n", "100"], combine_stderr=True
)
logs = log_exec.wait_output()[0]
logger.error(f"Recent PostgreSQL/Patroni logs:\n{logs}")
except ExecError as e:
logger.error(f"Failed to get pebble logs: {e}")
# Fallback to direct log file for older Juju versions
try:
log_exec = container.pebble.exec([
"tail",
"-n",
"100",
"/var/log/postgresql/patroni.log",
])
logs = log_exec.wait_output()[0]
logger.error(f"Recent Patroni logs:\n{logs}")
except ExecError as e2:
logger.error(f"Failed to get patroni.log: {e2}")

# Also get PostgreSQL logs if available
postgresql_logs = self._patroni.last_postgresql_logs()
if postgresql_logs:
# Only log last portion to avoid overwhelming the logs
log_lines = postgresql_logs.split("\n")
recent_lines = "\n".join(log_lines[-50:]) if len(log_lines) > 50 else postgresql_logs
logger.error(f"Recent PostgreSQL logs:\n{recent_lines}")

def get_plugins(self) -> list[str]:
"""Return a list of installed plugins."""
plugins = [
Expand Down
121 changes: 119 additions & 2 deletions tests/unit/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
RelationDataTypeError,
WaitingStatus,
)
from ops.pebble import ChangeError, ServiceStatus
from ops.pebble import ChangeError, CheckStatus, ServiceStatus
from ops.testing import Harness
from requests import ConnectionError as RequestsConnectionError
from tenacity import RetryError, wait_fixed
Expand Down Expand Up @@ -492,7 +492,13 @@ def test_on_update_status(harness):
return_value=False,
),
patch("charm.Patroni.get_running_cluster_members", return_value=["test"]),
patch("charm.PostgresqlOperatorCharm._get_postgresql_startup_logs") as _get_startup_logs,
):
# Setup default healthy health check
_pebble.get_checks.return_value = [
MagicMock(status=CheckStatus.UP, failures=0, threshold=3)
]

# Early exit on can connect.
harness.set_can_connect(POSTGRESQL_CONTAINER, False)
harness.charm.on.update_status.emit()
Expand Down Expand Up @@ -520,11 +526,13 @@ def test_on_update_status(harness):
_enable_disable_extensions.assert_called_once()
_pebble.get_services.assert_called_once()

# Test when a failure need to be handled.
# Test when a failure need to be handled (service inactive).
_pebble.get_services.return_value = [MagicMock(current=ServiceStatus.INACTIVE)]
harness.charm.on.update_status.emit()
_restart.assert_called_once_with("postgresql")
_get_startup_logs.assert_called()
_restart.reset_mock()
_get_startup_logs.reset_mock()
_get_primary.assert_called_once_with(unit_name_pattern=True)
_get_primary.reset_mock()

Expand All @@ -534,8 +542,11 @@ def test_on_update_status(harness):
harness.charm.on.update_status.emit()
_restart.assert_called_once_with("postgresql")
_logger.exception.assert_called_once_with("Failed to restart patroni")
# Verify startup logs are collected after failed restart
assert _get_startup_logs.call_count == 2 # Before and after restart attempt
_restart.reset_mock()
_logger.exception.reset_mock()
_get_startup_logs.reset_mock()
_restart.side_effect = None

# Check primary message not being set (current unit is not the primary).
Expand All @@ -552,6 +563,78 @@ def test_on_update_status(harness):
assert harness.model.unit.status == ActiveStatus("Primary")


def test_on_update_status_health_check_failing(harness):
"""Test that service is restarted when health check is failing."""
with (
patch("charm.logger") as _logger,
patch("charm.PostgresqlOperatorCharm.enable_disable_extensions"),
patch("charm.Patroni.member_started", new_callable=PropertyMock) as _member_started,
patch("charm.Patroni.get_primary") as _get_primary,
patch("ops.model.Container.pebble") as _pebble,
patch("ops.model.Container.restart") as _restart,
patch("upgrade.PostgreSQLUpgrade.idle", return_value="idle"),
patch(
"charm.PostgresqlOperatorCharm.is_standby_leader",
new_callable=PropertyMock,
return_value=False,
),
patch("charm.Patroni.get_running_cluster_members", return_value=["test"]),
patch("charm.PostgresqlOperatorCharm._get_postgresql_startup_logs") as _get_startup_logs,
):
harness.set_can_connect(POSTGRESQL_CONTAINER, True)

# Service is ACTIVE but health check is DOWN - should trigger restart
_pebble.get_services.return_value = [MagicMock(current=ServiceStatus.ACTIVE)]
_pebble.get_checks.return_value = [
MagicMock(status=CheckStatus.DOWN, failures=3, threshold=3)
]
_member_started.return_value = True

harness.charm.on.update_status.emit()

# Verify health check warning is logged
_logger.warning.assert_any_call(
"PostgreSQL health check failing: CheckStatus.DOWN (failures: 3/3)"
)
# Verify restart was triggered
_restart.assert_called_once_with("postgresql")
# Verify diagnostic logs were collected
_get_startup_logs.assert_called()


def test_on_update_status_health_check_error_handling(harness):
"""Test that health check errors are handled gracefully."""
with (
patch("charm.logger") as _logger,
patch("charm.PostgresqlOperatorCharm.enable_disable_extensions"),
patch("charm.Patroni.member_started", new_callable=PropertyMock) as _member_started,
patch("charm.Patroni.get_primary") as _get_primary,
patch("ops.model.Container.pebble") as _pebble,
patch("ops.model.Container.restart") as _restart,
patch("upgrade.PostgreSQLUpgrade.idle", return_value="idle"),
patch(
"charm.PostgresqlOperatorCharm.is_standby_leader",
new_callable=PropertyMock,
return_value=False,
),
patch("charm.Patroni.get_running_cluster_members", return_value=["test"]),
):
harness.set_can_connect(POSTGRESQL_CONTAINER, True)

# Service is ACTIVE and health check throws an exception - should not restart
_pebble.get_services.return_value = [MagicMock(current=ServiceStatus.ACTIVE)]
_pebble.get_checks.side_effect = Exception("Pebble error")
_member_started.return_value = True
_get_primary.return_value = harness.charm.unit.name

harness.charm.on.update_status.emit()

# Verify error was logged at debug level
_logger.debug.assert_any_call("Could not get health check status: Pebble error")
# Verify restart was NOT triggered (service is active, only health check failed to query)
_restart.assert_not_called()


def test_on_update_status_no_connection(harness):
with (
patch("charm.Patroni.get_primary") as _get_primary,
Expand All @@ -573,6 +656,10 @@ def test_on_update_status_with_error_on_get_primary(harness):
):
# Mock the access to the list of Pebble services.
_pebble.get_services.return_value = [MagicMock(current=ServiceStatus.ACTIVE)]
# Mock healthy health check to avoid triggering restart path
_pebble.get_checks.return_value = [
MagicMock(status=CheckStatus.UP, failures=0, threshold=3)
]

_get_primary.side_effect = [RetryError("fake error")]

Expand Down Expand Up @@ -1941,6 +2028,36 @@ def test_on_secret_remove(harness, only_with_juju_secrets):
assert not event.remove_revision.called


def test_get_postgresql_startup_logs(harness):
"""Test that _get_postgresql_startup_logs collects diagnostic information."""
with (
patch("charm.logger") as _logger,
patch("ops.model.Container.pebble") as _pebble,
patch("charm.Patroni.last_postgresql_logs") as _last_postgresql_logs,
):
container = harness.charm.unit.get_container("postgresql")

# Setup mock for pebble logs exec
mock_exec = MagicMock()
mock_exec.wait_output.return_value = ("Sample pebble logs output", "")
_pebble.exec.return_value = mock_exec

# Setup mock for PostgreSQL logs
_last_postgresql_logs.return_value = "Sample PostgreSQL logs"

harness.charm._get_postgresql_startup_logs(container)

# Verify pebble logs were collected
_pebble.exec.assert_called_once_with(
["pebble", "logs", "postgresql", "-n", "100"], combine_stderr=True
)
_logger.error.assert_any_call("Recent PostgreSQL/Patroni logs:\nSample pebble logs output")

# Verify PostgreSQL logs were collected
_last_postgresql_logs.assert_called_once()
_logger.error.assert_any_call("Recent PostgreSQL logs:\nSample PostgreSQL logs")


def test_calculate_worker_process_config_auto_values(harness):
"""Test worker process config calculation with 'auto' values."""
with patch.object(
Expand Down
Loading