Skip to content

Commit e92a015

Browse files
authored
[MISC] Add degraded status to primary message (#874)
* Add degraded status on to primary message * Use const running tests
1 parent 3b38f9d commit e92a015

File tree

4 files changed

+62
-53
lines changed

4 files changed

+62
-53
lines changed

src/charm.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1032,10 +1032,16 @@ def _set_active_status(self):
10321032
self.app_peer_data["s3-initialization-block-message"]
10331033
)
10341034
return
1035-
if self._patroni.get_primary(unit_name_pattern=True) == self.unit.name:
1036-
self.unit.status = ActiveStatus("Primary")
1037-
elif self.is_standby_leader:
1038-
self.unit.status = ActiveStatus("Standby")
1035+
if (
1036+
self._patroni.get_primary(unit_name_pattern=True) == self.unit.name
1037+
or self.is_standby_leader
1038+
):
1039+
danger_state = ""
1040+
if len(self._patroni.get_running_cluster_members()) < self.app.planned_units():
1041+
danger_state = " (degraded)"
1042+
self.unit.status = ActiveStatus(
1043+
f"{'Standby' if self.is_standby_leader else 'Primary'}{danger_state}"
1044+
)
10391045
elif self._patroni.member_started:
10401046
self.unit.status = ActiveStatus()
10411047
except (RetryError, RequestsConnectionError) as e:

src/patroni.py

Lines changed: 40 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,19 @@ def update_synchronous_node_count(self) -> None:
164164
if r.status_code != 200:
165165
raise UpdateSyncNodeCountError(f"received {r.status_code}")
166166

167+
def get_cluster(
168+
self, attempt: AttemptManager, alternative_endpoints: list[str] | None = None
169+
) -> dict[str, str | int]:
170+
"""Call the cluster endpoint."""
171+
url = self._get_alternative_patroni_url(attempt, alternative_endpoints)
172+
r = requests.get(
173+
f"{url}/cluster",
174+
verify=self._verify,
175+
auth=self._patroni_auth,
176+
timeout=PATRONI_TIMEOUT,
177+
)
178+
return r.json()
179+
167180
def get_primary(
168181
self, unit_name_pattern=False, alternative_endpoints: list[str] | None = None
169182
) -> str:
@@ -180,11 +193,7 @@ def get_primary(
180193
# Request info from cluster endpoint (which returns all members of the cluster).
181194
for attempt in Retrying(stop=stop_after_attempt(len(self._endpoints) + 1)):
182195
with attempt:
183-
url = self._get_alternative_patroni_url(attempt, alternative_endpoints)
184-
r = requests.get(
185-
f"{url}/cluster", verify=self._verify, timeout=5, auth=self._patroni_auth
186-
)
187-
for member in r.json()["members"]:
196+
for member in self.get_cluster(attempt, alternative_endpoints)["members"]:
188197
if member["role"] == "leader":
189198
primary = member["name"]
190199
if unit_name_pattern:
@@ -209,14 +218,7 @@ def get_standby_leader(
209218
# Request info from cluster endpoint (which returns all members of the cluster).
210219
for attempt in Retrying(stop=stop_after_attempt(len(self._endpoints) + 1)):
211220
with attempt:
212-
url = self._get_alternative_patroni_url(attempt)
213-
r = requests.get(
214-
f"{url}/cluster",
215-
verify=self._verify,
216-
auth=self._patroni_auth,
217-
timeout=PATRONI_TIMEOUT,
218-
)
219-
for member in r.json()["members"]:
221+
for member in self.get_cluster(attempt)["members"]:
220222
if member["role"] == "standby_leader":
221223
if check_whether_is_running and member["state"] not in RUNNING_STATES:
222224
logger.warning(f"standby leader {member['name']} is not running")
@@ -234,30 +236,33 @@ def get_sync_standby_names(self) -> list[str]:
234236
# Request info from cluster endpoint (which returns all members of the cluster).
235237
for attempt in Retrying(stop=stop_after_attempt(len(self._endpoints) + 1)):
236238
with attempt:
237-
url = self._get_alternative_patroni_url(attempt)
238-
r = requests.get(
239-
f"{url}/cluster",
240-
verify=self._verify,
241-
auth=self._patroni_auth,
242-
timeout=PATRONI_TIMEOUT,
243-
)
244-
for member in r.json()["members"]:
239+
for member in self.get_cluster(attempt)["members"]:
245240
if member["role"] == "sync_standby":
246241
sync_standbys.append("/".join(member["name"].rsplit("-", 1)))
247242
return sync_standbys
248243

249244
@property
250-
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
251245
def cluster_members(self) -> set:
252246
"""Get the current cluster members."""
253247
# Request info from cluster endpoint (which returns all members of the cluster).
254-
r = requests.get(
255-
f"{self._patroni_url}/cluster",
256-
verify=self._verify,
257-
auth=self._patroni_auth,
258-
timeout=PATRONI_TIMEOUT,
259-
)
260-
return {member["name"] for member in r.json()["members"]}
248+
for attempt in Retrying(
249+
stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10)
250+
):
251+
with attempt:
252+
return {member["name"] for member in self.get_cluster(attempt)["members"]}
253+
254+
def get_running_cluster_members(self) -> list[str]:
255+
"""List running patroni members."""
256+
try:
257+
for attempt in Retrying(stop=stop_after_attempt(1)):
258+
with attempt:
259+
return [
260+
member["name"]
261+
for member in self.get_cluster(attempt)["members"]
262+
if member["state"] in RUNNING_STATES
263+
]
264+
except Exception:
265+
return []
261266

262267
def are_all_members_ready(self) -> bool:
263268
"""Check if all members are correctly running Patroni and PostgreSQL.
@@ -271,17 +276,13 @@ def are_all_members_ready(self) -> bool:
271276
try:
272277
for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(3)):
273278
with attempt:
274-
r = requests.get(
275-
f"{self._patroni_url}/cluster",
276-
verify=self._verify,
277-
auth=self._patroni_auth,
278-
timeout=PATRONI_TIMEOUT,
279+
return all(
280+
member["state"] in RUNNING_STATES
281+
for member in self.get_cluster(attempt)["members"]
279282
)
280283
except RetryError:
281284
return False
282285

283-
return all(member["state"] in RUNNING_STATES for member in r.json()["members"])
284-
285286
@property
286287
def is_creating_backup(self) -> bool:
287288
"""Returns whether a backup is being created."""
@@ -291,20 +292,13 @@ def is_creating_backup(self) -> bool:
291292
try:
292293
for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(3)):
293294
with attempt:
294-
r = requests.get(
295-
f"{self._patroni_url}/cluster",
296-
verify=self._verify,
297-
auth=self._patroni_auth,
298-
timeout=PATRONI_TIMEOUT,
295+
return any(
296+
"tags" in member and member["tags"].get("is_creating_backup")
297+
for member in self.get_cluster(attempt)["members"]
299298
)
300299
except RetryError:
301300
return False
302301

303-
return any(
304-
"tags" in member and member["tags"].get("is_creating_backup")
305-
for member in r.json()["members"]
306-
)
307-
308302
@property
309303
def is_replication_healthy(self) -> bool:
310304
"""Return whether the replication is healthy."""

tests/unit/test_charm.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -479,6 +479,12 @@ def test_on_update_status(harness):
479479
patch("ops.model.Container.pebble") as _pebble,
480480
patch("ops.model.Container.restart") as _restart,
481481
patch("upgrade.PostgreSQLUpgrade.idle", return_value="idle"),
482+
patch(
483+
"charm.PostgresqlOperatorCharm.is_standby_leader",
484+
new_callable=PropertyMock,
485+
return_value=False,
486+
),
487+
patch("charm.Patroni.get_running_cluster_members", return_value=["test"]),
482488
):
483489
# Early exit on can connect.
484490
harness.set_can_connect(POSTGRESQL_CONTAINER, False)
@@ -1740,6 +1746,7 @@ def test_handle_postgresql_restart_need(harness):
17401746
def test_set_active_status(harness):
17411747
with (
17421748
patch("charm.Patroni.member_started", new_callable=PropertyMock) as _member_started,
1749+
patch("charm.Patroni.get_running_cluster_members", return_value=["test"]),
17431750
patch(
17441751
"charm.PostgresqlOperatorCharm.is_standby_leader", new_callable=PropertyMock
17451752
) as _is_standby_leader,
@@ -1772,7 +1779,9 @@ def test_set_active_status(harness):
17721779
assert isinstance(harness.charm.unit.status, MaintenanceStatus)
17731780
else:
17741781
_is_standby_leader.side_effect = None
1775-
_is_standby_leader.return_value = values[1]
1782+
_is_standby_leader.return_value = (
1783+
values[0] != harness.charm.unit.name and values[1]
1784+
)
17761785
harness.charm._set_active_status()
17771786
assert isinstance(
17781787
harness.charm.unit.status,

tests/unit/test_patroni.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def test_get_primary(harness, patroni):
8888
_get.assert_called_once_with(
8989
"http://postgresql-k8s-0:8008/cluster",
9090
verify=True,
91-
timeout=5,
91+
timeout=10,
9292
auth=patroni._patroni_auth,
9393
)
9494

@@ -99,7 +99,7 @@ def test_get_primary(harness, patroni):
9999
_get.assert_called_once_with(
100100
"http://postgresql-k8s-0:8008/cluster",
101101
verify=True,
102-
timeout=5,
102+
timeout=10,
103103
auth=patroni._patroni_auth,
104104
)
105105

0 commit comments

Comments
 (0)