Skip to content

Commit a179e9d

Browse files
authored
[MISC] Converge cluster status calls (#1014)
* Switch most calls to the parallel cluster call * Remove dead code * Use cached get primary * Revert cached primary call
1 parent 9f969b8 commit a179e9d

File tree

4 files changed

+42
-140
lines changed

4 files changed

+42
-140
lines changed

src/cluster.py

Lines changed: 22 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -224,17 +224,10 @@ def _change_owner(self, path: str) -> None:
224224
os.chown(path, uid=user_database.pw_uid, gid=user_database.pw_gid)
225225

226226
@property
227-
@retry(stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=2, max=10))
228227
def cluster_members(self) -> set:
229228
"""Get the current cluster members."""
230229
# Request info from cluster endpoint (which returns all members of the cluster).
231-
cluster_status = requests.get(
232-
f"{self._patroni_url}/{PATRONI_CLUSTER_STATUS_ENDPOINT}",
233-
verify=self.verify,
234-
timeout=API_REQUEST_TIMEOUT,
235-
auth=self._patroni_auth,
236-
)
237-
return {member["name"] for member in cluster_status.json()["members"]}
230+
return {member["name"] for member in self.cached_cluster_status}
238231

239232
def _create_directory(self, path: str, mode: int) -> None:
240233
"""Creates a directory.
@@ -256,6 +249,11 @@ def get_postgresql_version(self) -> str:
256249
if snp["name"] == charm_refresh.snap_name():
257250
return snp["version"]
258251

252+
@property
253+
def cached_cluster_status(self):
254+
"""Cached cluster status."""
255+
return self.cluster_status()
256+
259257
def cluster_status(self, alternative_endpoints: list | None = None) -> list[ClusterMember]:
260258
"""Query the cluster status."""
261259
# Request info from cluster endpoint (which returns all members of the cluster).
@@ -411,27 +409,22 @@ def are_all_members_ready(self) -> bool:
411409
# Request info from cluster endpoint
412410
# (which returns all members of the cluster and their states).
413411
try:
414-
for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(3)):
415-
with attempt:
416-
cluster_status = requests.get(
417-
f"{self._patroni_url}/{PATRONI_CLUSTER_STATUS_ENDPOINT}",
418-
verify=self.verify,
419-
timeout=API_REQUEST_TIMEOUT,
420-
auth=self._patroni_auth,
421-
)
412+
members = self.cluster_status()
422413
except RetryError:
423414
return False
424415

425416
# Check if all members are running and one of them is a leader (primary) or
426417
# a standby leader, because sometimes there may exist (for some period of time)
427418
# only replicas after a failed switchover.
428-
return all(
429-
member["state"] in STARTED_STATES for member in cluster_status.json()["members"]
430-
) and any(
431-
member["role"] in ["leader", "standby_leader"]
432-
for member in cluster_status.json()["members"]
419+
return all(member["state"] in STARTED_STATES for member in members) and any(
420+
member["role"] in ["leader", "standby_leader"] for member in members
433421
)
434422

423+
@property
424+
def cached_patroni_health(self) -> dict[str, str]:
425+
"""Cached local unit health."""
426+
return self.get_patroni_health()
427+
435428
def get_patroni_health(self) -> dict[str, str]:
436429
"""Gets, retires and parses the Patroni health endpoint."""
437430
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(7)):
@@ -452,20 +445,12 @@ def is_creating_backup(self) -> bool:
452445
# cluster member; the "is_creating_backup" tag means that the member is creating
453446
# a backup).
454447
try:
455-
for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(3)):
456-
with attempt:
457-
r = requests.get(
458-
f"{self._patroni_url}/cluster",
459-
verify=self.verify,
460-
auth=self._patroni_auth,
461-
timeout=PATRONI_TIMEOUT,
462-
)
448+
members = self.cached_cluster_status
463449
except RetryError:
464450
return False
465451

466452
return any(
467-
"tags" in member and member["tags"].get("is_creating_backup")
468-
for member in r.json()["members"]
453+
"tags" in member and member["tags"].get("is_creating_backup") for member in members
469454
)
470455

471456
def is_replication_healthy(self) -> bool:
@@ -509,7 +494,7 @@ def member_started(self) -> bool:
509494
if not self.is_patroni_running():
510495
return False
511496
try:
512-
response = self.get_patroni_health()
497+
response = self.cached_patroni_health
513498
except RetryError:
514499
return False
515500

@@ -524,7 +509,7 @@ def member_inactive(self) -> bool:
524509
seconds times to allow server time to start up.
525510
"""
526511
try:
527-
response = self.get_patroni_health()
512+
response = self.cached_patroni_health
528513
except RetryError:
529514
return True
530515

@@ -535,27 +520,6 @@ def member_inactive(self) -> bool:
535520
"restarting",
536521
]
537522

538-
@property
539-
def member_replication_lag(self) -> str:
540-
"""Member replication lag."""
541-
try:
542-
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)):
543-
with attempt:
544-
cluster_status = requests.get(
545-
f"{self._patroni_url}/{PATRONI_CLUSTER_STATUS_ENDPOINT}",
546-
verify=self.verify,
547-
timeout=API_REQUEST_TIMEOUT,
548-
auth=self._patroni_auth,
549-
)
550-
except RetryError:
551-
return "unknown"
552-
553-
for member in cluster_status.json()["members"]:
554-
if member["name"] == self.member_name:
555-
return member.get("lag", "unknown")
556-
557-
return "unknown"
558-
559523
@property
560524
def is_member_isolated(self) -> bool:
561525
"""Returns whether the unit is isolated from the cluster."""
@@ -589,16 +553,8 @@ def online_cluster_members(self) -> list[ClusterMember]:
589553
def are_replicas_up(self) -> dict[str, bool] | None:
590554
"""Check if cluster members are running or streaming."""
591555
try:
592-
response = requests.get(
593-
f"{self._patroni_url}/cluster",
594-
verify=self.verify,
595-
auth=self._patroni_auth,
596-
timeout=PATRONI_TIMEOUT,
597-
)
598-
return {
599-
member["host"]: member["state"] in ["running", "streaming"]
600-
for member in response.json()["members"]
601-
}
556+
members = self.cluster_status()
557+
return {member["host"]: member["state"] in STARTED_STATES for member in members}
602558
except Exception:
603559
logger.exception("Unable to get the state of the cluster")
604560
return
@@ -909,15 +865,8 @@ def reinitialise_raft_data(self) -> None:
909865
def get_running_cluster_members(self) -> list[str]:
910866
"""List running patroni members."""
911867
try:
912-
members = requests.get(
913-
f"{self._patroni_url}/{PATRONI_CLUSTER_STATUS_ENDPOINT}",
914-
verify=self.verify,
915-
timeout=API_REQUEST_TIMEOUT,
916-
auth=self._patroni_auth,
917-
).json()["members"]
918-
return [
919-
member["name"] for member in members if member["state"] in ("streaming", "running")
920-
]
868+
members = self.cluster_status()
869+
return [member["name"] for member in members if member["state"] in STARTED_STATES]
921870
except Exception:
922871
return []
923872

src/cluster_topology_observer.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -121,9 +121,3 @@ def stop_observer(self):
121121
self._charm._peers.data[self._charm.unit].update({"observer-pid": ""})
122122
except OSError:
123123
pass
124-
125-
@property
126-
def unit_tag(self):
127-
"""Juju-style tag identifying the unit being run by this charm."""
128-
unit_num = self._charm.unit.name.split("/")[-1]
129-
return f"unit-{self._charm.app.name}-{unit_num}"

tests/unit/test_charm.py

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -803,9 +803,6 @@ def test_on_update_status(harness):
803803
) as _set_primary_status_message,
804804
patch("charm.Patroni.restart_patroni") as _restart_patroni,
805805
patch("charm.Patroni.is_member_isolated") as _is_member_isolated,
806-
patch(
807-
"charm.Patroni.member_replication_lag", new_callable=PropertyMock
808-
) as _member_replication_lag,
809806
patch("charm.Patroni.member_started", new_callable=PropertyMock) as _member_started,
810807
patch(
811808
"charm.PostgresqlOperatorCharm.is_standby_leader", new_callable=PropertyMock
@@ -1383,9 +1380,6 @@ def test_on_peer_relation_changed(harness):
13831380
"backups.PostgreSQLBackups.start_stop_pgbackrest_service"
13841381
) as _start_stop_pgbackrest_service,
13851382
patch("charm.Patroni.reinitialize_postgresql") as _reinitialize_postgresql,
1386-
patch(
1387-
"charm.Patroni.member_replication_lag", new_callable=PropertyMock
1388-
) as _member_replication_lag,
13891383
patch("charm.PostgresqlOperatorCharm.is_standby_leader") as _is_standby_leader,
13901384
patch("charm.PostgresqlOperatorCharm.is_primary") as _is_primary,
13911385
patch("charm.Patroni.member_started", new_callable=PropertyMock) as _member_started,
@@ -1466,15 +1460,14 @@ def test_on_peer_relation_changed(harness):
14661460
# huge or unknown lag.
14671461
relation = harness.model.get_relation(PEER, rel_id)
14681462
_member_started.return_value = True
1469-
for values in itertools.product([True, False], ["0", "1000", "1001", "unknown"]):
1463+
for values in [True, False]:
14701464
_defer.reset_mock()
14711465
_start_stop_pgbackrest_service.reset_mock()
1472-
_is_primary.return_value = values[0]
1473-
_is_standby_leader.return_value = values[0]
1474-
_member_replication_lag.return_value = values[1]
1466+
_is_primary.return_value = values
1467+
_is_standby_leader.return_value = values
14751468
harness.charm.unit.status = ActiveStatus()
14761469
harness.charm.on.database_peers_relation_changed.emit(relation)
1477-
if _is_primary.return_value == values[0] or int(values[1]) <= 1000:
1470+
if _is_primary.return_value == values:
14781471
_defer.assert_not_called()
14791472
_start_stop_pgbackrest_service.assert_called_once()
14801473
assert isinstance(harness.charm.unit.status, ActiveStatus)

tests/unit/test_cluster.py

Lines changed: 16 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
from pysyncobj.utility import UtilityException
1313
from tenacity import (
1414
RetryError,
15-
Retrying,
1615
stop_after_delay,
1716
wait_fixed,
1817
)
@@ -92,11 +91,9 @@ def patroni(harness, peers_ips):
9291

9392

9493
def test_get_member_ip(peers_ips, patroni):
95-
with (
96-
patch(
97-
"charm.Patroni.parallel_patroni_get_request", return_value=None
98-
) as _parallel_patroni_get_request,
99-
):
94+
with patch(
95+
"charm.Patroni.parallel_patroni_get_request", return_value=None
96+
) as _parallel_patroni_get_request:
10097
# No IP if no members
10198
assert patroni.get_member_ip(patroni.member_name) is None
10299

@@ -201,21 +198,16 @@ def test_get_primary(peers_ips, patroni):
201198

202199

203200
def test_is_creating_backup(peers_ips, patroni):
204-
with patch("requests.get") as _get:
201+
with patch("charm.Patroni.cluster_status") as _cluster_status:
205202
# Test when one member is creating a backup.
206-
response = _get.return_value
207-
response.json.return_value = {
208-
"members": [
209-
{"name": "postgresql-0"},
210-
{"name": "postgresql-1", "tags": {"is_creating_backup": True}},
211-
]
212-
}
203+
_cluster_status.return_value = [
204+
{"name": "postgresql-0"},
205+
{"name": "postgresql-1", "tags": {"is_creating_backup": True}},
206+
]
213207
assert patroni.is_creating_backup
214208

215209
# Test when no member is creating a backup.
216-
response.json.return_value = {
217-
"members": [{"name": "postgresql-0"}, {"name": "postgresql-1"}]
218-
}
210+
_cluster_status.return_value = [{"name": "postgresql-0"}, {"name": "postgresql-1"}]
219211
assert not patroni.is_creating_backup
220212

221213

@@ -411,28 +403,6 @@ def test_stop_patroni(peers_ips, patroni):
411403
assert not patroni.stop_patroni()
412404

413405

414-
def test_member_replication_lag(peers_ips, patroni):
415-
with (
416-
patch("requests.get", side_effect=mocked_requests_get) as _get,
417-
patch("charm.Patroni._patroni_url", new_callable=PropertyMock) as _patroni_url,
418-
):
419-
# Test when the cluster member has a value for the lag field.
420-
_patroni_url.return_value = "http://server1"
421-
lag = patroni.member_replication_lag
422-
assert lag == "1"
423-
424-
# Test when the cluster member doesn't have a value for the lag field.
425-
patroni.member_name = "postgresql-1"
426-
lag = patroni.member_replication_lag
427-
assert lag == "unknown"
428-
429-
# Test when the API call fails.
430-
_patroni_url.return_value = "http://server2"
431-
with patch.object(Retrying, "iter", Mock(side_effect=RetryError(None))):
432-
lag = patroni.member_replication_lag
433-
assert lag == "unknown"
434-
435-
436406
def test_reinitialize_postgresql(peers_ips, patroni):
437407
with patch("requests.post") as _post:
438408
patroni.reinitialize_postgresql()
@@ -888,18 +858,14 @@ def test_reinitialise_raft_data(patroni):
888858

889859

890860
def test_are_replicas_up(patroni):
891-
with (
892-
patch("requests.get") as _get,
893-
):
894-
_get.return_value.json.return_value = {
895-
"members": [
896-
{"host": "1.1.1.1", "state": "running"},
897-
{"host": "2.2.2.2", "state": "streaming"},
898-
{"host": "3.3.3.3", "state": "other state"},
899-
]
900-
}
861+
with patch("charm.Patroni.cluster_status") as _cluster_status:
862+
_cluster_status.return_value = [
863+
{"host": "1.1.1.1", "state": "running"},
864+
{"host": "2.2.2.2", "state": "streaming"},
865+
{"host": "3.3.3.3", "state": "other state"},
866+
]
901867
assert patroni.are_replicas_up() == {"1.1.1.1": True, "2.2.2.2": True, "3.3.3.3": False}
902868

903869
# Return None on error
904-
_get.side_effect = Exception
870+
_cluster_status.side_effect = Exception
905871
assert patroni.are_replicas_up() is None

0 commit comments

Comments
 (0)