1
1
# Copyright 2022 Canonical Ltd.
2
2
# See LICENSE file for licensing details.
3
+ import logging
3
4
import os
4
5
import random
5
6
import subprocess
22
23
23
24
from ..helpers import APPLICATION_NAME , db_connect , get_unit_address , run_command_on_unit
24
25
26
+ logger = logging .getLogger (__name__ )
27
+
25
28
METADATA = yaml .safe_load (Path ("./metadata.yaml" ).read_text ())
26
29
PORT = 5432
27
30
APP_NAME = METADATA ["name" ]
@@ -74,13 +77,19 @@ async def are_all_db_processes_down(ops_test: OpsTest, process: str) -> bool:
74
77
return True
75
78
76
79
77
- async def are_writes_increasing (ops_test , down_unit : str = None ) -> None :
80
+ async def are_writes_increasing (
81
+ ops_test , down_unit : str = None , use_ip_from_inside : bool = False
82
+ ) -> None :
78
83
"""Verify new writes are continuing by counting the number of writes."""
79
- writes , _ = await count_writes (ops_test , down_unit = down_unit )
84
+ writes , _ = await count_writes (
85
+ ops_test , down_unit = down_unit , use_ip_from_inside = use_ip_from_inside
86
+ )
80
87
for member , count in writes .items ():
81
88
for attempt in Retrying (stop = stop_after_delay (60 * 3 ), wait = wait_fixed (3 )):
82
89
with attempt :
83
- more_writes , _ = await count_writes (ops_test , down_unit = down_unit )
90
+ more_writes , _ = await count_writes (
91
+ ops_test , down_unit = down_unit , use_ip_from_inside = use_ip_from_inside
92
+ )
84
93
assert more_writes [member ] > count , f"{ member } : writes not continuing to DB"
85
94
86
95
@@ -161,33 +170,46 @@ async def change_wal_settings(
161
170
)
162
171
163
172
164
- async def is_cluster_updated (ops_test : OpsTest , primary_name : str ) -> None :
173
+ async def is_cluster_updated (
174
+ ops_test : OpsTest , primary_name : str , use_ip_from_inside : bool = False
175
+ ) -> None :
165
176
# Verify that the old primary is now a replica.
177
+ logger .info ("checking that the former primary is now a replica" )
166
178
assert await is_replica (
167
- ops_test , primary_name
179
+ ops_test , primary_name , use_ip_from_inside
168
180
), "there are more than one primary in the cluster."
169
181
170
182
# Verify that all units are part of the same cluster.
171
- member_ips = await fetch_cluster_members (ops_test )
183
+ logger .info ("checking that all units are part of the same cluster" )
184
+ member_ips = await fetch_cluster_members (ops_test , use_ip_from_inside )
172
185
app = primary_name .split ("/" )[0 ]
173
186
ip_addresses = [
174
- await get_unit_ip (ops_test , unit .name ) for unit in ops_test .model .applications [app ].units
187
+ await (
188
+ get_ip_from_inside_the_unit (ops_test , unit .name )
189
+ if use_ip_from_inside
190
+ else get_unit_ip (ops_test , unit .name )
191
+ )
192
+ for unit in ops_test .model .applications [app ].units
175
193
]
176
194
assert set (member_ips ) == set (ip_addresses ), "not all units are part of the same cluster."
177
195
178
196
# Verify that no writes to the database were missed after stopping the writes.
179
- total_expected_writes = await check_writes (ops_test )
197
+ logger .info ("checking that no writes to the database were missed after stopping the writes" )
198
+ total_expected_writes = await check_writes (ops_test , use_ip_from_inside )
180
199
181
200
# Verify that old primary is up-to-date.
201
+ logger .info ("checking that the former primary is up to date with the cluster after restarting" )
182
202
assert await is_secondary_up_to_date (
183
- ops_test , primary_name , total_expected_writes
203
+ ops_test , primary_name , total_expected_writes , use_ip_from_inside
184
204
), "secondary not up to date with the cluster after restarting."
185
205
186
206
187
- async def check_writes (ops_test ) -> int :
207
+ async def check_writes (ops_test , use_ip_from_inside : bool = False ) -> int :
188
208
"""Gets the total writes from the test charm and compares to the writes from db."""
189
209
total_expected_writes = await stop_continuous_writes (ops_test )
190
- actual_writes , max_number_written = await count_writes (ops_test )
210
+ actual_writes , max_number_written = await count_writes (
211
+ ops_test , use_ip_from_inside = use_ip_from_inside
212
+ )
191
213
for member , count in actual_writes .items ():
192
214
assert (
193
215
count == max_number_written [member ]
@@ -197,14 +219,20 @@ async def check_writes(ops_test) -> int:
197
219
198
220
199
221
async def count_writes (
200
- ops_test : OpsTest , down_unit : str = None
222
+ ops_test : OpsTest , down_unit : str = None , use_ip_from_inside : bool = False
201
223
) -> Tuple [Dict [str , int ], Dict [str , int ]]:
202
224
"""Count the number of writes in the database."""
203
225
app = await app_name (ops_test )
204
226
password = await get_password (ops_test , app , down_unit )
205
227
for unit in ops_test .model .applications [app ].units :
206
228
if unit .name != down_unit :
207
- cluster = get_patroni_cluster (await get_unit_ip (ops_test , unit .name ))
229
+ cluster = get_patroni_cluster (
230
+ await (
231
+ get_ip_from_inside_the_unit (ops_test , unit .name )
232
+ if use_ip_from_inside
233
+ else get_unit_ip (ops_test , unit .name )
234
+ )
235
+ )
208
236
break
209
237
down_ips = []
210
238
if down_unit :
@@ -263,16 +291,21 @@ def cut_network_from_unit_without_ip_change(machine_name: str) -> None:
263
291
subprocess .check_call (limit_set_command .split ())
264
292
265
293
266
- async def fetch_cluster_members (ops_test : OpsTest ):
294
+ async def fetch_cluster_members (ops_test : OpsTest , use_ip_from_inside : bool = False ):
267
295
"""Fetches the IPs listed by Patroni as cluster members.
268
296
269
297
Args:
270
298
ops_test: OpsTest instance.
299
+ use_ip_from_inside: whether to use the IP from inside the unit.
271
300
"""
272
301
app = await app_name (ops_test )
273
302
member_ips = {}
274
303
for unit in ops_test .model .applications [app ].units :
275
- unit_ip = await get_unit_ip (ops_test , unit .name )
304
+ unit_ip = await (
305
+ get_ip_from_inside_the_unit (ops_test , unit .name )
306
+ if use_ip_from_inside
307
+ else get_unit_ip (ops_test , unit .name )
308
+ )
276
309
cluster_info = requests .get (f"http://{ unit_ip } :8008/cluster" )
277
310
if len (member_ips ) > 0 :
278
311
# If the list of members IPs was already fetched, also compare the
@@ -304,6 +337,16 @@ async def get_controller_machine(ops_test: OpsTest) -> str:
304
337
][0 ]
305
338
306
339
340
+ async def get_ip_from_inside_the_unit (ops_test : OpsTest , unit_name : str ) -> str :
341
+ command = f"exec --unit { unit_name } -- hostname -I"
342
+ return_code , stdout , stderr = await ops_test .juju (* command .split ())
343
+ if return_code != 0 :
344
+ raise ProcessError (
345
+ "Expected command %s to succeed instead it failed: %s %s" , command , return_code , stderr
346
+ )
347
+ return stdout .splitlines ()[0 ].strip ()
348
+
349
+
307
350
async def get_patroni_setting (ops_test : OpsTest , setting : str ) -> Optional [int ]:
308
351
"""Get the value of one of the integer Patroni settings.
309
352
@@ -388,20 +431,28 @@ async def get_unit_ip(ops_test: OpsTest, unit_name: str) -> str:
388
431
389
432
390
433
@retry (stop = stop_after_attempt (8 ), wait = wait_fixed (15 ), reraise = True )
391
- async def is_connection_possible (ops_test : OpsTest , unit_name : str ) -> bool :
434
+ async def is_connection_possible (
435
+ ops_test : OpsTest , unit_name : str , use_ip_from_inside : bool = False
436
+ ) -> bool :
392
437
"""Test a connection to a PostgreSQL server."""
393
438
app = unit_name .split ("/" )[0 ]
394
439
password = await get_password (ops_test , app , unit_name )
395
- address = await get_unit_ip (ops_test , unit_name )
440
+ address = await (
441
+ get_ip_from_inside_the_unit (ops_test , unit_name )
442
+ if use_ip_from_inside
443
+ else get_unit_ip (ops_test , unit_name )
444
+ )
396
445
try :
397
- with db_connect (
398
- host = address , password = password
399
- ) as connection , connection .cursor () as cursor :
400
- cursor .execute ("SELECT 1;" )
401
- success = cursor .fetchone ()[0 ] == 1
402
- connection .close ()
403
- return success
404
- except psycopg2 .Error :
446
+ for attempt in Retrying (stop = stop_after_delay (60 ), wait = wait_fixed (3 )):
447
+ with attempt :
448
+ with db_connect (
449
+ host = address , password = password
450
+ ) as connection , connection .cursor () as cursor :
451
+ cursor .execute ("SELECT 1;" )
452
+ success = cursor .fetchone ()[0 ] == 1
453
+ connection .close ()
454
+ return success
455
+ except (psycopg2 .Error , RetryError ):
405
456
# Error raised when the connection is not possible.
406
457
return False
407
458
@@ -420,9 +471,13 @@ def is_machine_reachable_from(origin_machine: str, target_machine: str) -> bool:
420
471
return False
421
472
422
473
423
- async def is_replica (ops_test : OpsTest , unit_name : str ) -> bool :
474
+ async def is_replica (ops_test : OpsTest , unit_name : str , use_ip_from_inside : bool = False ) -> bool :
424
475
"""Returns whether the unit a replica in the cluster."""
425
- unit_ip = await get_unit_ip (ops_test , unit_name )
476
+ unit_ip = await (
477
+ get_ip_from_inside_the_unit (ops_test , unit_name )
478
+ if use_ip_from_inside
479
+ else get_unit_ip (ops_test , unit_name )
480
+ )
426
481
member_name = unit_name .replace ("/" , "-" )
427
482
428
483
try :
@@ -532,9 +587,13 @@ async def send_signal_to_process(
532
587
)
533
588
534
589
535
- async def is_postgresql_ready (ops_test , unit_name : str ) -> bool :
590
+ async def is_postgresql_ready (ops_test , unit_name : str , use_ip_from_inside : bool = False ) -> bool :
536
591
"""Verifies a PostgreSQL instance is running and available."""
537
- unit_ip = get_unit_address (ops_test , unit_name )
592
+ unit_ip = (
593
+ (await get_ip_from_inside_the_unit (ops_test , unit_name ))
594
+ if use_ip_from_inside
595
+ else get_unit_address (ops_test , unit_name )
596
+ )
538
597
try :
539
598
for attempt in Retrying (stop = stop_after_delay (60 * 5 ), wait = wait_fixed (3 )):
540
599
with attempt :
@@ -571,15 +630,21 @@ def restore_network_for_unit_without_ip_change(machine_name: str) -> None:
571
630
subprocess .check_call (limit_set_command .split ())
572
631
573
632
574
- async def is_secondary_up_to_date (ops_test : OpsTest , unit_name : str , expected_writes : int ) -> bool :
633
+ async def is_secondary_up_to_date (
634
+ ops_test : OpsTest , unit_name : str , expected_writes : int , use_ip_from_inside : bool = False
635
+ ) -> bool :
575
636
"""Checks if secondary is up-to-date with the cluster.
576
637
577
638
Retries over the period of one minute to give secondary adequate time to copy over data.
578
639
"""
579
640
app = await app_name (ops_test )
580
641
password = await get_password (ops_test , app )
581
642
host = [
582
- await get_unit_ip (ops_test , unit .name )
643
+ await (
644
+ get_ip_from_inside_the_unit (ops_test , unit .name )
645
+ if use_ip_from_inside
646
+ else get_unit_ip (ops_test , unit .name )
647
+ )
583
648
for unit in ops_test .model .applications [app ].units
584
649
if unit .name == unit_name
585
650
][0 ]
@@ -679,15 +744,17 @@ async def update_restart_condition(ops_test: OpsTest, unit, condition: str):
679
744
680
745
681
746
@retry (stop = stop_after_attempt (20 ), wait = wait_fixed (30 ))
682
- async def wait_network_restore (ops_test : OpsTest , hostname : str , old_ip : str ) -> None :
747
+ async def wait_network_restore (ops_test : OpsTest , unit_name : str , old_ip : str ) -> None :
683
748
"""Wait until network is restored.
684
749
685
750
Args:
686
751
ops_test: pytest plugin helper
687
- hostname: The name of the instance
752
+ unit_name: name of the unit
688
753
old_ip: old registered IP address
689
754
"""
690
- if await instance_ip (ops_test , hostname ) == old_ip :
755
+ # Retrieve the unit IP from inside the unit because it may not be updated in the
756
+ # Juju status too quickly.
757
+ if (await get_ip_from_inside_the_unit (ops_test , unit_name )) == old_ip :
691
758
raise Exception
692
759
693
760
0 commit comments