@@ -292,16 +292,48 @@ def get_ssh_connection(instance_ip, ssh_identity_file, max_retries=10):
292
292
temp_key .get_priv_key_file (),
293
293
)
294
294
295
+ def run_detailed_checks (host ):
296
+ logger .info ("Running detailed system checks..." )
297
+
298
+ # Log Nix profile setup checks
299
+ logger .info ("Checking Nix profile setup:" )
300
+ nix_profile_result = host .run ("ls -la /var/lib/postgresql/.nix-profile" )
301
+ logger .info (f"Nix profile directory:\n { nix_profile_result .stdout } \n { nix_profile_result .stderr } " )
302
+
303
+ nix_bin_result = host .run ("ls -la /var/lib/postgresql/.nix-profile/bin" )
304
+ logger .info (f"Nix profile bin directory:\n { nix_bin_result .stdout } \n { nix_bin_result .stderr } " )
305
+
306
+ # Check PostgreSQL logs directory
307
+ logger .info ("Checking PostgreSQL logs directory:" )
308
+ result = host .run ("sudo ls -la /var/log/postgresql/" )
309
+ logger .info (f"log directory contents:\n { result .stdout } \n { result .stderr } " )
310
+
311
+ # Check any existing PostgreSQL logs
312
+ logger .info ("Checking existing PostgreSQL logs:" )
313
+ result = host .run ("sudo cat /var/log/postgresql/*.log" )
314
+ logger .info (f"postgresql logs:\n { result .stdout } \n { result .stderr } " )
315
+
316
+ # Try starting PostgreSQL directly with pg_ctl and capture output
317
+ logger .info ("Attempting to start PostgreSQL directly with pg_ctl:" )
318
+ startup_log = "/tmp/postgres-start.log"
319
+ result = host .run (f"sudo -u postgres /usr/lib/postgresql/bin/pg_ctl -D /var/lib/postgresql/data start -l { startup_log } " )
320
+ logger .info (f"pg_ctl start attempt:\n { result .stdout } \n { result .stderr } " )
321
+
322
+ # Check the startup log
323
+ logger .info ("PostgreSQL startup log:" )
324
+ result = host .run (f"sudo cat { startup_log } " )
325
+ logger .info (f"startup log contents:\n { result .stdout } \n { result .stderr } " )
326
+
327
+ # Check PostgreSQL environment
328
+ logger .info ("PostgreSQL environment:" )
329
+ result = host .run ("sudo -u postgres env | grep POSTGRES" )
330
+ logger .info (f"postgres environment:\n { result .stdout } \n { result .stderr } " )
331
+
295
332
def is_healthy (host , instance_ip , ssh_identity_file ) -> bool :
296
333
health_checks = [
297
334
(
298
335
"postgres" ,
299
336
lambda h : (
300
- # First check if PostgreSQL is running
301
- h .run ("sudo systemctl is-active postgresql" ),
302
- # Then check if the socket directory exists and has correct permissions
303
- h .run ("sudo ls -la /run/postgresql" ),
304
- # Then try pg_isready
305
337
h .run ("sudo -u postgres /usr/bin/pg_isready -U postgres" )
306
338
),
307
339
),
@@ -333,114 +365,24 @@ def is_healthy(host, instance_ip, ssh_identity_file) -> bool:
333
365
# For PostgreSQL, we need to check multiple things
334
366
systemd_status , socket_check , pg_isready = check (host )
335
367
336
- # Log Nix profile setup checks
337
- logger .info ("Checking Nix profile setup:" )
338
- nix_profile_result = host .run ("ls -la /home/postgres/.nix-profile" )
339
- logger .info (f"Nix profile directory:\n { nix_profile_result .stdout } \n { nix_profile_result .stderr } " )
340
-
341
- nix_bin_result = host .run ("ls -la /home/postgres/.nix-profile/bin" )
342
- logger .info (f"Nix profile bin directory:\n { nix_bin_result .stdout } \n { nix_bin_result .stderr } " )
343
-
344
- nix_script_result = host .run ("test -x /home/postgres/.nix-profile/bin/switch_pg_cron_version" )
345
- logger .info (f"Switch script executable check: { 'success' if not nix_script_result .failed else 'failed' } " )
346
-
347
- nix_script_output = host .run ("/home/postgres/.nix-profile/bin/switch_pg_cron_version" )
348
- logger .info (f"Switch script output:\n { nix_script_output .stdout } \n { nix_script_output .stderr } " )
349
-
350
368
if systemd_status .failed :
351
369
logger .error ("PostgreSQL systemd service is not active" )
352
370
logger .error (f"systemd status: { systemd_status .stdout } " )
353
371
logger .error (f"systemd error: { systemd_status .stderr } " )
354
-
355
- # Check systemd service unit file
356
- logger .error ("PostgreSQL systemd service unit file:" )
357
- result = host .run ("sudo systemctl cat postgresql" )
358
- logger .error (f"service unit file:\n { result .stdout } \n { result .stderr } " )
359
-
360
- # Check systemd service environment
361
- logger .error ("PostgreSQL systemd service environment:" )
362
- result = host .run ("sudo systemctl show postgresql" )
363
- logger .error (f"service environment:\n { result .stdout } \n { result .stderr } " )
364
-
365
- # Check systemd service dependencies
366
- logger .error ("PostgreSQL systemd service dependencies:" )
367
- result = host .run ("sudo systemctl list-dependencies postgresql" )
368
- logger .error (f"service dependencies:\n { result .stdout } \n { result .stderr } " )
369
-
370
- # Check if service is enabled
371
- logger .error ("PostgreSQL service enabled status:" )
372
- result = host .run ("sudo systemctl is-enabled postgresql" )
373
- logger .error (f"service enabled status:\n { result .stdout } \n { result .stderr } " )
374
-
375
- # Check systemd journal for service execution logs
376
- logger .error ("Systemd journal entries for PostgreSQL service execution:" )
377
- result = host .run ("sudo journalctl -u postgresql -n 100 --no-pager" )
378
- logger .error (f"systemd journal:\n { result .stdout } \n { result .stderr } " )
379
-
380
- # Check systemd journal specifically for ExecStartPre and ExecStart
381
- logger .error ("Systemd journal entries for ExecStartPre and ExecStart:" )
382
- result = host .run ("sudo journalctl -u postgresql -n 100 --no-pager | grep -E 'ExecStartPre|ExecStart'" )
383
- logger .error (f"execution logs:\n { result .stdout } \n { result .stderr } " )
384
-
385
- # Check systemd journal for any errors
386
- logger .error ("Systemd journal entries with error level:" )
387
- result = host .run ("sudo journalctl -u postgresql -n 100 --no-pager -p err" )
388
- logger .error (f"error logs:\n { result .stdout } \n { result .stderr } " )
389
372
390
- # Check pre-start script output
391
- logger .error ("Checking pre-start script output:" )
392
- result = host .run ("sudo -u postgres /usr/local/bin/postgres_prestart.sh" )
393
- logger .error (f"pre-start script output:\n { result .stdout } \n { result .stderr } " )
394
-
395
- # Check PostgreSQL logs directory
396
- logger .error ("Checking PostgreSQL logs directory:" )
397
- result = host .run ("sudo ls -la /var/log/postgresql/" )
398
- logger .error (f"log directory contents:\n { result .stdout } \n { result .stderr } " )
399
-
400
- # Check any existing PostgreSQL logs
401
- logger .error ("Checking existing PostgreSQL logs:" )
402
- result = host .run ("sudo cat /var/log/postgresql/*.log" )
403
- logger .error (f"postgresql logs:\n { result .stdout } \n { result .stderr } " )
404
-
405
- # Try starting PostgreSQL directly with pg_ctl and capture output
406
- logger .error ("Attempting to start PostgreSQL directly with pg_ctl:" )
407
- startup_log = "/tmp/postgres-start.log"
408
- result = host .run (f"sudo -u postgres /usr/lib/postgresql/bin/pg_ctl -D /var/lib/postgresql/data start -l { startup_log } " )
409
- logger .error (f"pg_ctl start attempt:\n { result .stdout } \n { result .stderr } " )
410
-
411
- # Check the startup log
412
- logger .error ("PostgreSQL startup log:" )
413
- result = host .run (f"sudo cat { startup_log } " )
414
- logger .error (f"startup log contents:\n { result .stdout } \n { result .stderr } " )
415
-
416
- # Clean up the startup log
417
- result = host .run (f"sudo rm -f { startup_log } " )
418
-
419
- # Check PostgreSQL configuration
420
- logger .error ("PostgreSQL configuration:" )
421
- result = host .run ("sudo cat /etc/postgresql/postgresql.conf" )
422
- logger .error (f"postgresql.conf:\n { result .stdout } \n { result .stderr } " )
423
-
424
- # Check PostgreSQL authentication configuration
425
- logger .error ("PostgreSQL authentication configuration:" )
426
- result = host .run ("sudo cat /etc/postgresql/pg_hba.conf" )
427
- logger .error (f"pg_hba.conf:\n { result .stdout } \n { result .stderr } " )
428
-
429
- # Check PostgreSQL environment
430
- logger .error ("PostgreSQL environment:" )
431
- result = host .run ("sudo -u postgres env | grep POSTGRES" )
432
- logger .error (f"postgres environment:\n { result .stdout } \n { result .stderr } " )
373
+ # Run detailed checks since we know we have a working connection
374
+ run_detailed_checks (host )
433
375
434
376
if any (cmd .failed for cmd in [systemd_status , socket_check , pg_isready ]):
435
377
return False
436
- else :
437
- cmd = check (host )
438
- if cmd .failed is True :
439
- logger .warning (f"{ service } not ready" )
440
- logger .error (f"{ service } command failed with rc={ cmd .rc } " )
441
- logger .error (f"{ service } stdout: { cmd .stdout } " )
442
- logger .error (f"{ service } stderr: { cmd .stderr } " )
443
- return False
378
+ else :
379
+ cmd = check (host )
380
+ if cmd .failed is True :
381
+ logger .warning (f"{ service } not ready" )
382
+ logger .error (f"{ service } command failed with rc={ cmd .rc } " )
383
+ logger .error (f"{ service } stdout: { cmd .stdout } " )
384
+ logger .error (f"{ service } stderr: { cmd .stderr } " )
385
+ return False
444
386
except Exception as e :
445
387
logger .warning (
446
388
f"Connection failed during { service } check, attempting reconnect..."
0 commit comments