@@ -292,16 +292,48 @@ def get_ssh_connection(instance_ip, ssh_identity_file, max_retries=10):
292292 temp_key .get_priv_key_file (),
293293 )
294294
295+ def run_detailed_checks (host ):
296+ logger .info ("Running detailed system checks..." )
297+
298+ # Log Nix profile setup checks
299+ logger .info ("Checking Nix profile setup:" )
300+ nix_profile_result = host .run ("ls -la /var/lib/postgresql/.nix-profile" )
301+ logger .info (f"Nix profile directory:\n { nix_profile_result .stdout } \n { nix_profile_result .stderr } " )
302+
303+ nix_bin_result = host .run ("ls -la /var/lib/postgresql/.nix-profile/bin" )
304+ logger .info (f"Nix profile bin directory:\n { nix_bin_result .stdout } \n { nix_bin_result .stderr } " )
305+
306+ # Check PostgreSQL logs directory
307+ logger .info ("Checking PostgreSQL logs directory:" )
308+ result = host .run ("sudo ls -la /var/log/postgresql/" )
309+ logger .info (f"log directory contents:\n { result .stdout } \n { result .stderr } " )
310+
311+ # Check any existing PostgreSQL logs
312+ logger .info ("Checking existing PostgreSQL logs:" )
313+ result = host .run ("sudo cat /var/log/postgresql/*.log" )
314+ logger .info (f"postgresql logs:\n { result .stdout } \n { result .stderr } " )
315+
316+ # Try starting PostgreSQL directly with pg_ctl and capture output
317+ logger .info ("Attempting to start PostgreSQL directly with pg_ctl:" )
318+ startup_log = "/tmp/postgres-start.log"
319+ result = host .run (f"sudo -u postgres /usr/lib/postgresql/bin/pg_ctl -D /var/lib/postgresql/data start -l { startup_log } " )
320+ logger .info (f"pg_ctl start attempt:\n { result .stdout } \n { result .stderr } " )
321+
322+ # Check the startup log
323+ logger .info ("PostgreSQL startup log:" )
324+ result = host .run (f"sudo cat { startup_log } " )
325+ logger .info (f"startup log contents:\n { result .stdout } \n { result .stderr } " )
326+
327+ # Check PostgreSQL environment
328+ logger .info ("PostgreSQL environment:" )
329+ result = host .run ("sudo -u postgres env | grep POSTGRES" )
330+ logger .info (f"postgres environment:\n { result .stdout } \n { result .stderr } " )
331+
295332 def is_healthy (host , instance_ip , ssh_identity_file ) -> bool :
296333 health_checks = [
297334 (
298335 "postgres" ,
299336 lambda h : (
300- # First check if PostgreSQL is running
301- h .run ("sudo systemctl is-active postgresql" ),
302- # Then check if the socket directory exists and has correct permissions
303- h .run ("sudo ls -la /run/postgresql" ),
304- # Then try pg_isready
305337 h .run ("sudo -u postgres /usr/bin/pg_isready -U postgres" )
306338 ),
307339 ),
@@ -333,114 +365,24 @@ def is_healthy(host, instance_ip, ssh_identity_file) -> bool:
333365 # For PostgreSQL, we need to check multiple things
334366 systemd_status , socket_check , pg_isready = check (host )
335367
336- # Log Nix profile setup checks
337- logger .info ("Checking Nix profile setup:" )
338- nix_profile_result = host .run ("ls -la /home/postgres/.nix-profile" )
339- logger .info (f"Nix profile directory:\n { nix_profile_result .stdout } \n { nix_profile_result .stderr } " )
340-
341- nix_bin_result = host .run ("ls -la /home/postgres/.nix-profile/bin" )
342- logger .info (f"Nix profile bin directory:\n { nix_bin_result .stdout } \n { nix_bin_result .stderr } " )
343-
344- nix_script_result = host .run ("test -x /home/postgres/.nix-profile/bin/switch_pg_cron_version" )
345- logger .info (f"Switch script executable check: { 'success' if not nix_script_result .failed else 'failed' } " )
346-
347- nix_script_output = host .run ("/home/postgres/.nix-profile/bin/switch_pg_cron_version" )
348- logger .info (f"Switch script output:\n { nix_script_output .stdout } \n { nix_script_output .stderr } " )
349-
350368 if systemd_status .failed :
351369 logger .error ("PostgreSQL systemd service is not active" )
352370 logger .error (f"systemd status: { systemd_status .stdout } " )
353371 logger .error (f"systemd error: { systemd_status .stderr } " )
354-
355- # Check systemd service unit file
356- logger .error ("PostgreSQL systemd service unit file:" )
357- result = host .run ("sudo systemctl cat postgresql" )
358- logger .error (f"service unit file:\n { result .stdout } \n { result .stderr } " )
359-
360- # Check systemd service environment
361- logger .error ("PostgreSQL systemd service environment:" )
362- result = host .run ("sudo systemctl show postgresql" )
363- logger .error (f"service environment:\n { result .stdout } \n { result .stderr } " )
364-
365- # Check systemd service dependencies
366- logger .error ("PostgreSQL systemd service dependencies:" )
367- result = host .run ("sudo systemctl list-dependencies postgresql" )
368- logger .error (f"service dependencies:\n { result .stdout } \n { result .stderr } " )
369-
370- # Check if service is enabled
371- logger .error ("PostgreSQL service enabled status:" )
372- result = host .run ("sudo systemctl is-enabled postgresql" )
373- logger .error (f"service enabled status:\n { result .stdout } \n { result .stderr } " )
374-
375- # Check systemd journal for service execution logs
376- logger .error ("Systemd journal entries for PostgreSQL service execution:" )
377- result = host .run ("sudo journalctl -u postgresql -n 100 --no-pager" )
378- logger .error (f"systemd journal:\n { result .stdout } \n { result .stderr } " )
379-
380- # Check systemd journal specifically for ExecStartPre and ExecStart
381- logger .error ("Systemd journal entries for ExecStartPre and ExecStart:" )
382- result = host .run ("sudo journalctl -u postgresql -n 100 --no-pager | grep -E 'ExecStartPre|ExecStart'" )
383- logger .error (f"execution logs:\n { result .stdout } \n { result .stderr } " )
384-
385- # Check systemd journal for any errors
386- logger .error ("Systemd journal entries with error level:" )
387- result = host .run ("sudo journalctl -u postgresql -n 100 --no-pager -p err" )
388- logger .error (f"error logs:\n { result .stdout } \n { result .stderr } " )
389372
390- # Check pre-start script output
391- logger .error ("Checking pre-start script output:" )
392- result = host .run ("sudo -u postgres /usr/local/bin/postgres_prestart.sh" )
393- logger .error (f"pre-start script output:\n { result .stdout } \n { result .stderr } " )
394-
395- # Check PostgreSQL logs directory
396- logger .error ("Checking PostgreSQL logs directory:" )
397- result = host .run ("sudo ls -la /var/log/postgresql/" )
398- logger .error (f"log directory contents:\n { result .stdout } \n { result .stderr } " )
399-
400- # Check any existing PostgreSQL logs
401- logger .error ("Checking existing PostgreSQL logs:" )
402- result = host .run ("sudo cat /var/log/postgresql/*.log" )
403- logger .error (f"postgresql logs:\n { result .stdout } \n { result .stderr } " )
404-
405- # Try starting PostgreSQL directly with pg_ctl and capture output
406- logger .error ("Attempting to start PostgreSQL directly with pg_ctl:" )
407- startup_log = "/tmp/postgres-start.log"
408- result = host .run (f"sudo -u postgres /usr/lib/postgresql/bin/pg_ctl -D /var/lib/postgresql/data start -l { startup_log } " )
409- logger .error (f"pg_ctl start attempt:\n { result .stdout } \n { result .stderr } " )
410-
411- # Check the startup log
412- logger .error ("PostgreSQL startup log:" )
413- result = host .run (f"sudo cat { startup_log } " )
414- logger .error (f"startup log contents:\n { result .stdout } \n { result .stderr } " )
415-
416- # Clean up the startup log
417- result = host .run (f"sudo rm -f { startup_log } " )
418-
419- # Check PostgreSQL configuration
420- logger .error ("PostgreSQL configuration:" )
421- result = host .run ("sudo cat /etc/postgresql/postgresql.conf" )
422- logger .error (f"postgresql.conf:\n { result .stdout } \n { result .stderr } " )
423-
424- # Check PostgreSQL authentication configuration
425- logger .error ("PostgreSQL authentication configuration:" )
426- result = host .run ("sudo cat /etc/postgresql/pg_hba.conf" )
427- logger .error (f"pg_hba.conf:\n { result .stdout } \n { result .stderr } " )
428-
429- # Check PostgreSQL environment
430- logger .error ("PostgreSQL environment:" )
431- result = host .run ("sudo -u postgres env | grep POSTGRES" )
432- logger .error (f"postgres environment:\n { result .stdout } \n { result .stderr } " )
373+ # Run detailed checks since we know we have a working connection
374+ run_detailed_checks (host )
433375
434376 if any (cmd .failed for cmd in [systemd_status , socket_check , pg_isready ]):
435377 return False
436- else :
437- cmd = check (host )
438- if cmd .failed is True :
439- logger .warning (f"{ service } not ready" )
440- logger .error (f"{ service } command failed with rc={ cmd .rc } " )
441- logger .error (f"{ service } stdout: { cmd .stdout } " )
442- logger .error (f"{ service } stderr: { cmd .stderr } " )
443- return False
378+ else :
379+ cmd = check (host )
380+ if cmd .failed is True :
381+ logger .warning (f"{ service } not ready" )
382+ logger .error (f"{ service } command failed with rc={ cmd .rc } " )
383+ logger .error (f"{ service } stdout: { cmd .stdout } " )
384+ logger .error (f"{ service } stderr: { cmd .stderr } " )
385+ return False
444386 except Exception as e :
445387 logger .warning (
446388 f"Connection failed during { service } check, attempting reconnect..."
0 commit comments