@@ -36,6 +36,8 @@ import tempfile
3636import traceback
3737from functools import cache
3838import urllib .parse
39+ from contextlib import contextmanager
40+ import sys
3941
4042# for crc32
4143import zlib
@@ -65,6 +67,7 @@ MESSAGES_TO_RETRY = [
6567 "ConnectionPoolWithFailover: Connection failed at try" ,
6668 "DB::Exception: New table appeared in database being dropped or detached. Try again" ,
6769 "is already started to be removing by another replica right now" ,
70+ "(UNKNOWN_DATABASE)" ,
6871 # This is from LSan, and it indicates its own internal problem:
6972 "Unable to get registers from thread" ,
7073]
@@ -435,14 +438,25 @@ def clickhouse_execute_http(
435438 max_http_retries = 5 ,
436439 retry_error_codes = False ,
437440):
438- if base_args .secure :
439- client = http .client .HTTPSConnection (
440- host = base_args .tcp_host , port = base_args .http_port , timeout = timeout
441- )
442- else :
443- client = http .client .HTTPConnection (
444- host = base_args .tcp_host , port = base_args .http_port , timeout = timeout
445- )
441+
442+ # make sure client connections are always properly terminated
443+ @contextmanager
444+ def make_http_client ():
445+ if base_args .secure :
446+ client = http .client .HTTPSConnection (
447+ host = base_args .tcp_host , port = base_args .http_port , timeout = timeout
448+ )
449+ else :
450+ client = http .client .HTTPConnection (
451+ host = base_args .tcp_host , port = base_args .http_port , timeout = timeout
452+ )
453+ try :
454+ client .connect ()
455+ yield client
456+ finally :
457+ client .close ()
458+ del client
459+
446460
447461 timeout = int (timeout )
448462 params = {
@@ -464,22 +478,23 @@ def clickhouse_execute_http(
464478 if default_format is not None :
465479 params ["default_format" ] = default_format
466480
467- for i in range (max_http_retries ):
468- try :
469- client .request (
470- "POST" ,
471- f"/?{ base_args .client_options_query_str } { urllib .parse .urlencode (params )} " ,
472- body = body ,
473- )
474- res = client .getresponse ()
475- data = res .read ()
476- if res .status == 200 or (not retry_error_codes ):
477- break
478- except Exception as ex :
479- if i == max_http_retries - 1 :
480- raise ex
481- client .close ()
482- sleep (i + 1 )
481+ with make_http_client () as client :
482+ for i in range (max_http_retries ):
483+ try :
484+ client .request (
485+ "POST" ,
486+ f"/?{ base_args .client_options_query_str } { urllib .parse .urlencode (params )} " ,
487+ body = body ,
488+ )
489+ res = client .getresponse ()
490+ data = res .read ()
491+ if res .status == 200 or (not retry_error_codes ):
492+ break
493+ except Exception as ex :
494+ if i == max_http_retries - 1 :
495+ raise ex
496+ client .close ()
497+ sleep (i + 1 )
483498
484499 if res .status != 200 :
485500 raise HTTPError (data .decode (), res .status )
@@ -665,6 +680,18 @@ def need_retry(args, stdout, stderr, total_time):
665680 msg in stderr for msg in MESSAGES_TO_RETRY
666681 )
667682
683+ @contextmanager
684+ def autokill_subprocess (proc : Optional [Popen ]):
685+ try :
686+ yield proc
687+ finally :
688+ if proc is not None :
689+ if proc .poll () is None :
690+ # seems to be still running
691+ print (f"child process { proc } is still running and will be force killed" )
692+ proc .kill ()
693+ # to properly cleanup stdin, stdout, etc...
694+ proc .__exit__ (* sys .exc_info ())
668695
669696def get_processlist_size (args ):
670697 if args .replicated_database :
@@ -1594,6 +1621,7 @@ class TestCase:
15941621 if proc .returncode is None :
15951622 f = io .StringIO ()
15961623 with redirect_stdout (f ):
1624+ cleanup_child_processes (proc .pid )
15971625 kill_process_group (os .getpgid (proc .pid ))
15981626 kill_output = f .getvalue ()
15991627
@@ -1858,7 +1886,23 @@ class TestCase:
18581886 description_full += "Database: " + self .testcase_args .testcase_database
18591887
18601888 if result .status == TestStatus .FAIL :
1861- description_full += "\n System disk usage:\n " + subprocess .check_output ("df -h" , shell = True ).decode ('utf-8' )
1889+ for description , command in [
1890+ ("OS info" , "lsb_release -idrc" ),
1891+ ("disk usage" , "df -h" ),
1892+ ("RAM usage" , "free -h" ),
1893+ # ("child processes", ("ps -f -g ", {'env' : {"LIBPROC_HIDE_KERNEL" : "1"}})),
1894+ ("ALL processes" , ("ps xj" , {'env' : {"LIBPROC_HIDE_KERNEL" : "1" }})),
1895+ ]:
1896+ try :
1897+ if isinstance (command , str ):
1898+ output = subprocess .check_output (command , shell = True )
1899+ else :
1900+ command , kwargs = command
1901+ output = subprocess .check_output (command , ** kwargs , shell = True )
1902+ except Exception as e :
1903+ print (f"Failed to execute post-failure command '{ command } ', got exception: { e } " )
1904+ else :
1905+ description_full += f"\n { description } :\n { output .decode ('utf-8' )} "
18621906
18631907 result .description = description_full
18641908 return result
@@ -1966,13 +2010,13 @@ class TestCase:
19662010 )
19672011
19682012 # pylint:disable-next=consider-using-with; TODO: fix
1969- proc = Popen (command , shell = True , env = tests_env , start_new_session = True )
1970-
1971- try :
1972- proc . wait ( args . timeout )
1973- except subprocess . TimeoutExpired :
1974- # Whether the test timed out will be decided later
1975- pass
2013+ with autokill_subprocess ( Popen (command , shell = True , env = tests_env , start_new_session = True )) as proc :
2014+ try :
2015+ proc . wait ( args . timeout )
2016+ except subprocess . TimeoutExpired :
2017+ # Whether the test timed out will be decided later
2018+ print ( f"!!! Timeout: { args . timeout } " )
2019+ pass
19762020
19772021 total_time = (datetime .now () - start_time ).total_seconds ()
19782022
@@ -2062,6 +2106,7 @@ class TestCase:
20622106 )
20632107
20642108 result = self .process_result_impl (proc , total_time )
2109+
20652110 result .check_if_need_retry (
20662111 args , result .description , result .description , self .runs_count
20672112 )
@@ -2671,6 +2716,7 @@ def run_tests_array(
26712716 args , test_suite , client_options , server_logs_level
26722717 )
26732718 test_result = test_case .process_result (test_result , MESSAGES )
2719+
26742720 except TimeoutError :
26752721 break
26762722 finally :
0 commit comments