Skip to content

Commit 6c56b3c

Browse files
committed
Desperate attempt to debug UNKNOWN_DATABASE problem
1 parent 65ddc50 commit 6c56b3c

File tree

1 file changed

+78
-32
lines changed

1 file changed

+78
-32
lines changed

tests/clickhouse-test

Lines changed: 78 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ import tempfile
3636
import traceback
3737
from functools import cache
3838
import urllib.parse
39+
from contextlib import contextmanager
40+
import sys
3941

4042
# for crc32
4143
import zlib
@@ -65,6 +67,7 @@ MESSAGES_TO_RETRY = [
6567
"ConnectionPoolWithFailover: Connection failed at try",
6668
"DB::Exception: New table appeared in database being dropped or detached. Try again",
6769
"is already started to be removing by another replica right now",
70+
"(UNKNOWN_DATABASE)",
6871
# This is from LSan, and it indicates its own internal problem:
6972
"Unable to get registers from thread",
7073
]
@@ -435,14 +438,25 @@ def clickhouse_execute_http(
435438
max_http_retries=5,
436439
retry_error_codes=False,
437440
):
438-
if base_args.secure:
439-
client = http.client.HTTPSConnection(
440-
host=base_args.tcp_host, port=base_args.http_port, timeout=timeout
441-
)
442-
else:
443-
client = http.client.HTTPConnection(
444-
host=base_args.tcp_host, port=base_args.http_port, timeout=timeout
445-
)
441+
442+
# make sure client connections are always properly terminated
443+
@contextmanager
444+
def make_http_client():
445+
if base_args.secure:
446+
client = http.client.HTTPSConnection(
447+
host=base_args.tcp_host, port=base_args.http_port, timeout=timeout
448+
)
449+
else:
450+
client = http.client.HTTPConnection(
451+
host=base_args.tcp_host, port=base_args.http_port, timeout=timeout
452+
)
453+
try:
454+
client.connect()
455+
yield client
456+
finally:
457+
client.close()
458+
del client
459+
446460

447461
timeout = int(timeout)
448462
params = {
@@ -464,22 +478,23 @@ def clickhouse_execute_http(
464478
if default_format is not None:
465479
params["default_format"] = default_format
466480

467-
for i in range(max_http_retries):
468-
try:
469-
client.request(
470-
"POST",
471-
f"/?{base_args.client_options_query_str}{urllib.parse.urlencode(params)}",
472-
body=body,
473-
)
474-
res = client.getresponse()
475-
data = res.read()
476-
if res.status == 200 or (not retry_error_codes):
477-
break
478-
except Exception as ex:
479-
if i == max_http_retries - 1:
480-
raise ex
481-
client.close()
482-
sleep(i + 1)
481+
with make_http_client() as client:
482+
for i in range(max_http_retries):
483+
try:
484+
client.request(
485+
"POST",
486+
f"/?{base_args.client_options_query_str}{urllib.parse.urlencode(params)}",
487+
body=body,
488+
)
489+
res = client.getresponse()
490+
data = res.read()
491+
if res.status == 200 or (not retry_error_codes):
492+
break
493+
except Exception as ex:
494+
if i == max_http_retries - 1:
495+
raise ex
496+
client.close()
497+
sleep(i + 1)
483498

484499
if res.status != 200:
485500
raise HTTPError(data.decode(), res.status)
@@ -665,6 +680,18 @@ def need_retry(args, stdout, stderr, total_time):
665680
msg in stderr for msg in MESSAGES_TO_RETRY
666681
)
667682

683+
@contextmanager
684+
def autokill_subprocess(proc : Optional[Popen]):
685+
try:
686+
yield proc
687+
finally:
688+
if proc is not None:
689+
if proc.poll() is None:
690+
# seems to be still running
691+
print(f"child process {proc} is still running and will be force killed")
692+
proc.kill()
693+
# to properly cleanup stdin, stdout, etc...
694+
proc.__exit__(*sys.exc_info())
668695

669696
def get_processlist_size(args):
670697
if args.replicated_database:
@@ -1594,6 +1621,7 @@ class TestCase:
15941621
if proc.returncode is None:
15951622
f = io.StringIO()
15961623
with redirect_stdout(f):
1624+
cleanup_child_processes(proc.pid)
15971625
kill_process_group(os.getpgid(proc.pid))
15981626
kill_output = f.getvalue()
15991627

@@ -1858,7 +1886,23 @@ class TestCase:
18581886
description_full += "Database: " + self.testcase_args.testcase_database
18591887

18601888
if result.status == TestStatus.FAIL:
1861-
description_full += "\nSystem disk usage:\n" + subprocess.check_output("df -h", shell=True).decode('utf-8')
1889+
for description, command in [
1890+
("OS info", "lsb_release -idrc"),
1891+
("disk usage", "df -h"),
1892+
("RAM usage", "free -h"),
1893+
# ("child processes", ("ps -f -g ", {'env' : {"LIBPROC_HIDE_KERNEL" : "1"}})),
1894+
("ALL processes", ("ps xj", {'env' : {"LIBPROC_HIDE_KERNEL" : "1"}})),
1895+
]:
1896+
try:
1897+
if isinstance(command, str):
1898+
output = subprocess.check_output(command, shell=True)
1899+
else:
1900+
command, kwargs = command
1901+
output = subprocess.check_output(command, **kwargs, shell=True)
1902+
except Exception as e:
1903+
print(f"Failed to execute post-failure command '{command}', got exception: {e}")
1904+
else:
1905+
description_full += f"\n{description} :\n{output.decode('utf-8')}"
18621906

18631907
result.description = description_full
18641908
return result
@@ -1966,13 +2010,13 @@ class TestCase:
19662010
)
19672011

19682012
# pylint:disable-next=consider-using-with; TODO: fix
1969-
proc = Popen(command, shell=True, env=tests_env, start_new_session=True)
1970-
1971-
try:
1972-
proc.wait(args.timeout)
1973-
except subprocess.TimeoutExpired:
1974-
# Whether the test timed out will be decided later
1975-
pass
2013+
with autokill_subprocess(Popen(command, shell=True, env=tests_env, start_new_session=True)) as proc:
2014+
try:
2015+
proc.wait(args.timeout)
2016+
except subprocess.TimeoutExpired:
2017+
# Whether the test timed out will be decided later
2018+
print(f"!!! Timeout: {args.timeout}")
2019+
pass
19762020

19772021
total_time = (datetime.now() - start_time).total_seconds()
19782022

@@ -2062,6 +2106,7 @@ class TestCase:
20622106
)
20632107

20642108
result = self.process_result_impl(proc, total_time)
2109+
20652110
result.check_if_need_retry(
20662111
args, result.description, result.description, self.runs_count
20672112
)
@@ -2671,6 +2716,7 @@ def run_tests_array(
26712716
args, test_suite, client_options, server_logs_level
26722717
)
26732718
test_result = test_case.process_result(test_result, MESSAGES)
2719+
26742720
except TimeoutError:
26752721
break
26762722
finally:

0 commit comments

Comments
 (0)