Skip to content

Commit 9291277

Browse files
authored
AAP-43763 Fix incomplete error handling in advisory_lock wrapper (#713)
## Description In AWX we relied on this behavior from `__clean_on_fork__` And we ran this before starting every task. This actually masked a lot of error handling bugs internal to the `advisory_lock` we were using. I recently moved that method from AWX to DAB, and this puts it in a situation not really safe for others to use. This adds tests which are honest to the bug - they fail without the corresponding fix, and fixes those bugs. The condition we have breaks _all_ queries after the error. It is very toxic. Ping @bzwei for review and @TheRealHaoLiu ## Type of Change - [x] Bug fix (non-breaking change which fixes an issue) - [ ] New feature (non-breaking change which adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) - [ ] Documentation update - [ ] Test update - [ ] Refactoring (no functional changes) - [ ] Development environment change - [ ] Configuration change ## Self-Review Checklist - [x] I have performed a self-review of my code - [x] I have added relevant comments to complex code sections - [ ] I have updated documentation where needed - [x] I have considered the security impact of these changes - [x] I have considered performance implications - [x] I have thought about error handling and edge cases - [ ] I have tested the changes in my local environment ## Testing Instructions The tests are in the tests. ### Steps to Test What the test does 1. obtain advisory lock 2. time out 3. make a query after that ### Expected Results subsequent queries work ## Additional Context in Jira ansible/eda-server#1262
1 parent 95e8ccf commit 9291277

File tree

2 files changed

+73
-12
lines changed

2 files changed

+73
-12
lines changed

ansible_base/lib/utils/db.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
1+
import logging
12
from contextlib import contextmanager
23
from zlib import crc32
34

4-
from django.db import DEFAULT_DB_ALIAS, connection, connections, transaction
5+
from django.db import DEFAULT_DB_ALIAS, OperationalError, connection, connections, transaction
56
from django.db.migrations.executor import MigrationExecutor
67

8+
logger = logging.getLogger(__name__)
9+
710

811
@contextmanager
912
def ensure_transaction():
@@ -129,6 +132,7 @@ def advisory_lock(*args, lock_session_timeout_milliseconds=0, **kwargs):
129132
@param: lock_session_timeout_milliseconds Postgres-level timeout
130133
@param: using django database identifier
131134
"""
135+
internal_error = False
132136
if connection.vendor == "postgresql":
133137
cur = None
134138
idle_in_transaction_session_timeout = None
@@ -139,12 +143,23 @@ def advisory_lock(*args, lock_session_timeout_milliseconds=0, **kwargs):
139143
idle_session_timeout = cur.execute("SHOW idle_session_timeout").fetchone()[0]
140144
cur.execute("SET idle_in_transaction_session_timeout = %s", (lock_session_timeout_milliseconds,))
141145
cur.execute("SET idle_session_timeout = %s", (lock_session_timeout_milliseconds,))
142-
with django_pglocks_advisory_lock(*args, **kwargs) as internal_lock:
143-
yield internal_lock
144-
if lock_session_timeout_milliseconds > 0:
146+
147+
try:
148+
with django_pglocks_advisory_lock(*args, **kwargs) as internal_lock:
149+
yield internal_lock
150+
except OperationalError:
151+
# Suspected case is that timeout happened due to the given timeout
152+
# this is _expected_ to leave the connection in an unusable state, so dropping it is better
153+
logger.info('Dropping connection due to suspected timeout inside advisory_lock')
154+
connection.close_if_unusable_or_obsolete()
155+
internal_error = True
156+
raise
157+
finally:
158+
if (not internal_error) and lock_session_timeout_milliseconds > 0:
145159
with connection.cursor() as cur:
146160
cur.execute("SET idle_in_transaction_session_timeout = %s", (idle_in_transaction_session_timeout,))
147161
cur.execute("SET idle_session_timeout = %s", (idle_session_timeout,))
162+
148163
elif connection.vendor == "sqlite":
149164
yield True
150165
else:

test_app/tests/lib/utils/test_db.py

Lines changed: 54 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,16 @@ def test_migrations_are_complete():
1414
assert migrations_are_complete()
1515

1616

17-
class TestAdvisoryLock:
17+
class SkipIfSqlite:
1818
@pytest.fixture(autouse=True)
1919
def skip_if_sqlite(self):
2020
if connection.vendor == 'sqlite':
2121
pytest.skip('Advisory lock is not written for sqlite')
2222

23+
24+
class TestAdvisoryLock(SkipIfSqlite):
25+
THREAD_WAIT_TIME = 0.1
26+
2327
@pytest.mark.django_db
2428
def test_get_unclaimed_lock(self):
2529
with advisory_lock('test_get_unclaimed_lock'):
@@ -30,7 +34,7 @@ def background_task(django_db_blocker):
3034
# HACK: as a thread the pytest.mark.django_db will not work
3135
django_db_blocker.unblock()
3236
with advisory_lock('background_task_lock'):
33-
time.sleep(0.1)
37+
time.sleep(TestAdvisoryLock.THREAD_WAIT_TIME)
3438

3539
@pytest.mark.django_db
3640
def test_determine_lock_is_held(self, django_db_blocker):
@@ -40,7 +44,7 @@ def test_determine_lock_is_held(self, django_db_blocker):
4044
with advisory_lock('background_task_lock', wait=False) as held:
4145
if held is False:
4246
break
43-
time.sleep(0.01)
47+
time.sleep(TestAdvisoryLock.THREAD_WAIT_TIME / 5.0)
4448
else:
4549
raise RuntimeError('Other thread never obtained lock')
4650
thread.join()
@@ -56,10 +60,52 @@ def test_invalid_tuple_name(self):
5660
with advisory_lock(['test_invalid_tuple_name', 'foo']):
5761
pass
5862

59-
@pytest.mark.django_db
60-
def test_lock_session_timeout_milliseconds(self):
63+
64+
# Special transaction=True parameter is used, because we do not want in normal test transactions
65+
# because dropping the connection would break the transaction context, erroring at end of test
66+
@pytest.mark.django_db(transaction=True)
67+
class TestAdvisoryLockPostgresErrors(SkipIfSqlite):
68+
"""Tests related to connection management and the advisory_lock"""
69+
70+
def kick_connection(self):
71+
"""
72+
These (somewhat evil) tests are not gentle with the connection
73+
74+
At the start of one tests there is a good chance the connection is broken by the last test.
75+
"""
76+
connection.ensure_connection()
77+
# sanity checks
78+
assert connection.connection
79+
assert connection.connection.closed is False
80+
81+
def test_timeout_under_lock_by_sleep(self):
82+
self.kick_connection()
83+
6184
with pytest.raises(OperationalError) as exc:
6285
# uses miliseconds units
63-
with advisory_lock('test_lock_session_timeout_milliseconds', lock_session_timeout_milliseconds=2):
64-
time.sleep(3)
65-
assert 'the connection is lost' in str(exc)
86+
with advisory_lock('test_lock_session_timeout_milliseconds', lock_session_timeout_milliseconds=5):
87+
time.sleep(0.1)
88+
89+
# This exception comes from the __exit__, either releasing the lock or closing the cursor
90+
assert 'terminating connection due to idle-session timeout' in str(exc)
91+
92+
with connection.cursor() as cursor:
93+
cursor.execute('SELECT 1')
94+
95+
def test_idle_after_exception(self):
96+
self.kick_connection()
97+
98+
with pytest.raises(RuntimeError) as exc:
99+
with advisory_lock('test_timeout_under_lock_with_query', lock_session_timeout_milliseconds=5):
100+
with connection.cursor() as cursor:
101+
raise RuntimeError('exception from test')
102+
103+
assert 'exception from test' in str(exc)
104+
105+
time.sleep(0.1)
106+
107+
# The fact that this works shows that the timeout was reset in the context manager __exit__
108+
# Prior bug was giving exception with
109+
# consuming input failed: terminating connection due to idle-session timeout server closed the connection unexpectedly
110+
with connection.cursor() as cursor:
111+
cursor.execute('SELECT 1')

0 commit comments

Comments
 (0)