AAP-43763 Fix incomplete error handling in advisory_lock wrapper (#713)

AlanCoding · web-flow · commit 9291277ea8bb · 2025-04-24T22:11:55.000-04:00
## Description In AWX we relied on this behavior from `__clean_on_fork__` And we ran this before starting every task. This actually masked a lot of error handling bugs internal to the `advisory_lock` we were using. I recently moved that method from AWX to DAB, and this puts it in a situation not really safe for others to use. This adds tests which are honest to the bug - they fail without the corresponding fix, and fixes those bugs. The condition we have breaks _all_ queries after the error. It is very toxic. Ping @bzwei for review and @TheRealHaoLiu ## Type of Change - [x] Bug fix (non-breaking change which fixes an issue) - [ ] New feature (non-breaking change which adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) - [ ] Documentation update - [ ] Test update - [ ] Refactoring (no functional changes) - [ ] Development environment change - [ ] Configuration change ## Self-Review Checklist - [x] I have performed a self-review of my code - [x] I have added relevant comments to complex code sections - [ ] I have updated documentation where needed - [x] I have considered the security impact of these changes - [x] I have considered performance implications - [x] I have thought about error handling and edge cases - [ ] I have tested the changes in my local environment ## Testing Instructions The tests are in the tests. ### Steps to Test What the test does 1. obtain advisory lock 2. time out 3. make a query after that ### Expected Results subsequent queries work ## Additional Context in Jira ansible/eda-server#1262
diff --git a/ansible_base/lib/utils/db.py b/ansible_base/lib/utils/db.py
@@ -1,9 +1,12 @@
+import logging
 from contextlib import contextmanager
 from zlib import crc32
 
-from django.db import DEFAULT_DB_ALIAS, connection, connections, transaction
+from django.db import DEFAULT_DB_ALIAS, OperationalError, connection, connections, transaction
 from django.db.migrations.executor import MigrationExecutor
 
+logger = logging.getLogger(__name__)
+
 
 @contextmanager
 def ensure_transaction():
@@ -129,6 +132,7 @@ def advisory_lock(*args, lock_session_timeout_milliseconds=0, **kwargs):
     @param: lock_session_timeout_milliseconds Postgres-level timeout
     @param: using django database identifier
     """
+    internal_error = False
     if connection.vendor == "postgresql":
         cur = None
         idle_in_transaction_session_timeout = None
@@ -139,12 +143,23 @@ def advisory_lock(*args, lock_session_timeout_milliseconds=0, **kwargs):
                 idle_session_timeout = cur.execute("SHOW idle_session_timeout").fetchone()[0]
                 cur.execute("SET idle_in_transaction_session_timeout = %s", (lock_session_timeout_milliseconds,))
                 cur.execute("SET idle_session_timeout = %s", (lock_session_timeout_milliseconds,))
-        with django_pglocks_advisory_lock(*args, **kwargs) as internal_lock:
-            yield internal_lock
-            if lock_session_timeout_milliseconds > 0:
+
+        try:
+            with django_pglocks_advisory_lock(*args, **kwargs) as internal_lock:
+                yield internal_lock
+        except OperationalError:
+            # Suspected case is that timeout happened due to the given timeout
+            # this is _expected_ to leave the connection in an unusable state, so dropping it is better
+            logger.info('Dropping connection due to suspected timeout inside advisory_lock')
+            connection.close_if_unusable_or_obsolete()
+            internal_error = True
+            raise
+        finally:
+            if (not internal_error) and lock_session_timeout_milliseconds > 0:
                 with connection.cursor() as cur:
                     cur.execute("SET idle_in_transaction_session_timeout = %s", (idle_in_transaction_session_timeout,))
                     cur.execute("SET idle_session_timeout = %s", (idle_session_timeout,))
+
     elif connection.vendor == "sqlite":
         yield True
     else:
diff --git a/test_app/tests/lib/utils/test_db.py b/test_app/tests/lib/utils/test_db.py
@@ -14,12 +14,16 @@ def test_migrations_are_complete():
     assert migrations_are_complete()
 
 
-class TestAdvisoryLock:
+class SkipIfSqlite:
     @pytest.fixture(autouse=True)
     def skip_if_sqlite(self):
         if connection.vendor == 'sqlite':
             pytest.skip('Advisory lock is not written for sqlite')
 
+
+class TestAdvisoryLock(SkipIfSqlite):
+    THREAD_WAIT_TIME = 0.1
+
     @pytest.mark.django_db
     def test_get_unclaimed_lock(self):
         with advisory_lock('test_get_unclaimed_lock'):
@@ -30,7 +34,7 @@ def background_task(django_db_blocker):
         # HACK: as a thread the pytest.mark.django_db will not work
         django_db_blocker.unblock()
         with advisory_lock('background_task_lock'):
-            time.sleep(0.1)
+            time.sleep(TestAdvisoryLock.THREAD_WAIT_TIME)
 
     @pytest.mark.django_db
     def test_determine_lock_is_held(self, django_db_blocker):
@@ -40,7 +44,7 @@ def test_determine_lock_is_held(self, django_db_blocker):
             with advisory_lock('background_task_lock', wait=False) as held:
                 if held is False:
                     break
-            time.sleep(0.01)
+            time.sleep(TestAdvisoryLock.THREAD_WAIT_TIME / 5.0)
         else:
             raise RuntimeError('Other thread never obtained lock')
         thread.join()
@@ -56,10 +60,52 @@ def test_invalid_tuple_name(self):
             with advisory_lock(['test_invalid_tuple_name', 'foo']):
                 pass
 
-    @pytest.mark.django_db
-    def test_lock_session_timeout_milliseconds(self):
+
+# Special transaction=True parameter is used, because we do not want in normal test transactions
+# because dropping the connection would break the transaction context, erroring at end of test
+@pytest.mark.django_db(transaction=True)
+class TestAdvisoryLockPostgresErrors(SkipIfSqlite):
+    """Tests related to connection management and the advisory_lock"""
+
+    def kick_connection(self):
+        """
+        These (somewhat evil) tests are not gentle with the connection
+
+        At the start of one tests there is a good chance the connection is broken by the last test.
+        """
+        connection.ensure_connection()
+        # sanity checks
+        assert connection.connection
+        assert connection.connection.closed is False
+
+    def test_timeout_under_lock_by_sleep(self):
+        self.kick_connection()
+
         with pytest.raises(OperationalError) as exc:
             # uses miliseconds units
-            with advisory_lock('test_lock_session_timeout_milliseconds', lock_session_timeout_milliseconds=2):
-                time.sleep(3)
-        assert 'the connection is lost' in str(exc)
+            with advisory_lock('test_lock_session_timeout_milliseconds', lock_session_timeout_milliseconds=5):
+                time.sleep(0.1)
+
+        # This exception comes from the __exit__, either releasing the lock or closing the cursor
+        assert 'terminating connection due to idle-session timeout' in str(exc)
+
+        with connection.cursor() as cursor:
+            cursor.execute('SELECT 1')
+
+    def test_idle_after_exception(self):
+        self.kick_connection()
+
+        with pytest.raises(RuntimeError) as exc:
+            with advisory_lock('test_timeout_under_lock_with_query', lock_session_timeout_milliseconds=5):
+                with connection.cursor() as cursor:
+                    raise RuntimeError('exception from test')
+
+        assert 'exception from test' in str(exc)
+
+        time.sleep(0.1)
+
+        # The fact that this works shows that the timeout was reset in the context manager __exit__
+        # Prior bug was giving exception with
+        # consuming input failed: terminating connection due to idle-session timeout server closed the connection unexpectedly
+        with connection.cursor() as cursor:
+            cursor.execute('SELECT 1')