-
Notifications
You must be signed in to change notification settings - Fork 17.2k
fix(assets): reduce task success asset registration lock contention #66854
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
cb4fa27
d035a7d
9b38921
3f2fc34
b48137c
43acdea
4673ff3
ab90c26
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -40,6 +40,7 @@ | |
| from sqlalchemy.sql import select | ||
| from structlog.contextvars import bind_contextvars | ||
|
|
||
| from airflow._shared.observability.metrics import stats | ||
| from airflow._shared.observability.traces import override_ids | ||
| from airflow._shared.state import TaskScope | ||
| from airflow._shared.timezones import timezone | ||
|
|
@@ -467,6 +468,14 @@ def ti_update_state( | |
| extra=json.dumps({"host_name": hostname}) if hostname else None, | ||
| ) | ||
| ) | ||
| # Commit the TI state update now to release the task_instance row lock before | ||
| # running asset-event queries. The direct-INSERT fix in AssetManager removes | ||
| # the O(n) lazy-load on the alias-event table, but register_asset_changes_in_db | ||
| # also queries scheduled dags and inserts AssetDagRunQueue rows - all of which | ||
| # would otherwise hold the row lock and cause idle-in-transaction pile-up that | ||
| # exhausts API server memory and triggers OOMKill under high concurrency. | ||
| # The task outcome is durable from this point on. | ||
| session.commit() | ||
| except SQLAlchemyError as e: | ||
| log.error("Error updating Task Instance state", error=str(e)) | ||
| raise HTTPException( | ||
|
|
@@ -490,14 +499,40 @@ def ti_update_state( | |
| task_id=task_id, | ||
| map_index=map_index, | ||
| ) | ||
| session.commit() | ||
| except Exception: | ||
| session.rollback() | ||
| log.warning( | ||
| "Failed to clear task state on success", | ||
| dag_id=dag_id, | ||
| run_id=run_id, | ||
| task_id=task_id, | ||
| ) | ||
|
|
||
| # Asset registration runs outside the TI row lock. Failures are logged and counted; | ||
| # raising HTTP 500 here would be misleading because the task already succeeded and | ||
| # would make the worker retry a state update that has already completed. Durable | ||
| # retry/reconciliation for dropped asset events is out of scope for this hot-path fix. | ||
| if isinstance(ti_patch_payload, TISuccessStatePayload) and ti_patch_payload.task_outlets: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This gates on the payload type, but On The |
||
| try: | ||
| ti_for_assets = session.get(TI, task_instance_id) | ||
| if ti_for_assets is not None: | ||
| TI.register_asset_changes_in_db( | ||
| ti_for_assets, | ||
| ti_patch_payload.task_outlets, | ||
| ti_patch_payload.outlet_events, | ||
| session=session, | ||
| ) | ||
| session.commit() | ||
| except Exception: | ||
| session.rollback() | ||
| stats.incr("asset.registration_failures") | ||
| log.exception( | ||
| "Failed to register asset changes; task state is already committed", | ||
| task_instance_id=str(task_instance_id), | ||
| new_state=updated_state, | ||
| ) | ||
|
|
||
|
|
||
| def _emit_task_span(ti, state): | ||
| # just to be safe | ||
|
|
@@ -586,13 +621,7 @@ def _create_ti_state_update_query_and_update_state( | |
| retry_reason=(ti_patch_payload.retry_reason[:500] if ti_patch_payload.retry_reason else None), | ||
| ) | ||
| elif isinstance(ti_patch_payload, TISuccessStatePayload): | ||
| if ti is not None: | ||
| TI.register_asset_changes_in_db( | ||
| ti, | ||
| ti_patch_payload.task_outlets, | ||
| ti_patch_payload.outlet_events, | ||
| session=session, | ||
| ) | ||
| pass # Asset registration happens after the TI state is committed; see ti_update_state. | ||
| try: | ||
| _emit_task_span(ti, state=updated_state) | ||
| except Exception: | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.