@@ -79,6 +79,7 @@ class _TableConfig:
7979 table_name : str
8080 recency_column_name : str
8181 extra_columns : list [str ] | None = None
82+ dag_id_column_name : str | None = None
8283 keep_last : bool = False
8384 keep_last_filters : Any | None = None
8485 keep_last_group_by : Any | None = None
@@ -89,9 +90,19 @@ class _TableConfig:
8990
9091 def __post_init__ (self ):
9192 self .recency_column = column (self .recency_column_name )
92- self .orm_model : Base = table (
93- self .table_name , * [column (x ) for x in self .extra_columns or []], self .recency_column
94- )
93+ if self .dag_id_column_name is None :
94+ self .dag_id_column = None
95+ self .orm_model : Base = table (
96+ self .table_name , * [column (x ) for x in self .extra_columns or []], self .recency_column
97+ )
98+ else :
99+ self .dag_id_column = column (self .dag_id_column_name )
100+ self .orm_model : Base = table (
101+ self .table_name ,
102+ * [column (x ) for x in self .extra_columns or []],
103+ self .dag_id_column ,
104+ self .recency_column ,
105+ )
95106
96107 def __lt__ (self , other ):
97108 return self .table_name < other .table_name
@@ -101,41 +112,47 @@ def readable_config(self):
101112 return {
102113 "table" : self .orm_model .name ,
103114 "recency_column" : str (self .recency_column ),
115+ "dag_id_column" : str (self .dag_id_column ),
104116 "keep_last" : self .keep_last ,
105117 "keep_last_filters" : [str (x ) for x in self .keep_last_filters ] if self .keep_last_filters else None ,
106118 "keep_last_group_by" : str (self .keep_last_group_by ),
107119 }
108120
109121
110122config_list : list [_TableConfig ] = [
111- _TableConfig (table_name = "job" , recency_column_name = "latest_heartbeat" ),
123+ _TableConfig (table_name = "job" , recency_column_name = "latest_heartbeat" , dag_id_column_name = "dag_id" ),
112124 _TableConfig (
113125 table_name = "dag" ,
114126 recency_column_name = "last_parsed_time" ,
115127 dependent_tables = ["dag_version" , "deadline" ],
128+ dag_id_column_name = "dag_id" ,
116129 ),
117130 _TableConfig (
118131 table_name = "dag_run" ,
119132 recency_column_name = "start_date" ,
133+ dag_id_column_name = "dag_id" ,
120134 extra_columns = ["dag_id" , "run_type" ],
121135 keep_last = True ,
122136 keep_last_filters = [column ("run_type" ) != DagRunType .MANUAL ],
123137 keep_last_group_by = ["dag_id" ],
124138 dependent_tables = ["task_instance" , "deadline" ],
125139 ),
126- _TableConfig (table_name = "asset_event" , recency_column_name = "timestamp" ),
140+ _TableConfig (table_name = "asset_event" , recency_column_name = "timestamp" , dag_id_column_name = "dag_id" ),
127141 _TableConfig (table_name = "import_error" , recency_column_name = "timestamp" ),
128- _TableConfig (table_name = "log" , recency_column_name = "dttm" ),
129- _TableConfig (table_name = "sla_miss" , recency_column_name = "timestamp" ),
142+ _TableConfig (table_name = "log" , recency_column_name = "dttm" , dag_id_column_name = "dag_id" ),
143+ _TableConfig (table_name = "sla_miss" , recency_column_name = "timestamp" , dag_id_column_name = "dag_id" ),
130144 _TableConfig (
131145 table_name = "task_instance" ,
132146 recency_column_name = "start_date" ,
133147 dependent_tables = ["task_instance_history" , "xcom" ],
148+ dag_id_column_name = "dag_id" ,
149+ ),
150+ _TableConfig (
151+ table_name = "task_instance_history" , recency_column_name = "start_date" , dag_id_column_name = "dag_id"
134152 ),
135- _TableConfig (table_name = "task_instance_history" , recency_column_name = "start_date" ),
136- _TableConfig (table_name = "task_reschedule" , recency_column_name = "start_date" ),
137- _TableConfig (table_name = "xcom" , recency_column_name = "timestamp" ),
138- _TableConfig (table_name = "_xcom_archive" , recency_column_name = "timestamp" ),
153+ _TableConfig (table_name = "task_reschedule" , recency_column_name = "start_date" , dag_id_column_name = "dag_id" ),
154+ _TableConfig (table_name = "xcom" , recency_column_name = "timestamp" , dag_id_column_name = "dag_id" ),
155+ _TableConfig (table_name = "_xcom_archive" , recency_column_name = "timestamp" , dag_id_column_name = "dag_id" ),
139156 _TableConfig (table_name = "callback_request" , recency_column_name = "created_at" ),
140157 _TableConfig (table_name = "celery_taskmeta" , recency_column_name = "date_done" ),
141158 _TableConfig (table_name = "celery_tasksetmeta" , recency_column_name = "date_done" ),
@@ -148,8 +165,11 @@ def readable_config(self):
148165 table_name = "dag_version" ,
149166 recency_column_name = "created_at" ,
150167 dependent_tables = ["task_instance" , "dag_run" ],
168+ dag_id_column_name = "dag_id" ,
169+ keep_last = True ,
170+ keep_last_group_by = ["dag_id" ],
151171 ),
152- _TableConfig (table_name = "deadline" , recency_column_name = "deadline_time" ),
172+ _TableConfig (table_name = "deadline" , recency_column_name = "deadline_time" , dag_id_column_name = "dag_id" ),
153173]
154174
155175# We need to have `fallback="database"` because this is executed at top level code and provider configuration
@@ -308,13 +328,25 @@ def _build_query(
308328 keep_last_group_by ,
309329 clean_before_timestamp : DateTime ,
310330 session : Session ,
331+ dag_id_column = None ,
332+ dag_ids : list [str ] | None = None ,
333+ exclude_dag_ids : list [str ] | None = None ,
311334 ** kwargs ,
312335) -> Query :
313336 base_table_alias = "base"
314337 base_table = aliased (orm_model , name = base_table_alias )
315338 query = session .query (base_table ).with_entities (text (f"{ base_table_alias } .*" ))
316339 base_table_recency_col = base_table .c [recency_column .name ]
317340 conditions = [base_table_recency_col < clean_before_timestamp ]
341+
342+ if (dag_ids or exclude_dag_ids ) and dag_id_column is not None :
343+ base_table_dag_id_col = base_table .c [dag_id_column .name ]
344+
345+ if dag_ids :
346+ conditions .append (base_table_dag_id_col .in_ (dag_ids ))
347+ if exclude_dag_ids :
348+ conditions .append (base_table_dag_id_col .not_in (exclude_dag_ids ))
349+
318350 if keep_last :
319351 max_date_col_name = "max_date_per_group"
320352 group_by_columns : list [Any ] = [column (x ) for x in keep_last_group_by ]
@@ -345,6 +377,9 @@ def _cleanup_table(
345377 keep_last_filters ,
346378 keep_last_group_by ,
347379 clean_before_timestamp : DateTime ,
380+ dag_id_column = None ,
381+ dag_ids = None ,
382+ exclude_dag_ids = None ,
348383 dry_run : bool = True ,
349384 verbose : bool = False ,
350385 skip_archive : bool = False ,
@@ -358,6 +393,9 @@ def _cleanup_table(
358393 query = _build_query (
359394 orm_model = orm_model ,
360395 recency_column = recency_column ,
396+ dag_id_column = dag_id_column ,
397+ dag_ids = dag_ids ,
398+ exclude_dag_ids = exclude_dag_ids ,
361399 keep_last = keep_last ,
362400 keep_last_filters = keep_last_filters ,
363401 keep_last_group_by = keep_last_group_by ,
@@ -380,10 +418,18 @@ def _cleanup_table(
380418 session .commit ()
381419
382420
383- def _confirm_delete (* , date : DateTime , tables : list [str ]) -> None :
421+ def _confirm_delete (
422+ * ,
423+ date : DateTime ,
424+ tables : list [str ],
425+ dag_ids : list [str ] | None = None ,
426+ exclude_dag_ids : list [str ] | None = None ,
427+ ) -> None :
384428 for_tables = f" for tables { tables !r} " if tables else ""
429+ for_dags = f" for the following dags: { dag_ids !r} " if dag_ids else ""
430+ excluding_dags = f" excluding the following dags: { exclude_dag_ids !r} " if exclude_dag_ids else ""
385431 question = (
386- f"You have requested that we purge all data prior to { date } { for_tables } .\n "
432+ f"You have requested that we purge all data prior to { date } { for_tables } { for_dags } { excluding_dags } .\n "
387433 f"This is irreversible. Consider backing up the tables first and / or doing a dry run "
388434 f"with option --dry-run.\n "
389435 f"Enter 'delete rows' (without quotes) to proceed."
@@ -493,6 +539,8 @@ def run_cleanup(
493539 * ,
494540 clean_before_timestamp : DateTime ,
495541 table_names : list [str ] | None = None ,
542+ dag_ids : list [str ] | None = None ,
543+ exclude_dag_ids : list [str ] | None = None ,
496544 dry_run : bool = False ,
497545 verbose : bool = False ,
498546 confirm : bool = True ,
@@ -513,6 +561,9 @@ def run_cleanup(
513561 :param clean_before_timestamp: The timestamp before which data should be purged
514562 :param table_names: Optional. List of table names to perform maintenance on. If list not provided,
515563 will perform maintenance on all tables.
564+ :param dag_ids: Optional. List of dag ids to perform maintenance on. If list not provided,
565+ will perform maintenance on all dags.
566+ :param exclude_dag_ids: Optional. List of dag ids to exclude from maintenance.
516567 :param dry_run: If true, print rows meeting deletion criteria
517568 :param verbose: If true, may provide more detailed output.
518569 :param confirm: Require user input to confirm before processing deletions.
@@ -532,14 +583,21 @@ def run_cleanup(
532583 )
533584 _print_config (configs = effective_config_dict )
534585 if not dry_run and confirm :
535- _confirm_delete (date = clean_before_timestamp , tables = sorted (effective_table_names ))
586+ _confirm_delete (
587+ date = clean_before_timestamp ,
588+ tables = sorted (effective_table_names ),
589+ dag_ids = dag_ids ,
590+ exclude_dag_ids = exclude_dag_ids ,
591+ )
536592 existing_tables = reflect_tables (tables = None , session = session ).tables
537593
538594 for table_name , table_config in effective_config_dict .items ():
539595 if table_name in existing_tables :
540596 with _suppress_with_logging (table_name , session ):
541597 _cleanup_table (
542598 clean_before_timestamp = clean_before_timestamp ,
599+ dag_ids = dag_ids ,
600+ exclude_dag_ids = exclude_dag_ids ,
543601 dry_run = dry_run ,
544602 verbose = verbose ,
545603 ** table_config .__dict__ ,
0 commit comments