y-scope · haiqi96 · Aug 13, 2025 · Jun 17, 2025 · Jun 18, 2025 · Jun 18, 2025
@@ -96,6 +96,13 @@ def __init__(self, clp_home: pathlib.Path, docker_clp_home: pathlib.Path):
         self.aws_config_dir: typing.Optional[DockerMount] = None
 
 
+def _validate_data_directory(data_dir: pathlib.Path, component_name: str) -> None:
+    try:
+        validate_path_could_be_dir(data_dir)
+    except ValueError as ex:
+        raise ValueError(f"{component_name} data directory is invalid: {ex}")
+
+
 def get_clp_home():
     # Determine CLP_HOME from an environment variable or this script's path
     clp_home = None
@@ -175,6 +182,13 @@ def is_container_exited(container_name):
     return False
 
 
+def validate_log_directory(logs_dir: pathlib.Path, component_name: str) -> None:
+    try:
+        validate_path_could_be_dir(logs_dir)
+    except ValueError as ex:
+        raise ValueError(f"{component_name} logs directory is invalid: {ex}")
+
+
 def validate_port(port_name: str, hostname: str, port: int):
     try:
         sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
@@ -431,40 +445,23 @@ def validate_and_load_redis_credentials_file(
 
 
 def validate_db_config(clp_config: CLPConfig, data_dir: pathlib.Path, logs_dir: pathlib.Path):
-    try:
-        validate_path_could_be_dir(data_dir)
-    except ValueError as ex:
-        raise ValueError(f"{DB_COMPONENT_NAME} data directory is invalid: {ex}")
-
-    try:
-        validate_path_could_be_dir(logs_dir)
-    except ValueError as ex:
-        raise ValueError(f"{DB_COMPONENT_NAME} logs directory is invalid: {ex}")
+    _validate_data_directory(data_dir, DB_COMPONENT_NAME)
+    validate_log_directory(logs_dir, DB_COMPONENT_NAME)
 
     validate_port(f"{DB_COMPONENT_NAME}.port", clp_config.database.host, clp_config.database.port)
 
 
 def validate_queue_config(clp_config: CLPConfig, logs_dir: pathlib.Path):
-    try:
-        validate_path_could_be_dir(logs_dir)
-    except ValueError as ex:
-        raise ValueError(f"{QUEUE_COMPONENT_NAME} logs directory is invalid: {ex}")
+    validate_log_directory(logs_dir, QUEUE_COMPONENT_NAME)
 
     validate_port(f"{QUEUE_COMPONENT_NAME}.port", clp_config.queue.host, clp_config.queue.port)
 
 
 def validate_redis_config(
     clp_config: CLPConfig, data_dir: pathlib.Path, logs_dir: pathlib.Path, base_config: pathlib.Path
 ):
-    try:
-        validate_path_could_be_dir(data_dir)
-    except ValueError as ex:
-        raise ValueError(f"{REDIS_COMPONENT_NAME} data directory is invalid {ex}")
-
-    try:
-        validate_path_could_be_dir(logs_dir)
-    except ValueError as ex:
-        raise ValueError(f"{REDIS_COMPONENT_NAME} logs directory is invalid: {ex}")
+    _validate_data_directory(data_dir, REDIS_COMPONENT_NAME)
+    validate_log_directory(logs_dir, REDIS_COMPONENT_NAME)
 
     if not base_config.exists():
         raise ValueError(
@@ -475,10 +472,7 @@ def validate_redis_config(
 
 
 def validate_reducer_config(clp_config: CLPConfig, logs_dir: pathlib.Path, num_workers: int):
-    try:
-        validate_path_could_be_dir(logs_dir)
-    except ValueError as ex:
-        raise ValueError(f"{REDUCER_COMPONENT_NAME} logs directory is invalid: {ex}")
+    validate_log_directory(logs_dir, REDUCER_COMPONENT_NAME)
 
     for i in range(0, num_workers):
         validate_port(
@@ -491,15 +485,8 @@ def validate_reducer_config(clp_config: CLPConfig, logs_dir: pathlib.Path, num_w
 def validate_results_cache_config(
     clp_config: CLPConfig, data_dir: pathlib.Path, logs_dir: pathlib.Path
 ):
-    try:
-        validate_path_could_be_dir(data_dir)
-    except ValueError as ex:
-        raise ValueError(f"{RESULTS_CACHE_COMPONENT_NAME} data directory is invalid: {ex}")
-
-    try:
-        validate_path_could_be_dir(logs_dir)
-    except ValueError as ex:
-        raise ValueError(f"{RESULTS_CACHE_COMPONENT_NAME} logs directory is invalid: {ex}")
+    _validate_data_directory(data_dir, RESULTS_CACHE_COMPONENT_NAME)
+    validate_log_directory(logs_dir, RESULTS_CACHE_COMPONENT_NAME)
 
     validate_port(
         f"{RESULTS_CACHE_COMPONENT_NAME}.port",
@@ -508,8 +495,11 @@ def validate_results_cache_config(
     )
 
 
-def validate_worker_config(clp_config: CLPConfig):
+def validate_logs_input_config(clp_config: CLPConfig):
     clp_config.validate_logs_input_config()
+
+
+def validate_output_storage_config(clp_config: CLPConfig):
     clp_config.validate_archive_output_config()
     clp_config.validate_stream_output_config()
 
@@ -590,3 +580,13 @@ def validate_dataset_name(clp_table_prefix: str, dataset_name: str) -> None:
             f"Invalid dataset name: `{dataset_name}`. Names can only be a maximum of"
             f" {dataset_name_max_len} characters long."
         )
+
+
+def is_retention_configured(clp_config: CLPConfig) -> bool:
+    if clp_config.archive_output.retention_period is not None:
+        return True
+
+    if clp_config.results_cache.retention_period is not None:
+        return True
+
+    return False
@@ -9,9 +9,8 @@
 
 from clp_py_utils.clp_config import Database
 from clp_py_utils.clp_metadata_db_utils import (
-    get_archive_tags_table_name,
+    delete_archives_from_metadata_db,
     get_archives_table_name,
-    get_files_table_name,
 )
 from clp_py_utils.sql_adapter import SQL_Adapter
 
@@ -325,13 +324,13 @@ def _delete_archives(
 
     archive_ids: typing.List[str]
     logger.info("Starting to delete archives from the database.")
-    try:
-        sql_adapter: SQL_Adapter = SQL_Adapter(database_config)
-        clp_db_connection_params: dict[str, any] = (
-            database_config.get_clp_connection_params_and_type(True)
-        )
-        table_prefix = clp_db_connection_params["table_prefix"]
+    sql_adapter: SQL_Adapter = SQL_Adapter(database_config)
+    clp_db_connection_params: dict[str, any] = database_config.get_clp_connection_params_and_type(
+        True
+    )
+    table_prefix = clp_db_connection_params["table_prefix"]
 
+    try:
         with closing(sql_adapter.create_connection(True)) as db_conn, closing(
             db_conn.cursor(dictionary=True)
         ) as db_cursor:
@@ -343,9 +342,8 @@ def _delete_archives(
 
             db_cursor.execute(
                 f"""
-                DELETE FROM `{get_archives_table_name(table_prefix, dataset)}`
+                SELECT id FROM `{get_archives_table_name(table_prefix, dataset)}`
                 WHERE {query_criteria}
-                RETURNING id
                 """,
                 query_params,
             )
@@ -358,21 +356,7 @@ def _delete_archives(
             archive_ids: typing.List[str] = [result["id"] for result in results]
             delete_handler.validate_results(archive_ids)
 
-            ids_list_string: str = ", ".join(["'%s'"] * len(archive_ids))
-
-            db_cursor.execute(
-                f"""
-                DELETE FROM `{get_files_table_name(table_prefix, dataset)}`
-                WHERE archive_id in ({ids_list_string})
-                """
-            )
-
-            db_cursor.execute(
-                f"""
-                DELETE FROM `{get_archive_tags_table_name(table_prefix, dataset)}`
-                WHERE archive_id in ({ids_list_string})
-                """
-            )
+            delete_archives_from_metadata_db(db_cursor, archive_ids, table_prefix, dataset)
             for archive_id in archive_ids:
                 logger.info(f"Deleted archive {archive_id} from the database.")
 
-            for archive_id in archive_ids:
-                logger.info(f"Deleted archive {archive_id} from the database.")
+            logger.info(f"Deleted {len(archive_ids)} archives from the database.")
+            logger.debug(f"Deleted archive IDs: {', '.join(archive_ids)}")
-            for archive_id in archive_ids:
-                logger.info(f"Deleted archive {archive_id} from the database.")
+            logger.info(f"Deleted {len(archive_ids)} archives from the database.")
+            logger.debug(f"Deleted archive IDs: {', '.join(archive_ids)}")
@@ -385,6 +369,8 @@ def _delete_archives(
 
     except Exception:
         logger.exception("Failed to delete archives from the database. Aborting deletion.")
+        if db_conn in locals() and db_conn.is_connected():
+            db_conn.rollback()
         return -1
 
     logger.info(f"Finished deleting archives from the database.")

@@ -22,6 +22,7 @@
     COMPRESSION_WORKER_COMPONENT_NAME,
     CONTROLLER_TARGET_NAME,
     DB_COMPONENT_NAME,
+    GARBAGE_COLLECTOR_NAME,
     QUERY_JOBS_TABLE_NAME,
     QUERY_SCHEDULER_COMPONENT_NAME,
     QUERY_WORKER_COMPONENT_NAME,
@@ -54,17 +55,20 @@
     get_clp_home,
     is_container_exited,
     is_container_running,
+    is_retention_configured,
     load_config_file,
     validate_and_load_db_credentials_file,
     validate_and_load_queue_credentials_file,
     validate_and_load_redis_credentials_file,
     validate_db_config,
+    validate_log_directory,
+    validate_logs_input_config,
+    validate_output_storage_config,
     validate_queue_config,
     validate_redis_config,
     validate_reducer_config,
     validate_results_cache_config,
     validate_webui_config,
-    validate_worker_config,
 )
 
 logger = logging.getLogger(__file__)
@@ -1051,6 +1055,88 @@ def start_reducer(
     logger.info(f"Started {component_name}.")
 
 
+def start_garbage_collector(
+    instance_id: str,
+    clp_config: CLPConfig,
+    container_clp_config: CLPConfig,
+    mounts: CLPDockerMounts,
+):
+    component_name = GARBAGE_COLLECTOR_NAME
+
+    if not is_retention_configured(clp_config):
+        logger.info(f"Retention period is not configured, skipping {component_name} creation...")
+        return
+
+    logger.info(f"Starting {component_name}...")
+
+    container_name = f"clp-{component_name}-{instance_id}"
+    if container_exists(container_name):
+        return
+
+    container_config_filename = f"{container_name}.yml"
+    container_config_file_path = clp_config.logs_directory / container_config_filename
+    with open(container_config_file_path, "w") as f:
+        yaml.safe_dump(container_clp_config.dump_to_primitive_dict(), f)
+
+    logs_dir = clp_config.logs_directory / component_name
+    validate_log_directory(logs_dir, component_name)
+    # Create logs directory if necessary
+    logs_dir.mkdir(parents=True, exist_ok=True)
+    container_logs_dir = container_clp_config.logs_directory / component_name
+
+    clp_site_packages_dir = CONTAINER_CLP_HOME / "lib" / "python3" / "site-packages"
+
+    # fmt: off
+    container_start_cmd = [
+        "docker", "run",
+        "-di",
+        "--network", "host",
+        "-w", str(CONTAINER_CLP_HOME),
+        "--name", container_name,
+        "--log-driver", "local",
+        "-u", f"{os.getuid()}:{os.getgid()}",
+    ]
+    # fmt: on
+
+    necessary_env_vars = [
+        f"PYTHONPATH={clp_site_packages_dir}",
+        f"CLP_HOME={CONTAINER_CLP_HOME}",
+        f"CLP_LOGS_DIR={container_logs_dir}",
+        f"CLP_LOGGING_LEVEL={clp_config.garbage_collector.logging_level}",
+    ]
+    necessary_mounts = [
+        mounts.clp_home,
+        mounts.logs_dir,
+    ]
+
+    # Add necessary mounts for archives and streams.
+    if StorageType.FS == clp_config.archive_output.storage.type:
+        necessary_mounts.append(mounts.archives_output_dir)
+    if StorageType.FS == clp_config.stream_output.storage.type:
+        necessary_mounts.append(mounts.stream_output_dir)
+
+    aws_mount, aws_env_vars = generate_container_auth_options(clp_config, component_name)
+    if aws_mount:
+        necessary_mounts.append(mounts.aws_config_dir)
+    if aws_env_vars:
+        necessary_env_vars.extend(aws_env_vars)
+
+    append_docker_options(container_start_cmd, necessary_mounts, necessary_env_vars)
+    container_start_cmd.append(clp_config.execution_container)
+
+    # fmt: off
+    garbage_collector_cmd = [
+        "python3", "-u",
+        "-m", "job_orchestration.garbage_collector.garbage_collector",
+        "--config", str(container_clp_config.logs_directory / container_config_filename),
+    ]
+    # fmt: on
+    cmd = container_start_cmd + garbage_collector_cmd
+    subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True)
+
+    logger.info(f"Started {component_name}.")
+
+
 def add_num_workers_argument(parser):
     parser.add_argument(
         "--num-workers",
@@ -1087,6 +1173,7 @@ def main(argv):
     reducer_server_parser = component_args_parser.add_parser(REDUCER_COMPONENT_NAME)
     add_num_workers_argument(reducer_server_parser)
     component_args_parser.add_parser(WEBUI_COMPONENT_NAME)
+    component_args_parser.add_parser(GARBAGE_COLLECTOR_NAME)
 
     parsed_args = args_parser.parse_args(argv[1:])
 
@@ -1111,6 +1198,7 @@ def main(argv):
             ALL_TARGET_NAME,
             CONTROLLER_TARGET_NAME,
             DB_COMPONENT_NAME,
+            GARBAGE_COLLECTOR_NAME,
             COMPRESSION_SCHEDULER_COMPONENT_NAME,
             QUERY_SCHEDULER_COMPONENT_NAME,
             WEBUI_COMPONENT_NAME,
@@ -1136,12 +1224,18 @@ def main(argv):
             QUERY_WORKER_COMPONENT_NAME,
         ):
             validate_and_load_redis_credentials_file(clp_config, clp_home, True)
+        if target in (
+            ALL_TARGET_NAME,
+            COMPRESSION_WORKER_COMPONENT_NAME,
+        ):
+            validate_logs_input_config(clp_config)
         if target in (
             ALL_TARGET_NAME,
             COMPRESSION_WORKER_COMPONENT_NAME,
             QUERY_WORKER_COMPONENT_NAME,
+            GARBAGE_COLLECTOR_NAME,
         ):
-            validate_worker_config(clp_config)
+            validate_output_storage_config(clp_config)
 
         clp_config.validate_data_dir()
         clp_config.validate_logs_dir()
@@ -1210,6 +1304,8 @@ def main(argv):
             start_reducer(instance_id, clp_config, container_clp_config, num_workers, mounts)
         if target in (ALL_TARGET_NAME, WEBUI_COMPONENT_NAME):
             start_webui(instance_id, clp_config, container_clp_config, mounts)
+        if target in (ALL_TARGET_NAME, GARBAGE_COLLECTOR_NAME):
+            start_garbage_collector(instance_id, clp_config, container_clp_config, mounts)
 
     except Exception as ex:
         if type(ex) == ValueError:

@@ -11,6 +11,7 @@
     COMPRESSION_WORKER_COMPONENT_NAME,
     CONTROLLER_TARGET_NAME,
     DB_COMPONENT_NAME,
+    GARBAGE_COLLECTOR_NAME,
     QUERY_SCHEDULER_COMPONENT_NAME,
     QUERY_WORKER_COMPONENT_NAME,
     QUEUE_COMPONENT_NAME,
@@ -84,6 +85,7 @@ def main(argv):
     component_args_parser.add_parser(COMPRESSION_WORKER_COMPONENT_NAME)
     component_args_parser.add_parser(QUERY_WORKER_COMPONENT_NAME)
     component_args_parser.add_parser(WEBUI_COMPONENT_NAME)
+    component_args_parser.add_parser(GARBAGE_COLLECTOR_NAME)
 
     parsed_args = args_parser.parse_args(argv[1:])
 
@@ -130,6 +132,9 @@ def main(argv):
 
         already_exited_containers = []
         force = parsed_args.force
+        if target in (ALL_TARGET_NAME, GARBAGE_COLLECTOR_NAME):
+            container_name = f"clp-{GARBAGE_COLLECTOR_NAME}-{instance_id}"
+            stop_running_container(container_name, already_exited_containers, force)
         if target in (ALL_TARGET_NAME, WEBUI_COMPONENT_NAME):
             container_name = f"clp-{WEBUI_COMPONENT_NAME}-{instance_id}"
             stop_running_container(container_name, already_exited_containers, force)