-
Notifications
You must be signed in to change notification settings - Fork 83
feat(package): Add garbage collectors to delete archives and search results based on a configurable retention policy. #1035
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 119 commits
73f0d69
bc4c464
e6238a2
653f3dc
a269a97
1742ff0
efbb47b
5714945
b1fe0d4
a5f5a3b
53d7417
ccd23dc
5bb3685
bc688b9
0c8c6f6
0d186e6
75ac0ff
bb1e5f4
ba7cfe1
68454c6
5eccaaf
c1de746
f08802b
d797198
c5dc9b9
8c39e77
ea4318e
2c97441
2eff448
d570ab6
06332f4
d5e8e28
8a79b9b
8c77119
3a1afb2
73d76ac
5745e65
f3ba8b0
e566e74
db9a508
2f0f95a
e310102
a332799
cb53857
85b7823
398ab5e
3209ddd
3c5b0e4
fb41607
386453b
945c97b
1845462
bed13df
0d8d679
d40e773
7759a7a
271b8b3
e6b8cc7
c0b8563
7a468c3
1dd1cea
a0c3c29
a9bf615
fe05f5f
39a9278
7124828
eb80992
5ed44e7
af6b508
67fb01f
d6ad4de
ff7d700
7ffc77c
0255cbd
1076a3f
71c4d82
6bd9372
84df2e2
983bea1
bdb7817
a82a267
f699496
94e8ca1
90ce0a4
d6f9e5a
dc6a706
76bcb4a
a4e6f83
3c53cb0
66eba87
7b42568
097e47c
8dc8e26
afe43ce
85a3164
af75118
e5e90f7
bac6767
de1c334
9fdb3d5
2245244
b1e5a2c
4e93a30
6719872
f9fa626
450e16a
ade2e27
f1584ff
2c57dd6
5f479c5
8c5fb89
11e695f
1291c3f
f8c7369
9b48c9b
6cff24d
c367c15
e282020
b93bb4b
390333f
a4546cf
2c4821a
9d5d087
74af600
34a06a5
7633a6c
bd48ca5
8ca206e
ee9dd59
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -9,9 +9,8 @@ | |||||||||
|
|
||||||||||
| from clp_py_utils.clp_config import Database | ||||||||||
| from clp_py_utils.clp_metadata_db_utils import ( | ||||||||||
| get_archive_tags_table_name, | ||||||||||
| delete_archives_from_metadata_db, | ||||||||||
| get_archives_table_name, | ||||||||||
| get_files_table_name, | ||||||||||
| ) | ||||||||||
| from clp_py_utils.sql_adapter import SQL_Adapter | ||||||||||
|
|
||||||||||
|
|
@@ -325,13 +324,13 @@ def _delete_archives( | |||||||||
|
|
||||||||||
| archive_ids: typing.List[str] | ||||||||||
| logger.info("Starting to delete archives from the database.") | ||||||||||
| try: | ||||||||||
| sql_adapter: SQL_Adapter = SQL_Adapter(database_config) | ||||||||||
| clp_db_connection_params: dict[str, any] = ( | ||||||||||
| database_config.get_clp_connection_params_and_type(True) | ||||||||||
| ) | ||||||||||
| table_prefix = clp_db_connection_params["table_prefix"] | ||||||||||
| sql_adapter: SQL_Adapter = SQL_Adapter(database_config) | ||||||||||
| clp_db_connection_params: dict[str, any] = database_config.get_clp_connection_params_and_type( | ||||||||||
| True | ||||||||||
| ) | ||||||||||
| table_prefix = clp_db_connection_params["table_prefix"] | ||||||||||
haiqi96 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||
|
|
||||||||||
| try: | ||||||||||
| with closing(sql_adapter.create_connection(True)) as db_conn, closing( | ||||||||||
| db_conn.cursor(dictionary=True) | ||||||||||
| ) as db_cursor: | ||||||||||
|
|
@@ -343,9 +342,8 @@ def _delete_archives( | |||||||||
|
|
||||||||||
| db_cursor.execute( | ||||||||||
| f""" | ||||||||||
| DELETE FROM `{get_archives_table_name(table_prefix, dataset)}` | ||||||||||
| SELECT id FROM `{get_archives_table_name(table_prefix, dataset)}` | ||||||||||
| WHERE {query_criteria} | ||||||||||
| RETURNING id | ||||||||||
| """, | ||||||||||
| query_params, | ||||||||||
| ) | ||||||||||
|
|
@@ -358,21 +356,7 @@ def _delete_archives( | |||||||||
| archive_ids: typing.List[str] = [result["id"] for result in results] | ||||||||||
| delete_handler.validate_results(archive_ids) | ||||||||||
|
|
||||||||||
| ids_list_string: str = ", ".join(["'%s'"] * len(archive_ids)) | ||||||||||
|
|
||||||||||
| db_cursor.execute( | ||||||||||
| f""" | ||||||||||
| DELETE FROM `{get_files_table_name(table_prefix, dataset)}` | ||||||||||
| WHERE archive_id in ({ids_list_string}) | ||||||||||
| """ | ||||||||||
| ) | ||||||||||
|
|
||||||||||
| db_cursor.execute( | ||||||||||
| f""" | ||||||||||
| DELETE FROM `{get_archive_tags_table_name(table_prefix, dataset)}` | ||||||||||
| WHERE archive_id in ({ids_list_string}) | ||||||||||
| """ | ||||||||||
| ) | ||||||||||
| delete_archives_from_metadata_db(db_cursor, archive_ids, table_prefix, dataset) | ||||||||||
| for archive_id in archive_ids: | ||||||||||
| logger.info(f"Deleted archive {archive_id} from the database.") | ||||||||||
|
|
||||||||||
|
Comment on lines
360
to
362
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧹 Nitpick (assertive) Consider batching the deletion logging for better performance. When deleting many archives, logging each deletion individually could impact performance. Consider logging a summary instead. - for archive_id in archive_ids:
- logger.info(f"Deleted archive {archive_id} from the database.")
+ logger.info(f"Deleted {len(archive_ids)} archives from the database.")
+ logger.debug(f"Deleted archive IDs: {', '.join(archive_ids)}")📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is unrelated to this PR. will skip.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @haiqi96 Understood! You're absolutely right that logging performance optimization is outside the scope of this retention and garbage collection PR. I'll skip this suggestion to keep the focus on the core functionality being added. 🧠 Learnings used |
||||||||||
|
|
@@ -385,6 +369,8 @@ def _delete_archives( | |||||||||
|
|
||||||||||
| except Exception: | ||||||||||
| logger.exception("Failed to delete archives from the database. Aborting deletion.") | ||||||||||
| if db_conn in locals() and db_conn.is_connected(): | ||||||||||
| db_conn.rollback() | ||||||||||
| return -1 | ||||||||||
|
|
||||||||||
| logger.info(f"Finished deleting archives from the database.") | ||||||||||
|
|
||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,6 +22,7 @@ | |
| COMPRESSION_WORKER_COMPONENT_NAME, | ||
| CONTROLLER_TARGET_NAME, | ||
| DB_COMPONENT_NAME, | ||
| GARBAGE_COLLECTOR_NAME, | ||
| QUERY_JOBS_TABLE_NAME, | ||
| QUERY_SCHEDULER_COMPONENT_NAME, | ||
| QUERY_WORKER_COMPONENT_NAME, | ||
|
|
@@ -54,17 +55,20 @@ | |
| get_clp_home, | ||
| is_container_exited, | ||
| is_container_running, | ||
| is_retention_configured, | ||
| load_config_file, | ||
| validate_and_load_db_credentials_file, | ||
| validate_and_load_queue_credentials_file, | ||
| validate_and_load_redis_credentials_file, | ||
| validate_db_config, | ||
| validate_log_directory, | ||
| validate_logs_input_config, | ||
| validate_output_storage_config, | ||
| validate_queue_config, | ||
| validate_redis_config, | ||
| validate_reducer_config, | ||
| validate_results_cache_config, | ||
| validate_webui_config, | ||
| validate_worker_config, | ||
| ) | ||
|
|
||
| logger = logging.getLogger(__file__) | ||
|
|
@@ -1051,6 +1055,88 @@ def start_reducer( | |
| logger.info(f"Started {component_name}.") | ||
|
|
||
|
|
||
| def start_garbage_collector( | ||
| instance_id: str, | ||
| clp_config: CLPConfig, | ||
| container_clp_config: CLPConfig, | ||
| mounts: CLPDockerMounts, | ||
| ): | ||
| component_name = GARBAGE_COLLECTOR_NAME | ||
|
Comment on lines
+1064
to
+1070
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧹 Nitpick (assertive) Add a concise docstring to start_garbage_collector A brief description of purpose and parameters will help future maintainers. Example: """ Args: 🤖 Prompt for AI Agents |
||
|
|
||
| if not is_retention_configured(clp_config): | ||
| logger.info(f"Retention period is not configured, skipping {component_name} creation...") | ||
| return | ||
|
|
||
| logger.info(f"Starting {component_name}...") | ||
|
|
||
| container_name = f"clp-{component_name}-{instance_id}" | ||
| if container_exists(container_name): | ||
| return | ||
|
|
||
| container_config_filename = f"{container_name}.yml" | ||
| container_config_file_path = clp_config.logs_directory / container_config_filename | ||
| with open(container_config_file_path, "w") as f: | ||
| yaml.safe_dump(container_clp_config.dump_to_primitive_dict(), f) | ||
haiqi96 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| logs_dir = clp_config.logs_directory / component_name | ||
| validate_log_directory(logs_dir, component_name) | ||
| # Create logs directory if necessary | ||
| logs_dir.mkdir(parents=True, exist_ok=True) | ||
| container_logs_dir = container_clp_config.logs_directory / component_name | ||
|
|
||
| clp_site_packages_dir = CONTAINER_CLP_HOME / "lib" / "python3" / "site-packages" | ||
|
|
||
| # fmt: off | ||
| container_start_cmd = [ | ||
| "docker", "run", | ||
| "-di", | ||
| "--network", "host", | ||
| "-w", str(CONTAINER_CLP_HOME), | ||
| "--name", container_name, | ||
| "--log-driver", "local", | ||
| "-u", f"{os.getuid()}:{os.getgid()}", | ||
| ] | ||
| # fmt: on | ||
|
|
||
| necessary_env_vars = [ | ||
| f"PYTHONPATH={clp_site_packages_dir}", | ||
| f"CLP_HOME={CONTAINER_CLP_HOME}", | ||
| f"CLP_LOGS_DIR={container_logs_dir}", | ||
| f"CLP_LOGGING_LEVEL={clp_config.garbage_collector.logging_level}", | ||
| ] | ||
| necessary_mounts = [ | ||
| mounts.clp_home, | ||
| mounts.logs_dir, | ||
| ] | ||
|
|
||
| # Add necessary mounts for archives and streams. | ||
| if StorageType.FS == clp_config.archive_output.storage.type: | ||
| necessary_mounts.append(mounts.archives_output_dir) | ||
| if StorageType.FS == clp_config.stream_output.storage.type: | ||
| necessary_mounts.append(mounts.stream_output_dir) | ||
|
|
||
LinZhihao-723 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| aws_mount, aws_env_vars = generate_container_auth_options(clp_config, component_name) | ||
| if aws_mount: | ||
| necessary_mounts.append(mounts.aws_config_dir) | ||
| if aws_env_vars: | ||
| necessary_env_vars.extend(aws_env_vars) | ||
|
|
||
| append_docker_options(container_start_cmd, necessary_mounts, necessary_env_vars) | ||
| container_start_cmd.append(clp_config.execution_container) | ||
|
|
||
| # fmt: off | ||
| garbage_collector_cmd = [ | ||
| "python3", "-u", | ||
| "-m", "job_orchestration.garbage_collector.garbage_collector", | ||
| "--config", str(container_clp_config.logs_directory / container_config_filename), | ||
| ] | ||
| # fmt: on | ||
| cmd = container_start_cmd + garbage_collector_cmd | ||
| subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) | ||
|
|
||
| logger.info(f"Started {component_name}.") | ||
haiqi96 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
|
|
||
| def add_num_workers_argument(parser): | ||
| parser.add_argument( | ||
| "--num-workers", | ||
|
|
@@ -1087,6 +1173,7 @@ def main(argv): | |
| reducer_server_parser = component_args_parser.add_parser(REDUCER_COMPONENT_NAME) | ||
| add_num_workers_argument(reducer_server_parser) | ||
| component_args_parser.add_parser(WEBUI_COMPONENT_NAME) | ||
| component_args_parser.add_parser(GARBAGE_COLLECTOR_NAME) | ||
|
|
||
| parsed_args = args_parser.parse_args(argv[1:]) | ||
|
|
||
|
|
@@ -1111,6 +1198,7 @@ def main(argv): | |
| ALL_TARGET_NAME, | ||
| CONTROLLER_TARGET_NAME, | ||
| DB_COMPONENT_NAME, | ||
| GARBAGE_COLLECTOR_NAME, | ||
| COMPRESSION_SCHEDULER_COMPONENT_NAME, | ||
| QUERY_SCHEDULER_COMPONENT_NAME, | ||
| WEBUI_COMPONENT_NAME, | ||
|
|
@@ -1136,12 +1224,18 @@ def main(argv): | |
| QUERY_WORKER_COMPONENT_NAME, | ||
| ): | ||
| validate_and_load_redis_credentials_file(clp_config, clp_home, True) | ||
| if target in ( | ||
| ALL_TARGET_NAME, | ||
| COMPRESSION_WORKER_COMPONENT_NAME, | ||
| ): | ||
| validate_logs_input_config(clp_config) | ||
haiqi96 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if target in ( | ||
| ALL_TARGET_NAME, | ||
| COMPRESSION_WORKER_COMPONENT_NAME, | ||
| QUERY_WORKER_COMPONENT_NAME, | ||
| GARBAGE_COLLECTOR_NAME, | ||
| ): | ||
| validate_worker_config(clp_config) | ||
| validate_output_storage_config(clp_config) | ||
|
|
||
| clp_config.validate_data_dir() | ||
| clp_config.validate_logs_dir() | ||
|
|
@@ -1210,6 +1304,8 @@ def main(argv): | |
| start_reducer(instance_id, clp_config, container_clp_config, num_workers, mounts) | ||
| if target in (ALL_TARGET_NAME, WEBUI_COMPONENT_NAME): | ||
| start_webui(instance_id, clp_config, container_clp_config, mounts) | ||
| if target in (ALL_TARGET_NAME, GARBAGE_COLLECTOR_NAME): | ||
| start_garbage_collector(instance_id, clp_config, container_clp_config, mounts) | ||
|
|
||
| except Exception as ex: | ||
| if type(ex) == ValueError: | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.