-
Notifications
You must be signed in to change notification settings - Fork 89
feat(job-orchestration): Read compression input metadata from DB for ingestor jobs (addresses #2018) #2082
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
feat(job-orchestration): Read compression input metadata from DB for ingestor jobs (addresses #2018) #2082
Changes from 3 commits
ec37878
4322e2f
e7d64de
833c33e
de34c46
89b5d6a
8c77e31
a7cde90
d0fd88f
0e3808e
957dee3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,7 +26,10 @@ | |
| fetch_existing_datasets, | ||
| ) | ||
| from clp_py_utils.compression import validate_path_and_get_info | ||
| from clp_py_utils.core import read_yaml_config_file | ||
| from clp_py_utils.core import ( | ||
| FileMetadata, | ||
| read_yaml_config_file, | ||
| ) | ||
| from clp_py_utils.s3_utils import s3_get_object_metadata | ||
| from clp_py_utils.sql_adapter import SqlAdapter | ||
| from pydantic import ValidationError | ||
|
|
@@ -38,12 +41,14 @@ | |
| from job_orchestration.scheduler.constants import ( | ||
| CompressionJobStatus, | ||
| CompressionTaskStatus, | ||
| INGESTED_S3_OBJECT_METADATA_TABLE_NAME, | ||
| SchedulerType, | ||
| ) | ||
| from job_orchestration.scheduler.job_config import ( | ||
| ClpIoConfig, | ||
| FsInputConfig, | ||
| InputType, | ||
| LogIngestorSubmittedS3InputConfig, | ||
| S3InputConfig, | ||
| ) | ||
| from job_orchestration.scheduler.scheduler_data import ( | ||
|
|
@@ -183,6 +188,75 @@ def _process_s3_input( | |
| paths_to_compress_buffer.add_file(object_metadata) | ||
|
|
||
|
|
||
| def _fetch_ingested_s3_object_metadata( | ||
| metadata_ids: list[int], | ||
| ingestion_job_id: int, | ||
| db_cursor: Any, | ||
| ) -> list[dict[str, Any]]: | ||
|
||
| """ | ||
| Fetches S3 object metadata rows from the INGESTED_S3_OBJECT_METADATA_TABLE_NAME table for the | ||
| given metadata_ids and ingestion_job_id. | ||
|
|
||
| :param metadata_ids: IDs to fetch. | ||
| :param ingestion_job_id: Ingestion job to filter by. | ||
| :param db_cursor: Database cursor for the query. | ||
| :return: List of row dicts with "id", "key", and "size". | ||
| :raises RuntimeError: If no rows are found, or if any requested metadata_id is missing. | ||
| """ | ||
| if not metadata_ids: | ||
| return [] | ||
|
||
|
|
||
| placeholders = ", ".join(["%s"] * len(metadata_ids)) | ||
| query = ( | ||
| f"SELECT id, `key`, size FROM {INGESTED_S3_OBJECT_METADATA_TABLE_NAME} " | ||
jonathan-imanu marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| f"WHERE id IN ({placeholders}) AND ingestion_job_id = %s" | ||
jonathan-imanu marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| ) | ||
| params = (*metadata_ids, ingestion_job_id) | ||
| db_cursor.execute(query, params) | ||
| metadata_list = db_cursor.fetchall() | ||
| if not metadata_list: | ||
|
||
| raise RuntimeError( | ||
| f"No rows found in {INGESTED_S3_OBJECT_METADATA_TABLE_NAME} for the given " | ||
| f"metadata_ids and ingestion_job_id {ingestion_job_id}." | ||
| ) | ||
|
|
||
| # Validate that all requested IDs are present. | ||
jonathan-imanu marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| returned_ids = {row["id"] for row in metadata_list} | ||
| requested_ids = set(metadata_ids) | ||
| missing_ids = requested_ids - returned_ids | ||
| if missing_ids: | ||
| raise RuntimeError( | ||
| f"Missing metadata rows in {INGESTED_S3_OBJECT_METADATA_TABLE_NAME} for " | ||
| f"ingestion_job_id {ingestion_job_id}: {sorted(missing_ids)}." | ||
| ) | ||
|
|
||
| return metadata_list | ||
|
|
||
|
|
||
| def _process_log_ingestor_submitted_s3_input( | ||
| log_ingestor_submitted_s3_input_config: LogIngestorSubmittedS3InputConfig, | ||
| paths_to_compress_buffer: PathsToCompressBuffer, | ||
| db_context: DbContext, | ||
| ) -> None: | ||
| """ | ||
| Retrieves S3 object metadata based on the config's metadata_ids and ingestion_job_id, | ||
| then adds FileMetadata for each row to paths_to_compress_buffer. | ||
|
|
||
| :param log_ingestor_submitted_s3_input_config: | ||
| :param paths_to_compress_buffer: | ||
| :param db_context: | ||
| :raises: Propagates `_fetch_ingested_s3_object_metadata`'s exceptions. | ||
| """ | ||
| metadata_ids = log_ingestor_submitted_s3_input_config.metadata_ids | ||
| ingestion_job_id = log_ingestor_submitted_s3_input_config.ingestion_job_id | ||
| metadata_list = _fetch_ingested_s3_object_metadata( | ||
| metadata_ids, ingestion_job_id, db_context.cursor | ||
| ) | ||
| for metadata in metadata_list: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For each key, we should also ensure it contains the expected prefix specified in |
||
| file_metadata = FileMetadata(path=Path(metadata["key"]), size=int(metadata["size"])) | ||
| paths_to_compress_buffer.add_file(file_metadata) | ||
|
|
||
|
|
||
| def _write_user_failure_log( | ||
| title: str, | ||
| content: list[str], | ||
|
|
@@ -321,6 +395,24 @@ def search_and_schedule_new_tasks( | |
| }, | ||
| ) | ||
| return | ||
| elif input_type == InputType.INGESTOR.value: | ||
| try: | ||
| _process_log_ingestor_submitted_s3_input( | ||
| input_config, paths_to_compress_buffer, db_context | ||
| ) | ||
| except Exception as err: | ||
| logger.exception( | ||
| "Failed to process log ingestor submitted S3 input for job %s", job_id | ||
| ) | ||
| update_compression_job_metadata( | ||
| db_context, | ||
| job_id, | ||
| { | ||
| "status": CompressionJobStatus.FAILED, | ||
| "status_msg": f"Log ingestor submitted S3 input failure: {err}", | ||
| }, | ||
| ) | ||
| return | ||
| else: | ||
| logger.error(f"Unsupported input type {input_type}") | ||
| update_compression_job_metadata( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,6 +11,7 @@ | |
| class InputType(LowercaseStrEnum): | ||
| FS = auto() | ||
| S3 = auto() | ||
| INGESTOR = auto() | ||
|
|
||
|
|
||
| class PathsToCompress(BaseModel): | ||
|
|
@@ -44,6 +45,22 @@ def validate_keys(cls, value): | |
| return value | ||
|
|
||
|
|
||
| class LogIngestorSubmittedS3InputConfig(S3Config): | ||
| type: Literal[InputType.INGESTOR.value] = InputType.INGESTOR.value | ||
| ingestion_job_id: int | ||
| dataset: str | None = None | ||
| timestamp_key: str | None = None | ||
| unstructured: bool = False | ||
| metadata_ids: list[int] | ||
|
||
|
|
||
| @field_validator("metadata_ids") | ||
| @classmethod | ||
| def validate_metadata_ids_non_empty(cls, value: list[int]) -> list[int]: | ||
| if not value: | ||
jonathan-imanu marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| raise ValueError("metadata_ids cannot be an empty list") | ||
| return value | ||
|
|
||
|
|
||
| class OutputConfig(BaseModel): | ||
| target_archive_size: int | ||
| target_dictionaries_size: int | ||
|
|
@@ -53,7 +70,7 @@ class OutputConfig(BaseModel): | |
|
|
||
|
|
||
| class ClpIoConfig(BaseModel): | ||
| input: FsInputConfig | S3InputConfig | ||
| input: FsInputConfig | S3InputConfig | LogIngestorSubmittedS3InputConfig | ||
| output: OutputConfig | ||
|
|
||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is repeated quite a few times - shall we create a helper like