44# stored procedures against on or more datasets in the ingestion workflow.
55
66import logging
7- import subprocess
87import sys
98import traceback
109
@@ -73,6 +72,11 @@ def sp_rollback_exception_handler(exc_type, exc_value, exc_traceback):
7372env = get_stored_procedures_env ()
7473_logger .info (f"Stored procedures execution environment: { env } " )
7574
75+ if env .file_format_mode != ClinVarIngestFileFormat .SP .value :
76+ msg = f"stored-procedure workflow got unexpected file_format_mode: { env .file_format_mode } "
77+ _logger .warning (msg )
78+ raise ValueError (msg )
79+
7680################################################################
7781#
7882processing_history_table = processing_history .ensure_initialized (client = _get_bq_client ())
@@ -85,17 +89,20 @@ def sp_rollback_exception_handler(exc_type, exc_value, exc_traceback):
8589processed_entries_needing_sp_run = processing_history .processed_entries_ready_for_sp_processing (
8690 processing_history_view , client = _get_bq_client ()
8791)
88- msg = f"Found { processed_entries_needing_sp_run .total_rows } datasets to run stored procedures on."
92+ total_rows = processed_entries_needing_sp_run .total_rows
93+ rows_needing_sp_run = list (processed_entries_needing_sp_run )
94+ release_dates_str = ", " .join (r .get ("release_date" ).isoformat () for r in rows_needing_sp_run )
95+ msg = f"Found { total_rows } datasets to run stored procedures on. ({ release_dates_str } )"
8996_logger .info (msg )
9097
91- if not processed_entries_needing_sp_run . total_rows :
98+ if not total_rows :
9299 sys .exit (0 )
93100
94101send_slack_message (msg )
95102
96103# update processing_history.bq_ingest_started for ALL processing_history_view
97104rows_to_ingest = []
98- for row in processed_entries_needing_sp_run :
105+ for row in rows_needing_sp_run :
99106 rows_to_ingest .append (row )
100107 vcv_pipeline_version = row .get ("vcv_pipeline_version" , None )
101108 vcv_xml_release_date = row .get ("vcv_xml_release_date" , None )
@@ -131,12 +138,13 @@ def sp_rollback_exception_handler(exc_type, exc_value, exc_traceback):
131138# Now process individual rows
132139for row in rows_to_ingest :
133140 _logger .info (row )
134- release_date = row .get ("release_date" , None )
135- vcv_pipeline_version = row .get ("vcv_pipeline_version" , None )
136- vcv_release_date = row .get ("vcv_release_date" , None )
137- vcv_xml_release_date = row .get ("vcv_xml_release_date" , None )
138- vcv_bucket_dir = row .get ("vcv_bucket_dir" , None )
139- schema_version = row .get ("vcv_schema_version" , None )
141+ # required
142+ release_date = row ["release_date" ]
143+ vcv_pipeline_version = row ["vcv_pipeline_version" ]
144+ vcv_xml_release_date = row ["vcv_xml_release_date" ]
145+ vcv_bucket_dir = row ["vcv_bucket_dir" ]
146+ # optional
147+ schema_version = row .get ("vcv_schema_version" )
140148
141149 msg = f"Executing stored procedures on dataset dated { release_date } "
142150 _logger .info (msg )
@@ -154,13 +162,13 @@ def sp_rollback_exception_handler(exc_type, exc_value, exc_traceback):
154162 client = _get_bq_client (),
155163 )
156164 msg = f"""
157- Stored procedure execution successful for release dated { vcv_xml_release_date = } { vcv_pipeline_version = } release_tag={ env .release_tag } .
165+ Stored procedure execution successful for release dated vcv_xml_release_date={ vcv_xml_release_date . isoformat () } { vcv_pipeline_version = } release_tag={ env .release_tag } .
158166 """
159167 _logger .info (msg )
160168 send_slack_message (msg )
161169 except Exception as e :
162170 msg = f"""
163- Stored procedure execution failed for release dated { vcv_xml_release_date = } { vcv_pipeline_version = } release_tag={ env .release_tag } .
171+ Stored procedure execution failed for release dated vcv_xml_release_date={ vcv_xml_release_date . isoformat () } { vcv_pipeline_version = } release_tag={ env .release_tag } .
164172 """
165173 _logger .error (msg )
166174 send_slack_message (msg )
@@ -173,22 +181,22 @@ def sp_rollback_exception_handler(exc_type, exc_value, exc_traceback):
173181 if row ["xml_release_date" ] != str (vcv_xml_release_date ) or row ["release_tag" ] != env .release_tag
174182 ]
175183
176- dataset_id = row . get ( "final_dataset_id" )
184+ dataset_id = row [ "final_dataset_id" ]
177185
178- vi_gs_url = f"gs://clinvar-gks/{ release_date } /dev/vi.jsonl.gz"
179- cmd = f"""
180- bq extract \
181- --destination_format NEWLINE_DELIMITED_JSON \
182- --compression GZIP \
183- '{ dataset_id } .variation_identity' \
184- { vi_gs_url }
185- """
186+ vi_gs_url = f"gs://{ env .clinvar_gks_bucket } /{ release_date } /dev/vi.jsonl.gz"
186187 try :
187- subprocess .run (cmd , shell = True , check = True , capture_output = True , text = True )
188+ client = _get_bq_client ()
189+ table_id = f"{ dataset_id } .variation_identity"
190+ job_config = bigquery .ExtractJobConfig (
191+ destination_format = bigquery .DestinationFormat .NEWLINE_DELIMITED_JSON , compression = bigquery .Compression .GZIP
192+ )
193+ extract_job = client .extract_table (table_id , vi_gs_url , job_config = job_config )
194+ extract_job .result (timeout = 1800 ) # Wait for the job to complete (30 minute timeout)
188195 msg = f"Successfully exported variation_identity file to { vi_gs_url } "
189196 _logger .info (msg )
190197 send_slack_message (msg )
191- except subprocess .CalledProcessError as e :
192- raise RuntimeError (
193- f"Command failed: { e .cmd } \n Return code: { e .returncode } \n Stdout: { e .stdout } \n Stderr: { e .stderr } "
194- )
198+ except Exception as e :
199+ error_msg = f"BigQuery extract job to { vi_gs_url } failed: { e } "
200+ _logger .error (error_msg )
201+ send_slack_message (error_msg )
202+ raise e
0 commit comments