@@ -50,6 +50,10 @@ workflow GvsImportGenomes {
5050 # Dump these Parquet files to a bucket.
5151 String ? parquet_output_gcs_dir
5252
53+ # Delete parquet files from GCS after successfully loading them into BigQuery
54+ Boolean delete_parquet_files_after_loading = true
55+ Boolean use_alternate_parquet_delete_strategy = false
56+
5357 Boolean is_wgs = true
5458 }
5559
@@ -293,6 +297,16 @@ workflow GvsImportGenomes {
293297 go = LoadParquetFilesToBQ .done ,
294298 variants_docker = effective_variants_docker ,
295299 }
300+
301+ if (delete_parquet_files_after_loading && VerifyParquetLoading .all_loaded ) {
302+ call DeleteParquetFiles {
303+ input :
304+ output_gcs_dir = defined_parquet_output_dir ,
305+ use_alternate_delete_strategy = use_alternate_parquet_delete_strategy ,
306+ billing_project_id = billing_project_id ,
307+ cloud_sdk_docker = effective_cloud_sdk_docker ,
308+ }
309+ }
296310 }
297311
298312 call SetIsLoadedColumn {
@@ -1311,3 +1325,54 @@ task VerifyParquetLoading {
13111325 Boolean done = true
13121326 }
13131327}
1328+
1329+ task DeleteParquetFiles {
1330+ input {
1331+ String output_gcs_dir
1332+ Boolean use_alternate_delete_strategy = false
1333+
1334+ String ? billing_project_id
1335+ String cloud_sdk_docker
1336+ }
1337+
1338+ command <<<
1339+ PS4 = '\D{+%F %T} \w $ '
1340+ set -o errexit -o nounset -o xtrace -o pipefail
1341+
1342+ # Normalize GCS path by removing any trailing slash
1343+ OUTPUT_GCS_DIR = $(echo ~{output_gcs_dir } | sed 's/\/$//' )
1344+
1345+ if [ "~{use_alternate_delete_strategy}" = "false" ]; then
1346+ gcloud storage rm --recursive ~{"--billing-project " + billing_project_id } "${OUTPUT_GCS_DIR} /" '**/*.parquet'
1347+ else
1348+ # List the contents of the vet and ref_ranges directories for subsequent deletion in the loop below
1349+ echo "Listing directories under ${OUTPUT_GCS_DIR} /vet/ and ${OUTPUT_GCS_DIR} /ref_ranges/ ${OUTPUT_GCS_DIR} /sample_chromosome_ploidy/ for deletion..."
1350+ gcloud storage ls ~{"--billing-project " + billing_project_id } \
1351+ "${OUTPUT_GCS_DIR} /vet/" "${OUTPUT_GCS_DIR} /ref_ranges/" > parquet_dirs.txt
1352+ echo "${OUTPUT_GCS_DIR} /sample_chromosome_ploidy/" >> parquet_dirs.txt
1353+
1354+ # Iterate over all Google Cloud paths in parquet_dirs.txt and delete all objects therein
1355+ echo "Deleting Parquet files..."
1356+ while IFS = read -r gcs_path ; do
1357+ if [ -n "$gcs_path " ]; then
1358+ echo "Deleting objects in: $gcs_path "
1359+ gcloud storage rm ~{"--billing-project " + billing_project_id } "$gcs_path " --recursive
1360+ fi
1361+ done < parquet_dirs.txt
1362+ fi
1363+
1364+ echo "✓ Completed deletion of Parquet files."
1365+
1366+ >>>
1367+ output {
1368+ Boolean done = true
1369+ }
1370+
1371+ runtime {
1372+ docker : cloud_sdk_docker
1373+ memory : "3 GB"
1374+ disks : "local-disk 100 HDD"
1375+ preemptible : 3
1376+ cpu : 1
1377+ }
1378+ }
0 commit comments