Skip to content

Commit 71a586d

Browse files
authored
VS-1779 clean up parquet files immediatement (#9338)
This PR adds a task to delete the parquet files once they are done being used. As there was controversy as to how to delete large amounts of files, it allows for an alternate deletion strategy.
1 parent 6f1c838 commit 71a586d

File tree

2 files changed

+65
-1
lines changed

2 files changed

+65
-1
lines changed

.dockstore.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,6 @@ workflows:
233233
branches:
234234
- master
235235
- ah_var_store
236-
- gg_VS-1794_ParquetRemovalStrategy
237236
- vs_1799_fix_parquet_exome_npe
238237
- vs_1809_parquet_ploidy
239238
tags:

scripts/variantstore/wdl/GvsImportGenomes.wdl

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,10 @@ workflow GvsImportGenomes {
5050
# Dump these Parquet files to a bucket.
5151
String? parquet_output_gcs_dir
5252

53+
# Delete parquet files from GCS after successfully loading them into BigQuery
54+
Boolean delete_parquet_files_after_loading = true
55+
Boolean use_alternate_parquet_delete_strategy = false
56+
5357
Boolean is_wgs = true
5458
}
5559

@@ -293,6 +297,16 @@ workflow GvsImportGenomes {
293297
go = LoadParquetFilesToBQ.done,
294298
variants_docker = effective_variants_docker,
295299
}
300+
301+
if (delete_parquet_files_after_loading && VerifyParquetLoading.all_loaded) {
302+
call DeleteParquetFiles {
303+
input:
304+
output_gcs_dir = defined_parquet_output_dir,
305+
use_alternate_delete_strategy = use_alternate_parquet_delete_strategy,
306+
billing_project_id = billing_project_id,
307+
cloud_sdk_docker = effective_cloud_sdk_docker,
308+
}
309+
}
296310
}
297311
298312
call SetIsLoadedColumn {
@@ -1311,3 +1325,54 @@ task VerifyParquetLoading {
13111325
Boolean done = true
13121326
}
13131327
}
1328+
1329+
task DeleteParquetFiles {
1330+
input {
1331+
String output_gcs_dir
1332+
Boolean use_alternate_delete_strategy = false
1333+
1334+
String? billing_project_id
1335+
String cloud_sdk_docker
1336+
}
1337+
1338+
command <<<
1339+
PS4='\D{+%F %T} \w $ '
1340+
set -o errexit -o nounset -o xtrace -o pipefail
1341+
1342+
# Normalize GCS path by removing any trailing slash
1343+
OUTPUT_GCS_DIR=$(echo ~{output_gcs_dir} | sed 's/\/$//')
1344+
1345+
if [ "~{use_alternate_delete_strategy}" = "false" ]; then
1346+
gcloud storage rm --recursive ~{"--billing-project " + billing_project_id} "${OUTPUT_GCS_DIR}/"'**/*.parquet'
1347+
else
1348+
# List the contents of the vet and ref_ranges directories for subsequent deletion in the loop below
1349+
echo "Listing directories under ${OUTPUT_GCS_DIR}/vet/ and ${OUTPUT_GCS_DIR}/ref_ranges/ ${OUTPUT_GCS_DIR}/sample_chromosome_ploidy/ for deletion..."
1350+
gcloud storage ls ~{"--billing-project " + billing_project_id} \
1351+
"${OUTPUT_GCS_DIR}/vet/" "${OUTPUT_GCS_DIR}/ref_ranges/" > parquet_dirs.txt
1352+
echo "${OUTPUT_GCS_DIR}/sample_chromosome_ploidy/" >> parquet_dirs.txt
1353+
1354+
# Iterate over all Google Cloud paths in parquet_dirs.txt and delete all objects therein
1355+
echo "Deleting Parquet files..."
1356+
while IFS= read -r gcs_path; do
1357+
if [ -n "$gcs_path" ]; then
1358+
echo "Deleting objects in: $gcs_path"
1359+
gcloud storage rm ~{"--billing-project " + billing_project_id} "$gcs_path" --recursive
1360+
fi
1361+
done < parquet_dirs.txt
1362+
fi
1363+
1364+
echo "✓ Completed deletion of Parquet files."
1365+
1366+
>>>
1367+
output {
1368+
Boolean done = true
1369+
}
1370+
1371+
runtime {
1372+
docker: cloud_sdk_docker
1373+
memory: "3 GB"
1374+
disks: "local-disk 100 HDD"
1375+
preemptible: 3
1376+
cpu: 1
1377+
}
1378+
}

0 commit comments

Comments
 (0)