Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
41853ff
WIP
gbggrant Feb 25, 2026
a59371b
Initial cruft
gbggrant Feb 25, 2026
e15571d
Test before live fire
gbggrant Feb 26, 2026
e215f37
slight update
gbggrant Feb 26, 2026
8981039
The real show.
gbggrant Feb 26, 2026
1c9a790
Merge remote-tracking branch 'origin/VS-1736' into gg_VS-1779_CleanUp…
gbggrant Feb 26, 2026
3471d89
Address copilot's code review suggestions.
gbggrant Feb 26, 2026
d977fcf
Another thing
gbggrant Feb 26, 2026
9006292
some more
gbggrant Feb 26, 2026
070fc29
Merge remote-tracking branch 'origin/VS-1736' into gg_VS-1779_CleanUp…
gbggrant Mar 4, 2026
255c487
Code review suggestions.
gbggrant Mar 4, 2026
ad3204d
Point to new branch in dockstore.
gbggrant Mar 4, 2026
6a20fe4
Last
gbggrant Mar 4, 2026
5c92904
Update scripts/variantstore/wdl/GvsImportGenomes.wdl
gbggrant Mar 5, 2026
f6c7380
Update scripts/variantstore/wdl/GvsImportGenomes.wdl
gbggrant Mar 5, 2026
a4a0134
Merge remote-tracking branch 'origin/VS-1736' into gg_VS-1779_CleanUp…
gbggrant Mar 6, 2026
7727358
Merge remote-tracking branch 'origin/VS-1736' into gg_VS-1779_CleanUp…
gbggrant Mar 9, 2026
a3eeb1d
Change to doing a simple rm -r on the whole directory
gbggrant Mar 9, 2026
a2e1833
Allow for alternate deletion strategy
gbggrant Mar 9, 2026
27ed4b9
Push up the flag to allow for alternate deletion strategy.
gbggrant Mar 9, 2026
8f10595
Push up the flag to allow for alternate deletion strategy.
gbggrant Mar 9, 2026
db066cc
I missed this one.
gbggrant Mar 9, 2026
1845747
MOre
gbggrant Mar 9, 2026
a114eb8
Last details
gbggrant Mar 9, 2026
e8faf9a
One last little thing
gbggrant Mar 10, 2026
38ad4ce
Cleanup
gbggrant Mar 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,6 @@ workflows:
branches:
- master
- ah_var_store
- gg_VS-1794_ParquetRemovalStrategy
- vs_1799_fix_parquet_exome_npe
- vs_1809_parquet_ploidy
tags:
Expand Down
65 changes: 65 additions & 0 deletions scripts/variantstore/wdl/GvsImportGenomes.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@ workflow GvsImportGenomes {
# Dump these Parquet files to a bucket.
String? parquet_output_gcs_dir

# Delete parquet files from GCS after successfully loading them into BigQuery
Boolean delete_parquet_files_after_loading = true
Boolean use_alternate_parquet_delete_strategy = false

Boolean is_wgs = true
}

Expand Down Expand Up @@ -293,6 +297,16 @@ workflow GvsImportGenomes {
go = LoadParquetFilesToBQ.done,
variants_docker = effective_variants_docker,
}

if (delete_parquet_files_after_loading && VerifyParquetLoading.all_loaded) {
call DeleteParquetFiles {
input:
output_gcs_dir = defined_parquet_output_dir,
use_alternate_delete_strategy = use_alternate_parquet_delete_strategy,
billing_project_id = billing_project_id,
cloud_sdk_docker = effective_cloud_sdk_docker,
}
}
}

call SetIsLoadedColumn {
Expand Down Expand Up @@ -1311,3 +1325,54 @@ task VerifyParquetLoading {
Boolean done = true
}
}

task DeleteParquetFiles {
input {
String output_gcs_dir
Boolean use_alternate_delete_strategy = false

String? billing_project_id
String cloud_sdk_docker
}

command <<<
PS4='\D{+%F %T} \w $ '
set -o errexit -o nounset -o xtrace -o pipefail

# Normalize GCS path by removing any trailing slash
OUTPUT_GCS_DIR=$(echo ~{output_gcs_dir} | sed 's/\/$//')

if [ "~{use_alternate_delete_strategy}" = "false" ]; then
gcloud storage rm --recursive ~{"--billing-project " + billing_project_id} "${OUTPUT_GCS_DIR}/"'**/*.parquet'
else
# List the contents of the vet and ref_ranges directories for subsequent deletion in the loop below
echo "Listing directories under ${OUTPUT_GCS_DIR}/vet/ and ${OUTPUT_GCS_DIR}/ref_ranges/ ${OUTPUT_GCS_DIR}/sample_chromosome_ploidy/ for deletion..."
gcloud storage ls ~{"--billing-project " + billing_project_id} \
"${OUTPUT_GCS_DIR}/vet/" "${OUTPUT_GCS_DIR}/ref_ranges/" > parquet_dirs.txt
echo "${OUTPUT_GCS_DIR}/sample_chromosome_ploidy/" >> parquet_dirs.txt

# Iterate over all Google Cloud paths in parquet_dirs.txt and delete all objects therein
echo "Deleting Parquet files..."
while IFS= read -r gcs_path; do
if [ -n "$gcs_path" ]; then
echo "Deleting objects in: $gcs_path"
gcloud storage rm ~{"--billing-project " + billing_project_id} "$gcs_path" --recursive
fi
done < parquet_dirs.txt
fi

echo "✓ Completed deletion of Parquet files."

>>>
output {
Boolean done = true
}

runtime {
docker: cloud_sdk_docker
memory: "3 GB"
disks: "local-disk 100 HDD"
preemptible: 3
cpu: 1
}
}
Loading