Skip to content

Commit a572d76

Browse files
authored
Merge pull request #25 from MPI-EVA-Archaeogenetics/dev
1.6.0
2 parents 7033abf + 5b38564 commit a572d76

12 files changed

+468
-124
lines changed

CHANGELOG.md

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,43 @@
33
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
44
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
55

6+
## [1.6.0] - 07/03/2025
7+
8+
### `Added`
9+
10+
- Processing of YC data. (Y + mtDNA capture (YMCA))
11+
- Processing of IM data. (Immunocapture)
12+
- `conf/Autorun.config`:
13+
- Use hard links when publishing results, instead of copying files.
14+
- Add YC profile for processing YMCA data.
15+
- Add IM profile for processing Immunocapture data.
16+
- `scripts/create_poseidon_release.sh`: New script to create large releases of the entire TF processed data in Poseidon format.
17+
- Now compatible with Pandora Site IDs longer than 3 letters.
18+
- The following scripts can now infer Site_ID of varied lengths from the Ind_ID (pyPandoraHelper):
19+
- `scripts/clear_results.sh`
20+
- `scripts/clear_work_dirs.sh`
21+
- `scripts/ethical_sample_scrub.sh`
22+
- `scripts/run_Eager.sh`
23+
- `scripts/update_poseidon_packages.sh`
24+
- The following scripts can now infer Site_ID of varied lengths from the Ind_ID (rPandoraHelper):
25+
- `scripts/prepare_eager_tsv.R`
26+
- `scripts/fill_in_janno.R`
27+
- Refactor how valid analysis types are determined in shell scripts, to make more easily extendable.
28+
- `scripts/prepare_eager_tsv.R`:
29+
- Now uses Main_Individual_ID instead of Full_Individual_ID as the Sample_Name when one is provided.
30+
- Now excludes sequencing entries with the `Exclude` flag set to `Yes`.
31+
- `scripts/create_processed_ind_list.sh`: Script to create a list of processed individuals across all analysis types, and a count of individuals in each analysis type.
32+
33+
### `Fixed`
34+
- `scripts/run_Eager.sh`: Java garbage collector now limited to one thread to avoid memory issues and hanging spawner jobs.
35+
36+
### `Dependencies`
37+
38+
- pyPandoraHelper=0.2.1
39+
- rPandoraHelper=0.2.0
40+
41+
### `Deprecated`
42+
643
## [1.5.0] - 30/09/2024
744

845
### `Added`
@@ -15,6 +52,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1552
- `scripts/ethical_sample_scrub.sh`: Add RP analysis type for ethical sample scrubbing.
1653
- `scripts/clear_work_dirs.sh`: Add RP analysis type for work directory clearing.
1754
- `scripts/clear_results.sh`: Add RP analysis type for results directory clearing.
55+
- `scripts/update_poseidon_packages.sh`: Bump version for new release.
56+
- `README.md`: Updated to list new state of the pipeline.
1857

1958
### `Fixed`
2059

conf/Autorun.config

Lines changed: 189 additions & 65 deletions
Large diffs are not rendered by default.

scripts/clear_results.sh

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,31 @@
11
#!/usr/bin/env bash
22

3-
## This script removes the results for an individiaul while maintaining the nextflow process cache for them.
3+
4+
## This script removes the results for an individual while maintaining the nextflow process cache for them.
45
## It is intended as a way to refresh the results directories of an individual. This can be useful either
56
## to remove older files after additional libraries appear and are therefore merged, or to remove results
67
## with misleading names in cases where Pandora entries get updated (e.g. protocol mixup leading to changes
78
## in strandedness for a library).
89

10+
## DEPENDENCY
11+
pandora_helper="/mnt/archgen/tools/helper_scripts/py_helpers/pyPandoraHelper/pyPandoraHelper.py"
12+
13+
valid_analysis_types=("TF" "SG" "RP" "RM" "IM" "YC")
14+
15+
## Join array elements by separator given as $1
16+
function join_array_elements() {
17+
local IFS="$1"
18+
shift
19+
echo "$*"
20+
}
21+
922
## Helptext function
1023
function Helptext() {
11-
echo -ne "\t usage: $0 [options] <ind_id_list>\n\n"
12-
echo -ne "This script removes all output directory contents for the provided individuals, without clearing out caching, allowing for the results to be re-published.\n This enables refreshing of result directories when changes to the input might have changes merging of libraries, thus making the directory structure inconsistent.\n\n"
13-
echo -ne "Options:\n"
14-
echo -ne "-h, --help\t\tPrint this text and exit.\n"
15-
echo -ne "-a, --analysis_type\t\tSet the analysis type. Options: TF, SG, RP, RM.\n"
24+
errecho "\t usage: $0 [options] <ind_id_list>\n"
25+
errecho "This script removes all output directory contents for the provided individuals, without clearing out caching, allowing for the results to be re-published.\n This enables refreshing of result directories when changes to the input might have changes merging of libraries, thus making the directory structure inconsistent.\n"
26+
errecho "Options:"
27+
errecho "-h, --help\t\tPrint this text and exit."
28+
errecho "-a, --analysis_type\t\tSet the analysis type. Options: $(join_array_elements , ${valid_analysis_types[@]})."
1629
}
1730

1831
## Print messages to stderr, optionally with colours
@@ -33,6 +46,8 @@ function errecho() {
3346
elif [[ ${1} == '-r' ]]; then
3447
colour="${Red}"
3548
shift 1
49+
else
50+
colour="${Normal}"
3651
fi
3752
echo -e ${colour}$*${Normal} 1>&2
3853
}
@@ -65,9 +80,11 @@ fi
6580
if [[ ${analysis_type} == '' ]]; then
6681
errecho "No --analysis_type was provided.\n"
6782
Helptext
68-
elif [[ ${analysis_type} != "SG" && ${analysis_type} != "TF" && ${analysis_type} != "RP" && ${analysis_type} != "RM" ]]; then
69-
errecho "analysis_type must be SG, TF, RP, or RM. You provided: ${analysis_type}\n"
83+
exit 2
84+
elif [[ ! " ${valid_analysis_types[*]} " =~ " ${analysis_type} " ]]; then
85+
errecho "analysis_type must be one of: $(join_array_elements , ${valid_analysis_types[@]}). You provided: ${analysis_type}\n"
7086
Helptext
87+
exit 2
7188
fi
7289

7390
root_eager_dir='/mnt/archgen/Autorun_eager/eager_outputs' ## Directory should include subdirectories for each analysis type (TF/SG) and sub-subdirectories for each site and individual.
@@ -79,7 +96,7 @@ input_iids=($(cat ${ind_id_list_fn}))
7996
## Both needed for caching.
8097
## Also leave '1240k.imputed' and 'GTL_output' alone.
8198
for ind_id in ${input_iids[@]}; do
82-
site_id=${ind_id:0:3} ## Site id is the first three characters of the individual ID
99+
site_id=`${pandora_helper} -g site_id ${ind_id}` ## Site inferred by pyPandoraHelper
83100
dirs_to_delete=$(ls -1 -d ${root_eager_dir}/${analysis_type}/${site_id}/${ind_id}/* | grep -vw -e 'work' -e '1240k.imputed' -e 'GTL_output' -e 'pipeline_info')
84101
for dir in ${dirs_to_delete}; do
85102
errecho "Deleting results in: ${dir}"

scripts/clear_work_dirs.sh

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,15 @@
22

33
## This script accepts a list of individual IDs and clears the nextflow work directories for both SG and TF data processing of each ID.
44

5+
## DEPENDENCY
6+
pandora_helper="/mnt/archgen/tools/helper_scripts/py_helpers/pyPandoraHelper/pyPandoraHelper.py"
7+
8+
valid_analysis_types=("TF" "SG" "RP" "RM" "IM" "YC")
9+
510
## Helptext function
611
function Helptext() {
712
echo -ne "\t usage: $0 [options] <ind_id_list>\n\n"
8-
echo -ne "This script clears the work directories of individuals in a specified individual ID list from both the SG and TF results directories.\n\n"
13+
echo -ne "This script clears the work directories of individuals in a specified individual ID list from all results directories.\n\n"
914
echo -ne "Options:\n"
1015
echo -ne "-h, --help\t\tPrint this text and exit.\n"
1116
}
@@ -40,9 +45,9 @@ root_eager_dir='/mnt/archgen/Autorun_eager/eager_outputs' ## Directory should in
4045
input_iids=($(cat ${ind_id_list_fn}))
4146

4247
for ind_id in ${input_iids[@]}; do
43-
site_id=${ind_id:0:3} ## Site id is the first three characters of the individual ID
48+
site_id=`${pandora_helper} -g site_id ${ind_id}` ## Site inferred by pyPandoraHelper
4449
errecho -ne "Clearing work directories for ${ind_id}..."
45-
for analysis_type in "SG" "TF" "RP" "RM"; do
50+
for analysis_type in ${valid_analysis_types[@]}; do
4651
if [[ -d ${root_eager_dir}/${analysis_type}/${site_id}/${ind_id}/work ]]; then
4752
errecho -ne " ${analysis_type}..."
4853
# ls -d ${root_eager_dir}/${analysis_type}/${site_id}/${ind_id}/work

scripts/create_poseidon_release.sh

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
#!/usr/bin/env bash
2+
3+
VERSION="1.0.0"
4+
5+
## Colours for printing to terminal
6+
Yellow=$(tput sgr0)'\033[1;33m' ## Yellow normal face
7+
Red=$(tput sgr0)'\033[1;31m' ## Red normal face
8+
Normal=$(tput sgr0)
9+
10+
## Helptext function
11+
function Helptext() {
12+
echo -ne "\t usage: $0 [options] <release_name>\n\n"
13+
echo -ne "This creates a dated release of all poseidon packages.\n\n"
14+
echo -ne "Options:\n"
15+
echo -ne "-h, --help\t\tPrint this text and exit.\n"
16+
echo -ne "-v, --version \t\tPrint version and exit.\n"
17+
}
18+
19+
## Print messages to stderr
20+
function errecho() { echo -e $* 1>&2 ;}
21+
22+
23+
## Parse CLI args.
24+
TEMP=`getopt -q -o hv --long help,version -n 'create_poseidon_release.sh' -- "$@"`
25+
eval set -- "$TEMP"
26+
27+
## parameter defaults
28+
trident_path="/r1/people/srv_autoeager/bin/trident-1.5.7.0"
29+
## In the future, maybe multiple releases, for each data type?
30+
poseidon_pacakges="/mnt/archgen/Autorun_eager/poseidon_packages/TF/Sites/"
31+
release_dir="/mnt/archgen/Autorun_eager/poseidon_packages/releases/"
32+
33+
## Read in CLI arguments
34+
while true ; do
35+
case "$1" in
36+
-h|--help) Helptext; exit 0 ;;
37+
-v|--version) echo ${VERSION}; exit 0;;
38+
--) release_name="${2}"; break ;;
39+
*) echo -e "invalid option provided: $1.\n"; Helptext; exit 1;;
40+
esac
41+
done
42+
43+
## All poseidon packages have the population name "Unknown". This can be used to make a mega release easily.
44+
## Once the large dataset is created, the population name can be changed to the site name.
45+
## TODO: a) Submit to scheduler, b) First forge each site, then forge across sites. That limits open file handles and speeds things up considerably.
46+
CMD="${trident_path} forge \
47+
-d ${poseidon_pacakges} \
48+
--forgeString Unknown \
49+
--outFormat EIGENSTRAT \
50+
--outPackagePath ${release_dir}/${release_name} \
51+
--outPackageName ${release_name} \
52+
--logMode SimpleLog"
53+
54+
errecho "${CMD}" | tr -s ' '
55+
${CMD} 2>&1 > ${release_dir}/${release_name}.creation_log
56+
57+
if [[ $? -ne 0 ]]; then
58+
errecho "${Red}Error${Normal}: Trident failed to create the release. Check the log file for more information."
59+
exit 1
60+
fi
61+
62+
## Update Group_Name column in ind file
63+
awk -F "\t" -v OFS="\t" '{if ($1 ~ /_ss$/) {$3 = substr($1, 1,length($1)-6)} else {$3 = substr($1, 1,length($1)-3)}; print $0}' ${release_dir}/${release_name}.ind > ${release_dir}/${release_name}.ind.tmp
64+
mv ${release_dir}/${release_name}.ind ${release_dir}/.${release_name}.ind.original
65+
mv ${release_dir}/${release_name}.ind.tmp ${release_dir}/${release_name}.ind
66+
67+
## Update Group_Name column in janno file
68+
## janno has aheader line, so add NR==1; NR > 1 to only apply the transformation after the first line.
69+
awk -F "\t" -v OFS="\t" 'NR==1; NR > 1{if ($1 ~ /_ss$/) {$3 = substr($1, 1,length($1)-6)} else {$3 = substr($1, 1,length($1)-3)}; print $0}' ${release_dir}/${release_name}.janno > ${release_dir}/${release_name}.janno.tmp
70+
mv ${release_dir}/${release_name}.janno ${release_dir}/.${release_name}.janno.original
71+
mv ${release_dir}/${release_name}.janno.tmp ${release_dir}/${release_name}.janno
72+
73+
## Rectify the package to add checksums
74+
CMD="${trident_path} rectify \
75+
-d ${release_dir}/${release_name} \
76+
--packageVersion Minor \
77+
--logText 'Added checksums to package' \
78+
--checksumAll"
79+
80+
errecho "${CMD}" | tr -s ' '
81+
${CMD}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/usr/bin/env bash
2+
3+
date=$(date +'%y%m%d_%H%M')
4+
5+
cd /mnt/archgen/Autorun_eager/
6+
7+
if [[ ! -d stats/${date} ]]; then
8+
mkdir stats/${date}/
9+
fi
10+
11+
find /mnt/archgen/Autorun_eager/eager_outputs -maxdepth 4 -mindepth 4 -path '*/*/*/multiqc' -type d | rev | cut -d "/" -f 2 | rev > stats/${date}/all_processed_inds_${date}.tsv
12+
13+
sort -u stats/${date}/all_processed_inds_${date}.tsv > stats/${date}/all_processed_inds_${date}_unique.txt
14+
15+
(for a in `ls eager_outputs/`; do echo -n "${a} individuals processed: "; find /mnt/archgen/Autorun_eager/eager_outputs/${a} -maxdepth 4 -mindepth 4 -path '*/*/*/multiqc/multiqc_report.html' -type f | wc -l ; done ; echo "Date: ${date}") > stats/${date}/n_processed_inds_${date}.txt

scripts/cron_daily_prepare.sh

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,17 @@ find /mnt/archgen/Autorun/Results/Human_RM/2* -name '*.bam' -mtime -1 2>/dev/nul
3333
echo "Processing RM data from run: ${RUN}"
3434
scripts/prepare_eager_tsv.R -s $RUN -a RM -o eager_inputs/ -d .eva_credentials
3535
done
36+
37+
# Y + mtDNA capture (YMCA)
38+
# Note: this find only checks runs starting from 2020. Silence stderr to avoid 'permission denied'.
39+
find /mnt/archgen/Autorun/Results/Human_Y/2* -name '*.bam' -mtime -1 2>/dev/null | cut -f 7 -d "/" | sort -u | while read RUN ; do
40+
echo "Processing YC data from run: ${RUN}"
41+
scripts/prepare_eager_tsv.R -s $RUN -a YC -o eager_inputs/ -d .eva_credentials
42+
done
43+
44+
# Immunocapture
45+
# Note: this find only checks runs starting from 2020. Silence stderr to avoid 'permission denied'.
46+
find /mnt/archgen/Autorun/Results/Human_IM/2* -name '*.bam' -mtime -1 2>/dev/null | cut -f 7 -d "/" | sort -u | while read RUN ; do
47+
echo "Processing IM data from run: ${RUN}"
48+
scripts/prepare_eager_tsv.R -s $RUN -a IM -o eager_inputs/ -d .eva_credentials
49+
done

scripts/ethical_sample_scrub.sh

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
#!/usr/bin/env bash
22

3+
## DEPENDENCY
4+
pandora_helper="/mnt/archgen/tools/helper_scripts/py_helpers/pyPandoraHelper/pyPandoraHelper.py"
5+
6+
valid_analysis_types=("TF" "SG" "RP" "RM" "IM" "YC")
7+
38
## Helptext function
49
function Helptext() {
510
echo -ne "\t usage: $0 [options] <sensitive_seqIds_list>\n\n"
@@ -63,12 +68,14 @@ else
6368

6469
## If the individuals were flagged as sensitive AFTER processing started, both the inputs and outputs should be made inaccessible.
6570
for raw_iid in ${scrub_me[@]}; do
66-
for analysis_type in "SG" "TF" "RP" "RM"; do
71+
for analysis_type in ${valid_analysis_types[@]}; do
6772
## EAGER_INPUTS
68-
site_id="${raw_iid:0:3}"
73+
site_id=`${pandora_helper} -g site_id ${raw_iid}` ## Site inferred by pyPandoraHelper
6974
eager_input_tsv="${root_input_dir}/${analysis_type}/${site_id}/${raw_iid}/${raw_iid}.tsv"
7075
## If the eager inpput exists, hide the entire directory and make it inaccessible
7176
if [[ -f ${eager_input_tsv} ]]; then
77+
errecho "Scrubbing ${raw_iid} from ${analysis_type}"
78+
errecho " ${raw_iid} ${eager_input_tsv}"
7279
old_name=$(dirname ${eager_input_tsv})
7380
new_name=$(dirname ${old_name})/.${raw_iid}
7481
mv -v ${old_name} ${new_name} ## Hide the input directory
@@ -78,6 +85,7 @@ else
7885
## EAGER_OUTPUTS
7986
eager_output_dir="${root_output_dir}/${analysis_type}/${site_id}/${raw_iid}/"
8087
if [[ -d ${eager_output_dir} ]]; then
88+
errecho " ${rawiid} ${eager_output_dir}"
8189
new_outdir_name=$(dirname ${eager_output_dir})/.${raw_iid}
8290
mv -v ${eager_output_dir} ${new_outdir_name} ## Hide the output directory
8391
chmod 0700 ${new_outdir_name} ## Restrict the directory contents

scripts/fill_in_janno.R

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@ if (!require('poseidonR')) {
1818
remotes::install_github('poseidon-framework/poseidonR')
1919
# require(poseidonR)
2020
} else {require(poseidonR)}
21+
if (!require('rPandoraHelper')) {
22+
write("Installing required local package 'rPandoraHelper'...", file=stderr())
23+
install.packages("/mnt/archgen/tools/helper_scripts/r_helpers/rPandoraHelper/", repos = NULL, type = "source")
24+
# require(rPandoraHelper)
25+
} else {require(rPandoraHelper)}
2126

2227
## Parse arguments ----------------------------
2328
parser <- OptionParser()
@@ -73,8 +78,11 @@ if (args$output_fn == "") {
7378
input_janno_table <- eager2poseidon::standardise_janno(args$janno_fn)
7479

7580
## Create new column `Pandora_ID` that removes the ss_suffix (if present) from the Poseidon ID to infer the Pandora_ID of the individual.
81+
## Uses rParndoraHelper::get_ind_id to infer the Pandora ID of the individual.
7682
sample_ids <- dplyr::select(input_janno_table, Poseidon_ID) %>%
77-
dplyr::mutate(Pandora_ID=sub(paste0(args$ss_suffix,"$"), '', .data$Poseidon_ID))
83+
rowwise() %>%
84+
dplyr::mutate(Pandora_ID=rPandoraHelper::get_ind_id(Poseidon_ID, keep_ss_suffix=F )) %>%
85+
ungroup()
7886

7987
##################
8088
## Pandora info ##
@@ -86,13 +94,16 @@ pandora_results <- eager2poseidon::import_pandora_data(sample_ids %>% dplyr::sel
8694
## drop Pandora_ID column. not needed anymore
8795
dplyr::select(-Pandora_ID)
8896

97+
## Use rPandoraHelper to infer Pandora IDs from input ind_id
98+
pandora_site_id <- rPandoraHelper::get_site_id(args$ind_id, keep_ss_suffix=F)
99+
pandora_ind_id <- rPandoraHelper::get_ind_id(args$ind_id, keep_ss_suffix=F)
89100
## Infer locations of different JSONs to read results in with eagerR. (More flexible than e2p and can pull results from SG runs if present)
90101
base_dir <- "/mnt/archgen/Autorun_eager"
91102
# base_dir <- "/Users/lamnidis/mount"
92-
eager_tsv_fn <- paste0(base_dir, "/eager_inputs/TF/", substr(args$ind_id,0,3), "/", args$ind_id,"/", args$ind_id, ".tsv")
93-
eager_tf_results_dir <- paste0(base_dir, "/eager_outputs/TF/", substr(args$ind_id,0,3), "/", args$ind_id,"/")
94-
eager_sg_endorspy_dir <- paste0(base_dir, "/eager_outputs/SG/", substr(args$ind_id,0,3), "/", args$ind_id,"/endorspy/")
95-
eager_sg_damageprofiler_dir <- paste0(base_dir, "/eager_outputs/SG/", substr(args$ind_id,0,3), "/", args$ind_id,"/damageprofiler/")
103+
eager_tsv_fn <- paste0(base_dir, "/eager_inputs/TF/", pandora_site_id, "/", pandora_ind_id,"/", pandora_ind_id, ".tsv")
104+
eager_tf_results_dir <- paste0(base_dir, "/eager_outputs/TF/", pandora_site_id, "/", pandora_ind_id,"/")
105+
eager_sg_endorspy_dir <- paste0(base_dir, "/eager_outputs/SG/", pandora_site_id, "/", pandora_ind_id,"/endorspy/")
106+
eager_sg_damageprofiler_dir <- paste0(base_dir, "/eager_outputs/SG/", pandora_site_id, "/", pandora_ind_id,"/damageprofiler/")
96107

97108
##############
98109
## TSV info ##
@@ -194,6 +205,7 @@ updated_columns <- eager2poseidon::compile_eager_result_tables(
194205
"Capture_Type"
195206
))) %>%
196207
## Remove ss_suffix from library names, so they match Pandora Library IDs
208+
## NOTE: Should this be changed to use rPandoraHelper? Would require some tweaking to work with list columns, as it currently expects a single ID.
197209
dplyr::mutate(
198210
Library_Names=gsub('_ss','',.data$Library_Names) %>% vctrs::vec_unique()
199211
) %>%

0 commit comments

Comments
 (0)