Merge pull request #25 from MPI-EVA-Archaeogenetics/dev

TCLamnidis · web-flow · commit a572d76a6b8a · 2025-03-07T14:55:57.000+01:00
1.6.0
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,43 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.6.0] - 07/03/2025
+
+### `Added`
+
+- Processing of YC data. (Y + mtDNA capture (YMCA))
+- Processing of IM data. (Immunocapture)
+- `conf/Autorun.config`: 
+  - Use hard links when publishing results, instead of copying files.
+  - Add YC profile for processing YMCA data.
+  - Add IM profile for processing Immunocapture data.
+- `scripts/create_poseidon_release.sh`: New script to create large releases of the entire TF processed data in Poseidon format.
+- Now compatible with Pandora Site IDs longer than 3 letters.
+  - The following scripts can now infer Site_ID of varied lengths from the Ind_ID (pyPandoraHelper):
+    - `scripts/clear_results.sh`
+    - `scripts/clear_work_dirs.sh`
+    - `scripts/ethical_sample_scrub.sh`
+    - `scripts/run_Eager.sh`
+    - `scripts/update_poseidon_packages.sh`
+  - The following scripts can now infer Site_ID of varied lengths from the Ind_ID (rPandoraHelper):
+    - `scripts/prepare_eager_tsv.R`
+    - `scripts/fill_in_janno.R`
+- Refactor how valid analysis types are determined in shell scripts, to make more easily extendable.
+- `scripts/prepare_eager_tsv.R`: 
+  - Now uses Main_Individual_ID instead of Full_Individual_ID as the Sample_Name when one is provided.
+  - Now excludes sequencing entries with the `Exclude` flag set to `Yes`.
+- `scripts/create_processed_ind_list.sh`: Script to create a list of processed individuals across all analysis types, and a count of individuals in each analysis type.
+
+### `Fixed`
+- `scripts/run_Eager.sh`: Java garbage collector now limited to one thread to avoid memory issues and hanging spawner jobs.
+
+### `Dependencies`
+
+- pyPandoraHelper=0.2.1
+- rPandoraHelper=0.2.0
+
+### `Deprecated`
+
 ## [1.5.0] - 30/09/2024
 
 ### `Added`
@@ -15,6 +52,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - `scripts/ethical_sample_scrub.sh`: Add RP analysis type for ethical sample scrubbing.
   - `scripts/clear_work_dirs.sh`: Add RP analysis type for work directory clearing.
   - `scripts/clear_results.sh`: Add RP analysis type for results directory clearing.
+- `scripts/update_poseidon_packages.sh`: Bump version for new release.
+- `README.md`: Updated to list new state of the pipeline.
 
 ### `Fixed`
 
diff --git a/conf/Autorun.config b/conf/Autorun.config
diff --git a/scripts/clear_results.sh b/scripts/clear_results.sh
@@ -1,18 +1,31 @@
 #!/usr/bin/env bash
 
-## This script removes the results for an individiaul while maintaining the nextflow process cache for them.
+
+## This script removes the results for an individual while maintaining the nextflow process cache for them.
 ##    It is intended as a way to refresh the results directories of an individual. This can be useful either
 ##    to remove older files after additional libraries appear and are therefore merged, or to remove results
 ##    with misleading names in cases where Pandora entries get updated (e.g. protocol mixup leading to changes
 ##    in strandedness for a library).
 
+## DEPENDENCY
+pandora_helper="/mnt/archgen/tools/helper_scripts/py_helpers/pyPandoraHelper/pyPandoraHelper.py"
+
+valid_analysis_types=("TF" "SG" "RP" "RM" "IM" "YC")
+
+## Join array elements by separator given as $1
+function join_array_elements() {
+  local IFS="$1"
+  shift
+  echo "$*"
+}
+
 ## Helptext function
 function Helptext() {
-  echo -ne "\t usage: $0 [options] <ind_id_list>\n\n"
-  echo -ne "This script removes all output directory contents for the provided individuals, without clearing out caching, allowing for the results to be re-published.\n    This enables refreshing of result directories when changes to the input might have changes merging of libraries, thus making the directory structure inconsistent.\n\n"
-  echo -ne "Options:\n"
-  echo -ne "-h, --help\t\tPrint this text and exit.\n"
-  echo -ne "-a, --analysis_type\t\tSet the analysis type. Options: TF, SG, RP, RM.\n"
+  errecho "\t usage: $0 [options] <ind_id_list>\n"
+  errecho "This script removes all output directory contents for the provided individuals, without clearing out caching, allowing for the results to be re-published.\n    This enables refreshing of result directories when changes to the input might have changes merging of libraries, thus making the directory structure inconsistent.\n"
+  errecho "Options:"
+  errecho "-h, --help\t\tPrint this text and exit."
+  errecho "-a, --analysis_type\t\tSet the analysis type. Options: $(join_array_elements , ${valid_analysis_types[@]})."
 }
 
 ## Print messages to stderr, optionally with colours
@@ -33,6 +46,8 @@ function errecho() {
   elif [[ ${1} == '-r' ]]; then
     colour="${Red}"
     shift 1
+  else
+    colour="${Normal}"
   fi
   echo -e ${colour}$*${Normal} 1>&2
 }
@@ -65,9 +80,11 @@ fi
 if [[ ${analysis_type} == '' ]]; then
   errecho "No --analysis_type was provided.\n"
   Helptext
-elif [[ ${analysis_type} != "SG" && ${analysis_type} != "TF" && ${analysis_type} != "RP" && ${analysis_type} != "RM" ]]; then
-  errecho "analysis_type must be SG, TF, RP, or RM. You provided: ${analysis_type}\n"
+  exit 2
+elif [[ ! " ${valid_analysis_types[*]} " =~ " ${analysis_type} " ]]; then
+  errecho "analysis_type must be one of: $(join_array_elements , ${valid_analysis_types[@]}). You provided: ${analysis_type}\n"
   Helptext
+  exit 2
 fi
 
 root_eager_dir='/mnt/archgen/Autorun_eager/eager_outputs' ## Directory should include subdirectories for each analysis type (TF/SG) and sub-subdirectories for each site and individual.
@@ -79,7 +96,7 @@ input_iids=($(cat ${ind_id_list_fn}))
 ##    Both needed for caching. 
 ##    Also leave '1240k.imputed' and 'GTL_output' alone.
 for ind_id in ${input_iids[@]}; do
-  site_id=${ind_id:0:3} ## Site id is the first three characters of the individual ID
+  site_id=`${pandora_helper} -g site_id ${ind_id}` ## Site inferred by pyPandoraHelper
   dirs_to_delete=$(ls -1 -d ${root_eager_dir}/${analysis_type}/${site_id}/${ind_id}/* | grep -vw -e 'work' -e '1240k.imputed' -e 'GTL_output' -e 'pipeline_info')
   for dir in ${dirs_to_delete}; do
     errecho "Deleting results in: ${dir}"
diff --git a/scripts/clear_work_dirs.sh b/scripts/clear_work_dirs.sh
@@ -2,10 +2,15 @@
 
 ## This script accepts a list of individual IDs and clears the nextflow work directories for both SG and TF data processing of each ID.
 
+## DEPENDENCY
+pandora_helper="/mnt/archgen/tools/helper_scripts/py_helpers/pyPandoraHelper/pyPandoraHelper.py"
+
+valid_analysis_types=("TF" "SG" "RP" "RM" "IM" "YC")
+
 ## Helptext function
 function Helptext() {
   echo -ne "\t usage: $0 [options] <ind_id_list>\n\n"
-  echo -ne "This script clears the work directories of individuals in a specified individual ID list from both the SG and TF results directories.\n\n"
+  echo -ne "This script clears the work directories of individuals in a specified individual ID list from all results directories.\n\n"
   echo -ne "Options:\n"
   echo -ne "-h, --help\t\tPrint this text and exit.\n"
 }
@@ -40,9 +45,9 @@ root_eager_dir='/mnt/archgen/Autorun_eager/eager_outputs' ## Directory should in
 input_iids=($(cat ${ind_id_list_fn}))
 
 for ind_id in ${input_iids[@]}; do
-  site_id=${ind_id:0:3} ## Site id is the first three characters of the individual ID
+  site_id=`${pandora_helper} -g site_id ${ind_id}` ## Site inferred by pyPandoraHelper
   errecho -ne "Clearing work directories for ${ind_id}..."
-  for analysis_type in "SG" "TF" "RP" "RM"; do
+  for analysis_type in ${valid_analysis_types[@]}; do
     if [[ -d ${root_eager_dir}/${analysis_type}/${site_id}/${ind_id}/work ]]; then
       errecho -ne " ${analysis_type}..."
       # ls -d ${root_eager_dir}/${analysis_type}/${site_id}/${ind_id}/work
diff --git a/scripts/create_poseidon_release.sh b/scripts/create_poseidon_release.sh
@@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+
+VERSION="1.0.0"
+
+## Colours for printing to terminal
+Yellow=$(tput sgr0)'\033[1;33m' ## Yellow normal face
+Red=$(tput sgr0)'\033[1;31m' ## Red normal face
+Normal=$(tput sgr0)
+
+## Helptext function
+function Helptext() {
+  echo -ne "\t usage: $0 [options] <release_name>\n\n"
+  echo -ne "This creates a dated release of all poseidon packages.\n\n"
+  echo -ne "Options:\n"
+  echo -ne "-h, --help\t\tPrint this text and exit.\n"
+  echo -ne "-v, --version \t\tPrint version and exit.\n"
+}
+
+## Print messages to stderr
+function errecho() { echo -e $* 1>&2 ;}
+
+
+## Parse CLI args.
+TEMP=`getopt -q -o hv --long help,version -n 'create_poseidon_release.sh' -- "$@"`  
+eval set -- "$TEMP"
+
+## parameter defaults
+trident_path="/r1/people/srv_autoeager/bin/trident-1.5.7.0"
+## In the future, maybe multiple releases, for each data type?
+poseidon_pacakges="/mnt/archgen/Autorun_eager/poseidon_packages/TF/Sites/"
+release_dir="/mnt/archgen/Autorun_eager/poseidon_packages/releases/"
+
+## Read in CLI arguments
+while true ; do
+  case "$1" in
+    -h|--help) Helptext; exit 0 ;;
+    -v|--version) echo ${VERSION}; exit 0;;
+    --) release_name="${2}"; break ;;
+    *) echo -e "invalid option provided: $1.\n"; Helptext; exit 1;;
+  esac
+done
+
+## All poseidon packages have the population name "Unknown". This can be used to make a mega release easily.
+##   Once the large dataset is created, the population name can be changed to the site name.
+## TODO: a) Submit to scheduler, b) First forge each site, then forge across sites. That limits open file handles and speeds things up considerably.
+CMD="${trident_path} forge \
+  -d ${poseidon_pacakges} \
+  --forgeString Unknown \
+  --outFormat EIGENSTRAT \
+  --outPackagePath ${release_dir}/${release_name} \
+  --outPackageName ${release_name} \
+  --logMode SimpleLog"
+
+errecho "${CMD}" | tr -s ' '
+${CMD} 2>&1 > ${release_dir}/${release_name}.creation_log
+
+if [[ $? -ne 0 ]]; then
+  errecho "${Red}Error${Normal}: Trident failed to create the release. Check the log file for more information."
+  exit 1
+fi
+
+## Update Group_Name column in ind file
+awk -F "\t" -v OFS="\t" '{if ($1 ~ /_ss$/) {$3 = substr($1, 1,length($1)-6)} else {$3 = substr($1, 1,length($1)-3)}; print $0}' ${release_dir}/${release_name}.ind > ${release_dir}/${release_name}.ind.tmp
+mv ${release_dir}/${release_name}.ind ${release_dir}/.${release_name}.ind.original
+mv ${release_dir}/${release_name}.ind.tmp ${release_dir}/${release_name}.ind
+
+## Update Group_Name column in janno file
+##    janno has  aheader line, so add NR==1; NR > 1 to only apply the transformation after the first line.
+awk -F "\t" -v OFS="\t" 'NR==1; NR > 1{if ($1 ~ /_ss$/) {$3 = substr($1, 1,length($1)-6)} else {$3 = substr($1, 1,length($1)-3)}; print $0}' ${release_dir}/${release_name}.janno > ${release_dir}/${release_name}.janno.tmp
+mv ${release_dir}/${release_name}.janno ${release_dir}/.${release_name}.janno.original
+mv ${release_dir}/${release_name}.janno.tmp ${release_dir}/${release_name}.janno
+
+## Rectify the package to add checksums
+CMD="${trident_path} rectify \
+  -d ${release_dir}/${release_name} \
+  --packageVersion Minor \
+  --logText 'Added checksums to package' \
+  --checksumAll"
+
+errecho "${CMD}" | tr -s ' '
+${CMD}
diff --git a/scripts/create_processed_ind_list.sh b/scripts/create_processed_ind_list.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+
+date=$(date +'%y%m%d_%H%M')
+
+cd /mnt/archgen/Autorun_eager/
+
+if [[ ! -d stats/${date} ]]; then
+	mkdir stats/${date}/
+fi
+
+find /mnt/archgen/Autorun_eager/eager_outputs -maxdepth 4 -mindepth 4 -path '*/*/*/multiqc' -type d | rev | cut -d "/" -f 2 | rev > stats/${date}/all_processed_inds_${date}.tsv
+
+sort -u stats/${date}/all_processed_inds_${date}.tsv > stats/${date}/all_processed_inds_${date}_unique.txt
+
+(for a in `ls eager_outputs/`; do echo -n "${a} individuals processed: "; find /mnt/archgen/Autorun_eager/eager_outputs/${a} -maxdepth 4 -mindepth 4 -path '*/*/*/multiqc/multiqc_report.html' -type f  | wc -l ; done ; echo "Date: ${date}") > stats/${date}/n_processed_inds_${date}.txt
diff --git a/scripts/cron_daily_prepare.sh b/scripts/cron_daily_prepare.sh
@@ -33,3 +33,17 @@ find /mnt/archgen/Autorun/Results/Human_RM/2* -name '*.bam' -mtime -1 2>/dev/nul
     echo "Processing RM data from run: ${RUN}"
     scripts/prepare_eager_tsv.R -s $RUN -a RM -o eager_inputs/ -d .eva_credentials
 done
+
+# Y + mtDNA capture (YMCA)
+# Note: this find only checks runs starting from 2020.  Silence stderr to avoid 'permission denied'.
+find /mnt/archgen/Autorun/Results/Human_Y/2* -name '*.bam' -mtime -1 2>/dev/null | cut -f 7 -d "/" | sort -u | while read RUN ; do
+    echo "Processing YC data from run: ${RUN}"
+    scripts/prepare_eager_tsv.R -s $RUN -a YC -o eager_inputs/ -d .eva_credentials
+done
+
+# Immunocapture
+# Note: this find only checks runs starting from 2020.  Silence stderr to avoid 'permission denied'.
+find /mnt/archgen/Autorun/Results/Human_IM/2* -name '*.bam' -mtime -1 2>/dev/null | cut -f 7 -d "/" | sort -u | while read RUN ; do
+    echo "Processing IM data from run: ${RUN}"
+    scripts/prepare_eager_tsv.R -s $RUN -a IM -o eager_inputs/ -d .eva_credentials
+done
diff --git a/scripts/ethical_sample_scrub.sh b/scripts/ethical_sample_scrub.sh
@@ -1,5 +1,10 @@
 #!/usr/bin/env bash
 
+## DEPENDENCY
+pandora_helper="/mnt/archgen/tools/helper_scripts/py_helpers/pyPandoraHelper/pyPandoraHelper.py"
+
+valid_analysis_types=("TF" "SG" "RP" "RM" "IM" "YC")
+
 ## Helptext function
 function Helptext() {
   echo -ne "\t usage: $0 [options] <sensitive_seqIds_list>\n\n"
@@ -63,12 +68,14 @@ else
 
   ## If the individuals were flagged as sensitive AFTER processing started, both the inputs and outputs should be made inaccessible.
   for raw_iid in ${scrub_me[@]}; do
-    for analysis_type in "SG" "TF" "RP" "RM"; do
+    for analysis_type in ${valid_analysis_types[@]}; do
       ## EAGER_INPUTS
-      site_id="${raw_iid:0:3}"
+      site_id=`${pandora_helper} -g site_id ${raw_iid}` ## Site inferred by pyPandoraHelper
       eager_input_tsv="${root_input_dir}/${analysis_type}/${site_id}/${raw_iid}/${raw_iid}.tsv"
       ## If the eager inpput exists, hide the entire directory and make it inaccessible
       if [[ -f ${eager_input_tsv} ]]; then
+        errecho "Scrubbing ${raw_iid} from ${analysis_type}"
+        errecho "    ${raw_iid} ${eager_input_tsv}"
         old_name=$(dirname ${eager_input_tsv})
         new_name=$(dirname ${old_name})/.${raw_iid}
         mv -v ${old_name} ${new_name} ## Hide the input directory
@@ -78,6 +85,7 @@ else
       ## EAGER_OUTPUTS
       eager_output_dir="${root_output_dir}/${analysis_type}/${site_id}/${raw_iid}/"
       if [[ -d ${eager_output_dir} ]]; then
+        errecho "    ${rawiid} ${eager_output_dir}"
         new_outdir_name=$(dirname ${eager_output_dir})/.${raw_iid}
         mv -v ${eager_output_dir} ${new_outdir_name} ## Hide the output directory
         chmod 0700 ${new_outdir_name}                ## Restrict the directory contents
diff --git a/scripts/fill_in_janno.R b/scripts/fill_in_janno.R
@@ -18,6 +18,11 @@ if (!require('poseidonR')) {
     remotes::install_github('poseidon-framework/poseidonR')
     # require(poseidonR)
 } else {require(poseidonR)}
+if (!require('rPandoraHelper')) {
+    write("Installing required local package 'rPandoraHelper'...", file=stderr())
+    install.packages("/mnt/archgen/tools/helper_scripts/r_helpers/rPandoraHelper/", repos = NULL, type = "source")
+    # require(rPandoraHelper)
+} else {require(rPandoraHelper)}
 
 ## Parse arguments ----------------------------
 parser <- OptionParser()
@@ -73,8 +78,11 @@ if (args$output_fn == "") {
 input_janno_table <- eager2poseidon::standardise_janno(args$janno_fn)
 
 ## Create new column `Pandora_ID` that removes the ss_suffix (if present) from the Poseidon ID to infer the Pandora_ID of the individual.
+## Uses rParndoraHelper::get_ind_id to infer the Pandora ID of the individual.
 sample_ids <- dplyr::select(input_janno_table, Poseidon_ID) %>%
-  dplyr::mutate(Pandora_ID=sub(paste0(args$ss_suffix,"$"), '', .data$Poseidon_ID))
+  rowwise() %>%
+  dplyr::mutate(Pandora_ID=rPandoraHelper::get_ind_id(Poseidon_ID, keep_ss_suffix=F )) %>%
+  ungroup()
 
 ##################
 ## Pandora info ##
@@ -86,13 +94,16 @@ pandora_results <- eager2poseidon::import_pandora_data(sample_ids %>% dplyr::sel
   ## drop Pandora_ID column. not needed anymore
   dplyr::select(-Pandora_ID)
 
+## Use rPandoraHelper to infer Pandora IDs from input ind_id
+pandora_site_id <- rPandoraHelper::get_site_id(args$ind_id, keep_ss_suffix=F)
+pandora_ind_id  <- rPandoraHelper::get_ind_id(args$ind_id, keep_ss_suffix=F)
 ## Infer locations of different JSONs to read results in with eagerR. (More flexible than e2p and can pull results from SG runs if present)
 base_dir <- "/mnt/archgen/Autorun_eager"
 # base_dir <- "/Users/lamnidis/mount"
-eager_tsv_fn <- paste0(base_dir, "/eager_inputs/TF/", substr(args$ind_id,0,3), "/", args$ind_id,"/", args$ind_id, ".tsv")
-eager_tf_results_dir <- paste0(base_dir, "/eager_outputs/TF/", substr(args$ind_id,0,3), "/", args$ind_id,"/")
-eager_sg_endorspy_dir <- paste0(base_dir, "/eager_outputs/SG/", substr(args$ind_id,0,3), "/", args$ind_id,"/endorspy/")
-eager_sg_damageprofiler_dir <- paste0(base_dir, "/eager_outputs/SG/", substr(args$ind_id,0,3), "/", args$ind_id,"/damageprofiler/")
+eager_tsv_fn <- paste0(base_dir, "/eager_inputs/TF/", pandora_site_id, "/", pandora_ind_id,"/", pandora_ind_id, ".tsv")
+eager_tf_results_dir <- paste0(base_dir, "/eager_outputs/TF/", pandora_site_id, "/", pandora_ind_id,"/")
+eager_sg_endorspy_dir <- paste0(base_dir, "/eager_outputs/SG/", pandora_site_id, "/", pandora_ind_id,"/endorspy/")
+eager_sg_damageprofiler_dir <- paste0(base_dir, "/eager_outputs/SG/", pandora_site_id, "/", pandora_ind_id,"/damageprofiler/")
 
 ##############
 ## TSV info ##
@@ -194,6 +205,7 @@ updated_columns <- eager2poseidon::compile_eager_result_tables(
     "Capture_Type"
   ))) %>%
   ## Remove ss_suffix from library names, so they match Pandora Library IDs
+  ## NOTE: Should this be changed to use rPandoraHelper? Would require some tweaking to work with list columns, as it currently expects a single ID.
   dplyr::mutate(
     Library_Names=gsub('_ss','',.data$Library_Names) %>% vctrs::vec_unique()
   ) %>%
diff --git a/scripts/prepare_eager_tsv.R b/scripts/prepare_eager_tsv.R
diff --git a/scripts/run_Eager.sh b/scripts/run_Eager.sh
diff --git a/scripts/update_poseidon_package.sh b/scripts/update_poseidon_package.sh