1.2.0 (#18)

TCLamnidis · web-flow · commit ed588a739bae · 2023-03-21T15:21:11.000+01:00
* prep changelog

* Fix PR template

* WIP parallelise eager job submission

* Correct syntax error

* No printing to screen for arrays. fix whitespace

* fix number of jobs

* make executable

* Add array qsub command

* update .gitignore

* print qsub command before submission

* Fix job naming

* Initial commit of poseidon package creation

* Rscript to fill in janno and overwrite columns

* Bugfixes

* Add suffix option and correct utput janno path

* move script

* Add janno recreation. Other minor changes

* Add pandora results to janno

* Add log info. New pacakge creation completed.

* Minor changes. Add Library_Names column

* Add script to mirror Population and Sex from janno to fam/ind

* Update CHANGELOG.md

* Update package updating.

* Add debug option. Add AE version in poseidon pkgs

* Remove debug cause of clash. Error when update fails.

* Update CHANGELOG.md

* Only delete temp files if validation passed.

* Bugfix.Runs now updated only if a change in the data occurs.

* move update script to scripts/

* Server-side testing paths

* Add path to trident executable

* server-paths

* Bump version

* Add environment yml file

* Update CHANGELOG.md

* Update output folder to live

* increase resources for AE_spawner jobs

* More resource tweaking for array jobs

* Increase memory further

* Remove path from environment yml

* Bump version

* Match Run_ID, not Batch_ID

* Array log subdir

* Update CHANGELOG.md

* prep CHANGELOG.md

* 40G memory max for array job

* indentation fix

* correct column naming

* document changes

* correct Nr_libs in column selection

* correct paths

* Update .gitignore

* Optimisation. Version bump. Distinct iids used for joining.

* Bump version

* Update CHANGELOG.md

* bump version

* Add mention of memory changes

* Prep Changelog

* Apply stash

* non-local paths

* Unique lib names

* Keep both lowercase and uppercase version of pandora analysis IDs

* Add whitelist option to update only specific TSVs

* pre-release version bump.

* document whitelist option

* fix formatting
diff --git a/.gitignore b/.gitignore
@@ -14,4 +14,4 @@ test_data/
 .tmp/
 eager_inputs_old/
 eager_outputs_old/
-array_Logs/
+array_Logs/
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,19 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.2.0] - 21/03/2023
+
+### `Added`
+ - `prepare_eager_tsv.R`: Added `-w/--whitelist` option. A whitelist of Pandora Individual IDs can be provided. Only the TSVs of individuals in the whitelist will be updated.
+
+### `Fixed`
+ - `update_poseidon_packages.sh`: `Library_Names` field now includes only unique library names.
+ - `prepare_eager_tsv.R`: Camel_Case versions of Pandora Analysis IDs are no longer filtered out.
+
+### `Dependencies`
+
+### `Deprecated`
+
 ## [1.1.3] - 17/03/2023
 
 ### `Added`
diff --git a/README.md b/README.md
@@ -67,6 +67,10 @@ Options:
             Some tools used in nf-core/eager will strip everything after the first dot (.)
             from the name of the input file, which can cause naming conflicts in rare cases.
 
+    -w WHITELIST, --whitelist=WHITELIST
+            An optional file that includes the IDs of whitelisted individuals,
+                    one per line. Only the TSVs for these individuals will be updated.
+
     -o OUTDIR/, --outDir=OUTDIR/
         The desired output directory. Within this directory, one subdirectory will be 
             created per analysis type, within that one subdirectory per individual ID,
diff --git a/scripts/fill_in_janno.R b/scripts/fill_in_janno.R
@@ -195,7 +195,7 @@ updated_columns <- eager2poseidon::compile_eager_result_tables(
   ))) %>%
   ## Remove ss_suffix from library names, so they match Pandora Library IDs
   dplyr::mutate(
-    Library_Names=gsub('_ss','',.data$Library_Names)
+    Library_Names=gsub('_ss','',.data$Library_Names) %>% vctrs::vec_unique()
   ) %>%
   ## Keep distinct rows, now that Library_ID has been dropped
   dplyr::distinct() %>%
diff --git a/scripts/prepare_eager_tsv.R b/scripts/prepare_eager_tsv.R
@@ -46,20 +46,20 @@ save_ind_tsv <- function(data, rename, output_dir, ...) {
   data %>% select(-individual.Full_Individual_Id) %>%  readr::write_tsv(file=paste0(ind_dir,"/",ind_id,".tsv")) ## Output structure can be changed here.
 
   ## Print Autorun_eager version to file
-  AE_version <- "1.1.3"
+  AE_version <- "1.2.0"
   cat(AE_version, file=paste0(ind_dir,"/autorun_eager_version.txt"), fill=T, append = F)
 }
 
 ## Correspondance between '-a' analysis type and the name of Kay's pipeline.
 ##    Only bams from the output autorun_name will be included in the output
-autorun_name_from_analysis_type <- function(analysis_type) {
-  autorun_name <- case_when(
-    analysis_type == "TF" ~ "HUMAN_1240K",
-    analysis_type == "SG" ~ "HUMAN_SHOTGUN",
+autorun_names_from_analysis_type <- function(analysis_type) {
+  autorun_names <- case_when(
+    analysis_type == "TF" ~ c( "HUMAN_1240K", "Human_1240k" ), 
+    analysis_type == "SG" ~ c( "HUMAN_SHOTGUN", "Human_Shotgun" ),
     ## Future analyses can be added here to pull those bams for eager processsing.
     TRUE ~ NA_character_
   )
-  return(autorun_name)
+  return(autorun_names)
 }
 
 ## MAIN ##
@@ -81,6 +81,11 @@ parser <- add_option(parser, c("-r", "--rename"), type = 'logical',
 			Some tools used in nf-core/eager will strip everything after the first dot (.)
 			from the name of the input file, which can cause naming conflicts in rare cases."
                     )
+parser <- add_option(parser, c("-w", "--whitelist"), type = 'character',
+                    action = 'store', dest = 'whitelist_fn', default=NA_character_,
+                    help = "An optional file that includes the IDs of whitelisted individuals,
+			one per line. Only the TSVs for these individuals will be updated."
+                    )
 parser <- add_option(parser, c("-o", "--outDir"), type = 'character',
                     action = "store", dest = "outdir",
                     help= "The desired output directory. Within this directory, one subdirectory will be 
@@ -99,6 +104,7 @@ opts <- arguments$options
 cred_file <- arguments$args
 sequencing_batch_id <- opts$sequencing_batch_id
 analysis_type <- opts$analysis_type
+whitelist_fn <- opts$whitelist_fn
 
 if (is.na(analysis_type)) {
   stop(call.=F, "\n[prepare_eager_tsv.R] error: No analysis type provided with -a. Please see --help for more information.\n")
@@ -128,9 +134,9 @@ tibble_input_iids <- complete_pandora_table %>% filter(sequencing.Run_Id == sequ
 
 ## Pull information from pandora, keeping only matching IIDs and requested Sequencing types.
 results <- inner_join(complete_pandora_table, tibble_input_iids, by=c("individual.Full_Individual_Id"="individual.Full_Individual_Id")) %>%
-  filter(grepl(paste0("\\.", analysis_type), sequencing.Full_Sequencing_Id), analysis.Analysis_Id == autorun_name_from_analysis_type(analysis_type)) %>%
+  filter(grepl(paste0("\\.", analysis_type), sequencing.Full_Sequencing_Id), analysis.Analysis_Id %in% autorun_names_from_analysis_type(analysis_type)) %>%
   select(individual.Full_Individual_Id,individual.Organism,library.Full_Library_Id,library.Protocol,analysis.Result_Directory,sequencing.Sequencing_Id,sequencing.Full_Sequencing_Id,sequencing.Single_Stranded) %>%
-  distinct() %>% ## Need distinct() call because of hoe analysis tab is read in, which created one copy of each row per analysis field.
+  distinct() %>% ## Need distinct() call because of how analysis tab is read in, which created one copy of each row per analysis field.
   group_by(individual.Full_Individual_Id) %>%
   filter(!is.na(analysis.Result_Directory)) %>% ## Exclude individuals with no results directory (seem to mostly be controls)
   mutate(
@@ -183,5 +189,13 @@ results <- inner_join(complete_pandora_table, tibble_input_iids, by=c("individua
 ## Save results into single file for debugging
 if ( opts$debug ) { write_tsv(results, file=paste0(sequencing_batch_id, ".", analysis_type, ".results.txt")) }
 
+## Read in the whitelist if any, and filter the results table
+if (! is.na(whitelist_fn) ){
+  whitelist <- read_tsv(whitelist_fn, col_types='c', col_names='Pandora_ID')
+  
+  results <- results %>% filter(individual.Full_Individual_Id %in% whitelist$Pandora_ID)
+  # write_tsv(results, file=paste0(sequencing_batch_id, ".", analysis_type, ".whitelist.results.txt"))
+}
+
 ## Group by individual IDs and save each chunk as TSV
 results %>% group_by(individual.Full_Individual_Id) %>% group_walk(~save_ind_tsv(., rename=F, output_dir=output_dir), .keep=T)
diff --git a/scripts/update_poseidon_package.sh b/scripts/update_poseidon_package.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-VERSION="1.1.3"
+VERSION="1.2.0"
 
 ## Colours for printing to terminal
 Yellow=$(tput sgr0)'\033[1;33m' ## Yellow normal face
@@ -43,7 +43,7 @@ done
 
 autorun_root_dir='/mnt/archgen/Autorun_eager/'
 root_input_dir='/mnt/archgen/Autorun_eager/eager_outputs' ## Directory should include subdirectories for each analysis type (TF/SG) and sub-subdirectories for each site and individual.
-root_output_dir='/mnt/archgen/Autorun_eager/poseidon_packages' ## Directory that includes data type, site ID and ind ID subdirs.
+root_output_dir='/mnt/archgen/Autorun_eager/dev/poseidon_packages' ## Directory that includes data type, site ID and ind ID subdirs.
 input_dir="${root_input_dir}/TF/${ind_id:0:3}/${ind_id}/genotyping/"
 output_dir="${root_output_dir}/TF/${ind_id:0:3}/${ind_id}/"
 cred_file="${autorun_root_dir}/.eva_credentials"