brioche/params.config at main · plantinformatics/brioche · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
params {
    //============================Required parameters=======================================================================
    mode              = 'prod' //Set mode to 'prod' to run on new dataset otherwise brioche will run on the test data
    notifications     =  true //Option to (dis|en)able email notifications, set to false to disable
    emailaddress      = "" //Replace with your e-mail address to receive notifications
    resultsdir        = "" //Path where brioche results are saved, default is the launch-directory/brioche-results
    genomefasta       = "${projectDir}/Data/CDCFrontier_GA_v10_Chrom_3_7.fa" //Replace with absolute path to the fasta file for your assembly
    restrict2chrom    =  "" //Restrict search to this chromosome
    chromstoexclude   = "" //List of chromosomes to exclude separated by a comma - e.g "chr1,chr2,chrun"
    genomename        = 'CDCFrontier_GA_v10' //Name of genome that is used in naming output files
    probename         = 'AVRGRDC_Pulses_v1' //Name of Chip/Marker set used in the naming of the output file
    istarget3primeend = 'TRUE' //Is the target in the probe at the 3' prime end (only relevant for SNP chip data), 'TRUE' or 'FALSE'
    targetdesign      = "${projectDir}/Data/AVRGRDC_Pulses_v1_20006795X370754_A2_Chickpea-target-new-format.tsv" //csv or tsv table with ID,ProbeSequence,Target bp position, Target
    markercharacter   = 'D' //Character used to replace target marker in probe sequence
    buildblastdbonly  = false //Set to true, if all you want to do is build a blast database
    usetargetchrom    = 'No' // Set to either Yes or No. Set to Yes to use a list of known target chromosomes to preference markers which fall on certain chromosomes. When set to yes, will use data from chromchrommatch and markertargetsites to assist with optimal marker location identification
    usesharedmarkersmap = 'No' // Set to either Yes or No. Set to Yes to use to use a list of known closely located markers per marker (similarSNPsmap) to identify whether target markers are aligned properly (target must be same chromsome as X fraction of close markers given)
    useldedgemap     = 'Yes' // Set to either Yes or No. Set to Yes to incorporate raw pairwise LD outputs to inform on top marker site.
    usegeneticmap     = 'Yes' // Set to either Yes or No. Set to Yes to incorporate genetic mapping outputs to inform on top marker chromosome. Requires chromchrommatch to be provided to
    sharedmarkersmapcutoff = '0.5' // (not in use yet) Set to the fraction of markers required from similarmarkersmap to allow for a given marker map to be correct. e.g., 1 is 100% 0.5 is 50% of provided markers
    chromchrommatch   = "Chromesome-chromosome_mappings.csv" // Absolute file path to pairwise table showing chromosomes comparison between the chromosome names of known target chromosomes and the names of new chromosomes on the reference genome e.g ChrA,Chr1   Scaffold2, Chr2
    markertargetsites = "targetchromsknownmarkers.csv" // Absolute path to 2 column table of marker and target chromosome of each marker to preference in filtering e.g., marker1, ChrA.
    similarmarkersmap = "linkedSNPs.csv" // Absolute path to a multicolumn table with the first column being the target marker and all subsequent columns being known nearby markers
    geneticmap = "genetic_mapping_input.csv" // Absolute path to a multicolumn table with the first column being the target marker the second the number of unique chromosomes a marker was mapped to genetically, and the following 10 being the name and hits for each chromosome amongst an arbitrary number of genetic mappings which were tested
    ldedgemap        = "examplelinkagedisequalibriumfile.ld" // Absolute path to an pairwise LD edge output table (PLINK TASSEL Haploview long tables) for linkage disequilibrium scores e.g CHR_A  BP_A   SNP_A      CHR_B  BP_B   SNP_B      R2     Dprime  P
    //============================Options for BLAST=======================================================================
    evalue            = 0.05 //The BLAST E-value is the number of expected hits of similar quality (score) that could be found just by chance.
    dust              = 'no' //Filtering option to filter query sequence with DUST (Format: 'yes', 'level window linker', or 'no' to disable)
    otherblastoptions = '' //Other parameters that need to be passed to BLAST, should be of the format "-param1 val1 -param2 val2" Here you can play around with dropping the word size to more reliably return hits for for smaller and more diverged markers (-word_size)
    query_chunk_size  = '20000' // Maximum size of dataset to be input into blastn at a time. fasta sequences will be split into subprocesses and run. This acts as the split size for downstream R scripts to (ideal is < 50,000 for blastn and downstream processes).
    //================================XT data with target SNP at end of probe===============================================
    minlength         = 40 //minimum length of HSP to be considered fully hybridized
    extendablebps     = 3 //number of matching base pairs from the 3 prime end for a probe to be considered as extendable
    maxgaps           = 0 // New option to implement
    //================================Filtering options====================================================================
    coverage          = 80 //Option to filter any hits with coverage less than the provided percentage
    pident            = 92  //Option to filter any hits with pident less than the provided percentage
    maximumhits       = 10 //Filter probes with hits more than maximumhits - change in logic
    localdupdist      = '100000'  // length in bp of region (both u/stream and d/stream each way) to consider for local duplication e.g., if value set to 1000000, search 100000 upstream and 100000 downstream of top marker hit for additional high identity matches indicating a local duplication of this region in the reference genome mapping to. This can be reported as additional data when performing insilico
    keeplocalduppos   = 'yes' // Keep markers where a local duplicate was detected if the allele states are identical. Will report the lowest sstart location as the Marker position.
    keepduplicates    = 'FALSE' // REDUNDANT FEATURE!!!! SUPERSCEDED BY SEVERAL MORE EXPANSIVE SYSTEMS AND OPTIONS Whether to retain or filter markers with multiple hits, maximumhits
    //================================Internal paramaters=============================================================
    mappingstate     = "Onset" // Do not change this paramater! This is an internal carry over paramater to assist with insilico genotyping. Onset=first round so use user provided orientations, Ongoing=Second or later round so use internal ProjectDIR orientationfile.
    //================================slurm options====================================================================
    hpcaccount       =  'user' //account to be used on basc, change this to the account associated with the project etc
    shortqueue       =  'shortrun' //queueing option for jobs with short run time
    longqueue        =  'batch' //queueing option for jobs with long run times
    queuesize        = 100 //Number of jobs submitted to slurm queue at any given time
}