distiller-nf/crick_project.yml at master · luslab/distiller-nf · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#########################################
# TEST HiC project yaml
# SRA: SRR5339787
# Accession no: GSM2533822
#########################################

#######################################
# paths to raw input data (fastq-s):
#######################################
input:
    raw_reads_paths:
        SRR5339787:
            lane1:
                - /camp/home/westc/westc/data/neural_hic_data/sub_sample_test/fastq/sub1.fastq.gz
                - /camp/home/westc/westc/data/neural_hic_data/sub_sample_test/fastq/sub2.fastq.gz

    # Truncate input fastqs to a small number of reads (e.g. 10000) for
    # semi-dry test runs.
    truncate_fastq_reads: 0

    # Specify a reference genome to align sequenced reads.
    # Provide the genome assembly name, a wildcard path to the BWA index files
    # of the reference genome, and a tab-separated table with contig sizes
    # (known as "chrom.sizes"). The latter is used to specify the subset and the
    # order of contigs in a resulting contact map.
    genome:
        assembly_name: 'mm10_chr6'
        # bwa_index_wildcard_path: 'hic_data/genome/mm10.fa.*'
        # chrom_sizes_path: 'hic_data/genome/mm10.chrom.sizes'
        bwa_index_wildcard_path: '/camp/home/westc/westc/data/neural_hic_data/sub_sample_test/small_genome/mm10_chr6.fa.*'
        chrom_sizes_path: '/camp/home/westc/westc/data/neural_hic_data/sub_sample_test/small_genome/mm10_chr6.chrom.sizes'

# Choose if you want to do FastQC of the input files:
do_fastqc: True

# Control how reads are mapped to the reference genomes.
map:
    # If 'chunksize' is non-zero, each input file gets split into multiple chunks,
    # each mapped separately. Useful for mapping on clusters with many
    # relatively weak nodes.
    # The optimal chunk size is defined by the balance between mapping and merging.
    # Smaller chunks (~30M) are better for clusters with many weak nodes,
    # however, having >~10 chunks per run slow down merging.
    # chunksize: 0

    # Specify extra BWA mapping options.
    mapping_options: '-v 3'

    # Specify fastp trim options.
    #i.e. parameters
    #--detect_adapter_for_pe -q 15
    trim_options: '-q 20 --json /dev/null'

    # A more technical option, use a custom script to split fastq files from SRA
    # into two files, one per read side. By default it is true, which is
    # faster (because we can use multi-threaded compression), but less
    # stable. Set to false if you download files from SRA and bwa complains
    # about unpaired reads.
    use_custom_split: true

# Control how read alignments are converted ('parsed') into Hi-C pairs.
parse:
    # If 'make_pairsam' is True, parsed Hi-C pairs will store complete
    # alignment records in the SAM format (the resulting hybrid between the
    # .pairs and .sam formats is called '.pairsam'). Such files can be useful for
    # thorough investigation of Hi-C data. Downstream of parsing, pairsams
    # are split into .pairs and .bam, and .bam alignments are tagged with
    # Hi-C related information. 'make_pairsam' roughly doubles the storage
    # and I/O requirements and should be used only when absolutely needed.
    # NOTE: when 'make_pairsam' is False, the initial output of parsing is still
    # called '.pairsam' despite missing SAM alignments, for technical reasons.
    make_pairsam: True

    # When 'make_pairsam' is True, enabling 'drop_seq' erases sequences and
    # Phred scores from the SAM alignments in .pairsam and .bam output files.
    # Enable to make lightweight .pairsam/.bam output.
    # NOTE: when 'make_pairsam' is False, 'drop_seq' is ignored.
    drop_seq: False

    # Enable 'drop_readid' to drop readID from .pairs files to create
    # lightweight .pairs files
    # NOTE: does not affect alignment records in the .pairsam files and
    # subsequently .bam files after .apirsam splitting.
    drop_readid: False

    # When 'keep_unparsed_bams' is True, distiller preserves the _immediate_
    # output of bwa in a .bam format. Could be used as a faster alternative
    # to 'make_pairsam' when alignments are needed, but tagging them with Hi-C
    # related information is not necessary.
    keep_unparsed_bams: True

    # Pass extra options to pairtools parse, on top of the ones specified by
    # flags 'make_pairsam', 'drop_readid', 'drop_seq'. The default value
    # enables storing MAPQ scores in the .pairsam/.pairs output, which are
    # used later for filtering/binning. The default walks-policy is 'mask'
    # which masks complex walks in long reads.
    parsing_options: '--add-columns mapq'

# Control how PCR/optical duplicates are detected in the data.
dedup:
    # PCR/optical duplicates are detected as Hi-C pairs with matching locations
    # on both sides. 'max_mismatch_bp' controls the maximal allowed mismatch in
    # mapped locations on either side for two pairs to be still considered as
    # duplicates.
    max_mismatch_bp: 3

    # Control how Hi-C pairs are binned into contact maps, stored in .cool files.
bin:
    # Specify which resolutions should be included in the multi-resolution .cool file.
    # The lowest (base) resolution _must_ be the common denominator of all other
    # resolutions.
    resolutions:
        - 1000000
        - 500000
        - 250000
        - 100000
        - 50000
        - 20000
        - 10000
    # Specify if the multi-resolution .cool output files should be balanced.
    balance: true

    # Pass additional parameters to cooler balance:
    # balance_options: ''

    # Specify additional filters applied to pairs during binning.
    # Multiple filters are allowed; for each filter, all pairs satisfying the
    # given filter expression will be binned into a separate cooler.
    # Filters are specified using the following syntax:
    # {filter_name}: '{a valid Python expression}'
    filters:
        no_filter: ''
        mapq_30: '(mapq1>=30) and (mapq2>=30)'

########################################
# folder structure for storing results
########################################
output:
    dirs:
        processed_fastqs: 'results/processed_fastqs/'
        mapped_parsed_sorted_chunks: 'results/mapped_parsed_sorted_chunks'
        fastqc: 'results/fastqc/'
        pairs_library: 'results/pairs_library'
        coolers_library: 'results/coolers_library/'
        coolers_library_group: 'results/coolers_library_group/'
        stats_library_group: 'results/stats_library_group/'