Skip to content

Commit 89e60b0

Browse files
Merge branch 'ibis-add-preprocessing' into add_center_around_peak
2 parents 5c321e7 + ab33e8d commit 89e60b0

File tree

6 files changed

+257
-4
lines changed

6 files changed

+257
-4
lines changed

conf/modules.config

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@ process {
3232
ext.prefix = { "${meta.id}.background_aliens" }
3333
}
3434

35+
withName: "BEDTOOLS_SUBTRACT" {
36+
publishDir = [
37+
enabled: false
38+
]
39+
ext.prefix = { "${meta.id}.background.clean" }
40+
3541
withName: "CENTER_AROUND_PEAK" {
3642
publishDir = [
3743
enabled: false
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
2+
channels:
3+
- conda-forge
4+
- bioconda
5+
dependencies:
6+
- "bioconda::bedtools=2.31.1"

modules/local/awk/shade/main.nf

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
process AWK_SHADE {
2+
label 'process_low'
3+
conda "${moduleDir}/environment.yml"
4+
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
5+
'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--h13024bc_3':
6+
'biocontainers/bedtools:2.31.1--h13024bc_3' }"
7+
// bedtools is not actually needed here, we only use awk
8+
9+
input:
10+
tuple path(bed), val(length), val(gap)
11+
path genome_index
12+
13+
output:
14+
path "${prefix}.bed", emit: shade_background
15+
//path "versions.yml" , emit: versions
16+
17+
when:
18+
task.ext.when == null || task.ext.when
19+
20+
script:
21+
prefix = task.ext.prefix ?: 'shade'
22+
"""
23+
# First create an associative array with chromosome sizes
24+
awk '{
25+
chrom_sizes[\$1]=\$2
26+
}' ${genome_index} > chrom_sizes.txt
27+
28+
awk -v w=${length} -v s=${gap} '
29+
# First load chromosome sizes
30+
FILENAME == "chrom_sizes.txt" {
31+
sizes[\$1]=\$2
32+
next
33+
}
34+
# Then process BED file
35+
FILENAME != "chrom_sizes.txt" && NR>1 {
36+
# Keep all original columns
37+
for(i=4;i<=NF;i++) extra=extra"\t"\$i
38+
39+
# Only print if chromosome exists in index
40+
if (\$1 in sizes) {
41+
# Create upstream window if it stays within bounds
42+
if (\$2-s-w >= 0) {
43+
print \$1, \$2-s-w, \$2-s extra
44+
}
45+
# Create downstream window if it stays within bounds
46+
if (\$3+s+w <= sizes[\$1]) {
47+
print \$1, \$3+s, \$3+s+w extra
48+
}
49+
}
50+
extra=""
51+
}' OFS='\\t' chrom_sizes.txt ${bed} > ${prefix}.bed
52+
53+
"""
54+
55+
stub:
56+
prefix = task.ext.prefix ?: 'shade'
57+
"""
58+
touch ${prefix}.bed
59+
60+
cat <<-END_VERSIONS > versions.yml
61+
"${task.process}":
62+
bedtools: \$(bedtools --version |& sed '1!d ; s/bedtools v//')
63+
END_VERSIONS
64+
"""
65+
}
66+
67+
workflow {
68+
// Create separate channels
69+
bed_ch = Channel.fromPath(params.b)
70+
genome_ch = Channel.fromPath(params.i)
71+
length_ch = Channel.of(params.l)
72+
gap_ch = Channel.of(params.g)
73+
74+
// Create proper input tuple (bed, length, gap)
75+
input_tup = bed_ch
76+
.combine(length_ch)
77+
.combine(gap_ch)
78+
79+
// Pass tuple and genome index separately
80+
AWK_SHADE(input_tup, genome_ch)
81+
}

modules/local/awk/shade/meta.yml

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
---
2+
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
3+
name: "awk"
4+
## TODO nf-core: Add a description of the module and list keywords
5+
description: write your description here
6+
keywords:
7+
- sort
8+
- example
9+
- genomics
10+
tools:
11+
- "awk":
12+
## TODO nf-core: Add a description and other details for the software below
13+
description: ""
14+
homepage: ""
15+
documentation: ""
16+
tool_dev_url: ""
17+
doi: ""
18+
licence:
19+
identifier:
20+
21+
## TODO nf-core: Add a description of all of the variables used as input
22+
input:
23+
#
24+
## TODO nf-core: Delete / customise this example input
25+
- bam:
26+
type: file
27+
description: Sorted BAM/CRAM/SAM file
28+
pattern: "*.{bam,cram,sam}"
29+
ontologies:
30+
- edam: "http://edamontology.org/format_25722"
31+
- edam: "http://edamontology.org/format_2573"
32+
- edam: "http://edamontology.org/format_3462"
33+
34+
35+
## TODO nf-core: Add a description of all of the variables used as output
36+
output:
37+
- bam:
38+
#
39+
## TODO nf-core: Delete / customise this example output
40+
- "*.bam":
41+
type: file
42+
description: Sorted BAM/CRAM/SAM file
43+
pattern: "*.{bam,cram,sam}"
44+
ontologies:
45+
- edam: "http://edamontology.org/format_25722"
46+
- edam: "http://edamontology.org/format_2573"
47+
- edam: "http://edamontology.org/format_3462"
48+
49+
- versions:
50+
- "versions.yml":
51+
type: file
52+
description: File containing software versions
53+
pattern: "versions.yml"
54+
55+
authors:
56+
- "@mathysgrapotte"
57+
maintainers:
58+
- "@mathysgrapotte"
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
// TODO nf-core: Once you have added the required tests, please run the following command to build this file:
2+
// nf-core modules test awk
3+
nextflow_process {
4+
5+
name "Test Process AWK"
6+
script "../main.nf"
7+
process "AWK"
8+
9+
tag "modules"
10+
tag "modules_"
11+
tag "awk"
12+
13+
// TODO nf-core: Change the test name preferably indicating the test-data and file-format used
14+
test("sarscov2 - bam") {
15+
16+
// TODO nf-core: If you are created a test for a chained module
17+
// (the module requires running more than one process to generate the required output)
18+
// add the 'setup' method here.
19+
// You can find more information about how to use a 'setup' method in the docs (https://nf-co.re/docs/contributing/modules#steps-for-creating-nf-test-for-chained-modules).
20+
21+
when {
22+
process {
23+
"""
24+
// TODO nf-core: define inputs of the process here. Example:
25+
26+
input[0] = file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true),
27+
"""
28+
}
29+
}
30+
31+
then {
32+
assertAll(
33+
{ assert process.success },
34+
{ assert snapshot(process.out).match() }
35+
//TODO nf-core: Add all required assertions to verify the test output.
36+
// See https://nf-co.re/docs/contributing/tutorials/nf-test_assertions for more information and examples.
37+
)
38+
}
39+
40+
}
41+
42+
// TODO nf-core: Change the test name preferably indicating the test-data and file-format used but keep the " - stub" suffix.
43+
test("sarscov2 - bam - stub") {
44+
45+
options "-stub"
46+
47+
when {
48+
process {
49+
"""
50+
// TODO nf-core: define inputs of the process here. Example:
51+
52+
input[0] = file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true),
53+
"""
54+
}
55+
}
56+
57+
then {
58+
assertAll(
59+
{ assert process.success },
60+
{ assert snapshot(process.out).match() }
61+
//TODO nf-core: Add all required assertions to verify the test output.
62+
)
63+
}
64+
65+
}
66+
67+
}

subworkflows/local/preprocess_bedfile_to_fasta/main.nf

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,13 @@ TODO write the meta.yaml file for this workflow
44
Basically, this subworkflow expects to get a bed file as input,
55
and a configuration channel that contains the target and background.
66
7+
It first centers the peaks and cuts them to a fixed size.
8+
79
Then it extracts the foreground and the background from the bed file.
810
Alternatively, it can build the background from random regions or from
9-
the foreground peaks.
11+
the foreground peaks nearby regions.
1012
11-
The extracted peaks are then extended and overlapping peaks are removed.
13+
The peaks in background overlapping the foreground will be removed.
1214
Finally, the peaks are converted to fasta format.
1315
1416
In this way, by knowing the target and background peaks, we can build
@@ -18,7 +20,9 @@ the dataset for stimulus with sequences as input and foreground/background
1820
*/
1921
include { EXTRACT_DATA_CONTENT_BY_COLUMN_VALUES as EXTRACT_FOREGROUND } from '../../../modules/local/extract_data_content_by_column_values'
2022
include { EXTRACT_DATA_CONTENT_BY_COLUMN_VALUES as EXTRACT_BACKGROUND_ALIENS } from '../../../modules/local/extract_data_content_by_column_values'
23+
2124
include { GAWK as CENTER_AROUND_PEAK } from '../../../modules/nf-core/gawk'
25+
include { BEDTOOLS_SUBTRACT } from '../../../modules/nf-core/bedtools/subtract'
2226

2327

2428
workflow PREPROCESS_BEDFILE_TO_FASTA {
@@ -28,6 +32,12 @@ workflow PREPROCESS_BEDFILE_TO_FASTA {
2832

2933
main:
3034

35+
// TODO: it would be nice to check that the input file is actually a bed file
36+
37+
// ==============================================================================
38+
// align peaks
39+
// ==============================================================================
40+
3141
// TODO the foolowing is just a proof of concept and how to example
3242
// on the usage of the GAWK nf-core module for modifying
3343
// bed start and end values based on distance from peak (centering).
@@ -41,6 +51,11 @@ workflow PREPROCESS_BEDFILE_TO_FASTA {
4151
CENTER_AROUND_PEAK(ch_center_input, ch_awk_program)
4252
*/
4353

54+
// ==============================================================================
55+
// extract foreground
56+
// ==============================================================================
57+
58+
4459
// extract foreground
4560

4661
ch_foreground_ids = ch_config
@@ -53,6 +68,10 @@ workflow PREPROCESS_BEDFILE_TO_FASTA {
5368
)
5469
ch_foreground = EXTRACT_FOREGROUND.out.extracted_data
5570

71+
// ==============================================================================
72+
// extract background
73+
// ==============================================================================
74+
5675
// extract background - aliens
5776

5877
ch_background_ids = ch_config
@@ -66,11 +85,27 @@ workflow PREPROCESS_BEDFILE_TO_FASTA {
6685
)
6786
ch_background_aliens = EXTRACT_BACKGROUND_ALIENS.out.extracted_data
6887

69-
ch_background_aliens.view()
88+
// extract background - shades
89+
90+
// extract background - random
7091

7192
// merge different background if needed
93+
// TODO: implement this
94+
// for the moment use aliens background
95+
96+
ch_background = ch_background_aliens
97+
98+
// run bedtools to remove overlapping peaks
99+
// this creates a clean background with no overlapping peaks with the foreground
100+
101+
BEDTOOLS_SUBTRACT(
102+
ch_background.join(ch_foreground)
103+
)
104+
ch_background = BEDTOOLS_SUBTRACT.out.bed
72105

73-
// run bedtools to extend and remove overlapping peaks
106+
// ==============================================================================
107+
// extract fasta sequences
108+
// ==============================================================================
74109

75110
// run bedtools to convert to fasta
76111

0 commit comments

Comments
 (0)