-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathmain.nf
More file actions
134 lines (107 loc) · 2.69 KB
/
main.nf
File metadata and controls
134 lines (107 loc) · 2.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
process ANNOTATE_PROCESS {
tag "${sample_meta.sample}"
label 'process_low'
publishDir enabled: false
input:
tuple val(sample_meta), path(count_table)
output:
tuple val(sample_meta), path("${sample_meta.sample}_cdr3.tsv"), emit: "process"
script:
"""
python - <<EOF
import pandas as pd
USECOLS = [
"junction_aa",
"v_call",
"j_call",
"duplicate_count"
]
COLMAP = {
"junction_aa": "CDR3b",
"v_call": "TRBV",
"j_call": "TRBJ",
"duplicate_count": "counts"
}
df = pd.read_csv(
"${count_table}",
sep='\t',
usecols=USECOLS,
dtype={
"junction_aa": "string",
"v_call": "string",
"j_call": "string",
"duplicate_count": "int"
})
df = (
df[df.junction_aa.notna()]
.rename(columns=COLMAP)
[["CDR3b", "TRBV", "TRBJ", "counts"]]
)
df["sample"] = "${sample_meta.sample}"
df.to_csv("${sample_meta.sample}_cdr3.tsv", sep="\t", index=False)
EOF
"""
}
process ANNOTATE_CONCATENATE {
label 'process_low'
input:
path samplesheet_utf8
path all_sample_files
output:
path "concatenated_cdr3.tsv", emit: concat_cdr3
script:
"""
# Concatenate input Adaptive files and process metadata
# Note: 'all_sample_files' is used as an implicit dependency to control scheduling.
: $all_sample_files
compare_concatenate.py "${samplesheet_utf8}"
"""
}
process ANNOTATE_SORT_CDR3 {
label 'process_medium'
input:
path concat_cdr3
output:
path 'concatenated_cdr3_sorted.tsv', emit: concat_cdr3_sorted
script:
"""
head -n 1 ${concat_cdr3} > concatenated_cdr3_sorted.tsv
tail -n +2 ${concat_cdr3} \
| LC_ALL=C sort \
-t \$'\t' \
-k1,1 -k2,5 \
--parallel=${task.cpus} \
-S 50% \
>> concatenated_cdr3_sorted.tsv
"""
}
process ANNOTATE_DEDUPLICATE_CDR3_TRBV {
label 'process_low'
input:
path concat_cdr3
output:
path 'unique_cdr3_trbv.tsv', emit: unique_cdr3_trbv
path 'unique_cdr3_trbv_with_vcall.tsv', emit: unique_cdr3_trbv_with_vcall
script:
"""
tail -n +2 ${concat_cdr3} \
| awk -F'\t' '{print toupper(\$1) "\t" toupper(\$2)}' \
| LC_ALL=C sort -u \
> unique_cdr3_trbv.tsv
# additional file with blank TRBV calls removed for GIANA
awk -F'\t' 'NF>=2 && \$2 ~ /^TRBV/' unique_cdr3_trbv.tsv > unique_cdr3_trbv_with_vcall.tsv
"""
}
process ANNOTATE_DEDUPLICATE_CDR3 {
label 'process_single'
input:
path unique_cdr3_trbv
output:
path 'unique_cdr3.txt', emit: unique_cdr3
script:
"""
cut -f1 ${unique_cdr3_trbv} \
| LC_ALL=C sort -u \
> unique_cdr3.txt
"""
}