TCRtoolkit/modules/local/annotate/main.nf at 2f30cbe5e2fb6c73555a4c39e6d94bd7fc55bd15 · KarchinLab/TCRtoolkit

History

134 lines (107 loc) · 2.69 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

process ANNOTATE_PROCESS {

tag "${sample_meta.sample}"

label 'process_low'

publishDir enabled: false

input:

tuple val(sample_meta), path(count_table)

output:

tuple val(sample_meta), path("${sample_meta.sample}_cdr3.tsv"), emit: "process"

script:

"""

python - <<EOF

import pandas as pd

USECOLS = [

"junction_aa",

"v_call",

"j_call",

"duplicate_count"

]

COLMAP = {

"junction_aa": "CDR3b",

"v_call": "TRBV",

"j_call": "TRBJ",

"duplicate_count": "counts"

}

df = pd.read_csv(

"${count_table}",

sep='\t',

usecols=USECOLS,

dtype={

"junction_aa": "string",

"v_call": "string",

"j_call": "string",

"duplicate_count": "int"

})

df = (

df[df.junction_aa.notna()]

.rename(columns=COLMAP)

[["CDR3b", "TRBV", "TRBJ", "counts"]]

)

df["sample"] = "${sample_meta.sample}"

df.to_csv("${sample_meta.sample}_cdr3.tsv", sep="\t", index=False)

EOF

"""

}

process ANNOTATE_CONCATENATE {

label 'process_low'

input:

path samplesheet_utf8

path all_sample_files

output:

path "concatenated_cdr3.tsv", emit: concat_cdr3

script:

"""

# Concatenate input Adaptive files and process metadata

# Note: 'all_sample_files' is used as an implicit dependency to control scheduling.

: $all_sample_files

compare_concatenate.py "${samplesheet_utf8}"

"""

}

process ANNOTATE_SORT_CDR3 {

label 'process_medium'

input:

path concat_cdr3

output:

path 'concatenated_cdr3_sorted.tsv', emit: concat_cdr3_sorted

script:

"""

head -n 1 ${concat_cdr3} > concatenated_cdr3_sorted.tsv

tail -n +2 ${concat_cdr3} \

| LC_ALL=C sort \

-t \$'\t' \

-k1,1 -k2,5 \

--parallel=${task.cpus} \

-S 50% \

>> concatenated_cdr3_sorted.tsv

"""

}

process ANNOTATE_DEDUPLICATE_CDR3_TRBV {

label 'process_low'

input:

path concat_cdr3

output:

path 'unique_cdr3_trbv.tsv', emit: unique_cdr3_trbv

path 'unique_cdr3_trbv_with_vcall.tsv', emit: unique_cdr3_trbv_with_vcall

script:

"""

tail -n +2 ${concat_cdr3} \

| awk -F'\t' '{print toupper(\$1) "\t" toupper(\$2)}' \

| LC_ALL=C sort -u \

> unique_cdr3_trbv.tsv

# additional file with blank TRBV calls removed for GIANA

awk -F'\t' 'NF>=2 && \$2 ~ /^TRBV/' unique_cdr3_trbv.tsv > unique_cdr3_trbv_with_vcall.tsv

"""

}

process ANNOTATE_DEDUPLICATE_CDR3 {

label 'process_single'

input:

path unique_cdr3_trbv

output:

path 'unique_cdr3.txt', emit: unique_cdr3

script:

"""

cut -f1 ${unique_cdr3_trbv} \

| LC_ALL=C sort -u \

> unique_cdr3.txt

"""

}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

main.nf

Latest commit

History

main.nf

File metadata and controls