Skip to content

Commit 2f1e2fe

Browse files
authored
Merge pull request #420 from abotlp/feat/rf2na-fasta-input
RF2NA: use multi-chain FASTA input and drop interactions sheet
2 parents ddf2d1a + a94cd25 commit 2f1e2fe

File tree

15 files changed

+238
-124
lines changed

15 files changed

+238
-124
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7676
- [[PR #399](https://github.com/nf-core/proteinfold/pulls/399)] - Update alphafold2 and alphafold2_pred Dockerfiles.
7777
- [[PR #404](https://github.com/nf-core/proteinfold/pulls/404)] - Boltz cache files moved to workdir, fixed version checks and Boltz stubRun.
7878
- [[#401](https://github.com/nf-core/proteinfold/issues/401)] - Get rid of symlinking in the prediction tools processes when using "PREPARE_DBS" subworkflows
79+
- [[#410](https://github.com/nf-core/proteinfold/issues/410)] - Switch RosettaFold2NA to Boltz-style multi-chain FASTA inputs and drop the interactions sheet.
7980
- [[PR #407](https://github.com/nf-core/proteinfold/pulls/407)] - Several changes to meet nf-core standards.
8081
- [[PR #409](https://github.com/nf-core/proteinfold/pulls/409)] - Force single pdb workflow outputs to return as a list
8182
- [[PR #396](https://github.com/nf-core/proteinfold/pulls/396)] - Split ColabFold into separate optimised containers with version pinning and significant size reduction.

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ The pipeline takes care of downloading the databases and parameters required by
186186
-profile <docker/singularity/podman/shifter/charliecloud/conda/institute>
187187
```
188188

189-
- The RosettaFold2NA mode can be run using the command below:
189+
- The RosettaFold2NA mode can be run using the command below:
190190

191191
```console
192192
nextflow run nf-core/proteinfold \

assets/schema_interactions.json

Lines changed: 0 additions & 31 deletions
This file was deleted.

bin/fasta_to_rosettafold.py

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
#!/usr/bin/env python3
2+
import os
3+
import re
4+
import sys
5+
from pathlib import Path
6+
7+
8+
def read_fasta(path, sample_id):
9+
entries = []
10+
header = None
11+
seq_lines = []
12+
with open(path, "r") as handle:
13+
for raw in handle:
14+
line = raw.strip()
15+
if not line:
16+
continue
17+
if line.startswith(">"):
18+
if header is not None:
19+
entries.append((header, "".join(seq_lines).upper()))
20+
header = line[1:].strip() or f"{sample_id}_chain_{len(entries) + 1}"
21+
seq_lines = []
22+
else:
23+
seq_lines.append(line.replace(" ", "").upper())
24+
if header is not None:
25+
entries.append((header, "".join(seq_lines).upper()))
26+
return entries
27+
28+
29+
def infer_type(header, sequence):
30+
type_aliases = {
31+
"protein": "P",
32+
"prot": "P",
33+
"aa": "P",
34+
"pep": "P",
35+
"peptide": "P",
36+
"p": "P",
37+
"rna": "R",
38+
"r": "R",
39+
"double": "D",
40+
"ds": "D",
41+
"dsdna": "D",
42+
"double_dna": "D",
43+
"single": "S",
44+
"ss": "S",
45+
"ssdna": "S",
46+
"single_dna": "S",
47+
"single-strand": "S",
48+
"singlestrand": "S",
49+
}
50+
header_lower = header.lower()
51+
match = re.search(r"(?:type|entity|molecule|mol)[:=]\s*([A-Za-z0-9_-]+)", header_lower)
52+
if match:
53+
candidate = match.group(1).lower()
54+
if candidate in type_aliases:
55+
return type_aliases[candidate]
56+
for alias, code in type_aliases.items():
57+
if re.search(r"\b" + re.escape(alias) + r"\b", header_lower):
58+
return code
59+
60+
seq_set = set(sequence)
61+
if not sequence:
62+
return None
63+
if seq_set <= set("ACUGN"):
64+
return "R"
65+
# Default DNA to double-stranded unless explicitly marked single-strand.
66+
if seq_set <= set("ACTGN"):
67+
return "D"
68+
protein_letters = set("ACDEFGHIKLMNPQRSTVWYBXZOU")
69+
if seq_set <= protein_letters and not (seq_set <= set("ACUGTN")):
70+
return "P"
71+
return "P"
72+
73+
74+
def main():
75+
if len(sys.argv) != 3:
76+
sys.stderr.write("Usage: fasta_to_rosettafold.py <sample_id> <fasta_path>\n")
77+
return 1
78+
79+
sample_id, fasta_path = sys.argv[1], sys.argv[2]
80+
allowed_ext = (".fa", ".fasta", ".fas", ".faa", ".fna")
81+
if not fasta_path.lower().endswith(allowed_ext):
82+
sys.stderr.write(
83+
f"[ROSETTAFOLD2NA_FASTA] Input file '{fasta_path}' must be a FASTA file.\n"
84+
)
85+
return 1
86+
87+
if not os.path.exists(fasta_path):
88+
sys.stderr.write(
89+
f"[ROSETTAFOLD2NA_FASTA] Input FASTA '{fasta_path}' does not exist.\n"
90+
)
91+
return 1
92+
93+
entries = read_fasta(fasta_path, sample_id)
94+
if not entries:
95+
sys.stderr.write(
96+
f"[ROSETTAFOLD2NA_FASTA] No sequences found in '{fasta_path}'.\n"
97+
)
98+
return 1
99+
100+
output_dir = Path("rf2na_input")
101+
output_dir.mkdir(parents=True, exist_ok=True)
102+
103+
chain_records = []
104+
observed_files = set()
105+
for idx, (header, sequence) in enumerate(entries, start=1):
106+
chain_type = infer_type(header, sequence)
107+
if chain_type is None:
108+
sys.stderr.write(
109+
f"[ROSETTAFOLD2NA_FASTA] Unable to determine entity type for entry '{header}'. "
110+
"Please include a token such as 'type=protein', 'type=double_dna', or 'type=single_dna'.\n"
111+
)
112+
return 1
113+
if chain_type not in {"P", "R", "D", "S"}:
114+
sys.stderr.write(
115+
f"[ROSETTAFOLD2NA_FASTA] Unable to determine entity type for entry '{header}'. "
116+
"Allowed types: protein (P), rna (R), double_dna (D), single_dna (S).\n"
117+
)
118+
return 1
119+
safe_name = re.sub(r"[^A-Za-z0-9_.-]+", "_", header) or f"chain_{idx}"
120+
filename = f"chain_{idx:03d}_{safe_name[:40]}.fa"
121+
if filename in observed_files:
122+
filename = f"chain_{idx:03d}_{idx}.fa"
123+
observed_files.add(filename)
124+
with open(output_dir / filename, "w") as fh:
125+
fh.write(f">{header}\n")
126+
for start in range(0, len(sequence), 80):
127+
fh.write(sequence[start : start + 80] + "\n")
128+
chain_records.append((chain_type, filename, header))
129+
130+
with open(output_dir / "chain_map.tsv", "w") as mapping:
131+
mapping.write("type\tfilename\theader\n")
132+
for chain_type, filename, header in chain_records:
133+
mapping.write(f"{chain_type}\t{filename}\t{header}\n")
134+
135+
return 0
136+
137+
138+
if __name__ == "__main__":
139+
sys.exit(main())

bin/generate_report.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,12 @@ def generate_output_images(msa_path, plddt_data, name, out_dir, in_type, generat
5757
for line in in_file:
5858
msa.append([int(x) for x in line.strip().split()])
5959

60+
# Pad jagged MSAs to avoid shape errors in downstream plotting
61+
if msa:
62+
max_len = max(len(row) for row in msa)
63+
if any(len(row) != max_len for row in msa):
64+
msa = [row + [21] * (max_len - len(row)) for row in msa]
65+
6066
seqid = []
6167
for sequence in msa:
6268
matches = [

conf/test_rosettafold2na.config

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,7 @@ params {
2626
// Input data to test rosettafold2na
2727
mode = 'rosettafold2na'
2828
rosettafold2na_db = "${projectDir}/assets/dummy_db_dir"
29-
input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.2/samplesheet.csv'
30-
interactions = params.pipelines_testdata_base_path + 'proteinfold/testdata/interactions/v1.2/interactions.csv'
29+
input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v2.0/rna_complex_samplesheet.csv'
3130
}
3231

3332
process {

docs/usage.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,9 @@ nextflow run nf-core/proteinfold \
427427
-profile <docker/singularity/.../institute>
428428
```
429429

430+
> [!NOTE]
431+
> RosettaFold2NA now expects each samplesheet row to reference a multi-chain FASTA that includes every interacting molecule. Add a `type=` hint to each header (for example `type=protein`, `type=rna`, `type=double_dna`, or `type=single_dna`) so the adaptor can tag chains with the correct RF2NA entity codes (`P`, `R`, `D`, `S`). If no hint is present, the chain type is inferred from sequence composition (pure `ACUGN` → RNA, pure `ACTGN` → DNA which defaults to `D` unless explicitly tagged single-strand, otherwise protein).
432+
430433
Note that the pipeline will create the following files in your working directory:
431434

432435
```bash

main.nf

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -62,11 +62,9 @@ workflow NFCORE_PROTEINFOLD {
6262

6363
take:
6464
samplesheet // channel: samplesheet read in from --input
65-
interactions // channel: interactions read in from --interactions
6665

6766
main:
6867
ch_samplesheet = samplesheet
69-
ch_interactions = interactions
7068
ch_multiqc = channel.empty()
7169
ch_versions = channel.empty()
7270
ch_report_input = channel.empty()
@@ -489,14 +487,12 @@ workflow NFCORE_PROTEINFOLD {
489487
//
490488
ROSETTAFOLD2NA (
491489
ch_samplesheet,
492-
ch_interactions,
493490
ch_versions,
494491
PREPARE_ROSETTAFOLD2NA_DBS.out.bfd,
495492
PREPARE_ROSETTAFOLD2NA_DBS.out.uniref30,
496493
PREPARE_ROSETTAFOLD2NA_DBS.out.pdb100,
497494
PREPARE_ROSETTAFOLD2NA_DBS.out.rna,
498-
PREPARE_ROSETTAFOLD2NA_DBS.out.rosettafold2na_weights,
499-
ch_dummy_file
495+
PREPARE_ROSETTAFOLD2NA_DBS.out.rosettafold2na_weights
500496
)
501497
ch_multiqc = ch_multiqc.mix(ROSETTAFOLD2NA.out.multiqc_report.collect())
502498
ch_versions = ch_versions.mix(ROSETTAFOLD2NA.out.versions)
@@ -631,7 +627,6 @@ workflow {
631627
args,
632628
params.outdir,
633629
params.input,
634-
params.interactions,
635630
params.help,
636631
params.help_full,
637632
params.show_hidden
@@ -641,8 +636,7 @@ workflow {
641636
// WORKFLOW: Run main workflow
642637
//
643638
NFCORE_PROTEINFOLD (
644-
PIPELINE_INITIALISATION.out.samplesheet,
645-
PIPELINE_INITIALISATION.out.interactions
639+
PIPELINE_INITIALISATION.out.samplesheet
646640
)
647641

648642
//
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
name: rosettafold2na_fasta
2+
channels:
3+
- conda-forge
4+
- defaults
5+
dependencies:
6+
- python=3.8
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
process ROSETTAFOLD2NA_FASTA {
2+
tag "$meta.id"
3+
label 'process_single'
4+
5+
conda "${moduleDir}/environment.yml"
6+
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
7+
'https://depot.galaxyproject.org/singularity/python:3.8.3' :
8+
'biocontainers/python:3.8.3' }"
9+
10+
input:
11+
tuple val(meta), path(fasta)
12+
13+
output:
14+
tuple val(meta), path("rf2na_input", type: "dir"), emit: rf2na_input
15+
path "versions.yml" , emit: versions
16+
17+
when:
18+
task.ext.when == null || task.ext.when
19+
20+
script:
21+
"""
22+
fasta_to_rosettafold.py "${meta.id}" "${fasta}"
23+
24+
cat <<'END_VERSIONS' > versions.yml
25+
"${task.process}":
26+
python: \$(python3 --version | sed 's/Python //g')
27+
END_VERSIONS
28+
"""
29+
30+
stub:
31+
"""
32+
mkdir -p rf2na_input
33+
touch rf2na_input/chain_map.tsv
34+
35+
cat <<'END_VERSIONS' > versions.yml
36+
"${task.process}":
37+
python: \$(python3 --version | sed 's/Python //g')
38+
END_VERSIONS
39+
"""
40+
}

0 commit comments

Comments
 (0)