Skip to content

Commit 2c1eabd

Browse files
authored
Merge pull request #228 from rlibouba/functional_annot_eggNOG_Interproscan
Functional annotation of protein sequences - Workflow
2 parents a294182 + b7e91c0 commit 2c1eabd

File tree

5 files changed

+255
-0
lines changed

5 files changed

+255
-0
lines changed
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
version: 1.2
2+
workflows:
3+
- name: main
4+
subclass: Galaxy
5+
publish: true
6+
primaryDescriptorPath: /Functional_annotation_of_protein_sequences.ga
7+
testParameterFiles:
8+
- /Functional_annotation_of_protein_sequences-tests.yml
9+
authors:
10+
- name: Romane Libouban
11+
email: romane.libouban@irisa.fr
12+
orcid: 0009-0001-4920-9951
13+
- name: Anthony Bretaudeau
14+
email: anthony.bretaudeau@irisa.fr
15+
orcid: 0000-0003-0914-2470
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Changelog
2+
3+
## [0.1]
4+
5+
Initial version of the Functional annotation of protein sequence Workflow.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
- doc: Test outline for Functional_annotation_of_protein_sequences.ga
2+
job:
3+
input:
4+
class: File
5+
location: https://zenodo.org/record/8414802/files/protein_sequences.fasta?download=1
6+
filetype: fasta
7+
outputs:
8+
eggNOG Mapper seed_orthologs:
9+
location: https://zenodo.org/records/13951790/files/eggNOG_Mapper_seed_orthologs.tabular?download=1&preview=1
10+
compare: sim_size
11+
delta: 50000
12+
eggNOG Mapper annotations:
13+
location: https://zenodo.org/records/13951790/files/eggNOG_Mapper_annot.tabular?download=1&preview=1
14+
compare: sim_size
15+
delta: 100000
16+
interproscan xml:
17+
location: https://zenodo.org/records/13951790/files/interProScan.xml?download=1&preview=1
18+
compare: sim_size
19+
delta: 7000000
20+
interproscan tabular:
21+
location: https://zenodo.org/records/13951790/files/interProScan.tabular?download=1&preview=1
22+
compare: sim_size
23+
delta: 2000000
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
{
2+
"a_galaxy_workflow": "true",
3+
"annotation": "This workflow uses eggNOG mapper and InterProScan for functional annotation of protein sequences.",
4+
"comments": [
5+
{
6+
"child_steps": [
7+
1,
8+
2
9+
],
10+
"color": "green",
11+
"data": {
12+
"title": "Functional annotation"
13+
},
14+
"id": 0,
15+
"position": [
16+
300,
17+
0
18+
],
19+
"size": [
20+
240,
21+
496
22+
],
23+
"type": "frame"
24+
}
25+
],
26+
"creator": [
27+
{
28+
"class": "Person",
29+
"email": "mailto:romane.libouban@irisa.fr",
30+
"identifier": "https://orcid.org/0009-0001-4920-9951",
31+
"name": "Romane Libouban"
32+
},
33+
{
34+
"class": "Person",
35+
"email": "mailto:anthony.bretaudeau@irisa.fr",
36+
"identifier": "https://orcid.org/0000-0003-0914-2470",
37+
"name": "Anthony Bretaudeau"
38+
}
39+
],
40+
"format-version": "0.1",
41+
"license": "MIT",
42+
"release": "0.1",
43+
"name": "Functional annotation of protein sequences",
44+
"steps": {
45+
"0": {
46+
"annotation": "This workflow uses eggNOG mapper and Interproscan for functional annotation of protein sequences.",
47+
"content_id": null,
48+
"errors": null,
49+
"id": 0,
50+
"input_connections": {},
51+
"inputs": [
52+
{
53+
"description": "This workflow uses eggNOG mapper and Interproscan for functional annotation of protein sequences.",
54+
"name": "input"
55+
}
56+
],
57+
"label": "input",
58+
"name": "Input dataset",
59+
"outputs": [],
60+
"position": {
61+
"left": 0,
62+
"top": 0
63+
},
64+
"tool_id": null,
65+
"tool_state": "{\"optional\": false, \"tag\": null}",
66+
"tool_version": null,
67+
"type": "data_input",
68+
"uuid": "fb78bb38-ab6a-4676-98c5-5d3be83e7474",
69+
"when": null,
70+
"workflow_outputs": []
71+
},
72+
"1": {
73+
"annotation": "InterProScan is a tool that analyses each protein sequence from our annotation to determine if they contain one or several of the signatures from InterPro.",
74+
"content_id": "toolshed.g2.bx.psu.edu/repos/bgruening/interproscan/interproscan/5.59-91.0+galaxy3",
75+
"errors": null,
76+
"id": 1,
77+
"input_connections": {
78+
"input": {
79+
"id": 0,
80+
"output_name": "output"
81+
}
82+
},
83+
"inputs": [],
84+
"label": "InterProScan",
85+
"name": "InterProScan",
86+
"outputs": [
87+
{
88+
"name": "outfile_tsv",
89+
"type": "tabular"
90+
},
91+
{
92+
"name": "outfile_xml",
93+
"type": "xml"
94+
}
95+
],
96+
"position": {
97+
"left": 162.5,
98+
"top": 279.5
99+
},
100+
"post_job_actions": {},
101+
"tool_id": "toolshed.g2.bx.psu.edu/repos/bgruening/interproscan/interproscan/5.59-91.0+galaxy3",
102+
"tool_shed_repository": {
103+
"changeset_revision": "74810db257cc",
104+
"name": "interproscan",
105+
"owner": "bgruening",
106+
"tool_shed": "toolshed.g2.bx.psu.edu"
107+
},
108+
"tool_state": "{\"__input_ext\": \"input\", \"applications\": [\"TIGRFAM\", \"FunFam\", \"SFLD\", \"SUPERFAMILY\", \"PANTHER\", \"Gene3D\", \"Hamap\", \"PrositeProfiles\", \"Coils\", \"SMART\", \"CDD\", \"PRINTS\", \"PIRSR\", \"PrositePatterns\", \"AntiFam\", \"Pfam\", \"MobiDBLite\", \"PIRSF\"], \"chromInfo\": \"/shared/ifbstor1/galaxy/mutable-config/tool-data/shared/ucsc/chrom/?.len\", \"database\": \"5.59-91.0\", \"goterms\": true, \"input\": {\"__class__\": \"ConnectedValue\"}, \"iprlookup\": false, \"licensed\": {\"use\": \"false\", \"__current_case__\": 1, \"applications_licensed\": [\"Phobius\", \"SignalP_EUK\", \"TMHMM\"]}, \"oformat\": [\"TSV\", \"XML\"], \"pathways\": true, \"seqtype\": \"p\", \"__page__\": null, \"__rerun_remap_job_id__\": null}",
109+
"tool_version": "5.59-91.0+galaxy3",
110+
"type": "tool",
111+
"uuid": "36d72511-ef8c-42ab-8944-b7aef340a9bc",
112+
"when": null,
113+
"workflow_outputs": [
114+
{
115+
"label": "interproscan xml",
116+
"output_name": "outfile_xml",
117+
"uuid": "bef32b2c-0065-4854-9e5a-898e689d559c"
118+
},
119+
{
120+
"label": "interproscan tabular",
121+
"output_name": "outfile_tsv",
122+
"uuid": "d58da4b2-4c05-491f-8509-609184241715"
123+
}
124+
]
125+
},
126+
"2": {
127+
"annotation": "EggNOG Mapper compares each protein sequence of the annotation to a huge set of ortholog groups from the EggNOG database.",
128+
"content_id": "toolshed.g2.bx.psu.edu/repos/galaxyp/eggnog_mapper/eggnog_mapper/2.1.8+galaxy4",
129+
"errors": null,
130+
"id": 2,
131+
"input_connections": {
132+
"ortho_method|input": {
133+
"id": 0,
134+
"output_name": "output"
135+
}
136+
},
137+
"inputs": [],
138+
"label": "eggNOG Mapper",
139+
"name": "eggNOG Mapper",
140+
"outputs": [
141+
{
142+
"name": "seed_orthologs",
143+
"type": "tabular"
144+
},
145+
{
146+
"name": "annotations",
147+
"type": "tabular"
148+
}
149+
],
150+
"position": {
151+
"left": 340,
152+
"top": 52
153+
},
154+
"post_job_actions": {},
155+
"tool_id": "toolshed.g2.bx.psu.edu/repos/galaxyp/eggnog_mapper/eggnog_mapper/2.1.8+galaxy4",
156+
"tool_shed_repository": {
157+
"changeset_revision": "d9c3016f7283",
158+
"name": "eggnog_mapper",
159+
"owner": "galaxyp",
160+
"tool_shed": "toolshed.g2.bx.psu.edu"
161+
},
162+
"tool_state": "{\"__input_ext\": \"input\", \"annotation_options\": {\"no_annot\": \"\", \"__current_case__\": 0, \"seed_ortholog_evalue\": \"0.001\", \"seed_ortholog_score\": null, \"tax_scope\": null, \"target_orthologs\": \"all\", \"go_evidence\": \"non-electronic\"}, \"chromInfo\": \"/shared/ifbstor1/galaxy/mutable-config/tool-data/shared/ucsc/chrom/?.len\", \"eggnog_data\": \"5.0.2\", \"ortho_method\": {\"m\": \"diamond\", \"__current_case__\": 0, \"input\": {\"__class__\": \"ConnectedValue\"}, \"input_trans\": {\"itype\": \"proteins\", \"__current_case__\": 0}, \"matrix_gapcosts\": {\"matrix\": \"BLOSUM62\", \"__current_case__\": 2, \"gap_costs\": \"--gapopen 11 --gapextend 1\"}, \"sensmode\": \"sensitive\", \"dmnd_iterate\": false, \"dmnd_ignore_warnings\": false, \"query_cover\": null, \"subject_cover\": null, \"pident\": null, \"evalue\": null, \"score\": \"0.001\"}, \"output_options\": {\"no_file_comments\": false, \"report_orthologs\": false, \"md5\": false}, \"__page__\": null, \"__rerun_remap_job_id__\": null}",
163+
"tool_version": "2.1.8+galaxy4",
164+
"type": "tool",
165+
"uuid": "76b27114-c49f-41b8-9333-91953729dee3",
166+
"when": null,
167+
"workflow_outputs": [
168+
{
169+
"label": "eggNOG Mapper annotations",
170+
"output_name": "annotations",
171+
"uuid": "25a1e387-baa5-48b6-b142-198bec95463e"
172+
},
173+
{
174+
"label": "eggNOG Mapper seed_orthologs",
175+
"output_name": "seed_orthologs",
176+
"uuid": "47ac8226-b800-479f-a6f0-f882bafc33d7"
177+
}
178+
]
179+
}
180+
},
181+
"tags": [],
182+
"uuid": "4cbba315-c9bc-4895-aeeb-57dadef3542a",
183+
"version": 2
184+
}
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Functional annotation of protein sequences Workflow
2+
3+
This workflow uses eggNOG mapper and Interproscan for functional annotation of protein sequences.
4+
It can be used on proteins from any organism.
5+
6+
EggNOG Mapper compares each protein sequence of the annotation to a huge set of ortholog groups from the EggNOG database. In this database, each ortholog group is associated with functional annotation like Gene Ontology (GO) terms or KEGG pathways. When the protein sequence of a new gene is found to be very similar to one of these ortholog groups, the corresponding functional annotation is transfered to this new gene.
7+
8+
InterProScan is a tool that analyses each protein sequence from our annotation to determine if they contain one or several of the signatures from InterPro. When a protein contains a known signature, the corresponding functional annotation will be assigned to it by InterProScan.
9+
10+
## Input dataset
11+
This workflow requires only a input file: a protein sequences file in fasta format.
12+
13+
14+
## Outputs for eggNOG Mapper
15+
The output of this tool is a tabular file, where each line represents a gene from our annotation, with the functional annotation that was found by EggNOG-mapper. It includes a predicted protein name, GO terms, EC numbers, KEGG identifiers, etc.
16+
17+
## Outputs for Interproscan
18+
The output of this tool is both a tabular file and an XML file. Both contain the same information, but the tabular one is more readable for a Human: each line represents a gene from our annotation, with the different domains and motifs that were found by InterProScan.
19+
20+
Each line correspond to a motif found in one of the annotated proteins. The most interesting columns are:
21+
- Column 1: the protein identifier
22+
- Column 5: the identifier of the signature that was found in the protein sequence
23+
- Column 4: the databank where this signature comes from (InterProScan regroups several motifs databanks)
24+
- Column 6: the human readable description of the motif
25+
- Columns 7 and 8: the position where the motif was found
26+
- Column 9: a score for the match (if available)
27+
- Column 12 and 13: identifier of the signature integrated in InterPro (if available). Have a look an example webpage for IPR036859 on InterPro.
28+
- The following columns contains various identifiers that were assigned to the protein based on the match with the signature (Gene ontology term, Reactome, …)

0 commit comments

Comments
 (0)