Skip to content

Commit 8e77aca

Browse files
authored
Merge pull request #54 from haddocking/interaction-partners
Interaction partners
2 parents 1d44ca5 + 9bcd1b4 commit 8e77aca

File tree

7 files changed

+152
-22
lines changed

7 files changed

+152
-22
lines changed

README.md

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ Python package to detect proteins in EM density maps.
1010

1111
It uses
1212

13-
- [Uniprot Sparql endpoint](https://sparql.uniprot.org/) to search for proteins and their measured or predicted 3D structures.
13+
- [protein-quest](https://github.com/haddocking/protein-quest) to search, retrieve and filter protein structures from Uniprot, PDBe and AlphaFold DB.
1414
- [powerfit](https://pypi.org/project/powerfit-em/) to fit protein structure in a Electron Microscopy (EM) density map.
1515

1616
An example workflow:
@@ -60,6 +60,26 @@ protein-detective search \
6060

6161
In `./mysession` directory, you will find session.db file, which is a [DuckDB](https://duckdb.org/) database with search results.
6262

63+
<details>
64+
<summary>You can also include interaction partners in the search</summary>
65+
66+
```shell
67+
protein-detective --log-level INFO search \
68+
--taxon-id 9606 \
69+
--reviewed \
70+
--subcellular-location-uniprot nucleus \
71+
--subcellular-location-go GO:0005634 \
72+
--molecular-function-go GO:0003677 \
73+
--interaction-partner-seed A8MT69 \
74+
--interaction-partner-exclude B1APH4 \
75+
--limit 100 \
76+
./mysession2
77+
```
78+
79+
Which will add `Q96H22` which is an interaction partner of `A8MT69` in a macromolecular complex.
80+
81+
</details>
82+
6383
### To retrieve a bunch of structures
6484

6585
```shell

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ dependencies = [
1717
"molviewspec>=1.6.0",
1818
"pandas>=2.3.0",
1919
"powerfit-em[opencl]>=3.0.6",
20-
"protein-quest>=0.3.3",
20+
"protein-quest>=0.4.0",
2121
"psutil>=7.0.0",
2222
"rich>=14.0.0",
2323
"rich-argparse>=1.7.1",
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.4.0"
1+
__version__ = "0.5.0"

src/protein_detective/cli.py

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
from protein_quest.alphafold.fetch import downloadable_formats
99
from protein_quest.converter import converter
1010
from protein_quest.ss import SecondaryStructureFilterQuery
11-
from protein_quest.uniprot import Query
1211
from rich import print as rprint
1312
from rich.logging import RichHandler
1413
from rich_argparse import RawDescriptionRichHelpFormatter, RichHelpFormatter
@@ -19,6 +18,7 @@
1918
add_powerfit_parser,
2019
handle_powerfit,
2120
)
21+
from protein_detective.search import UniprotQuery
2222
from protein_detective.workflow import (
2323
filter_structures,
2424
retrieve_structures,
@@ -50,6 +50,23 @@ def add_search_parser(subparsers):
5050
action="append",
5151
help="Molecular function (GO term, e.g. GO:0003677). Can be specified multiple times.",
5252
)
53+
parser.add_argument(
54+
"--interaction-partner-seed",
55+
type=str,
56+
action="append",
57+
help=dedent("""\
58+
UniProt ID to use as interaction partner seed.
59+
The search will be expanded to include structures identifiers of the found interaction partners.
60+
Can be specified multiple times.
61+
"""),
62+
)
63+
parser.add_argument(
64+
"--interaction-partner-exclude",
65+
type=str,
66+
action="append",
67+
help="UniProt ID to exclude as found interaction partners. Can be specified multiple times.",
68+
)
69+
5370
parser.add_argument("--limit", type=int, default=10_000, help="Limit number of results")
5471

5572

@@ -131,20 +148,27 @@ def add_filter_parser(subparsers: argparse._SubParsersAction):
131148

132149

133150
def handle_search(args):
134-
query = Query(
135-
taxon_id=args.taxon_id,
136-
reviewed=args.reviewed,
137-
subcellular_location_uniprot=args.subcellular_location_uniprot,
138-
subcellular_location_go=args.subcellular_location_go,
139-
molecular_function_go=args.molecular_function_go,
151+
query = converter.structure(
152+
{
153+
"taxon_id": args.taxon_id,
154+
"reviewed": args.reviewed,
155+
"subcellular_location_uniprot": args.subcellular_location_uniprot,
156+
"subcellular_location_go": args.subcellular_location_go,
157+
"molecular_function_go": args.molecular_function_go,
158+
"interaction_partner_seeds": args.interaction_partner_seed or [],
159+
"interaction_partner_excludes": args.interaction_partner_exclude or [],
160+
},
161+
UniprotQuery,
140162
)
141163
session_dir = Path(args.session_dir)
142-
nr_uniprot, nr_pdbes, nr_prot2pdbes, nr_afs = search_structures_in_uniprot(query, session_dir, limit=args.limit)
164+
result = search_structures_in_uniprot(query, session_dir, limit=args.limit)
143165
rprint(
144-
f"Search completed: {nr_uniprot} UniProt entries found, "
145-
f"{nr_pdbes} PDBe structures, {nr_prot2pdbes} UniProt to PDB mappings, "
146-
f"{nr_afs} AlphaFold structures."
166+
f"Search completed: {result.nr_uniprot_accessions} UniProt entries found, "
167+
f"{result.nr_pdbs} PDBe structures, {result.nr_prot2pdb} UniProt to PDB mappings, "
168+
f"{result.nr_afs} AlphaFold structures."
147169
)
170+
if query.interaction_partner_seeds:
171+
rprint(f"Included {result.nr_interaction_partners} Uniprot entries found as interaction partners.")
148172

149173

150174
def handle_retrieve(args):

src/protein_detective/db.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,17 @@
77
from importlib.resources import read_text
88
from pathlib import Path
99

10-
from cattrs import unstructure
1110
from duckdb import ConstraintException, DuckDBPyConnection, InvalidInputException
1211
from duckdb import connect as duckdb_connect
1312
from pandas import DataFrame
1413
from protein_quest.alphafold.entry_summary import EntrySummary
1514
from protein_quest.alphafold.fetch import AlphaFoldEntry
1615
from protein_quest.converter import converter
17-
from protein_quest.uniprot import PdbResult, Query
16+
from protein_quest.uniprot import PdbResult
1817

1918
from protein_detective.filter import FilteredStructure, FilterOptions
2019
from protein_detective.powerfit.options import PowerfitOptions
20+
from protein_detective.search import UniprotQuery
2121

2222
logger = logging.getLogger(__name__)
2323

@@ -111,14 +111,15 @@ def connect(session_dir: Path, read_only: bool = False) -> Iterator[DuckDBPyConn
111111
con.close()
112112

113113

114-
def save_query(query: Query, con: DuckDBPyConnection):
114+
def save_query(query: UniprotQuery, con: DuckDBPyConnection):
115115
"""Save a UniProt search query to the database.
116116
117117
Args:
118118
query: The UniProt search query to save.
119119
con: The DuckDB connection to use for saving the data.
120120
"""
121-
con.execute("INSERT INTO uniprot_searches (query) VALUES (?)", (unstructure(query),))
121+
value = converter.dumps(query).decode()
122+
con.execute("INSERT INTO uniprot_searches (query) VALUES (?)", (value,))
122123

123124

124125
def save_uniprot_accessions(uniprot_accessions: Iterable[str], con: DuckDBPyConnection) -> int:

src/protein_detective/search.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
"""Module with search logic."""
2+
3+
import logging
4+
from dataclasses import dataclass, field
5+
6+
from protein_quest.uniprot import Query, search4macromolecular_complexes
7+
8+
logger = logging.getLogger(__name__)
9+
10+
11+
@dataclass
12+
class UniprotQuery(Query):
13+
"""A UniProt search query with interaction partner options.
14+
15+
Parameters:
16+
interaction_partner_seeds: A set of UniProt accessions to search for interaction partners.
17+
interaction_partners_excludes: A set of UniProt accessions to exclude from interaction partner results.
18+
"""
19+
20+
interaction_partner_seeds: set[str] = field(default_factory=set)
21+
interaction_partner_excludes: set[str] = field(default_factory=set)
22+
23+
24+
def search_for_interaction_partners(query: UniprotQuery, limit: int) -> set[str]:
25+
"""Searches for interaction partners in UniProt database and ComplexPortal.
26+
27+
Args:
28+
query: The search query containing seeds and excludes.
29+
limit: The maximum number of results to return from the database query.
30+
31+
Returns:
32+
A set of unique UniProt accessions of interaction partners found.
33+
"""
34+
if not query.interaction_partner_seeds:
35+
logger.info("No interaction partner seeds provided; skipping search for interaction partners.")
36+
return set()
37+
logger.info("Searching for interaction partners of seeds %s", query.interaction_partner_seeds)
38+
uniprot_accessions_of_partners: set[str] = set()
39+
40+
complexes = search4macromolecular_complexes(query.interaction_partner_seeds, limit)
41+
for complex_entry in complexes:
42+
uniprot_accessions_of_partners.update(complex_entry.members)
43+
44+
# Exclude seeds and excludes from results
45+
uniprot_accessions_of_partners.difference_update(query.interaction_partner_seeds)
46+
uniprot_accessions_of_partners.difference_update(query.interaction_partner_excludes)
47+
48+
logger.info(
49+
"Found %d unique interaction partners in %d macromolecular complexes after excluding %d accessions",
50+
len(uniprot_accessions_of_partners),
51+
len(complexes),
52+
len(query.interaction_partner_excludes),
53+
)
54+
return uniprot_accessions_of_partners

src/protein_detective/workflow.py

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import asyncio
44
import logging
5+
from dataclasses import dataclass
56
from pathlib import Path
67
from typing import Literal
78

@@ -10,7 +11,7 @@
1011
from protein_quest.alphafold.fetch import fetch_many_async as af_fetch
1112
from protein_quest.alphafold.fetch import relative_to as af_relative_to
1213
from protein_quest.pdbe.fetch import fetch as pdbe_fetch
13-
from protein_quest.uniprot import Query, search4af, search4pdb, search4uniprot
14+
from protein_quest.uniprot import search4af, search4pdb, search4uniprot
1415

1516
from protein_detective.db import (
1617
connect,
@@ -33,11 +34,31 @@
3334
filter_alphafold_structures,
3435
filter_pdbe_structures,
3536
)
37+
from protein_detective.search import UniprotQuery, search_for_interaction_partners
3638

3739
logger = logging.getLogger(__name__)
3840

3941

40-
def search_structures_in_uniprot(query: Query, session_dir: Path, limit: int = 10_000) -> tuple[int, int, int, int]:
42+
@dataclass
43+
class UniprotSearchResult:
44+
"""Result of a UniProt search.
45+
46+
Parameters:
47+
nr_uniprot_accessions: Number of UniProt accessions found.
48+
nr_pdbs: Number of PDB structures found.
49+
nr_prot2pdb: Number of UniProt to PDB mappings found.
50+
nr_afs: Number of AlphaFold structures found.
51+
nr_interaction_partners: Number of interaction partners found.
52+
"""
53+
54+
nr_uniprot_accessions: int
55+
nr_pdbs: int
56+
nr_prot2pdb: int
57+
nr_afs: int
58+
nr_interaction_partners: int
59+
60+
61+
def search_structures_in_uniprot(query: UniprotQuery, session_dir: Path, limit: int = 10_000) -> UniprotSearchResult:
4162
"""Searches for protein structures in UniProt database.
4263
4364
Args:
@@ -51,8 +72,12 @@ def search_structures_in_uniprot(query: Query, session_dir: Path, limit: int = 1
5172
and the number of AlphaFold structures found.
5273
"""
5374
session_dir.mkdir(parents=True, exist_ok=True)
54-
5575
uniprot_accessions = search4uniprot(query, limit)
76+
logger.debug(uniprot_accessions)
77+
uniprot_accessions_of_partners = search_for_interaction_partners(query, limit)
78+
logger.debug(uniprot_accessions_of_partners)
79+
nr_interaction_partners = len(uniprot_accessions_of_partners)
80+
uniprot_accessions.update(uniprot_accessions_of_partners)
5681
pdbs = search4pdb(uniprot_accessions, limit=limit)
5782
af_result = search4af(uniprot_accessions, limit=limit)
5883

@@ -62,7 +87,13 @@ def search_structures_in_uniprot(query: Query, session_dir: Path, limit: int = 1
6287
nr_pdbs, nr_prot2pdb = save_pdbs(pdbs, con)
6388
nr_afs = save_alphafolds(af_result, con)
6489

65-
return len(uniprot_accessions), nr_pdbs, nr_prot2pdb, nr_afs
90+
return UniprotSearchResult(
91+
nr_uniprot_accessions=len(uniprot_accessions),
92+
nr_pdbs=nr_pdbs,
93+
nr_prot2pdb=nr_prot2pdb,
94+
nr_afs=nr_afs,
95+
nr_interaction_partners=nr_interaction_partners,
96+
)
6697

6798

6899
WhatRetrieve = Literal["pdbe", "alphafold"]

0 commit comments

Comments
 (0)