|
4 | 4 | Authors: |
5 | 5 | Thomas A. Hopf |
6 | 6 | """ |
7 | | - |
| 7 | +import io |
8 | 8 | from collections import OrderedDict |
9 | 9 | from collections.abc import Iterable |
10 | 10 | import gzip |
11 | | -from io import BytesIO |
| 11 | +from io import BytesIO, TextIOWrapper |
12 | 12 | from os import path |
13 | 13 | from urllib.error import HTTPError |
14 | 14 |
|
|
20 | 20 | from Bio.PDB.MMCIF2Dict import MMCIF2Dict |
21 | 21 |
|
22 | 22 | from evcouplings.utils.config import InvalidParameterError |
23 | | -from evcouplings.utils.constants import AA3_to_AA1 |
| 23 | +from evcouplings.utils.constants import AA3_to_AA1, AA3_to_AA1_FULL |
24 | 24 | from evcouplings.utils.helpers import DefaultOrderedDict |
25 | 25 | from evcouplings.utils.system import ( |
26 | 26 | valid_file, ResourceError, tempdir |
27 | 27 | ) |
28 | 28 |
|
29 | 29 | PDB_BCIF_DOWNLOAD_URL = "https://models.rcsb.org/{pdb_id}.bcif.gz" |
| 30 | +PDB_CIF_DOWNLOAD_URL = "https://files.rcsb.org/download/{pdb_id}.cif.gz" |
30 | 31 |
|
31 | 32 |
|
32 | 33 | # Mapping from MMTF secondary structure codes to DSSP symbols |
@@ -676,7 +677,7 @@ def from_file(cls, filename, keep_full_data=False): |
676 | 677 | if binary: |
677 | 678 | mode = "rb" |
678 | 679 | else: |
679 | | - mode = "r" |
| 680 | + mode = "rt" |
680 | 681 |
|
681 | 682 | with openfunc(filename, mode=mode) as f: |
682 | 683 | return cls(f, binary=binary, keep_full_data=keep_full_data) |
@@ -704,22 +705,36 @@ def from_id(cls, pdb_id, keep_full_data=False): |
704 | 705 | """ |
705 | 706 | # TODO: add proper retry logic and timeouts |
706 | 707 | # TODO: add better exception handling |
| 708 | + |
| 709 | + # easy toggle if we want to switch back to bCIF instead of mmCIF |
707 | 710 | try: |
708 | 711 | r = requests.get( |
709 | | - PDB_BCIF_DOWNLOAD_URL.format(pdb_id=pdb_id.lower()) |
| 712 | + PDB_CIF_DOWNLOAD_URL.format(pdb_id=pdb_id.lower()) |
710 | 713 | ) |
711 | 714 | except requests.exceptions.RequestException as e: |
712 | 715 | raise ResourceError( |
713 | | - "Error fetching bCIF data for {}".format(pdb_id) |
| 716 | + "Error fetching CIF data for {}".format(pdb_id) |
714 | 717 | ) from e |
715 | 718 |
|
716 | 719 | if not r.ok: |
717 | 720 | raise ResourceError( |
718 | 721 | "Did not receive valid response fetching {}".format(pdb_id) |
719 | 722 | ) |
720 | 723 |
|
721 | | - with gzip.GzipFile(fileobj=BytesIO(r.content), mode="r") as f: |
722 | | - return cls(f, keep_full_data=keep_full_data) |
| 724 | + # bCIF: |
| 725 | + # with gzip.GzipFile(fileobj=BytesIO(r.content), mode="r") as f: |
| 726 | + # return cls(f, binary=True, keep_full_data=keep_full_data) |
| 727 | + |
| 728 | + # following gzip.open() from https://github.com/python/cpython/blob/3.13/Lib/gzip.py |
| 729 | + with TextIOWrapper( |
| 730 | + gzip.GzipFile(fileobj=BytesIO(r.content), mode="r"), # noqa |
| 731 | + encoding="utf-8", |
| 732 | + errors=None, |
| 733 | + newline=None, |
| 734 | + ) as f: |
| 735 | + return cls( |
| 736 | + f, binary=False, keep_full_data=keep_full_data |
| 737 | + ) |
723 | 738 |
|
724 | 739 | def get_chain(self, chain, model=0, is_author_id=True): |
725 | 740 | """ |
@@ -773,7 +788,7 @@ def get_chain(self, chain, model=0, is_author_id=True): |
773 | 788 | # (this should be unique and circumvents issues from 0 seqres values if selecting based on author chain ID) |
774 | 789 | coord_id=lambda df: df.auth_seq_id.astype(str) + df.insertion_code, |
775 | 790 | seqres_id=lambda df: df.label_seq_id.astype(str).replace("0", pd.NA).replace("", pd.NA), |
776 | | - one_letter_code=lambda df: df.label_comp_id.map(AA3_to_AA1, na_action="ignore"), |
| 791 | + one_letter_code=lambda df: df.label_comp_id.map(AA3_to_AA1_FULL, na_action="ignore"), |
777 | 792 | # note that MSE will now be labeled as HETATM, which was not the case with MMTF |
778 | 793 | hetatm=lambda df: df.record_type == "HETATM", |
779 | 794 | ).reset_index( |
|
0 commit comments