lucidrains
diff --git a/‎alphafold3_pytorch/data/mmcif_parsing.py‎
Lines changed: 9 additions & 3 deletions b/‎alphafold3_pytorch/data/mmcif_parsing.py‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎alphafold3_pytorch/data/msa_parsing.py‎
Lines changed: 18 additions & 0 deletions b/‎alphafold3_pytorch/data/msa_parsing.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎alphafold3_pytorch/inputs.py‎
Lines changed: 21 additions & 12 deletions b/‎alphafold3_pytorch/inputs.py‎
Lines changed: 21 additions & 12 deletions
diff --git a/‎data/pdb_data/data_caches/msa/train_msas/209d-assembly1C_protein.a3m‎
Lines changed: 0 additions & 4 deletions b/‎data/pdb_data/data_caches/msa/train_msas/209d-assembly1C_protein.a3m‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎data/pdb_data/data_caches/msa/train_msas/209d-assembly1C_protein.a3m.gz‎
63 Bytes b/‎data/pdb_data/data_caches/msa/train_msas/209d-assembly1C_protein.a3m.gz‎
63 Bytes
@@ -5,6 +5,7 @@
 import io
 import itertools
 import logging
+import gzip
 from collections import defaultdict
 from operator import itemgetter
 from beartype.typing import Any, Mapping, Optional, Sequence, Set, Tuple
@@ -14,7 +15,7 @@
 from Bio.Data import PDBData
 
 from alphafold3_pytorch.utils.data_utils import is_polymer, is_water, matrix_rotate
-
+from alphafold3_pytorch.data.msa_parsing import is_gzip_file
 # Type aliases:
 ChainId = str
 PdbHeader = Mapping[str, Any]
@@ -763,8 +764,13 @@ def parse_mmcif_object(
     filepath: str, file_id: str, auth_chains: bool = True, auth_residues: bool = True
 ) -> MmcifObject:
     """Parse an mmCIF file into an `MmcifObject` containing a BioPython `Structure` object as well as associated metadata."""
-    with open(filepath, "r") as f:
-        mmcif_string = f.read()
+
+    if is_gzip_file(filepath):
+        with gzip.open(filepath, "r") as f:
+            mmcif_string = f.read()
+    else:
+        with open(filepath, "r") as f:
+            mmcif_string = f.read()
 
     parsing_result = parse(
         file_id=file_id,
 
@@ -6,6 +6,7 @@
 import random
 import re
 import string
+import binascii
 
 import hashlib
 from cachetools import cached, LRUCache
@@ -273,3 +274,20 @@ def parse_a3m(a3m_string: str, msa_type: MSA_TYPE) -> Msa:
         descriptions=descriptions,
         msa_type=msa_type,
     )
+
+@typecheck
+def is_gzip_file(f: str) -> bool:
+    """Checks whether an input file (i.e an a3m MSA file) is gzipped
+
+    Method copied from Phispy see https://github.com/linsalrob/PhiSpy/blob/master/PhiSpyModules/helper_functions.py
+
+    This is an elegant solution to test whether a file is gzipped by reading the first two characters.
+
+    Args:
+        f (str): The file to test.
+
+    Returns:
+        bool: True if the file is gzip compressed, otherwise False.
+    """
+    with open(f, "rb") as i:
+        return binascii.hexlify(i.read(2)) == b"1f8b"
@@ -4,6 +4,7 @@
 import glob
 import json
 import os
+import gzip
 import random
 import statistics
 import traceback
@@ -2203,9 +2204,9 @@ def __post_init__(self):
         if exists(self.mmcif_filepath):
             if not os.path.exists(self.mmcif_filepath):
                 raise FileNotFoundError(f"mmCIF file not found: {self.mmcif_filepath}.")
-            if not self.mmcif_filepath.endswith(".cif"):
+            if not (self.mmcif_filepath.endswith(".cif") or self.mmcif_filepath.endswith(".cif.gz")):
                 raise ValueError(
-                    f"mmCIF file `{self.mmcif_filepath}` must have a `.cif` file extension."
+                    f"mmCIF file `{self.mmcif_filepath}` must have a `.cif` or `.cif.gz` file extension."
                 )
         elif not exists(self.biomol):
             raise ValueError("Either an mmCIF file or a `Biomolecule` object must be provided.")
@@ -2825,9 +2826,9 @@ def load_msa_from_msa_dir(
             msa_fpath_pattern = ""
             if exists(msa_dir):
                 msa_fpath_pattern = (
-                    os.path.join(msa_dir, f"{pdb_id.split('-assembly1')[0]}_*", "a3m", "*.a3m")
+                    os.path.join(msa_dir, f"{pdb_id.split('-assembly1')[0]}_*", "a3m*")
                     if distillation
-                    else os.path.join(msa_dir, f"{file_id}{chain_id}_*.a3m")
+                    else os.path.join(msa_dir, f"{file_id}{chain_id}_*.a3m*")
                 )
                 msa_fpaths = glob.glob(msa_fpath_pattern)
 
@@ -2844,11 +2845,19 @@ def load_msa_from_msa_dir(
                 # into the MSAs as unknown amino acid residues.
                 chain_msas = []
                 for msa_fpath in msa_fpaths:
-                    with open(msa_fpath, "r") as f:
-                        msa = f.read()
-                        msa = msa_parsing.parse_a3m(msa, chain_msa_type)
-                        if len(chain_sequence) == len(msa.sequences[0]):
-                            chain_msas.append(msa)
+                    if msa_parsing.is_gzip_file(msa_fpath): 
+                        with gzip.open(msa_fpath, "r") as f:
+                            msa = f.read()
+                            msa = msa_parsing.parse_a3m(msa, chain_msa_type)
+                            if len(chain_sequence) == len(msa.sequences[0]):
+                                chain_msas.append(msa)
+                    else:
+                        with open(msa_fpath, "r") as f:
+                            msa = f.read()
+                            msa = msa_parsing.parse_a3m(msa, chain_msa_type)
+                            if len(chain_sequence) == len(msa.sequences[0]):
+                                chain_msas.append(msa)
+
 
                 if not chain_msas:
                     raise ValueError(
@@ -4304,13 +4313,13 @@ def __init__(
             sampler_pdb_ids = set(self.sampler.mappings.get_column("pdb_id").to_list())
             self.files = {
                 os.path.splitext(os.path.basename(filepath.name))[0]: filepath
-                for filepath in folder.glob(os.path.join("**", "*.cif"))
+                for filepath in folder.glob(os.path.join("**", "*.cif*"))
                 if os.path.splitext(os.path.basename(filepath.name))[0] in sampler_pdb_ids
             }
         else:
             self.files = {
                 os.path.splitext(os.path.basename(file.name))[0]: file
-                for file in folder.glob(os.path.join("**", "*.cif"))
+                for file in folder.glob(os.path.join("**", "*.cif*"))
             }
 
         if exists(filter_out_pdb_ids):
@@ -4484,7 +4493,7 @@ def __init__(
 
         self.files = {
             os.path.splitext(os.path.basename(file.name))[0]: file
-            for file in folder.glob(os.path.join("**", "*.cif"))
+            for file in folder.glob(os.path.join("**", "*.cif*"))
             if os.path.splitext(os.path.basename(file.name))[0].split("-")[1]
             in self.uniprot_to_pdb_id_mapping
         }