add an lru cache onto parsing species id from msa description

lucidrains · lucidrains · commit ace4968cde44 · 2024-09-24T17:28:10.000-07:00
diff --git a/alphafold3_pytorch/data/msa_parsing.py b/alphafold3_pytorch/data/msa_parsing.py
@@ -7,6 +7,9 @@
 import re
 import string
 
+import hashlib
+from cachetools import cached, LRUCache
+
 from beartype.typing import Literal, Optional, Sequence, Tuple
 
 from alphafold3_pytorch.tensor_typing import typecheck
@@ -117,7 +120,11 @@ def _extract_sequence_identifier(description: str) -> Optional[str]:
     else:
         return None
 
+def _get_identifiers_make_key(description, tab_separated_alignment_headers):
+    md5_digest = hashlib.md5(description.encode()).hexdigest()
+    return f'{md5_digest}:{tab_separated_alignment_headers}'
 
+@cached(cache = LRUCache(maxsize = 512), key = _get_identifiers_make_key)
 @typecheck
 def get_identifiers(
     description: str, tab_separated_alignment_headers: bool = False
diff --git a/pyproject.toml b/pyproject.toml
@@ -28,6 +28,7 @@ dependencies = [
     "awscliv2>=2.3.1",
     "beartype",
     "biopython>=1.83",
+    "cachetools",
     "click>=8.1",
     "CoLT5-attention>=0.11.0",
     "einops>=0.8.0",