plinder-org · Ninjani · Mar 27, 2025 · Apr 8, 2025 · Apr 8, 2025 · Apr 8, 2025
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -28,7 +28,7 @@ jobs:
         uses: mamba-org/setup-micromamba@v1
         with:
           environment-file: environment.yml
-          create-args: python=3.10
+          create-args: python=3.12
           init-shell: bash
           cache-downloads: true
           cache-environment: true

diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
@@ -22,7 +22,7 @@ jobs:
       - name: Setup python
         uses: actions/setup-python@v5
         with:
-          python-version: "3.10"
+          python-version: "3.12"
       - name: Configure docker
         run: echo ${{ secrets.GITHUB_TOKEN }} | docker login ghcr.io -u ${{ github.repository_owner }} --password-stdin
       - name: Install build and tag requirements

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -23,18 +23,18 @@ jobs:
       - name: Setup python
         uses: actions/setup-python@v5
         with:
-          python-version: "3.10"
+          python-version: "3.12"
       - name: Install tox
         run: pip install tox
       - name: Run quality checks
-        run: tox -e py310-lint,py310-type
+        run: tox -e py312-lint,py312-type
       - name: Directory Cache
         uses: actions/cache@v4
         with:
           path: .tox
-          key: tox-${{ runner.os }}-3.10-${{ hashFiles('tox.ini') }}
+          key: tox-${{ runner.os }}-3.12-${{ hashFiles('tox.ini') }}
           restore-keys: |
-            tox-${{ runner.os }}-3.10-
+            tox-${{ runner.os }}-3.12-
 
   test:
     name: Build and test docker image
@@ -101,7 +101,7 @@ jobs:
         uses: mamba-org/setup-micromamba@v1
         with:
           environment-file: environment.yml
-          create-args: python=3.10
+          create-args: python=3.12
           init-shell: bash
           cache-downloads: true
           cache-environment: true

diff --git a/environment.yml b/environment.yml
@@ -7,7 +7,7 @@ channels:
   - defaults
   - bioconda
 dependencies:
-  - python=3.10.*
+  - python=3.12.*
   - reduce
   - openstructure
   - mmseqs2

diff --git a/pyproject.toml b/pyproject.toml
@@ -18,7 +18,7 @@ dependencies = [
     "omegaconf",
     "mmcif",
     "eval_type_backport",
-    "posebusters",
+    "posebusters>=0.6.4",
     "duckdb",
     "cloudpathlib",
     "mols2grid",

diff --git a/requirements_data.txt b/requirements_data.txt
@@ -2,4 +2,5 @@
   tabulate
   pdb-validation @ git+https://git.scicore.unibas.ch/schwede/ligand-validation.git
   mmpdb @ git+https://github.com/rdkit/mmpdb.git
-  https://download.pytorch.org/whl/cpu/torch-2.5.1%2Bcpu-cp312-cp312-linux_x86_64.whl#sha256=4856f9d6925121d13c2df07aa7580b767f449dfe71ae5acde9c27535d5da4840
+  torch @ https://download.pytorch.org/whl/cpu/torch-2.5.1%2Bcpu-cp312-cp312-linux_x86_64.whl#sha256=4856f9d6925121d13c2df07aa7580b767f449dfe71ae5acde9c27535d5da4840 ; sys_platform == "linux"
+  torch >= 2.5 ; sys_platform == "darwin"
diff --git a/src/plinder/core/scores/index.py b/src/plinder/core/scores/index.py
@@ -57,7 +57,7 @@ def query_index(
     assert query is not None
     df = sql(query).to_df()
     # START patch-2
-    # TODO-2: remove this patch after entry_release_date is fixed
+    # TODO-2: rm this only once source data is regenerated!!
     if "entry_release_date" in df.columns:
         from importlib import resources
 

diff --git a/src/plinder/core/utils/io.py b/src/plinder/core/utils/io.py
@@ -74,8 +74,11 @@ def download_pdb_chain_cif_file(pdb_id: str, chain_id: str, filename: Path) -> P
         ),
         model=1,
         use_author_fields=False,
+        include_bonds=True,
     )
     write_file = CIFFile()
-    set_structure(write_file, structure[structure.chain_id == chain_id])
+    set_structure(
+        write_file, structure[structure.chain_id == chain_id], include_bonds=True
+    )
     write_file.write(filename.as_posix())
     return filename
diff --git a/src/plinder/data/clusters.py b/src/plinder/data/clusters.py
@@ -1,9 +1,16 @@
 # Copyright (c) 2024, Plinder Development Team
 # Distributed under the terms of the Apache License 2.0
+import os
+import sys
 from pathlib import Path
 from time import time
 from typing import Callable, TypeVar
 
+if sys.platform == "darwin":
+    # For macOS only: allow multiple OpenMP runtimes to coexist
+    # (needed on macOS with conda)
+    os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
+
 import networkit as nk
 import numpy as np
 import pandas as pd
@@ -66,6 +73,9 @@ def make_nk_communities(
     tuple[list[tuple[int, str]], int]
     """
     assert not directed
+    if sys.platform == "darwin":
+        # For macOS only: limit to 1 thread to avoid segfault in PLM with multiple OMP runtimes
+        nk.setNumberOfThreads(1)
     communities = nk.community.detectCommunities(graph, nk.community.PLM(graph))
     community_list = [
         communities.getMembers(i) for i in range(communities.numberOfSubsets())

diff --git a/src/plinder/data/pipeline/io.py b/src/plinder/data/pipeline/io.py
@@ -77,7 +77,7 @@ def download_cofactors(
 def download_affinity_data(
     *,
     data_dir: Path,
-    bindingdb_url: str = "https://www.bindingdb.org/bind/downloads/BindingDB_All_202401_tsv.zip",
+    bindingdb_url: str = "https://www.bindingdb.org/bind/downloads/BindingDB_All_202504_tsv.zip",
     force_update: bool = False,
 ) -> Any:
     """
@@ -102,18 +102,10 @@ def download_affinity_data(
     from zipfile import ZipFile
 
     affinity_path = data_dir / "dbs" / "affinity" / "affinity.json"
-    papyrus_raw_affinity_path = (
-        data_dir / "dbs" / "affinity" / "papyrus_affinity_raw.tar.gz"
-    )
-    bindingdb_raw_affinity_path = (
-        data_dir / "dbs" / "affinity" / "BindingDB_All_202401.tsv"
-    )
-    moad_raw_affinity_path = data_dir / "dbs" / "affinity" / "moad_affinity.csv"
+    bindingdb_raw_affinity_path = data_dir / "dbs" / "affinity" / "BindingDB_All.tsv"
 
     # Make sub directories
-    papyrus_raw_affinity_path.parent.mkdir(parents=True, exist_ok=True)
     bindingdb_raw_affinity_path.parent.mkdir(parents=True, exist_ok=True)
-    moad_raw_affinity_path.parent.mkdir(parents=True, exist_ok=True)
     if not affinity_path.is_file() or force_update:
         # Download BindingDB
         if (

diff --git a/src/plinder/data/pipeline/transform.py b/src/plinder/data/pipeline/transform.py
@@ -170,161 +170,6 @@ def calc_pchembl(affinity: float) -> Any:
     return df.groupby("pdbid_ligid").median().reset_index()
 
 
-def transform_papyrus_affinity_data(*, raw_affinity_path: Path) -> pd.DataFrame:
-    """
-    Unpack the tarball archive and collect the
-    contained files to a single parquet file.
-
-    Parameters
-    ----------
-    raw_affinity_path : Path
-        location of affinity data
-
-    Returns
-    -------
-    transformed : pd.DataFrame
-        median affinity dataset
-    """
-    df = pd.read_csv(raw_affinity_path, sep="\t", compression="zip")
-    affinity_df = (
-        df[
-            [
-                "accession",
-                "Quality",
-                "source",
-                "pchembl_value_Median",
-                "PDBID_ligand",
-                "PDBID_protein",
-            ]
-        ]
-        .copy()
-        .rename(columns={"pchembl_value_Median": "pchembl"})
-    )
-    affinity_df["PDBID_protein"] = affinity_df["PDBID_protein"].apply(
-        lambda x: x.split(";")
-    )
-    affinity_df = affinity_df.explode("PDBID_protein")
-    affinity_df = affinity_df[affinity_df.pchembl.notna()]
-    affinity_df["pdbid_ligid"] = (
-        affinity_df["PDBID_protein"].str.upper() + "_" + affinity_df["PDBID_ligand"]
-    )
-    return (
-        affinity_df[["pdbid_ligid", "pchembl"]]
-        .groupby("pdbid_ligid")
-        .median()
-        .reset_index()
-    )
-
-
-def transform_moad_affinity_data(*, raw_affinity_path: Path) -> pd.DataFrame:
-    """
-    Unpack the tarball archive and collect the
-    contained files to a single parquet file.
-
-    Parameters
-    ----------
-    raw_affinity_path : Path
-        location of affinity data
-
-    Returns
-    -------
-    transformed : pd.DataFrame
-        median affinity dataset
-    """
-
-    def calc_pchembl(affinity: float, unit: str) -> Any:
-        if unit == "fM":
-            affinity = affinity * 10**-15
-
-            if affinity > 0:
-                return -1.0 * np.log10(affinity)
-            else:
-                return np.nan
-        elif unit == "pM":
-            affinity = affinity * 10**-12
-            if affinity > 0:
-                return -1.0 * np.log10(affinity)
-            else:
-                return np.nan
-        elif unit == "nM":
-            affinity = affinity * 10**-9
-            if affinity > 0:
-                return -1.0 * np.log10(affinity)
-            else:
-                return np.nan
-        elif unit == "uM":
-            affinity = affinity * 10**-6
-            if affinity > 0:
-                return -1.0 * np.log10(affinity)
-            else:
-                return np.nan
-        elif unit == "mM":
-            affinity = affinity * 10**-3
-            if affinity > 0:
-                return -1.0 * np.log10(affinity)
-            else:
-                return np.nan
-        elif unit == "M":
-            return affinity
-
-    with open(raw_affinity_path) as f:
-        combined_list = []
-        for line in f.readlines():
-            line_split = line.split(",")
-            tmp_enzyme_class = line_split[0]
-            tmp_pdbid = line_split[2]
-            if len(tmp_enzyme_class.split(".")) == 4:
-                new_enzyme_class = tmp_enzyme_class
-            if len(tmp_pdbid) > 0:
-                if "Family" in line_split[1]:
-                    family_representative = True
-                else:
-                    family_representative = False
-                new_pdbid = tmp_pdbid
-            if (line_split[3] != "") & (line_split[5] != "Ka"):
-                combined_list.append(
-                    [
-                        new_enzyme_class,
-                        family_representative,
-                        new_pdbid,
-                        line_split[3],
-                        line_split[4],
-                        line_split[7],
-                        line_split[8],
-                        line_split[9],
-                    ]
-                )
-    moad_df = pd.DataFrame(
-        combined_list,
-        columns=[
-            "ec_no.",
-            "ec_family_rep",
-            "pdbid",
-            "binder_and_chain",
-            "valid_ligand",
-            "affinity",
-            "unit",
-            "smiles",
-        ],
-    )
-    moad_df["pdbid"] = moad_df["pdbid"].str.lower()
-
-    moad_df["binder_id"] = moad_df["binder_and_chain"].apply(lambda x: x.split(":")[0])
-    moad_df["binder_id"] = moad_df["binder_id"].apply(lambda x: x.split())
-    moad_df = moad_df.explode("binder_id")
-    moad_df["pdbid_ligid"] = moad_df["pdbid"].str.upper() + "_" + moad_df["binder_id"]
-    # This will set instances with undefined affinity to nan
-    moad_df["pchembl"] = moad_df[["affinity", "unit"]].apply(
-        lambda x: calc_pchembl(float(x[0]), x[1]) if x[0] != "" else np.nan, axis=1
-    )
-    return (
-        moad_df[["pdbid_ligid", "pchembl"]]
-        .groupby("pdbid_ligid")
-        .median()
-        .reset_index()
-    )
-
-
 def transform_components_data(*, raw_components_path: Path) -> pd.DataFrame:
     import gemmi
 

diff --git a/src/plinder/data/pipeline/utils.py b/src/plinder/data/pipeline/utils.py
@@ -113,6 +113,8 @@ def load_entries_from_zips(
     two_char_codes: Optional[list[str]] = None,
     pdb_ids: Optional[list[str]] = None,
     load_for_scoring: bool = False,
+    max_protein_chains: int = 5,
+    max_ligand_chains: int = 5,
 ) -> Dict[str, "Entry"]:
     """
     Load entries from the qc zips into a dict
@@ -151,6 +153,8 @@ def load_entries_from_zips(
                         pdb_id = name.replace(".json", "")
                         reduced[pdb_id] = Entry.model_validate_json(obj.read()).prune(
                             load_for_scoring=load_for_scoring,
+                            max_protein_chains=max_protein_chains,
+                            max_ligand_chains=max_ligand_chains,
                         )
                 except Exception as e:
                     LOG.error(f"failed to read name={name} failed with {repr(e)}")