Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
3931597
custom cif parsing
Ninjani Mar 27, 2025
e76ff65
fix: max val and feat: save custom systems
Ninjani Apr 8, 2025
1082b87
skip nones
Ninjani Apr 8, 2025
7d0f039
fix: allow custom scoring
Ninjani Apr 8, 2025
ae51823
chore: lint
Ninjani Apr 8, 2025
0746d00
chore: type
Ninjani Apr 8, 2025
70efa83
chore: type
OleinikovasV Apr 8, 2025
b01598b
rm unused moad,papyrus + bump bindingDB release
OleinikovasV Apr 9, 2025
d45289c
chore: update crystal mates detection (#108)
OleinikovasV Feb 19, 2026
e57ceed
chore: update license to GPL 2 for plip (#114)
tjduigna Dec 6, 2025
59a8d97
upgrade to numpy2 and python3.12 (#115)
OleinikovasV Feb 19, 2026
f607c6a
chore: ping docker build
OleinikovasV Feb 19, 2026
3aa9d84
chore: posebusters>=0.6.4
OleinikovasV Feb 20, 2026
57c31ff
chore: patch macOS clustering segfault w multi OMP
OleinikovasV Feb 20, 2026
cb73f61
chore: better support for macOS
OleinikovasV Feb 20, 2026
2bb5b7c
Merge branch 'main' into custom_cif
OleinikovasV Feb 20, 2026
1e31a44
chore: make requirements_data.txt work on macOS
OleinikovasV Feb 20, 2026
c21053c
bugfixes to end_to_end
OleinikovasV Feb 20, 2026
f253577
chore: lint
OleinikovasV Feb 20, 2026
5eec32a
chore: style lint
OleinikovasV Feb 20, 2026
c6bf7a2
chore: remove sdf v2000-v3000 patch
OleinikovasV Feb 20, 2026
a4b5265
refactor from_cif_file; add from_custom_cif_file docs
OleinikovasV Feb 20, 2026
8f18b5d
chore: reminder about entry_release_date patch
OleinikovasV Feb 20, 2026
aa53012
chore: reminder about entry_release_date patch-2
OleinikovasV Feb 20, 2026
49cdcdb
chore: cleanup test_annotations.py
OleinikovasV Feb 21, 2026
35fa365
chore: lint test_annotations
OleinikovasV Feb 21, 2026
67a5611
chore: bump python 3.10 -> 3.12
OleinikovasV Mar 24, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/docs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
uses: mamba-org/setup-micromamba@v1
with:
environment-file: environment.yml
create-args: python=3.10
create-args: python=3.12
init-shell: bash
cache-downloads: true
cache-environment: true
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
- name: Setup python
uses: actions/setup-python@v5
with:
python-version: "3.10"
python-version: "3.12"
- name: Configure docker
run: echo ${{ secrets.GITHUB_TOKEN }} | docker login ghcr.io -u ${{ github.repository_owner }} --password-stdin
- name: Install build and tag requirements
Expand Down
10 changes: 5 additions & 5 deletions .github/workflows/pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,18 @@ jobs:
- name: Setup python
uses: actions/setup-python@v5
with:
python-version: "3.10"
python-version: "3.12"
- name: Install tox
run: pip install tox
- name: Run quality checks
run: tox -e py310-lint,py310-type
run: tox -e py312-lint,py312-type
- name: Directory Cache
uses: actions/cache@v4
with:
path: .tox
key: tox-${{ runner.os }}-3.10-${{ hashFiles('tox.ini') }}
key: tox-${{ runner.os }}-3.12-${{ hashFiles('tox.ini') }}
restore-keys: |
tox-${{ runner.os }}-3.10-
tox-${{ runner.os }}-3.12-

test:
name: Build and test docker image
Expand Down Expand Up @@ -101,7 +101,7 @@ jobs:
uses: mamba-org/setup-micromamba@v1
with:
environment-file: environment.yml
create-args: python=3.10
create-args: python=3.12
init-shell: bash
cache-downloads: true
cache-environment: true
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ channels:
- defaults
- bioconda
dependencies:
- python=3.10.*
- python=3.12.*
- reduce
- openstructure
- mmseqs2
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ dependencies = [
"omegaconf",
"mmcif",
"eval_type_backport",
"posebusters",
"posebusters>=0.6.4",
"duckdb",
"cloudpathlib",
"mols2grid",
Expand Down
3 changes: 2 additions & 1 deletion requirements_data.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
tabulate
pdb-validation @ git+https://git.scicore.unibas.ch/schwede/ligand-validation.git
mmpdb @ git+https://github.com/rdkit/mmpdb.git
https://download.pytorch.org/whl/cpu/torch-2.5.1%2Bcpu-cp312-cp312-linux_x86_64.whl#sha256=4856f9d6925121d13c2df07aa7580b767f449dfe71ae5acde9c27535d5da4840
torch @ https://download.pytorch.org/whl/cpu/torch-2.5.1%2Bcpu-cp312-cp312-linux_x86_64.whl#sha256=4856f9d6925121d13c2df07aa7580b767f449dfe71ae5acde9c27535d5da4840 ; sys_platform == "linux"
torch >= 2.5 ; sys_platform == "darwin"
2 changes: 1 addition & 1 deletion src/plinder/core/scores/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def query_index(
assert query is not None
df = sql(query).to_df()
# START patch-2
# TODO-2: remove this patch after entry_release_date is fixed
# TODO-2: rm this only once source data is regenerated!!
if "entry_release_date" in df.columns:
from importlib import resources

Expand Down
5 changes: 4 additions & 1 deletion src/plinder/core/utils/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,11 @@ def download_pdb_chain_cif_file(pdb_id: str, chain_id: str, filename: Path) -> P
),
model=1,
use_author_fields=False,
include_bonds=True,
)
write_file = CIFFile()
set_structure(write_file, structure[structure.chain_id == chain_id])
set_structure(
write_file, structure[structure.chain_id == chain_id], include_bonds=True
)
write_file.write(filename.as_posix())
return filename
10 changes: 10 additions & 0 deletions src/plinder/data/clusters.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
# Copyright (c) 2024, Plinder Development Team
# Distributed under the terms of the Apache License 2.0
import os
import sys
from pathlib import Path
from time import time
from typing import Callable, TypeVar

if sys.platform == "darwin":
# For macOS only: allow multiple OpenMP runtimes to coexist
# (needed on macOS with conda)
os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")

import networkit as nk
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -66,6 +73,9 @@ def make_nk_communities(
tuple[list[tuple[int, str]], int]
"""
assert not directed
if sys.platform == "darwin":
# For macOS only: limit to 1 thread to avoid segfault in PLM with multiple OMP runtimes
nk.setNumberOfThreads(1)
communities = nk.community.detectCommunities(graph, nk.community.PLM(graph))
community_list = [
communities.getMembers(i) for i in range(communities.numberOfSubsets())
Expand Down
12 changes: 2 additions & 10 deletions src/plinder/data/pipeline/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def download_cofactors(
def download_affinity_data(
*,
data_dir: Path,
bindingdb_url: str = "https://www.bindingdb.org/bind/downloads/BindingDB_All_202401_tsv.zip",
bindingdb_url: str = "https://www.bindingdb.org/bind/downloads/BindingDB_All_202504_tsv.zip",
force_update: bool = False,
) -> Any:
"""
Expand All @@ -102,18 +102,10 @@ def download_affinity_data(
from zipfile import ZipFile

affinity_path = data_dir / "dbs" / "affinity" / "affinity.json"
papyrus_raw_affinity_path = (
data_dir / "dbs" / "affinity" / "papyrus_affinity_raw.tar.gz"
)
bindingdb_raw_affinity_path = (
data_dir / "dbs" / "affinity" / "BindingDB_All_202401.tsv"
)
moad_raw_affinity_path = data_dir / "dbs" / "affinity" / "moad_affinity.csv"
bindingdb_raw_affinity_path = data_dir / "dbs" / "affinity" / "BindingDB_All.tsv"

# Make sub directories
papyrus_raw_affinity_path.parent.mkdir(parents=True, exist_ok=True)
bindingdb_raw_affinity_path.parent.mkdir(parents=True, exist_ok=True)
moad_raw_affinity_path.parent.mkdir(parents=True, exist_ok=True)
if not affinity_path.is_file() or force_update:
# Download BindingDB
if (
Expand Down
155 changes: 0 additions & 155 deletions src/plinder/data/pipeline/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,161 +170,6 @@ def calc_pchembl(affinity: float) -> Any:
return df.groupby("pdbid_ligid").median().reset_index()


def transform_papyrus_affinity_data(*, raw_affinity_path: Path) -> pd.DataFrame:
"""
Unpack the tarball archive and collect the
contained files to a single parquet file.

Parameters
----------
raw_affinity_path : Path
location of affinity data

Returns
-------
transformed : pd.DataFrame
median affinity dataset
"""
df = pd.read_csv(raw_affinity_path, sep="\t", compression="zip")
affinity_df = (
df[
[
"accession",
"Quality",
"source",
"pchembl_value_Median",
"PDBID_ligand",
"PDBID_protein",
]
]
.copy()
.rename(columns={"pchembl_value_Median": "pchembl"})
)
affinity_df["PDBID_protein"] = affinity_df["PDBID_protein"].apply(
lambda x: x.split(";")
)
affinity_df = affinity_df.explode("PDBID_protein")
affinity_df = affinity_df[affinity_df.pchembl.notna()]
affinity_df["pdbid_ligid"] = (
affinity_df["PDBID_protein"].str.upper() + "_" + affinity_df["PDBID_ligand"]
)
return (
affinity_df[["pdbid_ligid", "pchembl"]]
.groupby("pdbid_ligid")
.median()
.reset_index()
)


def transform_moad_affinity_data(*, raw_affinity_path: Path) -> pd.DataFrame:
"""
Unpack the tarball archive and collect the
contained files to a single parquet file.

Parameters
----------
raw_affinity_path : Path
location of affinity data

Returns
-------
transformed : pd.DataFrame
median affinity dataset
"""

def calc_pchembl(affinity: float, unit: str) -> Any:
if unit == "fM":
affinity = affinity * 10**-15

if affinity > 0:
return -1.0 * np.log10(affinity)
else:
return np.nan
elif unit == "pM":
affinity = affinity * 10**-12
if affinity > 0:
return -1.0 * np.log10(affinity)
else:
return np.nan
elif unit == "nM":
affinity = affinity * 10**-9
if affinity > 0:
return -1.0 * np.log10(affinity)
else:
return np.nan
elif unit == "uM":
affinity = affinity * 10**-6
if affinity > 0:
return -1.0 * np.log10(affinity)
else:
return np.nan
elif unit == "mM":
affinity = affinity * 10**-3
if affinity > 0:
return -1.0 * np.log10(affinity)
else:
return np.nan
elif unit == "M":
return affinity

with open(raw_affinity_path) as f:
combined_list = []
for line in f.readlines():
line_split = line.split(",")
tmp_enzyme_class = line_split[0]
tmp_pdbid = line_split[2]
if len(tmp_enzyme_class.split(".")) == 4:
new_enzyme_class = tmp_enzyme_class
if len(tmp_pdbid) > 0:
if "Family" in line_split[1]:
family_representative = True
else:
family_representative = False
new_pdbid = tmp_pdbid
if (line_split[3] != "") & (line_split[5] != "Ka"):
combined_list.append(
[
new_enzyme_class,
family_representative,
new_pdbid,
line_split[3],
line_split[4],
line_split[7],
line_split[8],
line_split[9],
]
)
moad_df = pd.DataFrame(
combined_list,
columns=[
"ec_no.",
"ec_family_rep",
"pdbid",
"binder_and_chain",
"valid_ligand",
"affinity",
"unit",
"smiles",
],
)
moad_df["pdbid"] = moad_df["pdbid"].str.lower()

moad_df["binder_id"] = moad_df["binder_and_chain"].apply(lambda x: x.split(":")[0])
moad_df["binder_id"] = moad_df["binder_id"].apply(lambda x: x.split())
moad_df = moad_df.explode("binder_id")
moad_df["pdbid_ligid"] = moad_df["pdbid"].str.upper() + "_" + moad_df["binder_id"]
# This will set instances with undefined affinity to nan
moad_df["pchembl"] = moad_df[["affinity", "unit"]].apply(
lambda x: calc_pchembl(float(x[0]), x[1]) if x[0] != "" else np.nan, axis=1
)
return (
moad_df[["pdbid_ligid", "pchembl"]]
.groupby("pdbid_ligid")
.median()
.reset_index()
)


def transform_components_data(*, raw_components_path: Path) -> pd.DataFrame:
import gemmi

Expand Down
4 changes: 4 additions & 0 deletions src/plinder/data/pipeline/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ def load_entries_from_zips(
two_char_codes: Optional[list[str]] = None,
pdb_ids: Optional[list[str]] = None,
load_for_scoring: bool = False,
max_protein_chains: int = 5,
max_ligand_chains: int = 5,
) -> Dict[str, "Entry"]:
"""
Load entries from the qc zips into a dict
Expand Down Expand Up @@ -151,6 +153,8 @@ def load_entries_from_zips(
pdb_id = name.replace(".json", "")
reduced[pdb_id] = Entry.model_validate_json(obj.read()).prune(
load_for_scoring=load_for_scoring,
max_protein_chains=max_protein_chains,
max_ligand_chains=max_ligand_chains,
)
except Exception as e:
LOG.error(f"failed to read name={name} failed with {repr(e)}")
Expand Down
Loading
Loading