Skip to content

Commit 13b8795

Browse files
committed
scope: fix multiple chain filtering
1 parent aad16d9 commit 13b8795

File tree

1 file changed

+10
-14
lines changed
  • chebai/preprocessing/datasets/scope

1 file changed

+10
-14
lines changed

chebai/preprocessing/datasets/scope/scope.py

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,6 @@ def _graph_to_raw_dataset(self, graph: nx.DiGraph) -> pd.DataFrame:
347347
348348
Raises:
349349
RuntimeError: If no sunids are selected.
350-
AssertionError: If the input data is insufficient for encoding or validation fails.
351350
"""
352351
print(f"Process graph")
353352

@@ -404,30 +403,27 @@ def _graph_to_raw_dataset(self, graph: nx.DiGraph) -> pd.DataFrame:
404403
df_encoded["pdb_id"] = df_encoded["sid"].str[1:5]
405404
df_encoded["chain_id"] = df_encoded["sid"].str[5]
406405

406+
# "_" (underscore) means it has no chain
407+
df_encoded = df_encoded[df_encoded["chain_id"] != "_"]
408+
407409
pdb_chain_df = self._parse_pdb_sequence_file()
408410

409-
# Handle `chain_id == "_"` Case**
410-
# Split df_encoded into two: One for specific chains, one for "all chains" ("_")
411-
df_specific_chains = df_encoded[df_encoded["chain_id"] != "_"]
412-
df_all_chains = df_encoded[df_encoded["chain_id"] == "_"].drop(
411+
# Handle chain_id == "." - Multiple chain case
412+
# Split df_encoded into two: One for specific chains, one for "multiple chains" (".")
413+
df_specific_chains = df_encoded[df_encoded["chain_id"] != "."]
414+
df_multiple_chains = df_encoded[df_encoded["chain_id"] == "."].drop(
413415
columns=["chain_id"]
414416
)
415417

416-
common_pdb_ids = set(df_specific_chains["pdb_id"]) & set(
417-
df_all_chains["pdb_id"]
418-
)
419-
if common_pdb_ids:
420-
raise RuntimeError(
421-
f"{len(common_pdb_ids)} PDB chain IDs found in specific-chains df and all-chains df"
422-
)
423-
424418
# Merge specific chains normally
425419
merged_specific = df_specific_chains.merge(
426420
pdb_chain_df, on=["pdb_id", "chain_id"], how="left"
427421
)
428422

429423
# Merge all chains case -> Join by pdb_id (not chain_id)
430-
merged_all_chains = df_all_chains.merge(pdb_chain_df, on="pdb_id", how="left")
424+
merged_all_chains = df_multiple_chains.merge(
425+
pdb_chain_df, on="pdb_id", how="left"
426+
)
431427

432428
# Combine both cases
433429
sequence_hierarchy_df = pd.concat(

0 commit comments

Comments
 (0)