@@ -347,7 +347,6 @@ def _graph_to_raw_dataset(self, graph: nx.DiGraph) -> pd.DataFrame:
347347
348348 Raises:
349349 RuntimeError: If no sunids are selected.
350- AssertionError: If the input data is insufficient for encoding or validation fails.
351350 """
352351 print (f"Process graph" )
353352
@@ -404,30 +403,27 @@ def _graph_to_raw_dataset(self, graph: nx.DiGraph) -> pd.DataFrame:
404403 df_encoded ["pdb_id" ] = df_encoded ["sid" ].str [1 :5 ]
405404 df_encoded ["chain_id" ] = df_encoded ["sid" ].str [5 ]
406405
406+ # "_" (underscore) means it has no chain
407+ df_encoded = df_encoded [df_encoded ["chain_id" ] != "_" ]
408+
407409 pdb_chain_df = self ._parse_pdb_sequence_file ()
408410
409- # Handle ` chain_id == "_"` Case**
410- # Split df_encoded into two: One for specific chains, one for "all chains" ("_ ")
411- df_specific_chains = df_encoded [df_encoded ["chain_id" ] != "_ " ]
412- df_all_chains = df_encoded [df_encoded ["chain_id" ] == "_ " ].drop (
411+ # Handle chain_id == "." - Multiple chain case
412+ # Split df_encoded into two: One for specific chains, one for "multiple chains" (". ")
413+ df_specific_chains = df_encoded [df_encoded ["chain_id" ] != ". " ]
414+ df_multiple_chains = df_encoded [df_encoded ["chain_id" ] == ". " ].drop (
413415 columns = ["chain_id" ]
414416 )
415417
416- common_pdb_ids = set (df_specific_chains ["pdb_id" ]) & set (
417- df_all_chains ["pdb_id" ]
418- )
419- if common_pdb_ids :
420- raise RuntimeError (
421- f"{ len (common_pdb_ids )} PDB chain IDs found in specific-chains df and all-chains df"
422- )
423-
424418 # Merge specific chains normally
425419 merged_specific = df_specific_chains .merge (
426420 pdb_chain_df , on = ["pdb_id" , "chain_id" ], how = "left"
427421 )
428422
429423 # Merge all chains case -> Join by pdb_id (not chain_id)
430- merged_all_chains = df_all_chains .merge (pdb_chain_df , on = "pdb_id" , how = "left" )
424+ merged_all_chains = df_multiple_chains .merge (
425+ pdb_chain_df , on = "pdb_id" , how = "left"
426+ )
431427
432428 # Combine both cases
433429 sequence_hierarchy_df = pd .concat (
0 commit comments