Skip to content

Commit 26347ca

Browse files
committed
Added utility to merge multiple chain data into single target
1 parent d162b88 commit 26347ca

File tree

1 file changed

+233
-0
lines changed

1 file changed

+233
-0
lines changed
Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Add groups from one file to another (e.g., from metadata to bioassembly metadata).
4+
Combines all groups for chains in the bioassembly.
5+
6+
For example, if a bioassembly contains chains A, B, C with group_ids 1, 2, 3,
7+
the combined group_id would be "1,2,3".
8+
9+
This script handles:
10+
- group_id
11+
- seq_group_id
12+
- mmseqs_* columns (any column matching the pattern)
13+
"""
14+
15+
import argparse
16+
import pandas as pd
17+
import re
18+
from pathlib import Path
19+
20+
21+
def merge_groupings(
22+
right_file: str,
23+
left_file: str,
24+
output_file: str,
25+
right_on: str = "pdb_id",
26+
left_on: str = "pdb_id",
27+
group_columns: list[str] = None,
28+
group_pattern: str = r"^(group_id|seq_group_id|mmseqs_.+)$",
29+
) -> None:
30+
"""
31+
Merge grouping columns from right file to left file.
32+
33+
For each entry in the left file, finds all matching chains in the right file
34+
and combines their group values.
35+
36+
Parameters:
37+
-----------
38+
right_file : str
39+
Path to the source CSV file with grouping information
40+
left_file : str
41+
Path to the target CSV file to add groupings to (e.g., bioassembly metadata)
42+
output_file : str
43+
Path to save the merged CSV file
44+
right_on : str
45+
Column name in right DataFrame to match on (default: 'pdb_id')
46+
left_on : str
47+
Column name in left DataFrame to match on (default: 'pdb_id')
48+
group_columns : list[str], optional
49+
Explicit list of group columns to merge. If None, uses group_pattern.
50+
group_pattern : str
51+
Regex pattern to match group columns (default matches group_id, seq_group_id, mmseqs_*)
52+
"""
53+
# Check if files exist
54+
if not Path(right_file).exists():
55+
raise FileNotFoundError(f"Right file not found: {right_file}")
56+
if not Path(left_file).exists():
57+
raise FileNotFoundError(f"Left file not found: {left_file}")
58+
59+
# Read CSV files
60+
print(f"Reading {right_file}...")
61+
df_right = pd.read_csv(right_file, keep_default_na=False)
62+
print(f" Shape: {df_right.shape}")
63+
print(f" Columns: {list(df_right.columns)}")
64+
65+
print(f"\nReading {left_file}...")
66+
df_left = pd.read_csv(left_file, keep_default_na=False)
67+
print(f" Shape: {df_left.shape}")
68+
print(f" Columns: {list(df_left.columns)}")
69+
70+
# Identify group columns to merge
71+
if group_columns is None:
72+
pattern = re.compile(group_pattern)
73+
group_columns = [col for col in df_right.columns if pattern.match(col)]
74+
75+
if not group_columns:
76+
raise ValueError(f"No group columns found matching pattern: {group_pattern}")
77+
78+
print(f"\nGroup columns to merge: {group_columns}")
79+
80+
# Check that required columns exist
81+
if right_on not in df_right.columns:
82+
raise ValueError(f"Column '{right_on}' not found in right file")
83+
if left_on not in df_left.columns:
84+
raise ValueError(f"Column '{left_on}' not found in left file")
85+
86+
# Verify all group columns exist in right file
87+
missing_cols = [col for col in group_columns if col not in df_right.columns]
88+
if missing_cols:
89+
raise ValueError(f"Group columns not found in right file: {missing_cols}")
90+
91+
# Preprocessing: Explode concatenated strings to lists
92+
df_right_processed = df_right.copy()
93+
94+
# Convert all group columns to lists
95+
for col in group_columns:
96+
df_right_processed[col] = df_right_processed[col].str.split(";")
97+
98+
# Prepare data: Select only needed columns from right file
99+
right_cols = [right_on] + group_columns
100+
df_right_subset = df_right_processed[right_cols].copy()
101+
102+
# Merge and aggregate
103+
print("\nMerging left to right...")
104+
105+
# Step 1: Merge left to right
106+
merged_df = pd.merge(
107+
df_left,
108+
df_right_subset,
109+
left_on=left_on,
110+
right_on=right_on,
111+
how="left",
112+
)
113+
114+
print(f"Merged DataFrame shape: {merged_df.shape}")
115+
116+
# Step 2: Group by left_on
117+
print(f"Grouping by '{left_on}'...")
118+
119+
# Step 3: Aggregate the groups by extending lists
120+
def aggregate_lists(series):
121+
"""Aggregate list values by extending them into a single list."""
122+
all_values = []
123+
for val in series:
124+
if isinstance(val, list):
125+
all_values.extend(val)
126+
elif pd.notna(val) and val != "":
127+
all_values.append(str(val))
128+
129+
# Remove duplicates while preserving some order (sorted)
130+
unique_values = sorted(set(v for v in all_values if v and v != "nan"))
131+
return ";".join(unique_values) if unique_values else ""
132+
133+
# Get all columns from left file
134+
left_columns = list(df_left.columns)
135+
136+
# Create aggregation dictionary
137+
agg_dict = {}
138+
for col in merged_df.columns:
139+
if col == left_on:
140+
continue # Skip the grouping column
141+
elif col in group_columns:
142+
# Aggregate group columns by extending lists
143+
agg_dict[col] = aggregate_lists
144+
elif col in left_columns:
145+
# For original left columns, take first value
146+
agg_dict[col] = "first"
147+
148+
# Perform aggregation
149+
df_result = merged_df.groupby(left_on, as_index=False).agg(agg_dict)
150+
151+
# Reorder columns to match original left file order, then add new group columns
152+
result_columns = [col for col in left_columns if col in df_result.columns]
153+
new_group_cols = [col for col in group_columns if col not in left_columns]
154+
final_column_order = result_columns + new_group_cols
155+
df_result = df_result[final_column_order]
156+
157+
print(f"\nResult DataFrame shape: {df_result.shape}")
158+
print(f"Columns: {list(df_result.columns)}")
159+
160+
# Save merged file
161+
print(f"\nSaving to {output_file}...")
162+
df_result.to_csv(output_file, index=False)
163+
print("Done!")
164+
165+
# Print summary statistics
166+
print("\n--- Merge Summary ---")
167+
print(f"Right file rows: {len(df_right)}")
168+
print(f"Left file rows: {len(df_left)}")
169+
print(f"Output rows: {len(df_result)}")
170+
print(f"Group columns merged: {len(group_columns)}")
171+
172+
173+
def main():
174+
parser = argparse.ArgumentParser(
175+
description="Merge grouping columns from one metadata file to another"
176+
)
177+
178+
parser.add_argument(
179+
"left_file", type=str, help="Path to target CSV file to add groupings to"
180+
)
181+
182+
parser.add_argument(
183+
"right_file", type=str, help="Path to source CSV file with grouping information"
184+
)
185+
186+
parser.add_argument(
187+
"output_file", type=str, help="Path to save the merged CSV file"
188+
)
189+
190+
parser.add_argument(
191+
"--right-on",
192+
type=str,
193+
default="pdb_id",
194+
help="Column name in right DataFrame to match on (default: pdb_id)",
195+
)
196+
197+
parser.add_argument(
198+
"--left-on",
199+
type=str,
200+
default="pdb_id",
201+
help="Column name in left DataFrame to match on (default: pdb_id)",
202+
)
203+
204+
parser.add_argument(
205+
"--group-columns",
206+
nargs="+",
207+
type=str,
208+
default=None,
209+
help="Explicit list of group columns to merge (default: auto-detect)",
210+
)
211+
212+
parser.add_argument(
213+
"--group-pattern",
214+
type=str,
215+
default=r"^(group_id|seq_group_id|mmseqs_.+)$",
216+
help="Regex pattern to match group columns (default: matches group_id, seq_group_id, mmseqs_*)",
217+
)
218+
219+
args = parser.parse_args()
220+
221+
merge_groupings(
222+
right_file=args.right_file,
223+
left_file=args.left_file,
224+
output_file=args.output_file,
225+
right_on=args.right_on,
226+
left_on=args.left_on,
227+
group_columns=args.group_columns,
228+
group_pattern=args.group_pattern,
229+
)
230+
231+
232+
if __name__ == "__main__":
233+
main()

0 commit comments

Comments
 (0)