Skip to content

Commit 3907aa8

Browse files
committed
Added kaggle workflow
1 parent a0ba55e commit 3907aa8

File tree

3 files changed

+730
-1
lines changed

3 files changed

+730
-1
lines changed

tools/fasta/chain_parser.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
import re
2+
from typing import Dict, List, Tuple
3+
4+
5+
def parse_fasta(fasta_content: str) -> Dict[str, Tuple[str, List[str]]]:
6+
"""
7+
Parse FASTA content into dictionary.
8+
9+
Args:
10+
fasta_content: Multi-line FASTA string with format:
11+
>1A1T_1|Chain A[auth B]|SL3 STEM-LOOP RNA|
12+
or
13+
>104D_1|Chains A[auth A], B[auth B]|DNA/RNA (...)|
14+
15+
Returns:
16+
Dictionary mapping auth chain_id to (sequence, list_of_auth_chain_ids)
17+
Example: {"A": ("ACGT", ["A", "B"]), "C": ("UGCA", ["C"])}
18+
The key is the auth chain ID, and the list contains all auth chain IDs for this sequence
19+
"""
20+
result = {}
21+
lines = fasta_content.strip().split("\n")
22+
23+
i = 0
24+
while i < len(lines):
25+
line = lines[i].strip()
26+
27+
if line.startswith(">"):
28+
# Parse new format header: >104D_1|Chains A[auth A], B[auth B]|...| or >1A1T_1|Chain A[auth B]|...|
29+
# Extract the chains part (between first | and second |)
30+
parts = line.split("|")
31+
if len(parts) < 2:
32+
print("Warning: Malformed FASTA header:", line)
33+
auth_chain_ids = []
34+
chains_part = ""
35+
else:
36+
chains_part = parts[1].strip()
37+
38+
# Extract auth chain IDs from patterns like "Chain A[auth B]" or "Chains A[auth A], B[auth B] or just "Chain A" or "Chains A, B"
39+
auth_chain_ids = []
40+
replaced_chains_part = re.sub(r"^Chains? ", "", chains_part)
41+
chains = replaced_chains_part.split(",")
42+
for chain in chains:
43+
auth_match = re.search(r"\[auth ([^\]]+)\]", chain)
44+
if auth_match:
45+
auth_chain_ids.append(auth_match.group(1).strip())
46+
else:
47+
c = chain.strip()
48+
if c:
49+
auth_chain_ids.append(c)
50+
51+
# # Find all Y[auth X] or just Y patterns
52+
# auth_matches = re.findall(r"[^ ,]+(?:\[auth ([^\]]+)\])?", chains_part)
53+
54+
# auth_chain_ids = [match.strip() for match in auth_matches]
55+
56+
if not auth_chain_ids:
57+
print("Warning: Empty chains part:", chains_part)
58+
primary_auth_chain = None
59+
else:
60+
# Use the first auth chain ID as the key
61+
primary_auth_chain = auth_chain_ids[0]
62+
63+
# Read sequence (next lines until next header or end)
64+
sequence = ""
65+
while (i + 1) < len(lines) and lines[i + 1].startswith(">") is False:
66+
sequence += lines[i + 1].strip()
67+
i += 1
68+
for chain in auth_chain_ids:
69+
result[chain] = (sequence, auth_chain_ids)
70+
71+
i += 1
72+
73+
return result

validation/validate_bioassembly_metadata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
except ImportError:
2525
has_biotite = False
2626
# Import parse_fasta from the tools/fasta directory
27-
sys.path.insert(0, str(Path(__file__).parent / "tools/fasta"))
27+
sys.path.insert(0, str(Path(__file__).parents[1] / "tools/fasta"))
2828
from chain_parser import parse_fasta
2929

3030
csv.field_size_limit(sys.maxsize)

0 commit comments

Comments
 (0)