|
| 1 | +import re |
| 2 | +from typing import Dict, List, Tuple |
| 3 | + |
| 4 | + |
| 5 | +def parse_fasta(fasta_content: str) -> Dict[str, Tuple[str, List[str]]]: |
| 6 | + """ |
| 7 | + Parse FASTA content into dictionary. |
| 8 | +
|
| 9 | + Args: |
| 10 | + fasta_content: Multi-line FASTA string with format: |
| 11 | + >1A1T_1|Chain A[auth B]|SL3 STEM-LOOP RNA| |
| 12 | + or |
| 13 | + >104D_1|Chains A[auth A], B[auth B]|DNA/RNA (...)| |
| 14 | +
|
| 15 | + Returns: |
| 16 | + Dictionary mapping auth chain_id to (sequence, list_of_auth_chain_ids) |
| 17 | + Example: {"A": ("ACGT", ["A", "B"]), "C": ("UGCA", ["C"])} |
| 18 | + The key is the auth chain ID, and the list contains all auth chain IDs for this sequence |
| 19 | + """ |
| 20 | + result = {} |
| 21 | + lines = fasta_content.strip().split("\n") |
| 22 | + |
| 23 | + i = 0 |
| 24 | + while i < len(lines): |
| 25 | + line = lines[i].strip() |
| 26 | + |
| 27 | + if line.startswith(">"): |
| 28 | + # Parse new format header: >104D_1|Chains A[auth A], B[auth B]|...| or >1A1T_1|Chain A[auth B]|...| |
| 29 | + # Extract the chains part (between first | and second |) |
| 30 | + parts = line.split("|") |
| 31 | + if len(parts) < 2: |
| 32 | + print("Warning: Malformed FASTA header:", line) |
| 33 | + auth_chain_ids = [] |
| 34 | + chains_part = "" |
| 35 | + else: |
| 36 | + chains_part = parts[1].strip() |
| 37 | + |
| 38 | + # Extract auth chain IDs from patterns like "Chain A[auth B]" or "Chains A[auth A], B[auth B] or just "Chain A" or "Chains A, B" |
| 39 | + auth_chain_ids = [] |
| 40 | + replaced_chains_part = re.sub(r"^Chains? ", "", chains_part) |
| 41 | + chains = replaced_chains_part.split(",") |
| 42 | + for chain in chains: |
| 43 | + auth_match = re.search(r"\[auth ([^\]]+)\]", chain) |
| 44 | + if auth_match: |
| 45 | + auth_chain_ids.append(auth_match.group(1).strip()) |
| 46 | + else: |
| 47 | + c = chain.strip() |
| 48 | + if c: |
| 49 | + auth_chain_ids.append(c) |
| 50 | + |
| 51 | + # # Find all Y[auth X] or just Y patterns |
| 52 | + # auth_matches = re.findall(r"[^ ,]+(?:\[auth ([^\]]+)\])?", chains_part) |
| 53 | + |
| 54 | + # auth_chain_ids = [match.strip() for match in auth_matches] |
| 55 | + |
| 56 | + if not auth_chain_ids: |
| 57 | + print("Warning: Empty chains part:", chains_part) |
| 58 | + primary_auth_chain = None |
| 59 | + else: |
| 60 | + # Use the first auth chain ID as the key |
| 61 | + primary_auth_chain = auth_chain_ids[0] |
| 62 | + |
| 63 | + # Read sequence (next lines until next header or end) |
| 64 | + sequence = "" |
| 65 | + while (i + 1) < len(lines) and lines[i + 1].startswith(">") is False: |
| 66 | + sequence += lines[i + 1].strip() |
| 67 | + i += 1 |
| 68 | + for chain in auth_chain_ids: |
| 69 | + result[chain] = (sequence, auth_chain_ids) |
| 70 | + |
| 71 | + i += 1 |
| 72 | + |
| 73 | + return result |
0 commit comments