Skip to content

Commit 89b8c1c

Browse files
committed
First cut
1 parent a3954f1 commit 89b8c1c

File tree

10 files changed

+932
-0
lines changed

10 files changed

+932
-0
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# Custom
2+
.idea
3+
14
# Byte-compiled / optimized / DLL files
25
__pycache__/
36
*.py[cod]
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
#!/bin/bash
2+
#
3+
# Class II allele sequences
4+
#
5+
# Requires: clustalo, wget
6+
#
7+
set -e
8+
set -x
9+
10+
DOWNLOAD_NAME=allele_sequences
11+
SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
12+
SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
13+
SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
14+
export PYTHONUNBUFFERED=1
15+
16+
mkdir -p "$SCRATCH_DIR"
17+
rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
18+
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
19+
20+
# Send stdout and stderr to a logfile included with the archive.
21+
exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
22+
exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
23+
24+
# Log some environment info
25+
date
26+
pip freeze
27+
git status
28+
which clustalo
29+
clustalo --version
30+
31+
cd $SCRATCH_DIR/$DOWNLOAD_NAME
32+
cp $SCRIPT_DIR/make_allele_sequences.py .
33+
cp $SCRIPT_DIR/filter_sequences.py .
34+
cp $SCRIPT_ABSOLUTE_PATH .
35+
36+
# Human
37+
38+
# Alpha chain
39+
mkdir alpha
40+
cd alpha
41+
wget -q ftp://ftp.ebi.ac.uk/pub/databases/ipd/imgt/hla/fasta/DPA1_prot.fasta
42+
wget -q ftp://ftp.ebi.ac.uk/pub/databases/ipd/imgt/hla/fasta/DPA2_prot.fasta
43+
wget -q ftp://ftp.ebi.ac.uk/pub/databases/ipd/imgt/hla/fasta/DQA1_prot.fasta
44+
wget -q ftp://ftp.ebi.ac.uk/pub/databases/ipd/imgt/hla/fasta/DQA2_prot.fasta
45+
wget -q ftp://ftp.ebi.ac.uk/pub/databases/ipd/imgt/hla/fasta/DRA_prot.fasta
46+
cd ..
47+
48+
# Beta chain
49+
mkdir beta
50+
cd beta
51+
wget -q ftp://ftp.ebi.ac.uk/pub/databases/ipd/imgt/hla/fasta/DPB1_prot.fasta
52+
wget -q ftp://ftp.ebi.ac.uk/pub/databases/ipd/imgt/hla/fasta/DPB2_prot.fasta
53+
wget -q ftp://ftp.ebi.ac.uk/pub/databases/ipd/imgt/hla/fasta/DQB1_prot.fasta
54+
wget -q ftp://ftp.ebi.ac.uk/pub/databases/ipd/imgt/hla/fasta/DRB_prot.fasta
55+
cd ..
56+
57+
python filter_sequences.py alpha/*.fasta --kind alpha --out alpha.fasta
58+
python filter_sequences.py beta/*.fasta --kind beta --out beta.fasta
59+
60+
time clustalo -i alpha.fasta -o alpha.aligned.fasta
61+
time clustalo -i beta.fasta -o beta.aligned.fasta
62+
63+
time python make_allele_sequences.py \
64+
alpha.aligned.fasta \
65+
--reference-allele HLA-DRA1*01:01 \
66+
--out-csv alpha.csv
67+
68+
time python make_allele_sequences.py \
69+
beta.aligned.fasta \
70+
--reference-allele HLA-DRB1*01:01 \
71+
--out-csv beta.csv
72+
73+
# Cleanup
74+
gzip -f alpha.fasta
75+
gzip -f alpha.aligned.fasta
76+
gzip -f beta.fasta
77+
gzip -f beta.aligned.fasta
78+
79+
for i in $(ls "*/*.fasta")
80+
do
81+
gzip -f $i
82+
done
83+
84+
cp $SCRIPT_ABSOLUTE_PATH .
85+
bzip2 LOG.txt
86+
RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
87+
tar -cjf "$RESULT" *
88+
echo "Created archive: $RESULT"
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
"""
2+
Filter and combine class II sequence fastas.
3+
"""
4+
from __future__ import print_function
5+
6+
import sys
7+
import argparse
8+
9+
import mhcnames
10+
11+
import Bio.SeqIO # pylint: disable=import-error
12+
13+
14+
def normalize(s, disallowed=["MIC", "HFE"]):
15+
if any(item in s for item in disallowed):
16+
return None
17+
try:
18+
return mhcnames.normalize_allele_name(s, infer_class2_pair=False)
19+
except:
20+
while s:
21+
s = ":".join(s.split(":")[:-1])
22+
try:
23+
return mhcnames.normalize_allele_name(s, infer_class2_pair=False)
24+
except:
25+
pass
26+
27+
print("Couldn't parse", s)
28+
return None
29+
30+
31+
parser = argparse.ArgumentParser(usage=__doc__)
32+
33+
parser.add_argument(
34+
"fastas",
35+
nargs="+",
36+
help="Unaligned fastas")
37+
38+
parser.add_argument(
39+
"--kind",
40+
required=True,
41+
choices=("alpha", "beta"),
42+
help="Chain")
43+
44+
parser.add_argument(
45+
"--out",
46+
required=True,
47+
help="Fasta output")
48+
49+
min_lengths = {
50+
"alpha": 200,
51+
"beta": 200,
52+
}
53+
54+
55+
def run():
56+
args = parser.parse_args(sys.argv[1:])
57+
print(args)
58+
59+
min_length = min_lengths[args.kind]
60+
61+
output_records = []
62+
seen = set()
63+
sequences = set()
64+
65+
input_records = []
66+
for fasta in args.fastas:
67+
reader = Bio.SeqIO.parse(fasta, "fasta")
68+
input_records.extend(reader)
69+
70+
# Iterate longest records first so that when multiple records have the
71+
# same two digit normalized allele, we use the longest one.
72+
for record in sorted(input_records, key=lambda r: len(r.seq), reverse=True):
73+
name = record.description.split()[1]
74+
name = normalize(name)
75+
if name in seen:
76+
continue
77+
if len(record.seq) < min_length:
78+
print("Skipping due to short length", name, record.description)
79+
continue
80+
seen.add(name)
81+
sequences.add(record.seq)
82+
record.description = name + " " + record.description
83+
output_records.append(record)
84+
85+
with open(args.out, "w") as fd:
86+
Bio.SeqIO.write(output_records, fd, "fasta")
87+
88+
print("Wrote %d / %d [%d unique] sequences: %s" % (
89+
len(output_records), len(input_records), len(sequences), args.out))
90+
91+
92+
if __name__ == '__main__':
93+
run()
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
"""
2+
Generate allele sequences for pan-class II models.
3+
4+
Additional dependency: biopython
5+
"""
6+
from __future__ import print_function
7+
8+
import sys
9+
import argparse
10+
11+
import pandas
12+
13+
import Bio.SeqIO # pylint: disable=import-error
14+
15+
parser = argparse.ArgumentParser(usage=__doc__)
16+
17+
parser.add_argument(
18+
"aligned_fasta",
19+
help="Aligned sequences")
20+
21+
parser.add_argument(
22+
"--reference-allele",
23+
required=True,
24+
help="Allele to use for position numbering")
25+
26+
parser.add_argument(
27+
"--out-csv",
28+
help="Result file")
29+
30+
31+
def run():
32+
args = parser.parse_args(sys.argv[1:])
33+
print(args)
34+
35+
allele_to_sequence = {}
36+
reader = Bio.SeqIO.parse(args.aligned_fasta, "fasta")
37+
for record in reader:
38+
name = record.description.split()[1]
39+
print(record.name, record.description)
40+
allele_to_sequence[name] = str(record.seq)
41+
42+
allele_to_sequence = pandas.Series(allele_to_sequence).sort_index()
43+
print("Read %d aligned sequences" % len(allele_to_sequence))
44+
45+
reference = allele_to_sequence[args.reference_allele]
46+
print("Using reference", args.reference_allele, reference)
47+
48+
df = pandas.DataFrame(index=allele_to_sequence.index)
49+
50+
current_number = 1
51+
for (i, reference_char) in enumerate(reference):
52+
if current_number not in df.columns:
53+
df[current_number] = ""
54+
55+
df[current_number] += allele_to_sequence.str.get(i)
56+
if reference_char != '-':
57+
current_number += 1
58+
59+
df = df.applymap(lambda s: s.replace("-", "X"))
60+
print(df)
61+
62+
df.to_csv(args.out_csv, index=True)
63+
print("Wrote [%d alleles]: %s" % (len(df), args.out_csv))
64+
65+
66+
if __name__ == '__main__':
67+
run()

0 commit comments

Comments
 (0)