openvax
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎downloads-generation/allele_sequences/GENERATE.sh‎
Lines changed: 88 additions & 0 deletions b/‎downloads-generation/allele_sequences/GENERATE.sh‎
Lines changed: 88 additions & 0 deletions
diff --git a/‎downloads-generation/allele_sequences/filter_sequences.py‎
Lines changed: 93 additions & 0 deletions b/‎downloads-generation/allele_sequences/filter_sequences.py‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎downloads-generation/allele_sequences/make_allele_sequences.py‎
Lines changed: 67 additions & 0 deletions b/‎downloads-generation/allele_sequences/make_allele_sequences.py‎
Lines changed: 67 additions & 0 deletions
@@ -1,3 +1,6 @@
+# Custom
+.idea
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 
@@ -0,0 +1,88 @@
+#!/bin/bash
+#
+# Class II allele sequences
+#
+# Requires: clustalo, wget
+#
+set -e
+set -x
+
+DOWNLOAD_NAME=allele_sequences
+SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
+SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
+SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
+export PYTHONUNBUFFERED=1
+
+mkdir -p "$SCRATCH_DIR"
+rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
+mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
+
+# Send stdout and stderr to a logfile included with the archive.
+exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
+exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
+
+# Log some environment info
+date
+pip freeze
+git status
+which clustalo
+clustalo --version
+
+cd $SCRATCH_DIR/$DOWNLOAD_NAME
+cp $SCRIPT_DIR/make_allele_sequences.py .
+cp $SCRIPT_DIR/filter_sequences.py .
+cp $SCRIPT_ABSOLUTE_PATH .
+
+# Human
+
+# Alpha chain
+mkdir alpha
+cd alpha
+wget -q ftp://ftp.ebi.ac.uk/pub/databases/ipd/imgt/hla/fasta/DPA1_prot.fasta
+wget -q ftp://ftp.ebi.ac.uk/pub/databases/ipd/imgt/hla/fasta/DPA2_prot.fasta
+wget -q ftp://ftp.ebi.ac.uk/pub/databases/ipd/imgt/hla/fasta/DQA1_prot.fasta
+wget -q ftp://ftp.ebi.ac.uk/pub/databases/ipd/imgt/hla/fasta/DQA2_prot.fasta
+wget -q ftp://ftp.ebi.ac.uk/pub/databases/ipd/imgt/hla/fasta/DRA_prot.fasta
+cd ..
+
+# Beta chain
+mkdir beta
+cd beta
+wget -q ftp://ftp.ebi.ac.uk/pub/databases/ipd/imgt/hla/fasta/DPB1_prot.fasta
+wget -q ftp://ftp.ebi.ac.uk/pub/databases/ipd/imgt/hla/fasta/DPB2_prot.fasta
+wget -q ftp://ftp.ebi.ac.uk/pub/databases/ipd/imgt/hla/fasta/DQB1_prot.fasta
+wget -q ftp://ftp.ebi.ac.uk/pub/databases/ipd/imgt/hla/fasta/DRB_prot.fasta
+cd ..
+
+python filter_sequences.py alpha/*.fasta --kind alpha --out alpha.fasta
+python filter_sequences.py beta/*.fasta --kind beta --out beta.fasta
+
+time clustalo -i alpha.fasta -o alpha.aligned.fasta
+time clustalo -i beta.fasta -o beta.aligned.fasta
+
+time python make_allele_sequences.py \
+    alpha.aligned.fasta \
+    --reference-allele HLA-DRA1*01:01 \
+    --out-csv alpha.csv
+
+time python make_allele_sequences.py \
+    beta.aligned.fasta \
+    --reference-allele HLA-DRB1*01:01 \
+    --out-csv beta.csv
+
+# Cleanup
+gzip -f alpha.fasta
+gzip -f alpha.aligned.fasta
+gzip -f beta.fasta
+gzip -f beta.aligned.fasta
+
+for i in $(ls "*/*.fasta")
+do
+    gzip -f $i
+done
+
+cp $SCRIPT_ABSOLUTE_PATH .
+bzip2 LOG.txt
+RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
+tar -cjf "$RESULT" *
+echo "Created archive: $RESULT"
@@ -0,0 +1,93 @@
+"""
+Filter and combine class II sequence fastas.
+"""
+from __future__ import print_function
+
+import sys
+import argparse
+
+import mhcnames
+
+import Bio.SeqIO  # pylint: disable=import-error
+
+
+def normalize(s, disallowed=["MIC", "HFE"]):
+    if any(item in s for item in disallowed):
+        return None
+    try:
+        return mhcnames.normalize_allele_name(s, infer_class2_pair=False)
+    except:
+        while s:
+            s = ":".join(s.split(":")[:-1])
+            try:
+                return mhcnames.normalize_allele_name(s, infer_class2_pair=False)
+            except:
+                pass
+
+        print("Couldn't parse", s)
+        return None
+
+
+parser = argparse.ArgumentParser(usage=__doc__)
+
+parser.add_argument(
+    "fastas",
+    nargs="+",
+    help="Unaligned fastas")
+
+parser.add_argument(
+    "--kind",
+    required=True,
+    choices=("alpha", "beta"),
+    help="Chain")
+
+parser.add_argument(
+    "--out",
+    required=True,
+    help="Fasta output")
+
+min_lengths = {
+    "alpha": 200,
+    "beta": 200,
+}
+
+
+def run():
+    args = parser.parse_args(sys.argv[1:])
+    print(args)
+
+    min_length = min_lengths[args.kind]
+
+    output_records = []
+    seen = set()
+    sequences = set()
+
+    input_records = []
+    for fasta in args.fastas:
+        reader = Bio.SeqIO.parse(fasta, "fasta")
+        input_records.extend(reader)
+
+    # Iterate longest records first so that when multiple records have the
+    # same two digit normalized allele, we use the longest one.
+    for record in sorted(input_records, key=lambda r: len(r.seq), reverse=True):
+        name = record.description.split()[1]
+        name = normalize(name)
+        if name in seen:
+            continue
+        if len(record.seq) < min_length:
+            print("Skipping due to short length", name, record.description)
+            continue
+        seen.add(name)
+        sequences.add(record.seq)
+        record.description = name + " " + record.description
+        output_records.append(record)
+
+    with open(args.out, "w") as fd:
+        Bio.SeqIO.write(output_records, fd, "fasta")
+
+    print("Wrote %d / %d [%d unique] sequences: %s" % (
+        len(output_records), len(input_records), len(sequences), args.out))
+
+
+if __name__ == '__main__':
+    run()
@@ -0,0 +1,67 @@
+"""
+Generate allele sequences for pan-class II models.
+
+Additional dependency: biopython
+"""
+from __future__ import print_function
+
+import sys
+import argparse
+
+import pandas
+
+import Bio.SeqIO  # pylint: disable=import-error
+
+parser = argparse.ArgumentParser(usage=__doc__)
+
+parser.add_argument(
+    "aligned_fasta",
+    help="Aligned sequences")
+
+parser.add_argument(
+    "--reference-allele",
+    required=True,
+    help="Allele to use for position numbering")
+
+parser.add_argument(
+    "--out-csv",
+    help="Result file")
+
+
+def run():
+    args = parser.parse_args(sys.argv[1:])
+    print(args)
+
+    allele_to_sequence = {}
+    reader = Bio.SeqIO.parse(args.aligned_fasta, "fasta")
+    for record in reader:
+        name = record.description.split()[1]
+        print(record.name, record.description)
+        allele_to_sequence[name] = str(record.seq)
+
+    allele_to_sequence = pandas.Series(allele_to_sequence).sort_index()
+    print("Read %d aligned sequences" % len(allele_to_sequence))
+
+    reference = allele_to_sequence[args.reference_allele]
+    print("Using reference", args.reference_allele, reference)
+
+    df = pandas.DataFrame(index=allele_to_sequence.index)
+
+    current_number = 1
+    for (i, reference_char) in enumerate(reference):
+        if current_number not in df.columns:
+            df[current_number] = ""
+
+        df[current_number] += allele_to_sequence.str.get(i)
+        if reference_char != '-':
+            current_number += 1
+
+    df = df.applymap(lambda s: s.replace("-", "X"))
+    print(df)
+
+    df.to_csv(args.out_csv, index=True)
+    print("Wrote [%d alleles]: %s" % (len(df), args.out_csv))
+
+
+if __name__ == '__main__':
+    run()