kundajelab
diff --git a/‎enformer/Untitled.ipynb‎
Lines changed: 29 additions & 2 deletions b/‎enformer/Untitled.ipynb‎
Lines changed: 29 additions & 2 deletions
diff --git a/‎enformer/chrombpnet_grad.ipynb‎
Lines changed: 337 additions & 0 deletions b/‎enformer/chrombpnet_grad.ipynb‎
Lines changed: 337 additions & 0 deletions
diff --git a/‎enformer/chrombpnet_ism-ATAC.ipynb‎
Lines changed: 713 additions & 0 deletions b/‎enformer/chrombpnet_ism-ATAC.ipynb‎
Lines changed: 713 additions & 0 deletions
diff --git a/‎enformer/chrombpnet_ism.ipynb‎
Lines changed: 611 additions & 0 deletions b/‎enformer/chrombpnet_ism.ipynb‎
Lines changed: 611 additions & 0 deletions
diff --git a/‎enformer/enformer-tensorflow.ipynb‎
Lines changed: 315 additions & 0 deletions b/‎enformer/enformer-tensorflow.ipynb‎
Lines changed: 315 additions & 0 deletions
diff --git a/‎enformer/enformer_ism.ipynb‎
Lines changed: 721 additions & 0 deletions b/‎enformer/enformer_ism.ipynb‎
Lines changed: 721 additions & 0 deletions
diff --git a/‎enformer/make_plot-Copy1.ipynb‎
Lines changed: 487 additions & 0 deletions b/‎enformer/make_plot-Copy1.ipynb‎
Lines changed: 487 additions & 0 deletions
diff --git a/‎enformer/make_plot.ipynb‎
Lines changed: 496 additions & 0 deletions b/‎enformer/make_plot.ipynb‎
Lines changed: 496 additions & 0 deletions
diff --git a/‎enformer/one_hot.py‎
Lines changed: 61 additions & 0 deletions b/‎enformer/one_hot.py‎
Lines changed: 61 additions & 0 deletions
@@ -1,6 +1,33 @@
 {
- "cells": [],
- "metadata": {},
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8e300b9b-2f3e-4cec-945d-af8a1de13963",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.20"
+  }
+ },
  "nbformat": 4,
  "nbformat_minor": 5
 }
@@ -0,0 +1,61 @@
+"""
+Written by Alex Tseng
+
+https://gist.github.com/amtseng/010dd522daaabc92b014f075a34a0a0b
+"""
+
+import numpy as np
+
+def dna_to_one_hot(seqs):
+    """
+    Converts a list of DNA ("ACGT") sequences to one-hot encodings, where the
+    position of 1s is ordered alphabetically by "ACGT". `seqs` must be a list
+    of N strings, where every string is the same length L. Returns an N x L x 4
+    NumPy array of one-hot encodings, in the same order as the input sequences.
+    All bases will be converted to upper-case prior to performing the encoding.
+    Any bases that are not "ACGT" will be given an encoding of all 0s.
+    """
+    seq_len = len(seqs[0])
+    assert np.all(np.array([len(s) for s in seqs]) == seq_len)
+
+    # Join all sequences together into one long string, all uppercase
+    seq_concat = "".join(seqs).upper() + "ACGT"
+    # Add one example of each base, so np.unique doesn't miss indices later
+
+    one_hot_map = np.identity(5)[:, :-1].astype(np.int8)
+
+    # Convert string into array of ASCII character codes;
+    base_vals = np.frombuffer(bytearray(seq_concat, "utf8"), dtype=np.int8)
+
+    # Anything that's not an A, C, G, or T gets assigned a higher code
+    base_vals[~np.isin(base_vals, np.array([65, 67, 71, 84]))] = 85
+
+    # Convert the codes into indices in [0, 4], in ascending order by code
+    _, base_inds = np.unique(base_vals, return_inverse=True)
+
+    # Get the one-hot encoding for those indices, and reshape back to separate
+    return one_hot_map[base_inds[:-4]].reshape((len(seqs), seq_len, 4))
+
+
+def one_hot_to_dna(one_hot):
+    """
+    Converts a one-hot encoding into a list of DNA ("ACGT") sequences, where the
+    position of 1s is ordered alphabetically by "ACGT". `one_hot` must be an
+    N x L x 4 array of one-hot encodings. Returns a lits of N "ACGT" strings,
+    each of length L, in the same order as the input array. The returned
+    sequences will only consist of letters "A", "C", "G", "T", or "N" (all
+    upper-case). Any encodings that are all 0s will be translated to "N".
+    """
+    bases = np.array(["A", "C", "G", "T", "N"])
+    # Create N x L array of all 5s
+    one_hot_inds = np.tile(one_hot.shape[2], one_hot.shape[:2])
+
+    # Get indices of where the 1s are
+    batch_inds, seq_inds, base_inds = np.where(one_hot)
+
+    # In each of the locations in the N x L array, fill in the location of the 1
+    one_hot_inds[batch_inds, seq_inds] = base_inds
+
+    # Fetch the corresponding base for each position using indexing
+    seq_array = bases[one_hot_inds]
+    return ["".join(seq) for seq in seq_array]