Skip to content

Commit 9fb3067

Browse files
committed
reviews-enformer-interpret
1 parent 31feb9f commit 9fb3067

File tree

9 files changed

+3770
-2
lines changed

9 files changed

+3770
-2
lines changed

enformer/Untitled.ipynb

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,33 @@
11
{
2-
"cells": [],
3-
"metadata": {},
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"id": "8e300b9b-2f3e-4cec-945d-af8a1de13963",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": []
10+
}
11+
],
12+
"metadata": {
13+
"kernelspec": {
14+
"display_name": "Python 3 (ipykernel)",
15+
"language": "python",
16+
"name": "python3"
17+
},
18+
"language_info": {
19+
"codemirror_mode": {
20+
"name": "ipython",
21+
"version": 3
22+
},
23+
"file_extension": ".py",
24+
"mimetype": "text/x-python",
25+
"name": "python",
26+
"nbconvert_exporter": "python",
27+
"pygments_lexer": "ipython3",
28+
"version": "3.8.20"
29+
}
30+
},
431
"nbformat": 4,
532
"nbformat_minor": 5
633
}

enformer/chrombpnet_grad.ipynb

Lines changed: 337 additions & 0 deletions
Large diffs are not rendered by default.

enformer/chrombpnet_ism-ATAC.ipynb

Lines changed: 713 additions & 0 deletions
Large diffs are not rendered by default.

enformer/chrombpnet_ism.ipynb

Lines changed: 611 additions & 0 deletions
Large diffs are not rendered by default.

enformer/enformer-tensorflow.ipynb

Lines changed: 315 additions & 0 deletions
Large diffs are not rendered by default.

enformer/enformer_ism.ipynb

Lines changed: 721 additions & 0 deletions
Large diffs are not rendered by default.

enformer/make_plot-Copy1.ipynb

Lines changed: 487 additions & 0 deletions
Large diffs are not rendered by default.

enformer/make_plot.ipynb

Lines changed: 496 additions & 0 deletions
Large diffs are not rendered by default.

enformer/one_hot.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
"""
2+
Written by Alex Tseng
3+
4+
https://gist.github.com/amtseng/010dd522daaabc92b014f075a34a0a0b
5+
"""
6+
7+
import numpy as np
8+
9+
def dna_to_one_hot(seqs):
10+
"""
11+
Converts a list of DNA ("ACGT") sequences to one-hot encodings, where the
12+
position of 1s is ordered alphabetically by "ACGT". `seqs` must be a list
13+
of N strings, where every string is the same length L. Returns an N x L x 4
14+
NumPy array of one-hot encodings, in the same order as the input sequences.
15+
All bases will be converted to upper-case prior to performing the encoding.
16+
Any bases that are not "ACGT" will be given an encoding of all 0s.
17+
"""
18+
seq_len = len(seqs[0])
19+
assert np.all(np.array([len(s) for s in seqs]) == seq_len)
20+
21+
# Join all sequences together into one long string, all uppercase
22+
seq_concat = "".join(seqs).upper() + "ACGT"
23+
# Add one example of each base, so np.unique doesn't miss indices later
24+
25+
one_hot_map = np.identity(5)[:, :-1].astype(np.int8)
26+
27+
# Convert string into array of ASCII character codes;
28+
base_vals = np.frombuffer(bytearray(seq_concat, "utf8"), dtype=np.int8)
29+
30+
# Anything that's not an A, C, G, or T gets assigned a higher code
31+
base_vals[~np.isin(base_vals, np.array([65, 67, 71, 84]))] = 85
32+
33+
# Convert the codes into indices in [0, 4], in ascending order by code
34+
_, base_inds = np.unique(base_vals, return_inverse=True)
35+
36+
# Get the one-hot encoding for those indices, and reshape back to separate
37+
return one_hot_map[base_inds[:-4]].reshape((len(seqs), seq_len, 4))
38+
39+
40+
def one_hot_to_dna(one_hot):
41+
"""
42+
Converts a one-hot encoding into a list of DNA ("ACGT") sequences, where the
43+
position of 1s is ordered alphabetically by "ACGT". `one_hot` must be an
44+
N x L x 4 array of one-hot encodings. Returns a lits of N "ACGT" strings,
45+
each of length L, in the same order as the input array. The returned
46+
sequences will only consist of letters "A", "C", "G", "T", or "N" (all
47+
upper-case). Any encodings that are all 0s will be translated to "N".
48+
"""
49+
bases = np.array(["A", "C", "G", "T", "N"])
50+
# Create N x L array of all 5s
51+
one_hot_inds = np.tile(one_hot.shape[2], one_hot.shape[:2])
52+
53+
# Get indices of where the 1s are
54+
batch_inds, seq_inds, base_inds = np.where(one_hot)
55+
56+
# In each of the locations in the N x L array, fill in the location of the 1
57+
one_hot_inds[batch_inds, seq_inds] = base_inds
58+
59+
# Fetch the corresponding base for each position using indexing
60+
seq_array = bases[one_hot_inds]
61+
return ["".join(seq) for seq in seq_array]

0 commit comments

Comments
 (0)