Skip to content

Commit b5c9b5f

Browse files
authored
Merge pull request #99 from kipoi/add-one-hot-encode
Add one hot encode from enformer
2 parents e06360c + b99c1e9 commit b5c9b5f

File tree

9 files changed

+24
-32
lines changed

9 files changed

+24
-32
lines changed

.circleci/config.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ variables:
2828
name: Install pip dependencies
2929
command: |
3030
source activate kipoi-dev
31-
pip install genomelake --no-deps
3231
pip install pyfaidx
3332
install_conda_deps: &install_conda_deps
3433
run:
@@ -38,7 +37,7 @@ variables:
3837
apt-get update -y
3938
apt-get install build-essential -y
4039
conda install -y cython
41-
conda install -y -c bioconda cyvcf2 pybedtools genomelake pyfaidx biopython
40+
conda install -y -c bioconda cyvcf2 pybedtools pyfaidx biopython
4241
install_kipoi: &install_kipoi
4342
run:
4443
name: Install Kipoi

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,4 +68,4 @@ More info:
6868
- Read the pytorch [Data Loading and Processing Tutorial](https://pytorch.org/tutorials/beginner/data_loading_tutorial.html) to become more familiar with transforms and dataloaders
6969
- Read the code for `SeqIntervalDl` in [kipoiseq/dataloaders/sequence.py](https://github.com/kipoi/kipoiseq/blob/master/kipoiseq/dataloaders/sequence.py)
7070
- you can skip the `@kipoi_dataloader` and the long yaml doc-string. These are only required if you want to use dataloaders in Kipoi's model.yaml files.
71-
- Explore the available transforms ([functional](http://kipoi.org/kipoiseq/transforms/functional/), [class-based](http://kipoi.org/kipoiseq/transforms/transforms/)) or extractors ([kipoiseq](https://github.com/kipoi/kipoiseq/blob/master/kipoiseq/extractors.py), [genomelake](https://github.com/kundajelab/genomelake/blob/master/genomelake/extractors.py))
71+
- Explore the available transforms ([functional](http://kipoi.org/kipoiseq/transforms/functional/), [class-based](http://kipoi.org/kipoiseq/transforms/transforms/)) or extractors ([kipoiseq](https://github.com/kipoi/kipoiseq/blob/master/kipoiseq/extractors.py))

kipoiseq/dataloaders/sequence.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,6 @@
1212
from kipoiseq.utils import to_scalar, parse_dtype
1313
from kipoiseq.dataclasses import Interval
1414

15-
# general dependencies
16-
# bioconda::genomelake', TODO - add genomelake again once it gets released with pyfaidx to bioconda
1715
deps = Dependencies(conda=['bioconda::pybedtools', 'bioconda::pyfaidx', 'bioconda::pyranges', 'numpy', 'pandas'],
1816
pip=['kipoiseq'])
1917
package_authors = [Author(name='Ziga Avsec', github='avsecz'),

kipoiseq/dataloaders/splicing.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
from pyfaidx import Fasta
88
import pickle
99

10-
# general dependencies
11-
# bioconda::genomelake', TODO - add genomelake again once it gets released with pyfaidx to bioconda
1210
deps = Dependencies(conda=['bioconda::pyfaidx', 'numpy', 'pandas'],
1311
pip=['kipoiseq', 'kipoi'])
1412
package_authors = [Author(name='Jun Cheng', github='s6juncheng')]

kipoiseq/extractors/fasta.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,6 @@ def close(self):
6969
# """
7070

7171
# def __init__(self, bigwig_file):
72-
# from genomelake.extractors import BigwigExtractor
7372

7473
# self.bigwig_file = bigwig_file
7574
# self.batch_extractor = BigwigExtractor(self.bigwig_file)

kipoiseq/transforms/functional.py

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,12 @@
22
from __future__ import absolute_import
33
from __future__ import print_function
44

5+
from typing import Any
56
from kipoiseq.utils import DNA
67
from copy import deepcopy
78
import numpy as np
89
from six import string_types
910

10-
try:
11-
# use the fast genomelake's one-hot-encode if it's installed
12-
from genomelake.util import one_hot_encode_sequence
13-
except ImportError:
14-
one_hot_encode_sequence = None
15-
1611

1712
# sequence -> array
1813

@@ -119,21 +114,23 @@ def one_hot(seq, alphabet=DNA, neutral_alphabet=['N'], neutral_value=.25, dtype=
119114
raise ValueError("seq needs to be a string")
120115
return token2one_hot(tokenize(seq, alphabet, neutral_alphabet), len(alphabet), neutral_value, dtype=dtype)
121116

122-
123-
def one_hot_dna(seq, dtype=None):
124-
"""One-hot encode DNA sequence
125-
"""
117+
# Reference: https://github.com/deepmind/deepmind-research/blob/fa8c9be4bb0cfd0b8492203eb2a9f31ef995633c/enformer/enformer.py#L306-L318
118+
def one_hot_dna(seq: str,
119+
alphabet: list = DNA,
120+
neutral_alphabet: str = 'N',
121+
neutral_value: Any = 0.25,
122+
dtype=np.float32) -> np.ndarray:
123+
"""One-hot encode sequence."""
126124
if not isinstance(seq, str):
127-
raise ValueError("seq needs to be a string")
128-
129-
if one_hot_encode_sequence is not None:
130-
# genomelake's one_hot_encode_sequence could be imported
131-
out = np.zeros((len(seq), 4), dtype=np.float32)
132-
one_hot_encode_sequence(seq, out)
133-
return out.astype(dtype)
134-
else:
135-
return one_hot(seq, alphabet=DNA, neutral_alphabet=['N'], neutral_value=.25, dtype=dtype)
136-
125+
raise ValueError("sequence needs to be a string")
126+
def to_uint8(string):
127+
return np.frombuffer(string.encode('ascii'), dtype=np.uint8)
128+
129+
hash_table = np.zeros((np.iinfo(np.uint8).max, len(alphabet)), dtype=dtype)
130+
hash_table[to_uint8(''.join(alphabet))] = np.eye(len(alphabet), dtype=dtype)
131+
hash_table[to_uint8(''.join(neutral_alphabet))] = neutral_value
132+
hash_table = hash_table.astype(dtype)
133+
return hash_table[to_uint8(seq)]
137134

138135
# sequence trimming
139136

kipoiseq/transforms/transforms.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,11 @@ def __init__(self, alphabet=DNA, neutral_alphabet='N', neutral_value=0.25, dtype
111111

112112
def __call__(self, seq):
113113
if self.alphabet == DNA and self.neutral_alphabet == ['N'] and self.neutral_value == 0.25:
114-
return F.one_hot_dna(seq, self.dtype)
114+
return F.one_hot_dna(seq,
115+
alphabet=self.alphabet,
116+
neutral_alphabet=self.neutral_alphabet,
117+
neutral_value=self.neutral_value,
118+
dtype=self.dtype)
115119
else:
116120
return F.one_hot(seq,
117121
alphabet=self.alphabet,

requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
cyvcf2==0.20.0
22
deprecation==2.1.0
3-
genomelake==0.1.4
43
gffutils==0.10.1
54
kipoi==0.6.25
65
kipoi_conda==0.2.2

setup.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
requirements = [
77
"kipoi>=0.5.5",
8-
# "genomelake",
98
"pyfaidx",
109
"numpy",
1110
"pandas",
@@ -35,7 +34,6 @@
3534
"cython",
3635
"cyvcf2",
3736
"pyranges>=0.0.71",
38-
# "genomelake",
3937
"keras",
4038
"tensorflow",
4139
"pybedtools",

0 commit comments

Comments
 (0)