Merge pull request #99 from kipoi/add-one-hot-encode

haimasree · web-flow · commit b5c9b5f043d3 · 2021-08-03T17:03:34.000+02:00
Add one hot encode from enformer
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -28,7 +28,6 @@ variables:
       name: Install pip dependencies
       command: |
         source activate kipoi-dev
-        pip install genomelake --no-deps
         pip install pyfaidx
   install_conda_deps: &install_conda_deps
     run:
@@ -38,7 +37,7 @@ variables:
         apt-get update -y
         apt-get install build-essential -y
         conda install -y cython
-        conda install -y -c bioconda cyvcf2 pybedtools genomelake pyfaidx biopython
+        conda install -y -c bioconda cyvcf2 pybedtools pyfaidx biopython
   install_kipoi: &install_kipoi
     run:
       name: Install Kipoi
diff --git a/README.md b/README.md
@@ -68,4 +68,4 @@ More info:
 - Read the pytorch [Data Loading and Processing Tutorial](https://pytorch.org/tutorials/beginner/data_loading_tutorial.html) to become more familiar with transforms and dataloaders
 - Read the code for `SeqIntervalDl` in [kipoiseq/dataloaders/sequence.py](https://github.com/kipoi/kipoiseq/blob/master/kipoiseq/dataloaders/sequence.py)
   - you can skip the `@kipoi_dataloader` and the long yaml doc-string. These are only required if you want to use dataloaders in Kipoi's model.yaml files.
-- Explore the available transforms ([functional](http://kipoi.org/kipoiseq/transforms/functional/), [class-based](http://kipoi.org/kipoiseq/transforms/transforms/)) or extractors ([kipoiseq](https://github.com/kipoi/kipoiseq/blob/master/kipoiseq/extractors.py), [genomelake](https://github.com/kundajelab/genomelake/blob/master/genomelake/extractors.py))
+- Explore the available transforms ([functional](http://kipoi.org/kipoiseq/transforms/functional/), [class-based](http://kipoi.org/kipoiseq/transforms/transforms/)) or extractors ([kipoiseq](https://github.com/kipoi/kipoiseq/blob/master/kipoiseq/extractors.py))
diff --git a/kipoiseq/dataloaders/sequence.py b/kipoiseq/dataloaders/sequence.py
@@ -12,8 +12,6 @@
 from kipoiseq.utils import to_scalar, parse_dtype
 from kipoiseq.dataclasses import Interval
 
-# general dependencies
-# bioconda::genomelake', TODO - add genomelake again once it gets released with pyfaidx to bioconda
 deps = Dependencies(conda=['bioconda::pybedtools', 'bioconda::pyfaidx', 'bioconda::pyranges', 'numpy', 'pandas'],
                     pip=['kipoiseq'])
 package_authors = [Author(name='Ziga Avsec', github='avsecz'),
diff --git a/kipoiseq/dataloaders/splicing.py b/kipoiseq/dataloaders/splicing.py
@@ -7,8 +7,6 @@
 from pyfaidx import Fasta
 import pickle
 
-# general dependencies
-# bioconda::genomelake', TODO - add genomelake again once it gets released with pyfaidx to bioconda
 deps = Dependencies(conda=['bioconda::pyfaidx', 'numpy', 'pandas'],
                     pip=['kipoiseq', 'kipoi'])
 package_authors = [Author(name='Jun Cheng', github='s6juncheng')]
diff --git a/kipoiseq/extractors/fasta.py b/kipoiseq/extractors/fasta.py
@@ -69,7 +69,6 @@ def close(self):
 #     """
 
 #     def __init__(self, bigwig_file):
-#         from genomelake.extractors import BigwigExtractor
 
 #         self.bigwig_file = bigwig_file
 #         self.batch_extractor = BigwigExtractor(self.bigwig_file)
diff --git a/kipoiseq/transforms/functional.py b/kipoiseq/transforms/functional.py
@@ -2,17 +2,12 @@
 from __future__ import absolute_import
 from __future__ import print_function
 
+from typing import Any
 from kipoiseq.utils import DNA
 from copy import deepcopy
 import numpy as np
 from six import string_types
 
-try:
-    # use the fast genomelake's one-hot-encode if it's installed
-    from genomelake.util import one_hot_encode_sequence
-except ImportError:
-    one_hot_encode_sequence = None
-
 
 # sequence -> array
 
@@ -119,21 +114,23 @@ def one_hot(seq, alphabet=DNA, neutral_alphabet=['N'], neutral_value=.25, dtype=
         raise ValueError("seq needs to be a string")
     return token2one_hot(tokenize(seq, alphabet, neutral_alphabet), len(alphabet), neutral_value, dtype=dtype)
 
-
-def one_hot_dna(seq, dtype=None):
-    """One-hot encode DNA sequence
-    """
+# Reference: https://github.com/deepmind/deepmind-research/blob/fa8c9be4bb0cfd0b8492203eb2a9f31ef995633c/enformer/enformer.py#L306-L318
+def one_hot_dna(seq: str,
+                alphabet: list = DNA,
+                neutral_alphabet: str = 'N',
+                neutral_value: Any = 0.25,
+                dtype=np.float32) -> np.ndarray:
+    """One-hot encode sequence."""
     if not isinstance(seq, str):
-        raise ValueError("seq needs to be a string")
-
-    if one_hot_encode_sequence is not None:
-        # genomelake's one_hot_encode_sequence could be imported
-        out = np.zeros((len(seq), 4), dtype=np.float32)
-        one_hot_encode_sequence(seq, out)
-        return out.astype(dtype)
-    else:
-        return one_hot(seq, alphabet=DNA, neutral_alphabet=['N'], neutral_value=.25, dtype=dtype)
-
+        raise ValueError("sequence needs to be a string")
+    def to_uint8(string):
+        return np.frombuffer(string.encode('ascii'), dtype=np.uint8)
+    
+    hash_table = np.zeros((np.iinfo(np.uint8).max, len(alphabet)), dtype=dtype)
+    hash_table[to_uint8(''.join(alphabet))] = np.eye(len(alphabet), dtype=dtype)
+    hash_table[to_uint8(''.join(neutral_alphabet))] = neutral_value
+    hash_table = hash_table.astype(dtype)
+    return hash_table[to_uint8(seq)]
 
 # sequence trimming
 
diff --git a/kipoiseq/transforms/transforms.py b/kipoiseq/transforms/transforms.py
@@ -111,7 +111,11 @@ def __init__(self, alphabet=DNA, neutral_alphabet='N', neutral_value=0.25, dtype
 
     def __call__(self, seq):
         if self.alphabet == DNA and self.neutral_alphabet == ['N'] and self.neutral_value == 0.25:
-            return F.one_hot_dna(seq, self.dtype)
+            return F.one_hot_dna(seq, 
+                                alphabet=self.alphabet, 
+                                neutral_alphabet=self.neutral_alphabet, 
+                                neutral_value=self.neutral_value, 
+                                dtype=self.dtype)
         else:
             return F.one_hot(seq,
                              alphabet=self.alphabet,
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,5 @@
 cyvcf2==0.20.0
 deprecation==2.1.0
-genomelake==0.1.4
 gffutils==0.10.1
 kipoi==0.6.25
 kipoi_conda==0.2.2
diff --git a/setup.py b/setup.py
@@ -5,7 +5,6 @@
 
 requirements = [
     "kipoi>=0.5.5",
-    # "genomelake",
     "pyfaidx",
     "numpy",
     "pandas",
@@ -35,7 +34,6 @@
     "cython",
     "cyvcf2",
     "pyranges>=0.0.71",
-    # "genomelake",
     "keras",
     "tensorflow",
     "pybedtools",