Merge branch 'master' into fix_remove_use_strand

Avsecz · web-flow · commit 51aaecb301eb · 2018-10-27T12:40:27.000+02:00
diff --git a/.pep8speaks.yml b/.pep8speaks.yml
@@ -0,0 +1,5 @@
+pycodestyle:
+    max-line-length: 140  # Default is 79 in PEP8
+    ignore:  # Errors and warnings to ignore
+        - E111
+        - E731
diff --git a/kipoiseq/datasets/sequence.py b/kipoiseq/datasets/sequence.py
@@ -8,6 +8,7 @@
 from kipoi.plugin import is_installed
 from kipoi.data import Dataset, kipoi_dataloader
 from kipoi.specs import Author, Dependencies
+from kipoi.utils import default_kwargs
 from six import string_types
 
 
@@ -72,6 +73,7 @@ class BedDataset(object):
       incl_chromosomes: exclusive list of chromosome names to include in the final dataset.
         if not None, only these will be present in the dataset
       excl_chromosomes: list of chromosome names to omit from the dataset.
+      ignore_targets: if True, target variables are ignored
     """
 
     # bed types accorging to
@@ -95,14 +97,16 @@ def __init__(self, tsv_file,
                  num_chr=False,
                  ambiguous_mask=None,
                  incl_chromosomes=None,
-                 excl_chromosomes=None):
+                 excl_chromosomes=None,
+                 ignore_targets=False):
         self.tsv_file = tsv_file
         self.bed_columns = bed_columns
         self.num_chr = num_chr
         self.label_dtype = label_dtype
         self.ambiguous_mask = ambiguous_mask
         self.incl_chromosomes = incl_chromosomes
         self.excl_chromosomes = excl_chromosomes
+        self.ignore_targets = ignore_targets
 
         df_peek = pd.read_table(self.tsv_file,
                                 header=None,
@@ -141,7 +145,7 @@ def __getitem__(self, idx):
         row = self.df.iloc[idx]
         interval = pybedtools.create_interval_from_list([to_scalar(x) for x in row.iloc[:self.bed_columns]])
 
-        if self.n_tasks == 0:
+        if self.ignore_targets or self.n_tasks == 0:
             labels = {}
         else:
             labels = row.iloc[self.bed_columns:].values.astype(self.label_dtype)
@@ -185,6 +189,8 @@ class SeqStringDataset(Dataset):
         #     doc: reverse-complement fasta sequence if bed file defines negative strand
         force_upper:
             doc: Force uppercase output of sequences
+        ignore_targets:
+            doc: if True, don't return any target variables
     output_schema:
         inputs:
             name: seq
@@ -213,7 +219,8 @@ def __init__(self,
                  auto_resize_len=None,
                  # max_seq_len=None,
                  # use_strand=False,
-                 force_upper=True):
+                 force_upper=True,
+                 ignore_targets=False):
 
         self.num_chr_fasta = num_chr_fasta
         self.intervals_file = intervals_file
@@ -232,7 +239,8 @@ def __init__(self,
         self.bed = BedDataset(self.intervals_file,
                               num_chr=self.num_chr_fasta,
                               bed_columns=3,
-                              label_dtype=parse_dtype(label_dtype))
+                              label_dtype=parse_dtype(label_dtype),
+                              ignore_targets=ignore_targets)
         self.fasta_extractors = None
 
     def __len__(self):
@@ -265,15 +273,12 @@ def __getitem__(self, idx):
         }
 
     @classmethod
-    def default_shape(cls):
-        # correct the output schema - TODO - required?
-        # self.output_schema_params = deepcopy(self.output_schema_params)
-        # self.output_schema_params['inputs_shape'] = (1,)
-        # if self.bed.n_tasks != 0:
-        #     self.output_schema_params['targets_shape'] = (self.bed.n_tasks,)
-
-        # self.output_schema = get_seq_dataset_output_schema(**self.output_schema_params)
-        pass
+    def get_output_schema(cls):
+        kwargs = default_kwargs(cls)
+        ignore_targets = kwargs['ignore_targets']
+        if ignore_targets:
+            cls.output_schema.targets = None
+        return cls.output_schema
 
 
 # TODO - check lzamparo's dataloader:
@@ -320,7 +325,9 @@ class SeqDataset(Dataset):
                 alphabet to use for the one-hot encoding. This defines the order of the one-hot encoding.
                 Can either be a list or a string: 'DNA', 'RNA', 'AMINO_ACIDS'.
         dtype:
-            doc: defines the numpy dtype of the returned array. 
+            doc: defines the numpy dtype of the returned array.
+        ignore_targets:
+            doc: if True, don't return any target variables
 
     output_schema:
         inputs:
@@ -353,7 +360,9 @@ def __init__(self,
                  alphabet_axis=1,
                  dummy_axis=None,
                  alphabet="ACGT",
+                 ignore_targets=False,
                  dtype=None):
+        # TODO - add disable target loading to manage the Basenji case
 
         # make sure the alphabet axis and the dummy axis are valid:
         assert alphabet_axis >= 0 and (alphabet_axis < 2 or (alphabet_axis <= 2 and dummy_axis is not None))
@@ -369,13 +378,18 @@ def __init__(self,
         self.seq_string_dataset = SeqStringDataset(intervals_file, fasta_file, num_chr_fasta=num_chr_fasta,
                                                    label_dtype=label_dtype, auto_resize_len=auto_resize_len,
                                                    # use_strand=use_strand,
-                                                   force_upper=True)
+                                                   ignore_targets=ignore_targets)
+
+        if dummy_axis is not None and alphabet_axis == dummy_axis:
+            raise ValueError("dummy_axis can't be the same as dummy_axis")
 
         # set the transform parameters correctly
-        existing_alphabet_axis = 1
         if dummy_axis is not None and dummy_axis < 2:
             # dummy axis is added somewhere in the middle, so the alphabet axis is at the end now
             existing_alphabet_axis = 2
+        else:
+            # alphabet axis stayed the same
+            existing_alphabet_axis = 1
 
         # check if no swapping needed
         if existing_alphabet_axis == self.alphabet_axis:
@@ -396,18 +410,47 @@ def __getitem__(self, idx):
         ret['inputs'] = self.input_tranform(str(ret["inputs"]))
         return ret
 
-    # TODO - compute the output shape based on the default value of parameters
-    #         - executed in kipoi_dataloader
-    # TODO - how to specify the shape properly when using differnet default parameters?
-    #         - example: Basset dataloader
     @classmethod
-    def default_shape(cls):
-        # setup output schema
-        # self.output_schema_params = deepcopy(self.output_schema_params)
-
-        # self.output_schema_params['inputs_shape'] = get_onehot_shape(self.alphabet_axis, self.dummy_axis,
-        #                                                              self.auto_resize_len, self.alphabet)
-        # if self.bed.n_tasks != 0:
-        #     self.output_schema_params['targets_shape'] = (self.bed.n_tasks,)
-        # self.output_schema = get_seq_dataset_output_schema(**self.output_schema_params)
-        pass
+    def get_output_schema(cls):
+        """Get the output schema. Overrides the default `cls.output_schema`
+        """
+
+        # override the parent method
+        kwargs = default_kwargs(cls)
+        n_channels = len(kwargs['alphabet'])
+        seqlen = kwargs['auto_resize_len']
+        dummy_axis = kwargs['dummy_axis']
+        alphabet_axis = kwargs['alphabet_axis']
+        ignore_targets = kwargs['ignore_targets']
+
+        if ignore_targets:
+            cls.output_schema.targets = None
+
+        if dummy_axis is not None and alphabet_axis == dummy_axis:
+            raise ValueError("dummy_axis can't be the same as dummy_axis")
+
+        # default
+        input_shape = (seqlen, n_channels)
+
+        if dummy_axis is not None and dummy_axis < 2:
+            # dummy axis is added somewhere in the middle, so the alphabet axis is at the end now
+            existing_alphabet_axis = 2
+        else:
+            existing_alphabet_axis = 1
+
+        if existing_alphabet_axis == alphabet_axis:
+            alphabet_axis = None
+
+        # inject the dummy axis
+        if dummy_axis is not None:
+            input_shape = input_shape[:dummy_axis] + (1,) + input_shape[dummy_axis:]
+
+        # swap axes
+        if alphabet_axis is not None:
+            sh = list(input_shape)
+            sh[alphabet_axis], sh[existing_alphabet_axis] = sh[existing_alphabet_axis], sh[alphabet_axis]
+            input_shape = tuple(sh)
+
+        # now, modify the input schema
+        cls.output_schema.inputs.shape = input_shape
+        return cls.output_schema
diff --git a/kipoiseq/utils.py b/kipoiseq/utils.py
@@ -1,5 +1,4 @@
 import numpy as np
-from pybedtools import Interval
 
 
 # alphabets:
diff --git a/tests/datasets/test_sequence.py b/tests/datasets/test_sequence.py
@@ -1,7 +1,9 @@
 import os
 import numpy as np
 import pytest
+from copy import deepcopy
 from pybedtools import Interval
+from kipoi.utils import override_default_kwargs
 from kipoiseq.transforms.functional import one_hot_dna
 from kipoiseq.datasets.sequence import SeqStringDataset, SeqDataset, parse_dtype, BedDataset
 
@@ -86,3 +88,35 @@ def test_examples_exist(cls):
         dl_entries += 1
     assert dl_entries == len(ex)
     assert len(ex) == bed_entries
+
+
+def test_output_schape():
+    Dl = deepcopy(SeqDataset)
+    assert Dl.get_output_schema().inputs.shape == (None, 4)
+    override_default_kwargs(Dl, {"auto_resize_len": 100})
+    assert Dl.get_output_schema().inputs.shape == (100, 4)
+
+    override_default_kwargs(Dl, {"auto_resize_len": 100, "dummy_axis": 1, "alphabet_axis": 2})
+    assert Dl.get_output_schema().inputs.shape == (100, 1, 4)
+    override_default_kwargs(Dl, {"auto_resize_len": 100, "dummy_axis": None, "alphabet_axis": 1})  # reset
+    override_default_kwargs(Dl, {"auto_resize_len": 100, "dummy_axis": 2})
+    assert Dl.get_output_schema().inputs.shape == (100, 4, 1)
+    override_default_kwargs(Dl, {"auto_resize_len": 100, "dummy_axis": None, "alphabet_axis": 1})  # reset
+
+    override_default_kwargs(Dl, {"auto_resize_len": 100, "alphabet": "ACGTD"})
+    assert Dl.get_output_schema().inputs.shape == (100, 5)
+    override_default_kwargs(Dl, {"auto_resize_len": 100, "alphabet": "ACGT"})  # reset
+
+    override_default_kwargs(Dl, {"auto_resize_len": 160, "dummy_axis": 2, "alphabet_axis": 0})
+    assert Dl.get_output_schema().inputs.shape == (4, 160, 1)
+
+    override_default_kwargs(Dl, {"auto_resize_len": 160, "dummy_axis": 2, "alphabet_axis": 1})
+    assert Dl.get_output_schema().inputs.shape == (160, 4, 1)
+    targets = Dl.get_output_schema().targets
+    assert targets.shape == (None,)
+
+    override_default_kwargs(Dl, {"ignore_targets": True})
+    assert Dl.get_output_schema().targets is None
+    # reset back
+    override_default_kwargs(Dl, {"ignore_targets": False})
+    Dl.output_schema.targets = targets

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`	`1`	`import numpy as np`
`2`		`-from pybedtools import Interval`
`3`	`2`
`4`	`3`
`5`	`4`	`# alphabets:`