@@ -47,6 +47,7 @@ def parse_alphabet(alphabet):
4747 else :
4848 return alphabet
4949
50+
5051def parse_type (dtype ):
5152 if isinstance (dtype , string_types ):
5253 if dtype in dir (np ):
@@ -107,8 +108,12 @@ def __init__(self, tsv_file,
107108 header = None ,
108109 nrows = 1 ,
109110 sep = '\t ' )
110- self .n_tasks = df_peek .shape [1 ] - self .bed_columns
111- assert self .n_tasks >= 0
111+ found_columns = df_peek .shape [1 ]
112+ self .n_tasks = found_columns - self .bed_columns
113+ if self .n_tasks < 0 :
114+ raise ValueError ("BedDataset requires at least {} bed columns. Found only {} columns" .
115+ format (self .bed_columns , found_columns ))
116+
112117 self .df = pd .read_table (self .tsv_file ,
113118 header = None ,
114119 dtype = {i : d
@@ -176,8 +181,8 @@ class SeqStringDataset(Dataset):
176181 doc: None, required sequence length.
177182 # max_seq_len:
178183 # doc: maximum allowed sequence length
179- use_strand:
180- doc: reverse-complement fasta sequence if bed file defines negative strand
184+ # use_strand:
185+ # doc: reverse-complement fasta sequence if bed file defines negative strand
181186 force_upper:
182187 doc: Force uppercase output of sequences
183188 output_schema:
@@ -207,19 +212,26 @@ def __init__(self,
207212 label_dtype = None ,
208213 auto_resize_len = None ,
209214 # max_seq_len=None,
210- use_strand = False ,
215+ # use_strand=False,
211216 force_upper = True ):
212217
213218 self .num_chr_fasta = num_chr_fasta
214219 self .intervals_file = intervals_file
215220 self .fasta_file = fasta_file
216221 self .auto_resize_len = auto_resize_len
217- self .use_strand = use_strand
222+ # self.use_strand = use_strand
218223 self .force_upper = force_upper
219224 # self.max_seq_len = max_seq_len
220225
226+ # if use_strand:
227+ # # require a 6-column bed-file if strand is used
228+ # bed_columns = 6
229+ # else:
230+ # bed_columns = 3
231+
221232 self .bed = BedDataset (self .intervals_file ,
222233 num_chr = self .num_chr_fasta ,
234+ bed_columns = 3 ,
223235 label_dtype = parse_dtype (label_dtype ))
224236 self .fasta_extractors = None
225237
@@ -228,7 +240,7 @@ def __len__(self):
228240
229241 def __getitem__ (self , idx ):
230242 if self .fasta_extractors is None :
231- self .fasta_extractors = FastaStringExtractor (self .fasta_file , use_strand = self .use_strand ,
243+ self .fasta_extractors = FastaStringExtractor (self .fasta_file , use_strand = False , # self.use_strand,
232244 force_upper = self .force_upper )
233245
234246 interval , labels = self .bed [idx ]
@@ -297,8 +309,8 @@ class SeqDataset(Dataset):
297309 doc: None, datatype of the task labels taken from the intervals_file. Allowed - string', 'int', 'float', 'bool'
298310 auto_resize_len:
299311 doc: None, required sequence length.
300- use_strand:
301- doc: reverse-complement fasta sequence if bed file defines negative strand
312+ # use_strand:
313+ # doc: reverse-complement fasta sequence if bed file defines negative strand
302314 alphabet_axis:
303315 doc: axis along which the alphabet runs (e.g. A,C,G,T for DNA)
304316 dummy_axis:
@@ -309,7 +321,7 @@ class SeqDataset(Dataset):
309321 Can either be a list or a string: 'DNA', 'RNA', 'AMINO_ACIDS'.
310322 dtype:
311323 doc: defines the numpy dtype of the returned array.
312-
324+
313325 output_schema:
314326 inputs:
315327 name: seq
@@ -337,7 +349,7 @@ def __init__(self,
337349 label_dtype = None ,
338350 auto_resize_len = None ,
339351 # max_seq_len=None,
340- use_strand = False ,
352+ # use_strand=False,
341353 alphabet_axis = 1 ,
342354 dummy_axis = None ,
343355 alphabet = "ACGT" ,
@@ -356,7 +368,8 @@ def __init__(self,
356368 # core dataset
357369 self .seq_string_dataset = SeqStringDataset (intervals_file , fasta_file , num_chr_fasta = num_chr_fasta ,
358370 label_dtype = label_dtype , auto_resize_len = auto_resize_len ,
359- use_strand = use_strand , force_upper = True )
371+ # use_strand=use_strand,
372+ force_upper = True )
360373
361374 # set the transform parameters correctly
362375 existing_alphabet_axis = 1
0 commit comments