@@ -112,8 +112,12 @@ def __init__(self, tsv_file,
112112 header = None ,
113113 nrows = 1 ,
114114 sep = '\t ' )
115- self .n_tasks = df_peek .shape [1 ] - self .bed_columns
116- assert self .n_tasks >= 0
115+ found_columns = df_peek .shape [1 ]
116+ self .n_tasks = found_columns - self .bed_columns
117+ if self .n_tasks < 0 :
118+ raise ValueError ("BedDataset requires at least {} bed columns. Found only {} columns" .
119+ format (self .bed_columns , found_columns ))
120+
117121 self .df = pd .read_table (self .tsv_file ,
118122 header = None ,
119123 dtype = {i : d
@@ -181,8 +185,8 @@ class SeqStringDataset(Dataset):
181185 doc: None, required sequence length.
182186 # max_seq_len:
183187 # doc: maximum allowed sequence length
184- use_strand:
185- doc: reverse-complement fasta sequence if bed file defines negative strand
188+ # use_strand:
189+ # doc: reverse-complement fasta sequence if bed file defines negative strand
186190 force_upper:
187191 doc: Force uppercase output of sequences
188192 ignore_targets:
@@ -214,20 +218,27 @@ def __init__(self,
214218 label_dtype = None ,
215219 auto_resize_len = None ,
216220 # max_seq_len=None,
217- use_strand = False ,
221+ # use_strand=False,
218222 force_upper = True ,
219223 ignore_targets = False ):
220224
221225 self .num_chr_fasta = num_chr_fasta
222226 self .intervals_file = intervals_file
223227 self .fasta_file = fasta_file
224228 self .auto_resize_len = auto_resize_len
225- self .use_strand = use_strand
229+ # self.use_strand = use_strand
226230 self .force_upper = force_upper
227231 # self.max_seq_len = max_seq_len
228232
233+ # if use_strand:
234+ # # require a 6-column bed-file if strand is used
235+ # bed_columns = 6
236+ # else:
237+ # bed_columns = 3
238+
229239 self .bed = BedDataset (self .intervals_file ,
230240 num_chr = self .num_chr_fasta ,
241+ bed_columns = 3 ,
231242 label_dtype = parse_dtype (label_dtype ),
232243 ignore_targets = ignore_targets )
233244 self .fasta_extractors = None
@@ -237,7 +248,7 @@ def __len__(self):
237248
238249 def __getitem__ (self , idx ):
239250 if self .fasta_extractors is None :
240- self .fasta_extractors = FastaStringExtractor (self .fasta_file , use_strand = self .use_strand ,
251+ self .fasta_extractors = FastaStringExtractor (self .fasta_file , use_strand = False , # self.use_strand,
241252 force_upper = self .force_upper )
242253
243254 interval , labels = self .bed [idx ]
@@ -303,8 +314,8 @@ class SeqDataset(Dataset):
303314 doc: None, datatype of the task labels taken from the intervals_file. Allowed - string', 'int', 'float', 'bool'
304315 auto_resize_len:
305316 doc: None, required sequence length.
306- use_strand:
307- doc: reverse-complement fasta sequence if bed file defines negative strand
317+ # use_strand:
318+ # doc: reverse-complement fasta sequence if bed file defines negative strand
308319 alphabet_axis:
309320 doc: axis along which the alphabet runs (e.g. A,C,G,T for DNA)
310321 dummy_axis:
@@ -345,7 +356,7 @@ def __init__(self,
345356 label_dtype = None ,
346357 auto_resize_len = None ,
347358 # max_seq_len=None,
348- use_strand = False ,
359+ # use_strand=False,
349360 alphabet_axis = 1 ,
350361 dummy_axis = None ,
351362 alphabet = "ACGT" ,
@@ -366,7 +377,7 @@ def __init__(self,
366377 # core dataset
367378 self .seq_string_dataset = SeqStringDataset (intervals_file , fasta_file , num_chr_fasta = num_chr_fasta ,
368379 label_dtype = label_dtype , auto_resize_len = auto_resize_len ,
369- use_strand = use_strand , force_upper = True ,
380+ # use_strand=use_strand,
370381 ignore_targets = ignore_targets )
371382
372383 if dummy_axis is not None and alphabet_axis == dummy_axis :
0 commit comments