Skip to content

Commit 96efedc

Browse files
committed
don't allow use_strand
1 parent cc08b97 commit 96efedc

File tree

2 files changed

+25
-13
lines changed

2 files changed

+25
-13
lines changed

kipoiseq/datasets/sequence.py

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ def parse_alphabet(alphabet):
4747
else:
4848
return alphabet
4949

50+
5051
def parse_type(dtype):
5152
if isinstance(dtype, string_types):
5253
if dtype in dir(np):
@@ -107,8 +108,12 @@ def __init__(self, tsv_file,
107108
header=None,
108109
nrows=1,
109110
sep='\t')
110-
self.n_tasks = df_peek.shape[1] - self.bed_columns
111-
assert self.n_tasks >= 0
111+
found_columns = df_peek.shape[1]
112+
self.n_tasks = found_columns - self.bed_columns
113+
if self.n_tasks < 0:
114+
raise ValueError("BedDataset requires at least {} bed columns. Found only {} columns".
115+
format(self.bed_columns, found_columns))
116+
112117
self.df = pd.read_table(self.tsv_file,
113118
header=None,
114119
dtype={i: d
@@ -176,8 +181,8 @@ class SeqStringDataset(Dataset):
176181
doc: None, required sequence length.
177182
# max_seq_len:
178183
# doc: maximum allowed sequence length
179-
use_strand:
180-
doc: reverse-complement fasta sequence if bed file defines negative strand
184+
# use_strand:
185+
# doc: reverse-complement fasta sequence if bed file defines negative strand
181186
force_upper:
182187
doc: Force uppercase output of sequences
183188
output_schema:
@@ -207,19 +212,26 @@ def __init__(self,
207212
label_dtype=None,
208213
auto_resize_len=None,
209214
# max_seq_len=None,
210-
use_strand=False,
215+
# use_strand=False,
211216
force_upper=True):
212217

213218
self.num_chr_fasta = num_chr_fasta
214219
self.intervals_file = intervals_file
215220
self.fasta_file = fasta_file
216221
self.auto_resize_len = auto_resize_len
217-
self.use_strand = use_strand
222+
# self.use_strand = use_strand
218223
self.force_upper = force_upper
219224
# self.max_seq_len = max_seq_len
220225

226+
# if use_strand:
227+
# # require a 6-column bed-file if strand is used
228+
# bed_columns = 6
229+
# else:
230+
# bed_columns = 3
231+
221232
self.bed = BedDataset(self.intervals_file,
222233
num_chr=self.num_chr_fasta,
234+
bed_columns=3,
223235
label_dtype=parse_dtype(label_dtype))
224236
self.fasta_extractors = None
225237

@@ -228,7 +240,7 @@ def __len__(self):
228240

229241
def __getitem__(self, idx):
230242
if self.fasta_extractors is None:
231-
self.fasta_extractors = FastaStringExtractor(self.fasta_file, use_strand=self.use_strand,
243+
self.fasta_extractors = FastaStringExtractor(self.fasta_file, use_strand=False, # self.use_strand,
232244
force_upper=self.force_upper)
233245

234246
interval, labels = self.bed[idx]
@@ -297,8 +309,8 @@ class SeqDataset(Dataset):
297309
doc: None, datatype of the task labels taken from the intervals_file. Allowed - string', 'int', 'float', 'bool'
298310
auto_resize_len:
299311
doc: None, required sequence length.
300-
use_strand:
301-
doc: reverse-complement fasta sequence if bed file defines negative strand
312+
# use_strand:
313+
# doc: reverse-complement fasta sequence if bed file defines negative strand
302314
alphabet_axis:
303315
doc: axis along which the alphabet runs (e.g. A,C,G,T for DNA)
304316
dummy_axis:
@@ -309,7 +321,7 @@ class SeqDataset(Dataset):
309321
Can either be a list or a string: 'DNA', 'RNA', 'AMINO_ACIDS'.
310322
dtype:
311323
doc: defines the numpy dtype of the returned array.
312-
324+
313325
output_schema:
314326
inputs:
315327
name: seq
@@ -337,7 +349,7 @@ def __init__(self,
337349
label_dtype=None,
338350
auto_resize_len=None,
339351
# max_seq_len=None,
340-
use_strand=False,
352+
# use_strand=False,
341353
alphabet_axis=1,
342354
dummy_axis=None,
343355
alphabet="ACGT",
@@ -356,7 +368,8 @@ def __init__(self,
356368
# core dataset
357369
self.seq_string_dataset = SeqStringDataset(intervals_file, fasta_file, num_chr_fasta=num_chr_fasta,
358370
label_dtype=label_dtype, auto_resize_len=auto_resize_len,
359-
use_strand=use_strand, force_upper=True)
371+
# use_strand=use_strand,
372+
force_upper=True)
360373

361374
# set the transform parameters correctly
362375
existing_alphabet_axis = 1

tests/test_2_datasets.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,4 +37,3 @@ def test_seq_dataset_reshape(alphabet_axis, dummy_axis, example_kwargs):
3737
assert reshaped.shape[i] == alphabet_len
3838
else:
3939
assert reshaped.shape[i] == seq_len
40-

0 commit comments

Comments
 (0)