Skip to content

Commit 47a4d4e

Browse files
authored
Merge pull request #16 from kipoi/fix_remove_use_strand
don't allow use_strand
2 parents 9648ea8 + 51aaecb commit 47a4d4e

File tree

2 files changed

+22
-12
lines changed

2 files changed

+22
-12
lines changed

kipoiseq/datasets/sequence.py

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,12 @@ def __init__(self, tsv_file,
112112
header=None,
113113
nrows=1,
114114
sep='\t')
115-
self.n_tasks = df_peek.shape[1] - self.bed_columns
116-
assert self.n_tasks >= 0
115+
found_columns = df_peek.shape[1]
116+
self.n_tasks = found_columns - self.bed_columns
117+
if self.n_tasks < 0:
118+
raise ValueError("BedDataset requires at least {} bed columns. Found only {} columns".
119+
format(self.bed_columns, found_columns))
120+
117121
self.df = pd.read_table(self.tsv_file,
118122
header=None,
119123
dtype={i: d
@@ -181,8 +185,8 @@ class SeqStringDataset(Dataset):
181185
doc: None, required sequence length.
182186
# max_seq_len:
183187
# doc: maximum allowed sequence length
184-
use_strand:
185-
doc: reverse-complement fasta sequence if bed file defines negative strand
188+
# use_strand:
189+
# doc: reverse-complement fasta sequence if bed file defines negative strand
186190
force_upper:
187191
doc: Force uppercase output of sequences
188192
ignore_targets:
@@ -214,20 +218,27 @@ def __init__(self,
214218
label_dtype=None,
215219
auto_resize_len=None,
216220
# max_seq_len=None,
217-
use_strand=False,
221+
# use_strand=False,
218222
force_upper=True,
219223
ignore_targets=False):
220224

221225
self.num_chr_fasta = num_chr_fasta
222226
self.intervals_file = intervals_file
223227
self.fasta_file = fasta_file
224228
self.auto_resize_len = auto_resize_len
225-
self.use_strand = use_strand
229+
# self.use_strand = use_strand
226230
self.force_upper = force_upper
227231
# self.max_seq_len = max_seq_len
228232

233+
# if use_strand:
234+
# # require a 6-column bed-file if strand is used
235+
# bed_columns = 6
236+
# else:
237+
# bed_columns = 3
238+
229239
self.bed = BedDataset(self.intervals_file,
230240
num_chr=self.num_chr_fasta,
241+
bed_columns=3,
231242
label_dtype=parse_dtype(label_dtype),
232243
ignore_targets=ignore_targets)
233244
self.fasta_extractors = None
@@ -237,7 +248,7 @@ def __len__(self):
237248

238249
def __getitem__(self, idx):
239250
if self.fasta_extractors is None:
240-
self.fasta_extractors = FastaStringExtractor(self.fasta_file, use_strand=self.use_strand,
251+
self.fasta_extractors = FastaStringExtractor(self.fasta_file, use_strand=False, # self.use_strand,
241252
force_upper=self.force_upper)
242253

243254
interval, labels = self.bed[idx]
@@ -303,8 +314,8 @@ class SeqDataset(Dataset):
303314
doc: None, datatype of the task labels taken from the intervals_file. Allowed - string', 'int', 'float', 'bool'
304315
auto_resize_len:
305316
doc: None, required sequence length.
306-
use_strand:
307-
doc: reverse-complement fasta sequence if bed file defines negative strand
317+
# use_strand:
318+
# doc: reverse-complement fasta sequence if bed file defines negative strand
308319
alphabet_axis:
309320
doc: axis along which the alphabet runs (e.g. A,C,G,T for DNA)
310321
dummy_axis:
@@ -345,7 +356,7 @@ def __init__(self,
345356
label_dtype=None,
346357
auto_resize_len=None,
347358
# max_seq_len=None,
348-
use_strand=False,
359+
# use_strand=False,
349360
alphabet_axis=1,
350361
dummy_axis=None,
351362
alphabet="ACGT",
@@ -366,7 +377,7 @@ def __init__(self,
366377
# core dataset
367378
self.seq_string_dataset = SeqStringDataset(intervals_file, fasta_file, num_chr_fasta=num_chr_fasta,
368379
label_dtype=label_dtype, auto_resize_len=auto_resize_len,
369-
use_strand=use_strand, force_upper=True,
380+
# use_strand=use_strand,
370381
ignore_targets=ignore_targets)
371382

372383
if dummy_axis is not None and alphabet_axis == dummy_axis:

tests/test_2_datasets.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,4 +37,3 @@ def test_seq_dataset_reshape(alphabet_axis, dummy_axis, example_kwargs):
3737
assert reshaped.shape[i] == alphabet_len
3838
else:
3939
assert reshaped.shape[i] == seq_len
40-

0 commit comments

Comments
 (0)