Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/grelu/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from torch.utils.data import Dataset

from grelu.data.augment import Augmenter, _split_overall_idx
from grelu.data.preprocess import check_chrom_ends
from grelu.data.utils import _check_multiclass, _create_task_data
from grelu.sequence.format import (
INDEX_TO_BASE_HASH,
Expand Down Expand Up @@ -152,6 +153,7 @@ def _load_seqs(self, seqs: Union[str, Sequence, pd.DataFrame, np.ndarray]) -> No
seqs = resize(seqs, seq_len=self.padded_seq_len, end=self.end)

if get_input_type(seqs) == "intervals":
check_chrom_ends(seqs, genome=self.genome)
self.intervals = seqs
self.chroms = list(set(self.intervals.chrom))
else:
Expand Down Expand Up @@ -604,6 +606,7 @@ def _load_seqs(self, variants: pd.DataFrame) -> None:

self.padded_seq_len = self.seq_len + (2 * self.max_seq_shift)
self.intervals = variants_to_intervals(variants, seq_len=self.padded_seq_len)
check_chrom_ends(self.intervals, genome=self.genome)
self.seqs = convert_input_type(self.intervals, "indices", genome=self.genome)

def __len__(self) -> int:
Expand Down Expand Up @@ -711,6 +714,7 @@ def _load_seqs(self, variants: pd.DataFrame) -> None:

self.padded_seq_len = self.seq_len + (2 * self.max_seq_shift)
self.intervals = variants_to_intervals(variants, seq_len=self.padded_seq_len)
check_chrom_ends(self.intervals, genome=self.genome)
self.seqs = convert_input_type(self.intervals, "indices", genome=self.genome)
self.n_seqs = self.seqs.shape[0]

Expand Down
46 changes: 46 additions & 0 deletions src/grelu/data/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import os
import subprocess
import warnings
from typing import Callable, List, Optional, Union

import bioframe as bf
Expand Down Expand Up @@ -373,6 +374,51 @@ def filter_blacklist(
)


def check_chrom_ends(
data: Union[pd.DataFrame, AnnData],
genome: Optional[str] = None,
):
"""
Check that intervals do not exceed the ends of the chromosome.

Args:
data: Either a pandas dataframe of genomic intervals or an Anndata
object with intervals in .var
genome: name of the genome corresponding to intervals

Raises:
ValueError if any interval exceeds the chtomosome ends
"""
from grelu.io.genome import read_sizes

# Get genomic intervals
if isinstance(data, AnnData):
intervals = data.var
elif isinstance(data, pd.DataFrame):
intervals = data

# Check start
fail = intervals[intervals.start < 0].index

# Filter end if the genome is provided
if genome is None:
warnings.warn(
"No genome is provided; only intervals with negative start values will be flagged."
)
else:
sizes = read_sizes(genome)
for chrom, size in sizes.values:
fail = fail.append(
intervals[(intervals.chrom == chrom) & (intervals.end > size)].index
)

fail = np.unique(fail)
if len(fail) > 0:
raise ValueError(
f"Indices of intervals that extend beyond the chromosome ends: {','.join(fail.astype(str))}."
)


def filter_chrom_ends(
data: Union[pd.DataFrame, AnnData],
genome: Optional[str] = None,
Expand Down
37 changes: 26 additions & 11 deletions tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pytest

from grelu.data.preprocess import (
check_chrom_ends,
filter_blacklist,
filter_cells,
filter_chrom_ends,
Expand Down Expand Up @@ -117,22 +118,36 @@ def test_filter_blacklist():
assert filter_blacklist(intervals, genome="hg38").equals(intervals.iloc[-2:, :])


chrom_end_intervals = pd.DataFrame(
{
"chrom": ["chr1", "chr1", "chr1", "chr1", "chr1"],
"start": [-10, 10, 1000, 248956300, 248956350],
"end": [90, 110, 1100, 248956400, 248956450],
}
)


def test_filter_chrom_ends():
intervals = pd.DataFrame(
{
"chrom": ["chr1", "chr1", "chr1", "chr1", "chr1"],
"start": [-10, 10, 1000, 248956300, 248956350],
"end": [90, 110, 1100, 248956400, 248956450],
}
)
assert filter_chrom_ends(intervals, genome="hg38").equals(
intervals.iloc[[1, 2, 3], :]

assert filter_chrom_ends(chrom_end_intervals, genome="hg38").equals(
chrom_end_intervals.iloc[[1, 2, 3], :]
)
assert filter_chrom_ends(intervals, genome="hg38", pad=100).equals(
intervals.iloc[[2], :]
assert filter_chrom_ends(chrom_end_intervals, genome="hg38", pad=100).equals(
chrom_end_intervals.iloc[[2], :]
)


def test_check_chrom_ends():
with pytest.raises(Exception) as e_info:
check_chrom_ends(chrom_end_intervals, genome="hg38")
assert (
str(e_info.value)
== "Indices of intervals that extend beyond the chromosome ends: 0,4."
)

check_chrom_ends(chrom_end_intervals.iloc[1:2], genome="hg38")


def test_merge_intervals_by_column():
intervals = pd.DataFrame(
{
Expand Down