Skip to content

Commit f994085

Browse files
authored
Merge pull request #143 from MuhammedHasan/custom_fasta
custom fasta
2 parents 2c30964 + cc4f707 commit f994085

File tree

10 files changed

+95
-13
lines changed

10 files changed

+95
-13
lines changed

src/grelu/io/fasta.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def check_fasta(fasta_file: str) -> bool:
1818
Returns:
1919
True if the file path has a valid FASTA extension and exists, False otherwise.
2020
"""
21-
fasta_extensions = (".fa", ".fasta", ".fa.gz", ".fasta.gz")
21+
fasta_extensions = (".fa", ".fasta", ".fa.gz", ".fasta.gz", ".fa.bgz", ".fasta.bgz")
2222
return (
2323
isinstance(fasta_file, str)
2424
and fasta_file.endswith(fasta_extensions)
@@ -36,14 +36,18 @@ def read_fasta(fasta_file: str) -> List[str]:
3636
Returns:
3737
A list of DNA sequences as strings.
3838
"""
39-
from Bio import SeqIO
39+
from Bio import SeqIO, bgzf
4040

4141
assert check_fasta(fasta_file), "Input is not a valid FASTA file."
4242

4343
if fasta_file.endswith(".gz"):
4444
# Read sequences from a gzipped FASTA file
4545
with gzip.open(fasta_file, "rt") as handle:
4646
return [str(record.seq) for record in SeqIO.parse(handle, "fasta")]
47+
elif fasta_file.endswith(".bgz"):
48+
# Read sequences from a bgzipped FASTA file
49+
with bgzf.BgzfReader(fasta_file, "rt") as handle:
50+
return [str(record.seq) for record in SeqIO.parse(handle, "fasta")]
4751
else:
4852
# Read sequences from a FASTA file
4953
with open(fasta_file, "rt") as handle:

src/grelu/io/genome.py

Lines changed: 46 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,45 @@
66
import os
77
from typing import List, Optional, Union
88

9-
import genomepy
109
import pandas as pd
10+
import pyfaidx
11+
import genomepy
12+
13+
14+
class CustomGenome:
15+
"""
16+
A custom genome object that can be used to load a genome from a file.
17+
18+
Args:
19+
genome: Path to the genome file.
20+
"""
21+
def __init__(self, genome: str):
22+
self.genome = genome
23+
self._genome = pyfaidx.Fasta(genome, rebuild=False)
24+
fai_file = genome + ".fai"
25+
if not os.path.isfile(fai_file):
26+
raise FileNotFoundError(
27+
f"Genome file {fai_file} not found. "
28+
"Please provide a genome name or a path to a chromosome sizes file. "
29+
f"Or generate one with: `samtools faidx {genome}`."
30+
)
31+
self._sizes_file = genome + ".sizes"
32+
33+
def get_seq(self, chrom: str, start: int, end: int, rc: bool = False) -> str:
34+
"""
35+
Get the sequence for a given chromosome and interval.
36+
"""
37+
return self._genome.get_seq(chrom, start, end, rc=rc)
38+
39+
@property
40+
def sizes_file(self) -> str:
41+
if not os.path.isfile(self._sizes_file):
42+
raise FileNotFoundError(
43+
f"Genome file {self._sizes_file} not found. "
44+
"Please provide a genome name or a path to a chromosome sizes file. "
45+
f"Or generate one with: `faidx -i chromsizes {self.genome} > {self._sizes_file}`."
46+
)
47+
return self._sizes_file
1148

1249

1350
def read_sizes(genome: str = "hg38") -> pd.DataFrame:
@@ -24,16 +61,13 @@ def read_sizes(genome: str = "hg38") -> pd.DataFrame:
2461
and "size" (chromosome size).
2562
"""
2663
# Get file path
27-
if not os.path.isfile(genome):
28-
genome = get_genome(genome).sizes_file
29-
30-
# Read file
64+
genome = get_genome(genome).sizes_file
3165
return pd.read_table(
3266
genome, header=None, names=["chrom", "size"], dtype={"chrom": str, "size": int}
3367
)
3468

3569

36-
def get_genome(genome: str, **kwargs) -> genomepy.Genome:
70+
def get_genome(genome: str, **kwargs) -> Union[CustomGenome, genomepy.Genome]:
3771
"""
3872
Install a genome from genomepy and load it as a Genome object
3973
@@ -44,11 +78,13 @@ def get_genome(genome: str, **kwargs) -> genomepy.Genome:
4478
Returns:
4579
Genome object
4680
"""
47-
if genome not in genomepy.list_installed_genomes():
48-
return genomepy.install_genome(genome, annotation=False, **kwargs)
81+
if os.path.isfile(genome):
82+
return CustomGenome(genome, **kwargs)
4983
else:
50-
return genomepy.Genome(genome)
51-
84+
if genome not in genomepy.list_installed_genomes():
85+
return genomepy.install_genome(genome, annotation=False, **kwargs)
86+
else:
87+
return genomepy.Genome(genome, **kwargs)
5288

5389
def read_gtf(
5490
genome: str, features: Optional[Union[str, List[str]]] = None

tests/files/test.fa

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
>seq1
2+
AAC
3+
4+
>seq2
5+
ATG

tests/files/test.fa.bgz

110 Bytes
Binary file not shown.

tests/files/test.fa.bgz.fai

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
seq1 3 6 3 4
2+
seq2 3 3145728 3 3

tests/files/test.fa.fai

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
seq1 3 6 3 4
2+
seq2 3 17 3 3

tests/files/test.fa.gz

0 Bytes
Binary file not shown.

tests/files/test.fa.sizes

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
seq1 3
2+
seq2 3

tests/test_io.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,16 @@
11
import os
22

3+
import pytest
34
import numpy as np
45
import pandas as pd
6+
import genomepy
57
from pandas.testing import assert_frame_equal
68

79
from grelu.io import read_tomtom
810
from grelu.io.bed import read_bed
911
from grelu.io.bigwig import read_bigwig
1012
from grelu.io.fasta import read_fasta
11-
from grelu.io.genome import read_sizes
13+
from grelu.io.genome import read_sizes, CustomGenome
1214
from grelu.io.motifs import read_meme_file, read_modisco_report
1315
from grelu.sequence.utils import resize
1416

@@ -20,6 +22,16 @@ def test_read_sizes():
2022
assert df.shape == (194, 2)
2123
assert df.iloc[0].to_dict() == {"chrom": "chr1", "size": 248956422}
2224

25+
with pytest.raises(FileNotFoundError):
26+
CustomGenome("tests/files/test.fa.bgz").sizes_file
27+
28+
chrom_sizes = CustomGenome("tests/files/test.fa").sizes_file
29+
assert chrom_sizes == "tests/files/test.fa.sizes"
30+
df = read_sizes("tests/files/test.fa")
31+
assert df.shape == (2, 2)
32+
assert df.iloc[0].to_dict() == {'chrom': 'seq1', 'size': 3}
33+
assert df.iloc[1].to_dict() == {'chrom': 'seq2', 'size': 3}
34+
2335

2436
def test_read_tomtom():
2537
# No q-value threshold
@@ -34,9 +46,15 @@ def test_read_tomtom():
3446

3547

3648
def test_read_fasta():
49+
fa_file = os.path.join(cwd, "files", "test.fa")
50+
assert np.all(read_fasta(fa_file) == ["AAC", "ATG"])
51+
3752
fa_file = os.path.join(cwd, "files", "test.fa.gz")
3853
assert np.all(read_fasta(fa_file) == ["AAC", "ATG"])
3954

55+
fa_file = os.path.join(cwd, "files", "test.fa.bgz")
56+
assert np.all(read_fasta(fa_file) == ["AAC", "ATG"])
57+
4058

4159
expected_intervals = pd.DataFrame(
4260
{"chrom": ["chr1", "chr1", "chr2"], "start": [1, 3, 3], "end": [4, 6, 6]}

tests/test_sequence.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,20 @@ def test_seq_formatting():
129129
convert_input_type(indices[0], "one_hot", add_batch_axis=True), batch[[0]]
130130
)
131131

132+
# Test custom genome
133+
intervals = pd.DataFrame(
134+
{"chrom": ["seq1", "seq2"], "start": [1, 1], "end": [3, 3]}
135+
)
136+
assert convert_input_type(intervals, "strings", genome="tests/files/test.fa") == ["AC", "TG"]
132137

138+
intervals = pd.DataFrame(
139+
{"chrom": ["seq1", "seq2"], "start": [1, 1], "end": [3, 3]}
140+
)
141+
assert convert_input_type(intervals, "strings", genome="tests/files/test.fa.bgz") == ["AC", "TG"]
142+
143+
intervals = pd.DataFrame(
144+
{"chrom": ["seq1", "seq2"], "start": [1, 1], "end": [3, 3]}
145+
)
133146
# Test Metrics functions
134147

135148

0 commit comments

Comments
 (0)