Skip to content

Commit 1c094c7

Browse files
authored
feat: Return chromsizes tileset info (#158)
* feat: Return chromsizes tilest info * Updated the CHANGELOG * Fix linting and add smart_open dependency * Add docstring to tileset_info function * Test using file-like object * Load file as binary * Removed TODO line
1 parent c686398 commit 1c094c7

File tree

6 files changed

+74
-11
lines changed

6 files changed

+74
-11
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
v0.20.3
2+
3+
- Add chromsizes tileset_info function
4+
15
v0.20.2
26

37
- Convert cooler chromsizes to int64 to prevent overflow error with recent versions of h5py and numpy

clodius/models/tileset_info.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from typing import List, Optional
2+
3+
from pydantic import BaseModel
4+
5+
6+
class TilesetInfo(BaseModel):
7+
max_width: int
8+
min_pos: List[int]
9+
max_pos: List[int]
10+
chromsizes: Optional[List]

clodius/tiles/chromsizes.py

Lines changed: 37 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,61 @@
1-
import csv
21
import logging
2+
from smart_open import open
33

44
logger = logging.getLogger(__name__)
55

66

7-
def get_tsv_chromsizes(filename):
7+
def tileset_info(filename: str) -> dict:
8+
"""Return a standard higlass tileset info object that contains
9+
chromsizes as an element.
10+
11+
The chromsizes in the returned object will be a list of [name, size]
12+
tuples.
13+
14+
[
15+
['chr1', 1000],
16+
['chr2', 2000]
17+
]
18+
"""
19+
chromsizes = get_tsv_chromsizes(filename)
20+
21+
max_width = sum([int(c[1]) for c in chromsizes])
22+
return {
23+
"max_width": max_width,
24+
"chromsizes": [[c[0], int(c[1])] for c in chromsizes],
25+
"min_pos": [0],
26+
"max_pos": [max_width],
27+
}
28+
29+
30+
def get_tsv_chromsizes(file):
831
"""
932
Get a list of chromosome sizes from this [presumably] tsv
10-
chromsizes file file.
33+
chromsizes file.
1134
1235
Parameters:
1336
-----------
14-
filename: string
15-
The filename of the tsv file
37+
file: string or file-like object
38+
A file-like object
1639
1740
Returns
1841
-------
1942
chromsizes: [(name:string, size:int), ...]
2043
An ordered list of chromosome names and sizes
2144
"""
45+
if isinstance(file, str):
46+
file = open(file, "rb")
47+
2248
try:
23-
with open(filename, "r") as f:
24-
reader = csv.reader(f, delimiter="\t")
49+
file.seek(0)
50+
binary_data = file.read()
51+
text_data = binary_data.decode("utf-8")
2552

26-
data = []
27-
for row in reader:
28-
data.append(row)
53+
lines = text_data.split("\n")
54+
data = [line.strip().split("\t") for line in lines if line.strip()]
2955
return data
3056
except Exception as ex:
3157
logger.error(ex)
3258

33-
err_msg = "WHAT?! Could not load file %s. 😤 (%s)" % (filename, ex)
59+
err_msg = "WHAT?! Could not load file %s." % (ex)
3460

3561
raise Exception(err_msg)

get_test_data.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
wget -q -NP data/ https://s3.amazonaws.com/pkerp/public/chromSizes.tsv
12
wget -q -NP data/ https://s3.amazonaws.com/pkerp/public/all.KL.bed.multires.mv5
23
wget -q -NP data/ https://s3.amazonaws.com/pkerp/public/Dixon2012-J1-NcoI-R1-filtered.100kb.multires.cool
34
wget -q -NP data/ https://s3.amazonaws.com/pkerp/public/hic-resolutions.cool

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ dependencies = [
2525
"slugid",
2626
"sortedcontainers",
2727
"tqdm",
28+
"smart_open"
2829
]
2930
license = { text = "MIT" }
3031
readme = "README.md"

test/tiles/chromsizes_test.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import os.path as op
2+
3+
import clodius.tiles.chromsizes as ctcs
4+
from clodius.models.tileset_info import TilesetInfo
5+
6+
7+
def test_get_tileset_info():
8+
filename = op.join("data", "chromSizes.tsv")
9+
10+
# Test loading tileset info using a filename
11+
tsinfo = TilesetInfo(**ctcs.tileset_info(filename))
12+
13+
assert tsinfo.max_width > 100
14+
assert len(tsinfo.chromsizes) > 2
15+
16+
with open(filename, "rb") as f:
17+
# Test loading using a file-like object
18+
tsinfo = TilesetInfo(**ctcs.tileset_info(f))
19+
20+
assert tsinfo.max_width > 100
21+
assert len(tsinfo.chromsizes) > 2

0 commit comments

Comments
 (0)