Skip to content

Commit fe8e6c0

Browse files
committed
Migrating to ncls-cpp
1 parent 6249eaf commit fe8e6c0

File tree

3 files changed

+134
-43
lines changed

3 files changed

+134
-43
lines changed

src/genomicranges/GenomicRanges.py

Lines changed: 106 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -2137,6 +2137,27 @@ def intersect_ncls(self, other: "GenomicRanges") -> "GenomicRanges":
21372137
######>> search operations <<######
21382138
###################################
21392139

2140+
def extract_groups_by_seqnames(self):
2141+
groups = []
2142+
for idx, _ in enumerate(self._seqinfo._seqnames):
2143+
idx = np.where(self._seqnames == idx)[0]
2144+
groups.append(idx)
2145+
return groups
2146+
2147+
def _get_query_common_groups(self, query: "GenomicRanges") -> Tuple[np.ndarray, np.ndarray]:
2148+
# smerged = merge_SeqInfo([self._seqinfo, query._seqinfo])
2149+
common_seqlevels = set(self._seqinfo._seqnames).intersection(query._seqinfo._seqnames)
2150+
q_group_idx = [self._seqinfo._seqnames.index(i) for i in common_seqlevels]
2151+
s_group_idx = [query._seqinfo._seqnames.index(i) for i in common_seqlevels]
2152+
2153+
s_grps = self.extract_groups_by_seqnames()
2154+
self_groups = [s_grps[i] for i in q_group_idx]
2155+
2156+
q_grps = query.extract_groups_by_seqnames()
2157+
query_groups = [q_grps[i] for i in s_group_idx]
2158+
2159+
return (self_groups, query_groups)
2160+
21402161
def find_overlaps(
21412162
self,
21422163
query: "GenomicRanges",
@@ -2145,6 +2166,7 @@ def find_overlaps(
21452166
max_gap: int = -1,
21462167
min_overlap: int = 0,
21472168
ignore_strand: bool = False,
2169+
num_threads: int = 1,
21482170
) -> BiocFrame:
21492171
"""Find overlaps between subject (self) and a ``query`` ``GenomicRanges`` object.
21502172
@@ -2176,6 +2198,10 @@ def find_overlaps(
21762198
ignore_strand:
21772199
Whether to ignore strands. Defaults to False.
21782200
2201+
num_threads:
2202+
Number of threads to use.
2203+
Defaults to 1.
2204+
21792205
Raises:
21802206
TypeError: If ``query`` is not of type `GenomicRanges`.
21812207
@@ -2192,53 +2218,71 @@ def find_overlaps(
21922218
if query_type not in OVERLAP_QUERY_TYPES:
21932219
raise ValueError(f"'{query_type}' must be one of {', '.join(OVERLAP_QUERY_TYPES)}.")
21942220

2195-
subject_chrm_grps = self._group_indices_by_chrm(ignore_strand=ignore_strand)
2196-
query_chrm_grps = query._group_indices_by_chrm(ignore_strand=ignore_strand)
2197-
2198-
all_qhits = np.array([], dtype=np.int32)
2199-
all_shits = np.array([], dtype=np.int32)
2221+
from iranges.lib_iranges import find_overlaps_groups
2222+
2223+
if len(self) >= len(query):
2224+
self_groups, query_groups = self._get_query_common_groups(query)
2225+
2226+
all_s_hits, all_q_hits = find_overlaps_groups(
2227+
self._ranges.get_start().astype(np.int32),
2228+
self._ranges.get_end().astype(np.int32) + 1,
2229+
[s.astype(np.int32) for s in self_groups],
2230+
query._ranges.get_start().astype(np.int32),
2231+
query._ranges.get_end().astype(np.int32) + 1,
2232+
[q.astype(np.int32) for q in query_groups],
2233+
query_type,
2234+
select,
2235+
max_gap,
2236+
min_overlap,
2237+
num_threads,
2238+
)
22002239

2201-
for group, indices in query_chrm_grps.items():
2202-
_subset = []
2203-
if group in subject_chrm_grps:
2204-
_subset.extend(subject_chrm_grps[group])
2240+
print(all_s_hits, all_q_hits)
2241+
if ignore_strand is False:
2242+
print(self._strand, query._strand)
2243+
s_strands = self._strand[all_s_hits]
2244+
q_strands = query._strand[all_q_hits]
22052245

2206-
_grp_split = group.split(_granges_delim)
2207-
if _grp_split[1] != "0":
2208-
_any_grp_fwd = f"{_grp_split[0]}{_granges_delim}0"
2209-
if _any_grp_fwd in subject_chrm_grps:
2210-
_subset.extend(subject_chrm_grps[_any_grp_fwd])
2211-
else:
2212-
_any_grp_fwd = f"{_grp_split[0]}{_granges_delim}1"
2213-
if _any_grp_fwd in subject_chrm_grps:
2214-
_subset.extend(subject_chrm_grps[_any_grp_fwd])
2246+
print(s_strands, q_strands)
22152247

2216-
_any_grp_rev = f"{_grp_split[0]}{_granges_delim}-1"
2217-
if _any_grp_rev in subject_chrm_grps:
2218-
_subset.extend(subject_chrm_grps[_any_grp_rev])
2248+
mask = s_strands == q_strands
2249+
# to allow '*' with any strand from query
2250+
mask[s_strands == 0] = True
2251+
mask[q_strands == 0] = True
2252+
all_q_hits = all_q_hits[mask]
2253+
all_s_hits = all_s_hits[mask]
22192254

2220-
if len(_subset) > 0:
2221-
_sub_subset = self[_subset]
2222-
_query_subset = query[indices]
2255+
order = np.argsort(all_q_hits, stable=True)
2256+
return BiocFrame({"query_hits": all_q_hits[order], "self_hits": all_s_hits[order]})
2257+
else:
2258+
query_groups, self_groups = query._get_query_common_groups(self)
2259+
2260+
all_q_hits, all_s_hits = find_overlaps_groups(
2261+
query._ranges.get_start().astype(np.int32),
2262+
query._ranges.get_end().astype(np.int32) + 1,
2263+
[q.astype(np.int32) for q in query_groups],
2264+
self._ranges.get_start().astype(np.int32),
2265+
self._ranges.get_end().astype(np.int32) + 1,
2266+
[s.astype(np.int32) for s in self_groups],
2267+
query_type,
2268+
select,
2269+
max_gap,
2270+
min_overlap,
2271+
num_threads,
2272+
)
22232273

2224-
res_idx = _sub_subset._ranges.find_overlaps(
2225-
query=_query_subset._ranges,
2226-
query_type=query_type,
2227-
select=select,
2228-
max_gap=max_gap,
2229-
min_overlap=min_overlap,
2230-
delete_index=False,
2231-
)
2274+
if ignore_strand is False:
2275+
q_strands = query._strand[all_q_hits]
2276+
s_strands = self._strand[all_s_hits]
22322277

2233-
all_qhits = np.concatenate(
2234-
(all_qhits, [indices[j] for j in res_idx.get_column("query_hits")]), dtype=np.int32
2235-
)
2236-
all_shits = np.concatenate(
2237-
(all_shits, [_subset[x] for x in res_idx.get_column("self_hits")]), dtype=np.int32
2238-
)
2278+
mask = s_strands == q_strands
2279+
mask[q_strands == 0] = True
2280+
mask[s_strands == 0] = True
2281+
all_q_hits = all_q_hits[mask]
2282+
all_s_hits = all_s_hits[mask]
22392283

2240-
order = np.argsort(all_qhits, stable=True)
2241-
return BiocFrame({"query_hits": all_qhits, "self_hits": all_shits})[order, :]
2284+
order = np.argsort(all_q_hits, stable=True)
2285+
return BiocFrame({"query_hits": all_q_hits[order], "self_hits": all_s_hits[order]})
22422286

22432287
def count_overlaps(
22442288
self,
@@ -2247,6 +2291,7 @@ def count_overlaps(
22472291
max_gap: int = -1,
22482292
min_overlap: int = 0,
22492293
ignore_strand: bool = False,
2294+
num_threads: int = 1,
22502295
) -> np.ndarray:
22512296
"""Count overlaps between subject (self) and a ``query`` ``GenomicRanges`` object.
22522297
@@ -2274,6 +2319,10 @@ def count_overlaps(
22742319
ignore_strand:
22752320
Whether to ignore strands. Defaults to False.
22762321
2322+
num_threads:
2323+
Number of threads to use.
2324+
Defaults to 1.
2325+
22772326
Raises:
22782327
TypeError: If ``query`` is not of type `GenomicRanges`.
22792328
@@ -2282,7 +2331,12 @@ def count_overlaps(
22822331
value represents the number of overlaps in `self` for each query.
22832332
"""
22842333
_overlaps = self.find_overlaps(
2285-
query, query_type=query_type, max_gap=max_gap, min_overlap=min_overlap, ignore_strand=ignore_strand
2334+
query,
2335+
query_type=query_type,
2336+
max_gap=max_gap,
2337+
min_overlap=min_overlap,
2338+
ignore_strand=ignore_strand,
2339+
num_threads=num_threads,
22862340
)
22872341
result = np.zeros(len(query))
22882342
_ucounts = np.unique_counts(_overlaps.get_column("query_hits"))
@@ -2297,6 +2351,7 @@ def subset_by_overlaps(
22972351
max_gap: int = -1,
22982352
min_overlap: int = 0,
22992353
ignore_strand: bool = False,
2354+
num_threads: int = 1,
23002355
) -> "GenomicRanges":
23012356
"""Subset ``subject`` (self) with overlaps in ``query`` `GenomicRanges` object.
23022357
@@ -2324,6 +2379,10 @@ def subset_by_overlaps(
23242379
ignore_strand:
23252380
Whether to ignore strands. Defaults to False.
23262381
2382+
num_threads:
2383+
Number of threads to use.
2384+
Defaults to 1.
2385+
23272386
Raises:
23282387
TypeError:
23292388
If ``query`` is not of type `GenomicRanges`.
@@ -2332,7 +2391,12 @@ def subset_by_overlaps(
23322391
A ``GenomicRanges`` object containing overlapping ranges.
23332392
"""
23342393
_overlaps = self.find_overlaps(
2335-
query, query_type=query_type, max_gap=max_gap, min_overlap=min_overlap, ignore_strand=ignore_strand
2394+
query,
2395+
query_type=query_type,
2396+
max_gap=max_gap,
2397+
min_overlap=min_overlap,
2398+
ignore_strand=ignore_strand,
2399+
num_threads=num_threads,
23362400
)
23372401
_all_indices = np.unique(_overlaps.get_column("self_hits"))
23382402
return self[_all_indices]

src/genomicranges/utils.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,3 +203,19 @@ def compute_up_down(starts, ends, strands, upstream, downstream, site: str = "TS
203203
new_ends[minus_mask] = starts[minus_mask] + upstream[minus_mask]
204204

205205
return new_starts, new_ends - new_starts + 1
206+
207+
def extract_groups_from_granges(x, ignore_strand=False):
208+
groups = []
209+
if ignore_strand:
210+
for idx, seq in enumerate(x._seqinfo._seqnames):
211+
matches = np.where(x._seqnames == idx)[0]
212+
groups.append((seq, matches))
213+
else:
214+
## TODO: needs some rethinking to speed this up
215+
combined = np.stack((x._seqnames, x._strand), axis=-1)
216+
unique_groups, inverse_indices = np.unique(combined, return_inverse=True, return_counts=False, axis=0)
217+
for idx, seq in enumerate(unique_groups):
218+
matches = np.where(inverse_indices == idx)[0]
219+
groups.append(((x._seqinfo._seqnames[seq[0]], seq[1]), matches))
220+
221+
return groups

tests/test_gr_overlaps.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def test_find_overlaps():
5656
assert res is not None
5757
assert isinstance(res, BiocFrame)
5858
assert np.all(res.get_column("query_hits") == [1, 1, 2, 2, 3, 3])
59-
assert np.all(res.get_column("self_hits") == [1, 0, 1, 0, 1, 0])
59+
assert np.all(res.get_column("self_hits") == [0, 1, 0, 1, 0, 1])
6060

6161

6262
def test_find_overlaps_query_type():
@@ -70,6 +70,17 @@ def test_find_overlaps_query_type():
7070
assert np.all(res.get_column("query_hits") == [0, 0, 0, 1, 1])
7171

7272

73+
def test_find_overlaps_threads():
74+
assert subject is not None
75+
assert query is not None
76+
77+
res = subject.find_overlaps(query, query_type="within", num_threads=3)
78+
79+
assert res is not None
80+
assert np.all(res.get_column("self_hits") == [1, 2, 3, 1, 2])
81+
assert np.all(res.get_column("query_hits") == [0, 0, 0, 1, 1])
82+
83+
7384
def test_find_overlaps_rtrip():
7485
x = GenomicRanges(["chr1", "chr1"], IRanges([2, 9], [7, 19]), strand=["+", "-"])
7586
y = GenomicRanges(["chr1"], IRanges([5], [10]), strand=["*"])

0 commit comments

Comments
 (0)