Skip to content

Commit abbf719

Browse files
authored
Sync with iranges's nearest and parallelization methods (#158)
1 parent 81c3641 commit abbf719

File tree

6 files changed

+49
-69
lines changed

6 files changed

+49
-69
lines changed

CHANGELOG.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# Changelog
22

3-
## Version 0.7.0 - 0.7.2
3+
## Version 0.7.0 - 0.7.3
44

55
- Changes to switch to LTLA/nclist-cpp in the iranges package for overlap and search operations.
6-
- Improve performance of search operations, bump version of iranges to 0.5.2.
6+
- Improve performance of search operations, bump version of iranges to 0.5.4.
77
- Optimize group by operations that `GenomicRanges` uses internally for the inter-range operations.
88
- Reset cached indexes when ranges were modified.
99

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -272,10 +272,10 @@ Performance comparison between Python and R GenomicRanges implementations. The q
272272

273273
| Operation | Python/GenomicRanges | Python/GenomicRanges (5 threads) | R/GenomicRanges |
274274
|-----------|---------------------|-----------------------------------|-----------------|
275-
| Overlap | 3.02s | 2.13s | 4.40s |
276-
| Overlap (single chromosome) | 6.98s | 5.36s | 10.06s |
277-
| Nearest | 50.1s | 32.3s | 42.16s |
278-
| Nearest (single chromosome) | 15.5s | 11.4s | 11.01s |
275+
| Overlap | 2.80s | 2.06s | 4.40s |
276+
| Overlap (single chromosome) | 6.73s | 5.19s | 10.06s |
277+
| Nearest | 2.27s | 1.5s | 42.16s |
278+
| Nearest (single chromosome) | 4.7s | 4.67s | 11.01s |
279279

280280
> [!NOTE]
281281
> The single chromosome benchmark ignores chromosome/sequence information and performs overlap operations solely on intervals.

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ python_requires = >=3.9
5050
install_requires =
5151
importlib-metadata; python_version<"3.8"
5252
biocframe>=0.6.2
53-
iranges>=0.5.3
53+
iranges>=0.5.4
5454
biocutils>=0.2.1
5555
numpy
5656

src/genomicranges/GenomicRanges.py

Lines changed: 40 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
group_by_indices,
1616
sanitize_strand_vector,
1717
wrapper_follow_precede,
18-
wrapper_nearest,
1918
)
2019

2120
__author__ = "jkanche"
@@ -2279,10 +2278,10 @@ def find_overlaps(
22792278

22802279
all_s_hits, all_q_hits = find_overlaps_groups(
22812280
self._ranges.get_start().astype(np.int32),
2282-
self._ranges.get_end().astype(np.int32) + 1,
2281+
self._ranges.get_end_exclusive().astype(np.int32),
22832282
[s.astype(np.int32) for s in self_groups],
22842283
query._ranges.get_start().astype(np.int32),
2285-
query._ranges.get_end().astype(np.int32) + 1,
2284+
query._ranges.get_end_exclusive().astype(np.int32),
22862285
[q.astype(np.int32) for q in query_groups],
22872286
query_type,
22882287
select,
@@ -2309,10 +2308,10 @@ def find_overlaps(
23092308

23102309
all_q_hits, all_s_hits = find_overlaps_groups(
23112310
query._ranges.get_start().astype(np.int32),
2312-
query._ranges.get_end().astype(np.int32) + 1,
2311+
query._ranges.get_end_exclusive().astype(np.int32),
23132312
[q.astype(np.int32) for q in query_groups],
23142313
self._ranges.get_start().astype(np.int32),
2315-
self._ranges.get_end().astype(np.int32) + 1,
2314+
self._ranges.get_end_exclusive().astype(np.int32),
23162315
[s.astype(np.int32) for s in self_groups],
23172316
query_type,
23182317
select,
@@ -2461,6 +2460,7 @@ def nearest(
24612460
select: Literal["all", "arbitrary"] = "arbitrary",
24622461
ignore_strand: bool = False,
24632462
num_threads: int = 1,
2463+
adjacent_equals_overlap: bool = True,
24642464
) -> Union[np.ndarray, BiocFrame]:
24652465
"""Search nearest positions both upstream and downstream that overlap with each range in ``query``.
24662466
@@ -2479,6 +2479,16 @@ def nearest(
24792479
Number of threads to use.
24802480
Defaults to 1.
24812481
2482+
adjacent_equals_overlap:
2483+
Whether to consider immediately-adjacent subject intervals to be
2484+
equally "nearest" to the query as an overlapping subject interval.
2485+
2486+
If true, both overlapping and immediately-adjacent subject intervals
2487+
(i.e., a gap of zero) will be reported in matches.
2488+
2489+
Otherwise, immediately-adjacent subjects will only be reported if
2490+
overlapping subjects are not present.
2491+
24822492
Returns:
24832493
If select="arbitrary":
24842494
A numpy array of integers with length matching query, containing indices
@@ -2493,36 +2503,37 @@ def nearest(
24932503
if not isinstance(query, GenomicRanges):
24942504
raise TypeError("'query' is not a `GenomicRanges` object.")
24952505

2496-
effective_threads = min(num_threads, cpu_count()) if num_threads > 0 else cpu_count()
2506+
from iranges.lib_iranges import nearest_groups
24972507

24982508
self_groups, query_groups = self._get_query_common_groups(query)
24992509

2500-
tasks = [
2501-
(
2502-
s_group,
2503-
q_group,
2504-
self._ranges,
2505-
query._ranges,
2506-
self._strand,
2507-
query._strand,
2508-
ignore_strand,
2509-
)
2510-
for s_group, q_group in zip(self_groups, query_groups)
2511-
]
2510+
all_q_hits, all_s_hits = nearest_groups(
2511+
self._ranges.get_start().astype(np.int32),
2512+
self._ranges.get_end_exclusive().astype(np.int32),
2513+
[s.astype(np.int32) for s in self_groups],
2514+
query._ranges.get_start().astype(np.int32),
2515+
query._ranges.get_end_exclusive().astype(np.int32),
2516+
[q.astype(np.int32) for q in query_groups],
2517+
select,
2518+
num_threads,
2519+
adjacent_equals_overlap,
2520+
)
25122521

2513-
if effective_threads == 1 or len(self_groups) <= 1:
2514-
results = [wrapper_nearest(task) for task in tasks]
2515-
else:
2516-
with Pool(processes=effective_threads) as pool:
2517-
results = pool.map(wrapper_nearest, tasks)
2522+
if ignore_strand is False:
2523+
s_strands = self._strand[all_s_hits]
2524+
q_strands = query._strand[all_q_hits]
25182525

2519-
if results:
2520-
all_qhits_list, all_shits_list = zip(*results)
2521-
else:
2522-
all_qhits_list, all_shits_list = [], []
2526+
mask = s_strands == q_strands
2527+
# to allow '*' with any strand from query
2528+
mask[s_strands == 0] = True
2529+
mask[q_strands == 0] = True
2530+
all_q_hits = all_q_hits[mask]
2531+
all_s_hits = all_s_hits[mask]
25232532

2524-
final_qhits = np.concatenate(all_qhits_list) if all_qhits_list else np.array([], dtype=np.int32)
2525-
final_shits = np.concatenate(all_shits_list) if all_shits_list else np.array([], dtype=np.int32)
2533+
order = np.argsort(all_q_hits, stable=True)
2534+
2535+
final_qhits = all_q_hits[order]
2536+
final_shits = all_s_hits[order]
25262537

25272538
if select == "arbitrary":
25282539
ret_result = np.full(len(query), None)

src/genomicranges/utils.py

Lines changed: 0 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -228,37 +228,6 @@ def extract_groups_from_granges(x, ignore_strand=False):
228228
return groups
229229

230230

231-
def wrapper_nearest(args):
232-
"""Processes a single pair of self and query groups to find the nearest ranges.
233-
This function is designed to be called by a multiprocessing pool.
234-
"""
235-
(
236-
s_group,
237-
q_group,
238-
self_ranges,
239-
query_ranges,
240-
self_strand,
241-
query_strand,
242-
ignore_strand,
243-
) = args
244-
245-
res_idx = self_ranges[s_group].nearest(query=query_ranges[q_group], select="all")
246-
247-
_q_hits = np.asarray([q_group[j] for j in res_idx.get_column("query_hits")])
248-
_s_hits = np.asarray([s_group[x] for x in res_idx.get_column("self_hits")])
249-
250-
if not ignore_strand:
251-
s_strands = self_strand[s_group][res_idx.get_column("self_hits")]
252-
q_strands = query_strand[q_group][res_idx.get_column("query_hits")]
253-
254-
mask = (s_strands == q_strands) | (s_strands == 0) | (q_strands == 0)
255-
256-
_q_hits = _q_hits[mask]
257-
_s_hits = _s_hits[mask]
258-
259-
return _q_hits, _s_hits
260-
261-
262231
def wrapper_follow_precede(args):
263232
"""Processes a single group for precede/follow operations.
264233
This function is designed to be called by a multiprocessing pool.

tests/test_gr_search.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def test_nearest():
3535
query_hits = gr.nearest(q_gr)
3636

3737
assert query_hits is not None
38-
assert np.all(query_hits == [3, 3])
38+
assert np.all(query_hits == [1, 1]) # R returns [3,3], select is arbitrary so its ok
3939

4040
query_hits = q_gr.nearest(gr)
4141
assert np.all(
@@ -47,7 +47,7 @@ def test_nearest():
4747
assert np.all(query_hits.get_column("self_hits") == [1, 2, 3, 1, 2, 3])
4848

4949
query_hits = gr.nearest(q_gr, ignore_strand=True)
50-
assert np.all(query_hits == [3, 3])
50+
assert np.all(query_hits == [1, 1]) # R returns [3,3], select is arbitrary so its ok
5151

5252
query_hits = gr.nearest(q_gr, select="all", ignore_strand=True)
5353
assert np.all(query_hits.get_column("query_hits") == [0, 0, 0, 1, 1, 1])

0 commit comments

Comments
 (0)