@@ -2137,6 +2137,27 @@ def intersect_ncls(self, other: "GenomicRanges") -> "GenomicRanges":
21372137 ######>> search operations <<######
21382138 ###################################
21392139
2140+ def extract_groups_by_seqnames (self ):
2141+ groups = []
2142+ for idx , _ in enumerate (self ._seqinfo ._seqnames ):
2143+ idx = np .where (self ._seqnames == idx )[0 ]
2144+ groups .append (idx )
2145+ return groups
2146+
2147+ def _get_query_common_groups (self , query : "GenomicRanges" ) -> Tuple [np .ndarray , np .ndarray ]:
2148+ # smerged = merge_SeqInfo([self._seqinfo, query._seqinfo])
2149+ common_seqlevels = set (self ._seqinfo ._seqnames ).intersection (query ._seqinfo ._seqnames )
2150+ q_group_idx = [self ._seqinfo ._seqnames .index (i ) for i in common_seqlevels ]
2151+ s_group_idx = [query ._seqinfo ._seqnames .index (i ) for i in common_seqlevels ]
2152+
2153+ s_grps = self .extract_groups_by_seqnames ()
2154+ self_groups = [s_grps [i ] for i in q_group_idx ]
2155+
2156+ q_grps = query .extract_groups_by_seqnames ()
2157+ query_groups = [q_grps [i ] for i in s_group_idx ]
2158+
2159+ return (self_groups , query_groups )
2160+
21402161 def find_overlaps (
21412162 self ,
21422163 query : "GenomicRanges" ,
@@ -2145,6 +2166,7 @@ def find_overlaps(
21452166 max_gap : int = - 1 ,
21462167 min_overlap : int = 0 ,
21472168 ignore_strand : bool = False ,
2169+ num_threads : int = 1 ,
21482170 ) -> BiocFrame :
21492171 """Find overlaps between subject (self) and a ``query`` ``GenomicRanges`` object.
21502172
@@ -2176,6 +2198,10 @@ def find_overlaps(
21762198 ignore_strand:
21772199 Whether to ignore strands. Defaults to False.
21782200
2201+ num_threads:
2202+ Number of threads to use.
2203+ Defaults to 1.
2204+
21792205 Raises:
21802206 TypeError: If ``query`` is not of type `GenomicRanges`.
21812207
@@ -2192,53 +2218,71 @@ def find_overlaps(
21922218 if query_type not in OVERLAP_QUERY_TYPES :
21932219 raise ValueError (f"'{ query_type } ' must be one of { ', ' .join (OVERLAP_QUERY_TYPES )} ." )
21942220
2195- subject_chrm_grps = self ._group_indices_by_chrm (ignore_strand = ignore_strand )
2196- query_chrm_grps = query ._group_indices_by_chrm (ignore_strand = ignore_strand )
2197-
2198- all_qhits = np .array ([], dtype = np .int32 )
2199- all_shits = np .array ([], dtype = np .int32 )
2221+ from iranges .lib_iranges import find_overlaps_groups
2222+
2223+ if len (self ) >= len (query ):
2224+ self_groups , query_groups = self ._get_query_common_groups (query )
2225+
2226+ all_s_hits , all_q_hits = find_overlaps_groups (
2227+ self ._ranges .get_start ().astype (np .int32 ),
2228+ self ._ranges .get_end ().astype (np .int32 ) + 1 ,
2229+ [s .astype (np .int32 ) for s in self_groups ],
2230+ query ._ranges .get_start ().astype (np .int32 ),
2231+ query ._ranges .get_end ().astype (np .int32 ) + 1 ,
2232+ [q .astype (np .int32 ) for q in query_groups ],
2233+ query_type ,
2234+ select ,
2235+ max_gap ,
2236+ min_overlap ,
2237+ num_threads ,
2238+ )
22002239
2201- for group , indices in query_chrm_grps .items ():
2202- _subset = []
2203- if group in subject_chrm_grps :
2204- _subset .extend (subject_chrm_grps [group ])
2240+ print (all_s_hits , all_q_hits )
2241+ if ignore_strand is False :
2242+ print (self ._strand , query ._strand )
2243+ s_strands = self ._strand [all_s_hits ]
2244+ q_strands = query ._strand [all_q_hits ]
22052245
2206- _grp_split = group .split (_granges_delim )
2207- if _grp_split [1 ] != "0" :
2208- _any_grp_fwd = f"{ _grp_split [0 ]} { _granges_delim } 0"
2209- if _any_grp_fwd in subject_chrm_grps :
2210- _subset .extend (subject_chrm_grps [_any_grp_fwd ])
2211- else :
2212- _any_grp_fwd = f"{ _grp_split [0 ]} { _granges_delim } 1"
2213- if _any_grp_fwd in subject_chrm_grps :
2214- _subset .extend (subject_chrm_grps [_any_grp_fwd ])
2246+ print (s_strands , q_strands )
22152247
2216- _any_grp_rev = f"{ _grp_split [0 ]} { _granges_delim } -1"
2217- if _any_grp_rev in subject_chrm_grps :
2218- _subset .extend (subject_chrm_grps [_any_grp_rev ])
2248+ mask = s_strands == q_strands
2249+ # to allow '*' with any strand from query
2250+ mask [s_strands == 0 ] = True
2251+ mask [q_strands == 0 ] = True
2252+ all_q_hits = all_q_hits [mask ]
2253+ all_s_hits = all_s_hits [mask ]
22192254
2220- if len (_subset ) > 0 :
2221- _sub_subset = self [_subset ]
2222- _query_subset = query [indices ]
2255+ order = np .argsort (all_q_hits , stable = True )
2256+ return BiocFrame ({"query_hits" : all_q_hits [order ], "self_hits" : all_s_hits [order ]})
2257+ else :
2258+ query_groups , self_groups = query ._get_query_common_groups (self )
2259+
2260+ all_q_hits , all_s_hits = find_overlaps_groups (
2261+ query ._ranges .get_start ().astype (np .int32 ),
2262+ query ._ranges .get_end ().astype (np .int32 ) + 1 ,
2263+ [q .astype (np .int32 ) for q in query_groups ],
2264+ self ._ranges .get_start ().astype (np .int32 ),
2265+ self ._ranges .get_end ().astype (np .int32 ) + 1 ,
2266+ [s .astype (np .int32 ) for s in self_groups ],
2267+ query_type ,
2268+ select ,
2269+ max_gap ,
2270+ min_overlap ,
2271+ num_threads ,
2272+ )
22232273
2224- res_idx = _sub_subset ._ranges .find_overlaps (
2225- query = _query_subset ._ranges ,
2226- query_type = query_type ,
2227- select = select ,
2228- max_gap = max_gap ,
2229- min_overlap = min_overlap ,
2230- delete_index = False ,
2231- )
2274+ if ignore_strand is False :
2275+ q_strands = query ._strand [all_q_hits ]
2276+ s_strands = self ._strand [all_s_hits ]
22322277
2233- all_qhits = np .concatenate (
2234- (all_qhits , [indices [j ] for j in res_idx .get_column ("query_hits" )]), dtype = np .int32
2235- )
2236- all_shits = np .concatenate (
2237- (all_shits , [_subset [x ] for x in res_idx .get_column ("self_hits" )]), dtype = np .int32
2238- )
2278+ mask = s_strands == q_strands
2279+ mask [q_strands == 0 ] = True
2280+ mask [s_strands == 0 ] = True
2281+ all_q_hits = all_q_hits [mask ]
2282+ all_s_hits = all_s_hits [mask ]
22392283
2240- order = np .argsort (all_qhits , stable = True )
2241- return BiocFrame ({"query_hits" : all_qhits , "self_hits" : all_shits }) [order , :]
2284+ order = np .argsort (all_q_hits , stable = True )
2285+ return BiocFrame ({"query_hits" : all_q_hits [ order ] , "self_hits" : all_s_hits [order ]})
22422286
22432287 def count_overlaps (
22442288 self ,
@@ -2247,6 +2291,7 @@ def count_overlaps(
22472291 max_gap : int = - 1 ,
22482292 min_overlap : int = 0 ,
22492293 ignore_strand : bool = False ,
2294+ num_threads : int = 1 ,
22502295 ) -> np .ndarray :
22512296 """Count overlaps between subject (self) and a ``query`` ``GenomicRanges`` object.
22522297
@@ -2274,6 +2319,10 @@ def count_overlaps(
22742319 ignore_strand:
22752320 Whether to ignore strands. Defaults to False.
22762321
2322+ num_threads:
2323+ Number of threads to use.
2324+ Defaults to 1.
2325+
22772326 Raises:
22782327 TypeError: If ``query`` is not of type `GenomicRanges`.
22792328
@@ -2282,7 +2331,12 @@ def count_overlaps(
22822331 value represents the number of overlaps in `self` for each query.
22832332 """
22842333 _overlaps = self .find_overlaps (
2285- query , query_type = query_type , max_gap = max_gap , min_overlap = min_overlap , ignore_strand = ignore_strand
2334+ query ,
2335+ query_type = query_type ,
2336+ max_gap = max_gap ,
2337+ min_overlap = min_overlap ,
2338+ ignore_strand = ignore_strand ,
2339+ num_threads = num_threads ,
22862340 )
22872341 result = np .zeros (len (query ))
22882342 _ucounts = np .unique_counts (_overlaps .get_column ("query_hits" ))
@@ -2297,6 +2351,7 @@ def subset_by_overlaps(
22972351 max_gap : int = - 1 ,
22982352 min_overlap : int = 0 ,
22992353 ignore_strand : bool = False ,
2354+ num_threads : int = 1 ,
23002355 ) -> "GenomicRanges" :
23012356 """Subset ``subject`` (self) with overlaps in ``query`` `GenomicRanges` object.
23022357
@@ -2324,6 +2379,10 @@ def subset_by_overlaps(
23242379 ignore_strand:
23252380 Whether to ignore strands. Defaults to False.
23262381
2382+ num_threads:
2383+ Number of threads to use.
2384+ Defaults to 1.
2385+
23272386 Raises:
23282387 TypeError:
23292388 If ``query`` is not of type `GenomicRanges`.
@@ -2332,7 +2391,12 @@ def subset_by_overlaps(
23322391 A ``GenomicRanges`` object containing overlapping ranges.
23332392 """
23342393 _overlaps = self .find_overlaps (
2335- query , query_type = query_type , max_gap = max_gap , min_overlap = min_overlap , ignore_strand = ignore_strand
2394+ query ,
2395+ query_type = query_type ,
2396+ max_gap = max_gap ,
2397+ min_overlap = min_overlap ,
2398+ ignore_strand = ignore_strand ,
2399+ num_threads = num_threads ,
23362400 )
23372401 _all_indices = np .unique (_overlaps .get_column ("self_hits" ))
23382402 return self [_all_indices ]
0 commit comments