Skip to content

Commit 44e8105

Browse files
Add open syncmers; change SYNCMER generic to u8
1 parent dfea53c commit 44e8105

File tree

4 files changed

+258
-86
lines changed

4 files changed

+258
-86
lines changed

src/intrinsics/dedup.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ pub unsafe fn append_unique_vals_2(
181181
let recon = _mm256_blend_epi32(old, new, 0b01111111);
182182
let movebyone_mask = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7); // rotate shuffle
183183
let vec_tmp = _mm256_permutevar8x32_epi32(recon, movebyone_mask);
184-
let mut mask = transmute(_mm256_cmpeq_epi32(vec_tmp, new));
184+
let mask = transmute(_mm256_cmpeq_epi32(vec_tmp, new));
185185

186186
append_filtered_vals_2(vals, vals2, mask, v, v2, write_idx);
187187
}

src/lib.rs

Lines changed: 84 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,11 @@ thread_local! {
213213
static CACHE: std::cell::RefCell<(Cache, Vec<S>, Vec<S>)> = std::cell::RefCell::new(Default::default());
214214
}
215215

216-
pub struct Builder<'h, const CANONICAL: bool, H: KmerHasher, SkPos, const SYNCMER: bool> {
216+
/// `CANONICAL`: true for canonical minimizers.
217+
/// `H`: the kmer hasher to use.
218+
/// `SkPos`: type of super-k-mer position storage. Use `()` to disable super-k-mers.
219+
/// `SYNCMER`: 0 for minimizers, 1 for closed syncmers, 2 for open syncmers.
220+
pub struct Builder<'h, const CANONICAL: bool, H: KmerHasher, SkPos, const SYNCMER: u8> {
217221
k: usize,
218222
w: usize,
219223
hasher: Option<&'h H>,
@@ -228,7 +232,7 @@ pub struct Output<'o, const CANONICAL: bool, S> {
228232
}
229233

230234
#[must_use]
231-
pub const fn minimizers(k: usize, w: usize) -> Builder<'static, false, NtHasher<false>, (), false> {
235+
pub const fn minimizers(k: usize, w: usize) -> Builder<'static, false, NtHasher<false>, (), 0> {
232236
Builder {
233237
k,
234238
w,
@@ -241,7 +245,7 @@ pub const fn minimizers(k: usize, w: usize) -> Builder<'static, false, NtHasher<
241245
pub const fn canonical_minimizers(
242246
k: usize,
243247
w: usize,
244-
) -> Builder<'static, true, NtHasher<true>, (), false> {
248+
) -> Builder<'static, true, NtHasher<true>, (), 0> {
245249
Builder {
246250
k,
247251
w,
@@ -250,14 +254,17 @@ pub const fn canonical_minimizers(
250254
}
251255
}
252256

253-
/// Return positions/values of syncmers of length `k+w-1`.
257+
/// Return positions/values of *closed* syncmers of length `k+w-1`.
254258
///
255259
/// These are windows with the minimizer at the start or end of the window.
256260
///
257261
/// `k` here corresponds to `s` in original syncmer notation: the minimizer length.
258262
/// `k+w-1` corresponds to `k` in original syncmer notation: the length of the extracted string.
259263
#[must_use]
260-
pub const fn syncmers(k: usize, w: usize) -> Builder<'static, false, NtHasher<false>, (), true> {
264+
pub const fn closed_syncmers(
265+
k: usize,
266+
w: usize,
267+
) -> Builder<'static, false, NtHasher<false>, (), 1> {
261268
Builder {
262269
k,
263270
w,
@@ -267,10 +274,10 @@ pub const fn syncmers(k: usize, w: usize) -> Builder<'static, false, NtHasher<fa
267274
}
268275

269276
#[must_use]
270-
pub const fn canonical_syncmers(
277+
pub const fn canonical_closed_syncmers(
271278
k: usize,
272279
w: usize,
273-
) -> Builder<'static, true, NtHasher<true>, (), true> {
280+
) -> Builder<'static, true, NtHasher<true>, (), 1> {
274281
Builder {
275282
k,
276283
w,
@@ -279,7 +286,36 @@ pub const fn canonical_syncmers(
279286
}
280287
}
281288

282-
impl<const CANONICAL: bool, const SYNCMERS: bool>
289+
/// Return positions/values of *open* syncmers of length `k+w-1`.
290+
///
291+
/// These are windows with the minimizer in the middle of the window. This requires `w` to be odd.
292+
///
293+
/// `k` here corresponds to `s` in original syncmer notation: the minimizer length.
294+
/// `k+w-1` corresponds to `k` in original syncmer notation: the length of the extracted string.
295+
#[must_use]
296+
pub const fn open_syncmers(k: usize, w: usize) -> Builder<'static, false, NtHasher<false>, (), 2> {
297+
Builder {
298+
k,
299+
w,
300+
hasher: None,
301+
sk_pos: (),
302+
}
303+
}
304+
305+
#[must_use]
306+
pub const fn canonical_open_syncmers(
307+
k: usize,
308+
w: usize,
309+
) -> Builder<'static, true, NtHasher<true>, (), 2> {
310+
Builder {
311+
k,
312+
w,
313+
hasher: None,
314+
sk_pos: (),
315+
}
316+
}
317+
318+
impl<const CANONICAL: bool, const SYNCMERS: u8>
283319
Builder<'static, CANONICAL, NtHasher<CANONICAL>, (), SYNCMERS>
284320
{
285321
#[must_use]
@@ -295,14 +331,12 @@ impl<const CANONICAL: bool, const SYNCMERS: bool>
295331
}
296332
}
297333
}
298-
impl<'h, const CANONICAL: bool, H: KmerHasher, const SYNCMERS: bool>
299-
Builder<'h, CANONICAL, H, (), SYNCMERS>
300-
{
334+
impl<'h, const CANONICAL: bool, H: KmerHasher> Builder<'h, CANONICAL, H, (), 0> {
301335
#[must_use]
302336
pub const fn super_kmers<'o2>(
303337
&self,
304338
sk_pos: &'o2 mut Vec<u32>,
305-
) -> Builder<'h, CANONICAL, H, &'o2 mut Vec<u32>, SYNCMERS> {
339+
) -> Builder<'h, CANONICAL, H, &'o2 mut Vec<u32>, 0> {
306340
Builder {
307341
k: self.k,
308342
w: self.w,
@@ -313,7 +347,7 @@ impl<'h, const CANONICAL: bool, H: KmerHasher, const SYNCMERS: bool>
313347
}
314348

315349
/// Without-superkmer version
316-
impl<'h, const CANONICAL: bool, H: KmerHasher, const SYNCMERS: bool>
350+
impl<'h, const CANONICAL: bool, H: KmerHasher, const SYNCMERS: u8>
317351
Builder<'h, CANONICAL, H, (), SYNCMERS>
318352
{
319353
pub fn run_scalar_once<'s, SEQ: Seq<'s>>(&self, seq: SEQ) -> Vec<u32> {
@@ -355,35 +389,50 @@ impl<'h, const CANONICAL: bool, H: KmerHasher, const SYNCMERS: bool>
355389
.unwrap_or_else(|| default_hasher.as_ref().unwrap());
356390

357391
CACHE.with_borrow_mut(|cache| match (SIMD, CANONICAL, SYNCMERS) {
358-
(false, false, false) => collect_and_dedup_into_scalar(
392+
(false, false, 0) => collect_and_dedup_into_scalar(
359393
minimizers_seq_scalar(seq, hasher, self.w, &mut cache.0),
360394
min_pos,
361395
),
362-
(false, false, true) => collect_syncmers_scalar(
396+
(false, false, 1) => collect_syncmers_scalar::<false>(
363397
self.w,
364398
minimizers_seq_scalar(seq, hasher, self.w, &mut cache.0),
365399
min_pos,
366400
),
367-
(false, true, false) => collect_and_dedup_into_scalar(
401+
(false, false, 2) => collect_syncmers_scalar::<true>(
402+
self.w,
403+
minimizers_seq_scalar(seq, hasher, self.w, &mut cache.0),
404+
min_pos,
405+
),
406+
(false, true, 0) => collect_and_dedup_into_scalar(
368407
canonical_minimizers_seq_scalar(seq, hasher, self.w, &mut cache.0),
369408
min_pos,
370409
),
371-
(false, true, true) => collect_syncmers_scalar(
410+
(false, true, 1) => collect_syncmers_scalar::<false>(
372411
self.w,
373412
canonical_minimizers_seq_scalar(seq, hasher, self.w, &mut cache.0),
374413
min_pos,
375414
),
376-
(true, false, false) => minimizers_seq_simd(seq, hasher, self.w, &mut cache.0)
415+
(false, true, 2) => collect_syncmers_scalar::<true>(
416+
self.w,
417+
canonical_minimizers_seq_scalar(seq, hasher, self.w, &mut cache.0),
418+
min_pos,
419+
),
420+
(true, false, 0) => minimizers_seq_simd(seq, hasher, self.w, &mut cache.0)
377421
.collect_and_dedup_into::<false>(min_pos),
378-
(true, false, true) => minimizers_seq_simd(seq, hasher, self.w, &mut cache.0)
379-
.collect_syncmers_into(self.w, min_pos),
380-
(true, true, false) => canonical_minimizers_seq_simd(seq, hasher, self.w, &mut cache.0)
422+
(true, false, 1) => minimizers_seq_simd(seq, hasher, self.w, &mut cache.0)
423+
.collect_syncmers_into::<false>(self.w, min_pos),
424+
(true, false, 2) => minimizers_seq_simd(seq, hasher, self.w, &mut cache.0)
425+
.collect_syncmers_into::<true>(self.w, min_pos),
426+
(true, true, 0) => canonical_minimizers_seq_simd(seq, hasher, self.w, &mut cache.0)
381427
.collect_and_dedup_into::<false>(min_pos),
382-
(true, true, true) => canonical_minimizers_seq_simd(seq, hasher, self.w, &mut cache.0)
383-
.collect_syncmers_into(self.w, min_pos),
428+
(true, true, 1) => canonical_minimizers_seq_simd(seq, hasher, self.w, &mut cache.0)
429+
.collect_syncmers_into::<false>(self.w, min_pos),
430+
(true, true, 2) => canonical_minimizers_seq_simd(seq, hasher, self.w, &mut cache.0)
431+
.collect_syncmers_into::<true>(self.w, min_pos),
432+
_ => unreachable!("SYNCMERS generic must be 0 (no syncmers), 1 (closed syncmers), or 2 (open syncmers)."),
384433
});
385434
Output {
386-
len: if SYNCMERS {
435+
len: if SYNCMERS != 0 {
387436
self.k + self.w - 1
388437
} else {
389438
self.k
@@ -394,7 +443,7 @@ impl<'h, const CANONICAL: bool, H: KmerHasher, const SYNCMERS: bool>
394443
}
395444
}
396445

397-
impl<'h, H: KmerHasher, const SYNCMERS: bool> Builder<'h, true, H, (), SYNCMERS> {
446+
impl<'h, H: KmerHasher, const SYNCMERS: u8> Builder<'h, true, H, (), SYNCMERS> {
398447
pub fn run_skip_ambiguous_windows_once<'s>(&self, nseq: PackedNSeq<'s>) -> Vec<u32> {
399448
let mut min_pos = vec![];
400449
self.run_skip_ambiguous_windows(nseq, &mut min_pos);
@@ -419,13 +468,18 @@ impl<'h, H: KmerHasher, const SYNCMERS: bool> Builder<'h, true, H, (), SYNCMERS>
419468
.hasher
420469
.unwrap_or_else(|| default_hasher.as_ref().unwrap());
421470
match SYNCMERS {
422-
false => canonical_minimizers_skip_ambiguous_windows(nseq, hasher, self.w, cache)
471+
0 => canonical_minimizers_skip_ambiguous_windows(nseq, hasher, self.w, cache)
423472
.collect_and_dedup_into::<true>(min_pos),
424-
true => canonical_minimizers_skip_ambiguous_windows(nseq, hasher, self.w, cache)
425-
.collect_syncmers_into(self.w, min_pos),
473+
1 => canonical_minimizers_skip_ambiguous_windows(nseq, hasher, self.w, cache)
474+
.collect_syncmers_into::<false>(self.w, min_pos),
475+
2 => canonical_minimizers_skip_ambiguous_windows(nseq, hasher, self.w, cache)
476+
.collect_syncmers_into::<true>(self.w, min_pos),
477+
_ => panic!(
478+
"SYNCMERS generic must be 0 (no syncmers), 1 (closed syncmers), or 2 (open syncmers)."
479+
),
426480
}
427481
Output {
428-
len: if SYNCMERS {
482+
len: if SYNCMERS != 0 {
429483
self.k + self.w - 1
430484
} else {
431485
self.k
@@ -440,7 +494,7 @@ impl<'h, H: KmerHasher, const SYNCMERS: bool> Builder<'h, true, H, (), SYNCMERS>
440494
///
441495
/// (does not work in combination with syncmers)
442496
impl<'h, 'o2, const CANONICAL: bool, H: KmerHasher>
443-
Builder<'h, CANONICAL, H, &'o2 mut Vec<u32>, false>
497+
Builder<'h, CANONICAL, H, &'o2 mut Vec<u32>, 0>
444498
{
445499
pub fn run_scalar_once<'s, SEQ: Seq<'s>>(self, seq: SEQ) -> Vec<u32> {
446500
let mut min_pos = vec![];

src/syncmers.rs

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,29 @@ use packed_seq::{ChunkIt, L, PaddedIt, intrinsics::transpose};
1212
use wide::u32x8;
1313

1414
/// Collect positions of all syncmers.
15-
pub fn collect_syncmers_scalar(w: usize, it: impl Iterator<Item = u32>, out_vec: &mut Vec<u32>) {
15+
/// `OPEN`:
16+
/// - `false`: closed syncmers
17+
/// - `true`: open syncmers
18+
pub fn collect_syncmers_scalar<const OPEN: bool>(
19+
w: usize,
20+
it: impl Iterator<Item = u32>,
21+
out_vec: &mut Vec<u32>,
22+
) {
23+
if OPEN {
24+
assert!(
25+
w % 2 == 1,
26+
"Open syncmers require odd window size, so that there is a unique middle element."
27+
);
28+
}
1629
unsafe { out_vec.set_len(out_vec.capacity()) };
1730
let mut idx = 0;
1831
it.enumerate().for_each(|(i, min_pos)| {
19-
if min_pos as usize == i || min_pos as usize == i + w - 1 {
32+
let is_syncmer = if OPEN {
33+
min_pos as usize == i + w / 2
34+
} else {
35+
min_pos as usize == i || min_pos as usize == i + w - 1
36+
};
37+
if is_syncmer {
2038
if idx == out_vec.len() {
2139
out_vec.reserve(1);
2240
unsafe { out_vec.set_len(out_vec.capacity()) };
@@ -32,16 +50,16 @@ pub trait CollectSyncmers: Sized {
3250
/// Collect all indices where syncmers start.
3351
///
3452
/// Automatically skips `SIMD_SKIPPED` values for ambiguous windows for sequences shorter than 2^32-2 or so.
35-
fn collect_syncmers(self, w: usize) -> Vec<u32> {
53+
fn collect_syncmers<const OPEN: bool>(self, w: usize) -> Vec<u32> {
3654
let mut v = vec![];
37-
self.collect_syncmers_into(w, &mut v);
55+
self.collect_syncmers_into::<OPEN>(w, &mut v);
3856
v
3957
}
4058

4159
/// Collect all indices where syncmers start into `out_vec`.
4260
///
4361
/// Automatically skips `SIMD_SKIPPED` values for ambiguous windows for sequences shorter than 2^32-2 or so.
44-
fn collect_syncmers_into(self, w: usize, out_vec: &mut Vec<u32>);
62+
fn collect_syncmers_into<const OPEN: bool>(self, w: usize, out_vec: &mut Vec<u32>);
4563
}
4664

4765
thread_local! {
@@ -51,7 +69,7 @@ thread_local! {
5169
impl<I: ChunkIt<u32x8>> CollectSyncmers for PaddedIt<I> {
5270
// mostly copied from `Collect::collect_minimizers_into`
5371
#[inline(always)]
54-
fn collect_syncmers_into(self, w: usize, out_vec: &mut Vec<u32>) {
72+
fn collect_syncmers_into<const OPEN: bool>(self, w: usize, out_vec: &mut Vec<u32>) {
5573
let Self { it, padding } = self;
5674
CACHE.with(
5775
#[inline(always)]
@@ -91,8 +109,11 @@ impl<I: ChunkIt<u32x8>> CollectSyncmers for PaddedIt<I> {
91109
let x = x | mask;
92110

93111
// Every non-syncmer minimizer pos is masked out.
94-
let is_syncmer = x.cmp_eq(lane_offsets)
95-
| x.cmp_eq(lane_offsets + S::splat(w as u32 - 1));
112+
let is_syncmer = if OPEN {
113+
x.cmp_eq(lane_offsets + S::splat((w / 2) as u32))
114+
} else {
115+
x.cmp_eq(lane_offsets) | x.cmp_eq(lane_offsets + S::splat(w as u32 - 1))
116+
};
96117
// current window position if syncmer, else u32::MAX
97118
let y = is_syncmer.blend(lane_offsets, u32x8::MAX);
98119

0 commit comments

Comments
 (0)