Skip to content

Commit c481afd

Browse files
committed
homopolymer filter
helps a tiny bit with recall/precision - genotyping concordance essentially the same
1 parent c1771e2 commit c481afd

File tree

10 files changed

+59
-16
lines changed

10 files changed

+59
-16
lines changed

experiments/bedtest.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@ create() {
77
--input test_rs/test2.vcf.gz \
88
--bam /Users/english/code/kanpig/experiments/test_rs/NA24385.chr20.bam \
99
--reference /Users/english/code/references/grch38/GRCh38_1kg_mainchrs.fa \
10-
--sizemin 20 \
10+
--sizemin 50 \
1111
--sizesim 0.95 --seqsim 0.90 --threads 4 \
12-
--maxpaths 10000 --mapq 20 --hapsim 0.98 \
13-
--chunksize 1000 --prune --try-exact \
12+
--maxpaths 10000 --mapq 5 --hapsim 0.98 \
13+
--chunksize 100 --maxhom 0 \
1414
-o test_rs/hc.vcf --bed $bed
1515
# --bed /Users/english/code/kanpig/test/GRCh38_HG002-T2TQ100-V1.0_stvar.benchmark.bed \
1616
# --bam /Users/english/code/kanpig/experiments/test_rs/GIABHG002.bam \

src/kplib/bamparser.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,12 @@ impl BamParser {
124124
};
125125

126126
let n_hap = Haplotype::new(
127-
seq_to_kmer(&sequence, self.params.kmer, p.indel == Svtype::Del),
127+
seq_to_kmer(
128+
&sequence,
129+
self.params.kmer,
130+
p.indel == Svtype::Del,
131+
self.params.maxhom,
132+
),
128133
p.size,
129134
1,
130135
1,

src/kplib/cli.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,10 @@ pub struct KDParams {
109109
/// Don't require alignments to span vargraph region
110110
#[arg(long, default_value_t = true)]
111111
pub spanoff: bool,
112+
113+
/// Maximum homopolymer length to kmerize (off=0)
114+
#[arg(long, default_value_t = 5)]
115+
pub maxhom: usize,
112116
}
113117

114118
impl ArgParser {

src/kplib/cluster.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ pub fn cluster_haplotypes(
4343
hap_b.push(hap);
4444
}
4545
}
46+
// Averaging Haplotypes could be done here.
47+
// However, 'low-quality' long-reads are now ~97% (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10070092/)
48+
// Which is reasonably within the thresholds of size/seqsim's search space
4649
hap_a.sort();
4750
hap_b.sort();
4851

src/kplib/haplotype.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ impl Haplotype {
2525

2626
// Create an empty haplotype
2727
pub fn blank(kmer: u8, coverage: u64) -> Haplotype {
28-
let mk = seq_to_kmer(&[], kmer, false);
28+
let mk = seq_to_kmer(&[], kmer, false, 0);
2929
Haplotype {
3030
size: 0,
3131
n: 0,

src/kplib/kmeans.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ pub fn kmeans(data: &[Point], k: usize) -> Vec<Cluster> {
6767
.map(|centroid| Cluster::new(centroid.clone()))
6868
.collect();
6969

70+
let mut ntries = 10;
7071
loop {
7172
// Assign each point to the nearest cluster
7273
for (idx, point) in data.iter().enumerate() {
@@ -93,7 +94,7 @@ pub fn kmeans(data: &[Point], k: usize) -> Vec<Cluster> {
9394

9495
// Check for convergence
9596
let new_centroids: Vec<Point> = clusters.iter().map(|c| c.centroid.clone()).collect();
96-
if new_centroids == old_centroids {
97+
if new_centroids == old_centroids || ntries == 0 {
9798
break;
9899
} else {
99100
centroids = new_centroids;
@@ -102,6 +103,7 @@ pub fn kmeans(data: &[Point], k: usize) -> Vec<Cluster> {
102103
.map(|centroid| Cluster::new(centroid.clone()))
103104
.collect();
104105
}
106+
ntries -= 1;
105107
}
106108

107109
clusters

src/kplib/kmer.rs

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,12 @@ fn encode_nuc(nuc: u8) -> u64 {
1010
}
1111

1212
/// Count kmers in a sequence
13-
pub fn seq_to_kmer(sequence: &[u8], kmer: u8, negative: bool) -> Vec<f32> {
13+
pub fn seq_to_kmer(sequence: &[u8], kmer: u8, negative: bool, maxhom: usize) -> Vec<f32> {
14+
let sequence = if maxhom != 0 {
15+
compress_homopolymer(sequence, maxhom)
16+
} else {
17+
sequence.to_vec()
18+
};
1419
let ukmer = kmer as usize;
1520
let mut kcounts = vec![0f32; 1 << (2 * ukmer)];
1621
let cnt = if negative { -1.0 } else { 1.0 };
@@ -47,3 +52,27 @@ pub fn seq_to_kmer(sequence: &[u8], kmer: u8, negative: bool) -> Vec<f32> {
4752

4853
kcounts
4954
}
55+
56+
pub fn compress_homopolymer(vector: &[u8], maxspan: usize) -> Vec<u8> {
57+
let mut result = Vec::new();
58+
let mut count = 0;
59+
let mut prev_byte = None;
60+
61+
for byte in vector {
62+
match prev_byte {
63+
Some(prev) if prev == byte => {
64+
count += 1;
65+
if count < maxspan {
66+
result.push(*byte);
67+
}
68+
}
69+
_ => {
70+
count = 1;
71+
result.push(*byte);
72+
}
73+
}
74+
prev_byte = Some(byte);
75+
}
76+
77+
result
78+
}

src/kplib/vargraph.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,11 @@ pub struct VarNode {
2121
}
2222

2323
impl VarNode {
24-
pub fn new(entry: vcf::Record, kmer: u8) -> Self {
24+
pub fn new(entry: vcf::Record, kmer: u8, maxhom: usize) -> Self {
2525
// Want to make a hash for these names for debugging, I think.
2626
let name = "".to_string();
2727
let (start, end) = entry.boundaries();
28-
let (kfeat, size) = entry.to_kfeat(kmer);
28+
let (kfeat, size) = entry.to_kfeat(kmer, maxhom);
2929
Self {
3030
name,
3131
start,
@@ -70,7 +70,7 @@ pub struct Variants {
7070
/// The graph has an upstream 'src' node that point to every variant node
7171
/// The graph has a dnstream 'snk' node that is pointed to by every variant node and 'src'
7272
impl Variants {
73-
pub fn new(mut variants: Vec<vcf::Record>, kmer: u8) -> Self {
73+
pub fn new(mut variants: Vec<vcf::Record>, kmer: u8, maxhom: usize) -> Self {
7474
if variants.is_empty() {
7575
panic!("Cannot create a graph from no variants");
7676
}
@@ -84,7 +84,7 @@ impl Variants {
8484
node_indices.append(
8585
&mut variants
8686
.drain(..) // drain lets us move the entry without a clone
87-
.map(|entry| graph.add_node(VarNode::new(entry, kmer)))
87+
.map(|entry| graph.add_node(VarNode::new(entry, kmer, maxhom)))
8888
.collect(),
8989
);
9090

src/kplib/vcf_traits.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ impl FromStr for Svtype {
3232

3333
/// Convert vcf::Record to kfeat
3434
pub trait KdpVcf {
35-
fn to_kfeat(&self, kmer: u8) -> (Vec<f32>, i64);
35+
fn to_kfeat(&self, kmer: u8, maxhom: usize) -> (Vec<f32>, i64);
3636
fn boundaries(&self) -> (u64, u64);
3737
fn size(&self) -> u64;
3838
fn is_filtered(&self) -> bool;
@@ -41,7 +41,7 @@ pub trait KdpVcf {
4141

4242
impl KdpVcf for vcf::Record {
4343
/// Convert variant sequence to Kfeat
44-
fn to_kfeat(&self, kmer: u8) -> (Vec<f32>, i64) {
44+
fn to_kfeat(&self, kmer: u8, maxhom: usize) -> (Vec<f32>, i64) {
4545
let ref_seq = self.reference_bases().to_string();
4646
let alt_seq = self
4747
.alternate_bases()
@@ -51,8 +51,8 @@ impl KdpVcf for vcf::Record {
5151

5252
let size = alt_seq.len() as i64 - ref_seq.len() as i64;
5353

54-
let m_ref = seq_to_kmer(&ref_seq.as_bytes()[1..], kmer, false);
55-
let m_alt = seq_to_kmer(&alt_seq.as_bytes()[1..], kmer, false);
54+
let m_ref = seq_to_kmer(&ref_seq.as_bytes()[1..], kmer, false, maxhom);
55+
let m_alt = seq_to_kmer(&alt_seq.as_bytes()[1..], kmer, false, maxhom);
5656

5757
let m_ret: Vec<_> = m_alt
5858
.iter()

src/main.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ fn main() {
6767
thread::spawn(move || {
6868
let mut m_bam = BamParser::new(m_args.io.bam, m_args.io.reference, m_args.kd.clone());
6969
for chunk in receiver.into_iter().flatten() {
70-
let mut m_graph = Variants::new(chunk, m_args.kd.kmer);
70+
let mut m_graph = Variants::new(chunk, m_args.kd.kmer, m_args.kd.maxhom);
7171
let (haps, coverage) = m_bam.find_haps(&m_graph.chrom, m_graph.start, m_graph.end);
7272
let (h1, h2) = cluster_haplotypes(haps, coverage, &m_args.kd);
7373
let p1 = m_graph.apply_coverage(&h1, &m_args.kd);

0 commit comments

Comments
 (0)