@@ -97,7 +97,7 @@ pub fn raw_distance(
9797 // at this point we've exhausted one of the two sketches, but we may have
9898 // more counts in the other to compare if these were scaled sketches
9999 if scale > 0. {
100- let max_hash = u64:: max_value ( ) / scale. recip ( ) as u64 ;
100+ let max_hash = u64:: MAX / scale. recip ( ) as u64 ;
101101 while query_hashes
102102 . get ( i)
103103 . map ( |kmer_count| kmer_count. hash < max_hash)
@@ -125,6 +125,37 @@ pub fn raw_distance(
125125 ( containment, jaccard, common, total)
126126}
127127
128+ /// This computes set statistics from one set of hashes to another.
129+ ///
130+ /// Every hash in the reference set is considered while only those hashes in the
131+ /// query set that are in the same range as the reference set are compared. This
132+ /// should be a more accurate representation of the query set's containment in
133+ /// the reference set because we consider all of the reference set. In
134+ /// practice, there may be issues especially if the query is sketched to a
135+ /// different effective scale than the reference.
136+ pub fn old_distance ( query_sketch : & [ KmerCount ] , ref_sketch : & [ KmerCount ] ) -> ( f64 , f64 , u64 , u64 ) {
137+ let mut i: usize = 0 ;
138+ let mut common: u64 = 0 ;
139+ let mut total: u64 = 0 ;
140+
141+ for ref_hash in ref_sketch {
142+ while ( query_sketch[ i] . hash < ref_hash. hash ) && ( i < query_sketch. len ( ) - 1 ) {
143+ i += 1 ;
144+ }
145+
146+ if query_sketch[ i] . hash == ref_hash. hash {
147+ common += 1 ;
148+ }
149+
150+ total += 1 ;
151+ }
152+
153+ // Numerator is A-intersect-B, |A| is the denominator, we enforce |A| == |B|
154+ let containment: f64 = common as f64 / total as f64 ;
155+ let jaccard: f64 = common as f64 / ( common + 2 * ( total - common) ) as f64 ;
156+ ( containment, jaccard, common, total)
157+ }
158+
128159#[ cfg( test) ]
129160mod tests {
130161 use super :: * ;
@@ -306,37 +337,6 @@ mod tests {
306337 }
307338}
308339
309- /// This computes set statistics from one set of hashes to another.
310- ///
311- /// Every hash in the reference set is considered while only those hashes in the
312- /// query set that are in the same range as the reference set are compared. This
313- /// should be a more accurate representation of the query set's containment in
314- /// the reference set because we consider all of the reference set. In
315- /// practice, there may be issues especially if the query is sketched to a
316- /// different effective scale than the reference.
317- pub fn old_distance ( query_sketch : & [ KmerCount ] , ref_sketch : & [ KmerCount ] ) -> ( f64 , f64 , u64 , u64 ) {
318- let mut i: usize = 0 ;
319- let mut common: u64 = 0 ;
320- let mut total: u64 = 0 ;
321-
322- for ref_hash in ref_sketch {
323- while ( query_sketch[ i] . hash < ref_hash. hash ) && ( i < query_sketch. len ( ) - 1 ) {
324- i += 1 ;
325- }
326-
327- if query_sketch[ i] . hash == ref_hash. hash {
328- common += 1 ;
329- }
330-
331- total += 1 ;
332- }
333-
334- // Numerator is A-intersect-B, |A| is the denominator, we enforce |A| == |B|
335- let containment: f64 = common as f64 / total as f64 ;
336- let jaccard: f64 = common as f64 / ( common + 2 * ( total - common) ) as f64 ;
337- ( containment, jaccard, common, total)
338- }
339-
340340// TODO: add another method like this to allow 0's in ref sketch for hashes present in sketches?
341341// TODO: maybe we want to do NNLS on these matrices in Rust? for example code, see:
342342// https://github.com/igmanthony/fnnls/blob/master/src/fnnls.rs
0 commit comments