11/* bam2bcf.c -- variant calling.
22
33 Copyright (C) 2010-2012 Broad Institute.
4- Copyright (C) 2012-2021 Genome Research Ltd.
4+ Copyright (C) 2012-2022 Genome Research Ltd.
55
66 Author: Heng Li <[email protected] > 77
@@ -89,6 +89,39 @@ void bcf_call_destroy(bcf_callaux_t *bca)
8989 free (bca -> bases ); free (bca -> inscns ); free (bca );
9090}
9191
92+ static int get_aux_nm (bam1_t * rec , int32_t qpos , int is_ref )
93+ {
94+ uint8_t * nm_tag = bam_aux_get (rec , "NM" );
95+ if ( !nm_tag ) return -1 ;
96+ int64_t nm = bam_aux2i (nm_tag );
97+
98+ // Count indels as single events, not as the number of inserted/deleted
99+ // bases (which is what NM does). Add soft clips as mismatches.
100+ int i ;
101+ for (i = 0 ; i < rec -> core .n_cigar ; i ++ )
102+ {
103+ int val = bam_get_cigar (rec )[i ] & BAM_CIGAR_MASK ;
104+ if ( val == BAM_CSOFT_CLIP )
105+ {
106+ nm += bam_get_cigar (rec )[i ] >> BAM_CIGAR_SHIFT ;
107+ }
108+ else if ( val == BAM_CINS || val == BAM_CDEL )
109+ {
110+ val = bam_get_cigar (rec )[i ] >> BAM_CIGAR_SHIFT ;
111+ if ( val > 1 ) nm -= val - 1 ;
112+ }
113+ }
114+
115+ // Take into account MNPs, 2% of de novo SNVs appear within 20bp of another de novo SNV
116+ // http://www.genome.org/cgi/doi/10.1101/gr.239756.118
117+ nm -= is_ref ? 1 : 2 ;
118+
119+ if ( nm < 0 ) nm = 0 ;
120+ if ( nm >= B2B_N_NM ) nm = B2B_N_NM - 1 ;
121+
122+ return nm ;
123+ }
124+
92125// position in the sequence with respect to the aligned part of the read
93126static int get_position (const bam_pileup1_t * p , int * len ,
94127 int * sc_len , int * sc_dist ) {
@@ -158,6 +191,17 @@ void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call)
158191 if ( call -> ADF ) memset (call -> ADF ,0 ,sizeof (int32_t )* (call -> n + 1 )* B2B_MAX_ALLELES );
159192 if ( call -> ADR ) memset (call -> ADR ,0 ,sizeof (int32_t )* (call -> n + 1 )* B2B_MAX_ALLELES );
160193 if ( call -> SCR ) memset (call -> SCR ,0 ,sizeof (* call -> SCR )* (call -> n + 1 ));
194+ if ( call -> SCR ) memset (call -> SCR ,0 ,sizeof (* call -> SCR )* (call -> n + 1 ));
195+ if ( bca -> fmt_flag & B2B_FMT_NMBZ )
196+ {
197+ memset (call -> ref_nm ,0 ,sizeof (* call -> ref_nm )* (call -> n + 1 )* B2B_N_NM );
198+ memset (call -> alt_nm ,0 ,sizeof (* call -> alt_nm )* (call -> n + 1 )* B2B_N_NM );
199+ }
200+ else
201+ {
202+ memset (call -> ref_nm ,0 ,sizeof (* call -> ref_nm )* B2B_N_NM );
203+ memset (call -> alt_nm ,0 ,sizeof (* call -> alt_nm )* B2B_N_NM );
204+ }
161205 memset (call -> QS ,0 ,sizeof (* call -> QS )* call -> n * B2B_MAX_ALLELES );
162206 memset (bca -> ref_scl , 0 , 100 * sizeof (int ));
163207 memset (bca -> alt_scl , 0 , 100 * sizeof (int ));
@@ -309,28 +353,38 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
309353 if (sc_len > 99 ) sc_len = 99 ;
310354 }
311355 }
312-
313356 int imq = mapQ * nqual_over_60 ;
314357 int ibq = baseQ * nqual_over_60 ;
358+ int inm = get_aux_nm (p -> b ,p -> qpos ,is_diff ?0 :1 );
315359
316360 if ( bam_is_rev (p -> b ) )
317361 bca -> rev_mqs [imq ]++ ;
318362 else
319363 bca -> fwd_mqs [imq ]++ ;
320364
321- if ( bam_seqi ( bam_get_seq ( p -> b ), p -> qpos ) == ref_base )
365+ if ( ! is_diff )
322366 {
323367 bca -> ref_pos [epos ]++ ;
324368 bca -> ref_bq [ibq ]++ ;
325369 bca -> ref_mq [imq ]++ ;
326370 bca -> ref_scl [sc_len ]++ ;
371+ if ( inm >=0 )
372+ {
373+ bca -> ref_nm [inm ]++ ;
374+ if ( r -> ref_nm ) r -> ref_nm [inm ]++ ;
375+ }
327376 }
328377 else
329378 {
330379 bca -> alt_pos [epos ]++ ;
331380 bca -> alt_bq [ibq ]++ ;
332381 bca -> alt_mq [imq ]++ ;
333382 bca -> alt_scl [sc_len ]++ ;
383+ if ( inm >=0 )
384+ {
385+ bca -> alt_nm [inm ]++ ;
386+ if ( r -> alt_nm ) r -> alt_nm [inm ]++ ;
387+ }
334388 }
335389 }
336390
@@ -798,6 +852,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
798852 call -> n_alleles = j ;
799853 if (call -> n_alleles == 1 ) return -1 ; // no reliable supporting read. stop doing anything
800854 }
855+ int has_alt = (call -> n_alleles == 2 && call -> unseen != -1 ) ? 0 : 1 ;
801856 /*
802857 * Set the phread likelihood array (call->PL) This array is 15 entries long
803858 * for each sample because that is size of an upper or lower triangle of a
@@ -914,6 +969,9 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
914969 for (j = 0 ; j < 16 ; ++ j ) call -> anno [j ] += calls [i ].anno [j ];
915970 }
916971
972+ // No need to calculate MWU tests when there is no ALT allele, this should speed up things slightly
973+ if ( !has_alt ) return 0 ;
974+
917975 calc_SegBias (calls , call );
918976
919977 // calc_chisq_bias("XPOS", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_pos, bca->alt_pos, bca->npos);
@@ -922,7 +980,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
922980
923981 if (bca -> fmt_flag & B2B_INFO_ZSCORE ) {
924982 // U z-normalised as +/- number of standard deviations from mean.
925- if (call -> ori_ref < 0 ) {
983+ if (call -> ori_ref < 0 ) { // indel
926984 if (bca -> fmt_flag & B2B_INFO_RPB )
927985 call -> mwu_pos = calc_mwu_biasZ (bca -> iref_pos , bca -> ialt_pos ,
928986 bca -> npos , 0 , 1 );
@@ -945,6 +1003,15 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
9451003 call -> mwu_sc = calc_mwu_biasZ (bca -> ref_scl , bca -> alt_scl ,
9461004 100 , 0 ,1 );
9471005 }
1006+ call -> mwu_nm [0 ] = calc_mwu_biasZ (bca -> ref_nm , bca -> alt_nm , B2B_N_NM ,0 ,1 );
1007+ if ( bca -> fmt_flag & B2B_FMT_NMBZ )
1008+ {
1009+ for (i = 0 ; i < n ; i ++ )
1010+ {
1011+ float val = calc_mwu_biasZ (calls [i ].ref_nm , calls [i ].alt_nm , B2B_N_NM ,0 ,1 );
1012+ call -> mwu_nm [i + 1 ] = val != HUGE_VAL ? val : 0 ;
1013+ }
1014+ }
9481015 } else {
9491016 // Old method; U as probability between 0 and 1
9501017 if ( bca -> fmt_flag & B2B_INFO_RPB )
@@ -976,7 +1043,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
9761043int bcf_call2bcf (bcf_call_t * bc , bcf1_t * rec , bcf_callret1_t * bcr , int fmt_flag , const bcf_callaux_t * bca , const char * ref )
9771044{
9781045 extern double kt_fisher_exact (int n11 , int n12 , int n21 , int n22 , double * _left , double * _right , double * two );
979- int i , j , nals = 1 ;
1046+ int i , j , nals = 1 , has_alt = 0 ;
9801047
9811048 bcf_hdr_t * hdr = bc -> bcf_hdr ;
9821049 rec -> rid = bc -> tid ;
@@ -1006,6 +1073,7 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
10061073 for (j = 0 ; j < bca -> indelreg ; ++ j ) kputc (ref [bc -> pos + 1 + j ], & bc -> tmp );
10071074 }
10081075 nals ++ ;
1076+ has_alt = 1 ;
10091077 }
10101078 }
10111079 else // SNP
@@ -1016,7 +1084,11 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
10161084 if (bc -> a [i ] < 0 ) break ;
10171085 kputc (',' , & bc -> tmp );
10181086 if ( bc -> unseen == i ) kputs ("<*>" , & bc -> tmp );
1019- else kputc ("ACGT" [bc -> a [i ]], & bc -> tmp );
1087+ else
1088+ {
1089+ kputc ("ACGT" [bc -> a [i ]], & bc -> tmp );
1090+ has_alt = 1 ;
1091+ }
10201092 nals ++ ;
10211093 }
10221094 }
@@ -1052,40 +1124,46 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
10521124 bcf_update_info_float (hdr , rec , "I16" , tmpf , 16 );
10531125 bcf_update_info_float (hdr , rec , "QS" , bc -> qsum , nals );
10541126
1055- if ( bc -> vdb != HUGE_VAL ) bcf_update_info_float (hdr , rec , "VDB" , & bc -> vdb , 1 );
1056- if ( bc -> seg_bias != HUGE_VAL ) bcf_update_info_float (hdr , rec , "SGB" , & bc -> seg_bias , 1 );
1057-
1058- if (bca -> fmt_flag & B2B_INFO_ZSCORE ) {
1059- if ( bc -> mwu_pos != HUGE_VAL )
1060- bcf_update_info_float (hdr , rec , "RPBZ" , & bc -> mwu_pos , 1 );
1061- if ( bc -> mwu_mq != HUGE_VAL )
1062- bcf_update_info_float (hdr , rec , "MQBZ" , & bc -> mwu_mq , 1 );
1063- if ( bc -> mwu_mqs != HUGE_VAL )
1064- bcf_update_info_float (hdr , rec , "MQSBZ" , & bc -> mwu_mqs , 1 );
1065- if ( bc -> mwu_bq != HUGE_VAL )
1066- bcf_update_info_float (hdr , rec , "BQBZ" , & bc -> mwu_bq , 1 );
1067- if ( bc -> mwu_sc != HUGE_VAL )
1068- bcf_update_info_float (hdr , rec , "SCBZ" , & bc -> mwu_sc , 1 );
1069- } else {
1070- if ( bc -> mwu_pos != HUGE_VAL )
1071- bcf_update_info_float (hdr , rec , "RPB" , & bc -> mwu_pos , 1 );
1072- if ( bc -> mwu_mq != HUGE_VAL )
1073- bcf_update_info_float (hdr , rec , "MQB" , & bc -> mwu_mq , 1 );
1074- if ( bc -> mwu_mqs != HUGE_VAL )
1075- bcf_update_info_float (hdr , rec , "MQSB" , & bc -> mwu_mqs , 1 );
1076- if ( bc -> mwu_bq != HUGE_VAL )
1077- bcf_update_info_float (hdr , rec , "BQB" , & bc -> mwu_bq , 1 );
1078- }
1127+ if ( has_alt )
1128+ {
1129+ if ( bc -> vdb != HUGE_VAL ) bcf_update_info_float (hdr , rec , "VDB" , & bc -> vdb , 1 );
1130+ if ( bc -> seg_bias != HUGE_VAL ) bcf_update_info_float (hdr , rec , "SGB" , & bc -> seg_bias , 1 );
1131+
1132+ if (bca -> fmt_flag & B2B_INFO_ZSCORE ) {
1133+ if ( bc -> mwu_pos != HUGE_VAL )
1134+ bcf_update_info_float (hdr , rec , "RPBZ" , & bc -> mwu_pos , 1 );
1135+ if ( bc -> mwu_mq != HUGE_VAL )
1136+ bcf_update_info_float (hdr , rec , "MQBZ" , & bc -> mwu_mq , 1 );
1137+ if ( bc -> mwu_mqs != HUGE_VAL )
1138+ bcf_update_info_float (hdr , rec , "MQSBZ" , & bc -> mwu_mqs , 1 );
1139+ if ( bc -> mwu_bq != HUGE_VAL )
1140+ bcf_update_info_float (hdr , rec , "BQBZ" , & bc -> mwu_bq , 1 );
1141+ if ( bc -> mwu_nm [0 ] != HUGE_VAL )
1142+ bcf_update_info_float (hdr , rec , "NMBZ" , bc -> mwu_nm , 1 );
1143+ if ( bc -> mwu_sc != HUGE_VAL )
1144+ bcf_update_info_float (hdr , rec , "SCBZ" , & bc -> mwu_sc , 1 );
1145+ } else {
1146+ if ( bc -> mwu_pos != HUGE_VAL )
1147+ bcf_update_info_float (hdr , rec , "RPB" , & bc -> mwu_pos , 1 );
1148+ if ( bc -> mwu_mq != HUGE_VAL )
1149+ bcf_update_info_float (hdr , rec , "MQB" , & bc -> mwu_mq , 1 );
1150+ if ( bc -> mwu_mqs != HUGE_VAL )
1151+ bcf_update_info_float (hdr , rec , "MQSB" , & bc -> mwu_mqs , 1 );
1152+ if ( bc -> mwu_bq != HUGE_VAL )
1153+ bcf_update_info_float (hdr , rec , "BQB" , & bc -> mwu_bq , 1 );
1154+ }
10791155
1080- if ( bc -> strand_bias != HUGE_VAL )
1081- bcf_update_info_float (hdr , rec , "FS" , & bc -> strand_bias , 1 );
1156+ if ( bc -> strand_bias != HUGE_VAL )
1157+ bcf_update_info_float (hdr , rec , "FS" , & bc -> strand_bias , 1 );
10821158
10831159#if CDF_MWU_TESTS
1084- if ( bc -> mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float (hdr , rec , "RPB2" , & bc -> mwu_pos_cdf , 1 );
1085- if ( bc -> mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float (hdr , rec , "MQB2" , & bc -> mwu_mq_cdf , 1 );
1086- if ( bc -> mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float (hdr , rec , "MQSB2" , & bc -> mwu_mqs_cdf , 1 );
1087- if ( bc -> mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float (hdr , rec , "BQB2" , & bc -> mwu_bq_cdf , 1 );
1160+ if ( bc -> mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float (hdr , rec , "RPB2" , & bc -> mwu_pos_cdf , 1 );
1161+ if ( bc -> mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float (hdr , rec , "MQB2" , & bc -> mwu_mq_cdf , 1 );
1162+ if ( bc -> mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float (hdr , rec , "MQSB2" , & bc -> mwu_mqs_cdf , 1 );
1163+ if ( bc -> mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float (hdr , rec , "BQB2" , & bc -> mwu_bq_cdf , 1 );
10881164#endif
1165+ }
1166+
10891167 tmpf [0 ] = bc -> ori_depth ? (float )bc -> mq0 /bc -> ori_depth : 0 ;
10901168 bcf_update_info_float (hdr , rec , "MQ0F" , tmpf , 1 );
10911169
@@ -1144,5 +1222,11 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
11441222 if ( fmt_flag & B2B_FMT_QS )
11451223 bcf_update_format_int32 (hdr , rec , "QS" , bc -> QS , rec -> n_sample * rec -> n_allele );
11461224
1225+ if ( has_alt )
1226+ {
1227+ if ( fmt_flag & B2B_FMT_NMBZ )
1228+ bcf_update_format_float (hdr , rec , "NMBZ" , bc -> mwu_nm + 1 , rec -> n_sample );
1229+ }
1230+
11471231 return 0 ;
11481232}
0 commit comments