@@ -36,6 +36,7 @@ DEALINGS IN THE SOFTWARE. */
3636#include <signal.h>
3737#include <inttypes.h>
3838#include <unistd.h>
39+ #include <regex.h>
3940
4041#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
4142#include "fuzz_settings.h"
@@ -3701,6 +3702,7 @@ int sam_set_threads(htsFile *fp, int nthreads) {
37013702 return 0 ;
37023703}
37033704
3705+ #define UMI_TAGS 5
37043706typedef struct {
37053707 kstring_t name ;
37063708 kstring_t comment ; // NB: pointer into name, do not free
@@ -3710,9 +3712,11 @@ typedef struct {
37103712 int aux ;
37113713 int rnum ;
37123714 char BC [3 ]; // aux tag ID for barcode
3715+ char UMI [UMI_TAGS ][3 ]; // aux tag list for UMIs.
37133716 khash_t (tag ) * tags ; // which aux tags to use (if empty, use all).
37143717 char nprefix ;
37153718 int sra_names ;
3719+ regex_t regex ;
37163720} fastq_state ;
37173721
37183722// Initialise fastq state.
@@ -3723,6 +3727,12 @@ static fastq_state *fastq_state_init(int name_char) {
37233727 return NULL ;
37243728 strcpy (x -> BC , "BC" );
37253729 x -> nprefix = name_char ;
3730+ // Default Illumina naming convention
3731+ char * re = "^[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:([^:#/]*)" ;
3732+ if (regcomp (& x -> regex , re , REG_EXTENDED ) != 0 ) {
3733+ free (x );
3734+ return NULL ;
3735+ }
37263736
37273737 return x ;
37283738}
@@ -3735,6 +3745,7 @@ void fastq_state_destroy(htsFile *fp) {
37353745 ks_free (& x -> name );
37363746 ks_free (& x -> seq );
37373747 ks_free (& x -> qual );
3748+ regfree (& x -> regex );
37383749 free (fp -> state );
37393750 }
37403751}
@@ -3795,6 +3806,52 @@ int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) {
37953806 break ;
37963807 }
37973808
3809+ case FASTQ_OPT_UMI : {
3810+ // UMI tag: an empty string disables UMI by setting x->UMI[0] to \0\0\0
3811+ va_start (args , opt );
3812+ char * bc = va_arg (args , char * ), * bc_orig = bc ;
3813+ va_end (args );
3814+ if (!bc || strcmp (bc , "1" ) == 0 )
3815+ bc = "RX" ;
3816+ int ntags = 0 , err = 0 ;
3817+ for (ntags = 0 ; * bc && ntags < UMI_TAGS ; ntags ++ ) {
3818+ if (!isalpha (bc [0 ]) || !isalnum_c (bc [1 ])) {
3819+ err = 1 ;
3820+ break ;
3821+ }
3822+
3823+ strncpy (x -> UMI [ntags ], bc , 3 );
3824+ bc += 2 ;
3825+ if (* bc && * bc != ',' ) {
3826+ err = 1 ;
3827+ break ;
3828+ }
3829+ bc += (* bc == ',' );
3830+ x -> UMI [ntags ][2 ] = 0 ;
3831+ }
3832+ for (; ntags < UMI_TAGS ; ntags ++ )
3833+ x -> UMI [ntags ][0 ] = x -> UMI [ntags ][1 ] = x -> UMI [ntags ][2 ] = 0 ;
3834+
3835+
3836+ if (err )
3837+ hts_log_warning ("Bad UMI tag list '%s'; skipping option" , bc_orig );
3838+
3839+ break ;
3840+ }
3841+
3842+ case FASTQ_OPT_UMI_REGEX : {
3843+ va_start (args , opt );
3844+ char * re = va_arg (args , char * );
3845+ va_end (args );
3846+
3847+ regfree (& x -> regex );
3848+ if (regcomp (& x -> regex , re , REG_EXTENDED ) != 0 ) {
3849+ hts_log_error ("Regular expression '%s' is not supported" , re );
3850+ return -1 ;
3851+ }
3852+ break ;
3853+ }
3854+
37983855 case FASTQ_OPT_RNUM :
37993856 x -> rnum = 1 ;
38003857 break ;
@@ -3907,6 +3964,40 @@ static int fastq_parse1(htsFile *fp, bam1_t *b) {
39073964 x -> name .s [x -> name .l -= 2 ] = 0 ;
39083965 }
39093966
3967+ // Strip Illumina formatted UMI off read-name
3968+ char UMI_seq [256 ]; // maximum length in spec
3969+ size_t UMI_len = 0 ;
3970+ if (x -> UMI [0 ][0 ]) {
3971+ regmatch_t match [3 ];
3972+ if (regexec (& x -> regex , x -> name .s , 2 , match , 0 ) == 0
3973+ && match [0 ].rm_so >= 0 // whole regex
3974+ && match [1 ].rm_so >= 0 ) { // bracketted UMI component
3975+ UMI_len = match [1 ].rm_eo - match [1 ].rm_so ;
3976+ if (UMI_len > 255 ) {
3977+ hts_log_error ("SAM read name is too long" );
3978+ return -2 ;
3979+ }
3980+
3981+ // The SAMTags spec recommends (but not requires) separating
3982+ // barcodes with hyphen ('-').
3983+ size_t i ;
3984+ for (i = 0 ; i < UMI_len ; i ++ )
3985+ UMI_seq [i ] = isalpha_c (x -> name .s [i + match [1 ].rm_so ])
3986+ ? x -> name .s [i + match [1 ].rm_so ]
3987+ : '-' ;
3988+ UMI_seq [UMI_len ++ ] = 0 ;
3989+
3990+ // Move any trailing #num earlier in the name
3991+ x -> name .l = match [1 ].rm_so ;
3992+ if (x -> name .l > 0 && x -> name .s [x -> name .l - 1 ] == ':' )
3993+ x -> name .l -- ; // remove colon too
3994+ char * cp = x -> name .s + match [1 ].rm_eo ;
3995+ while (* cp )
3996+ x -> name .s [x -> name .l ++ ] = * cp ++ ;
3997+ x -> name .s [x -> name .l ] = 0 ;
3998+ }
3999+ }
4000+
39104001 // Convert to BAM
39114002 ret = bam_set1 (b ,
39124003 x -> name .s + x -> name .l - name , name ,
@@ -3918,6 +4009,12 @@ static int fastq_parse1(htsFile *fp, bam1_t *b) {
39184009 0 );
39194010 if (ret < 0 ) return -2 ;
39204011
4012+ // Add UMI tag if removed from read-name above
4013+ if (UMI_len ) {
4014+ if (bam_aux_append (b , x -> UMI [0 ], 'Z' , UMI_len , (uint8_t * )UMI_seq ) < 0 )
4015+ ret = -2 ;
4016+ }
4017+
39214018 // Identify Illumina CASAVA strings.
39224019 // <read>:<is_filtered>:<control_bits>:<barcode_sequence>
39234020 char * barcode = NULL ;
@@ -4260,6 +4357,39 @@ int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str)
42604357 if (kputc (x -> nprefix , str ) == EOF || kputs (bam_get_qname (b ), str ) == EOF )
42614358 return -1 ;
42624359
4360+ // UMI tag
4361+ if (x && * x -> UMI [0 ]) {
4362+ // Temporary copy of '#num' if present
4363+ char plex [256 ];
4364+ size_t len = str -> l ;
4365+ while (len && str -> s [len ] != ':' && str -> s [len ] != '#' )
4366+ len -- ;
4367+
4368+ if (str -> s [len ] == '#' && str -> l - len < 255 ) {
4369+ memcpy (plex , & str -> s [len ], str -> l - len );
4370+ plex [str -> l - len ] = 0 ;
4371+ str -> l = len ;
4372+ } else {
4373+ * plex = 0 ;
4374+ }
4375+
4376+ uint8_t * bc = NULL ;
4377+ int n ;
4378+ for (n = 0 ; !bc && n < UMI_TAGS ; n ++ )
4379+ bc = bam_aux_get (b , x -> UMI [n ]);
4380+ if (bc && * bc == 'Z' ) {
4381+ int err = kputc (':' , str ) < 0 ;
4382+ // Replace any non-alpha with '+'
4383+ while (* ++ bc )
4384+ err |= kputc (isalpha_c (* bc ) ? toupper_c (* bc ) : '+' , str ) < 0 ;
4385+ if (err )
4386+ return -1 ;
4387+ }
4388+
4389+ if (* plex && kputs (plex , str ) < 0 )
4390+ return -1 ;
4391+ }
4392+
42634393 // /1 or /2 suffix
42644394 if (x && x -> rnum && (flag & BAM_FPAIRED )) {
42654395 int r12 = flag & (BAM_FREAD1 | BAM_FREAD2 );
0 commit comments