@@ -36,6 +36,7 @@ DEALINGS IN THE SOFTWARE. */
3636#include <signal.h>
3737#include <inttypes.h>
3838#include <unistd.h>
39+ #include <regex.h>
3940
4041#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
4142#include "fuzz_settings.h"
@@ -3715,6 +3716,7 @@ typedef struct {
37153716 khash_t (tag ) * tags ; // which aux tags to use (if empty, use all).
37163717 char nprefix ;
37173718 int sra_names ;
3719+ regex_t regex ;
37183720} fastq_state ;
37193721
37203722// Initialise fastq state.
@@ -3725,6 +3727,12 @@ static fastq_state *fastq_state_init(int name_char) {
37253727 return NULL ;
37263728 strcpy (x -> BC , "BC" );
37273729 x -> nprefix = name_char ;
3730+ // Default Illumina naming convention
3731+ char * re = "^[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:([^:#]+)" ;
3732+ if (regcomp (& x -> regex , re , REG_EXTENDED ) != 0 ) {
3733+ free (x );
3734+ return NULL ;
3735+ }
37283736
37293737 return x ;
37303738}
@@ -3737,6 +3745,7 @@ void fastq_state_destroy(htsFile *fp) {
37373745 ks_free (& x -> name );
37383746 ks_free (& x -> seq );
37393747 ks_free (& x -> qual );
3748+ regfree (& x -> regex );
37403749 free (fp -> state );
37413750 }
37423751}
@@ -3798,6 +3807,7 @@ int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) {
37983807 }
37993808
38003809 case FASTQ_OPT_UMI : {
3810+ // UMI tag: an empty string disables UMI by setting x->UMI[0] to \0\0\0
38013811 va_start (args , opt );
38023812 char * bc = va_arg (args , char * ), * bc_orig = bc ;
38033813 va_end (args );
@@ -3817,6 +3827,7 @@ int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) {
38173827 break ;
38183828 }
38193829 bc += (* bc == ',' );
3830+ x -> UMI [ntags ][2 ] = 0 ;
38203831 }
38213832 for (; ntags < UMI_TAGS ; ntags ++ )
38223833 x -> UMI [ntags ][0 ] = x -> UMI [ntags ][1 ] = x -> UMI [ntags ][2 ] = 0 ;
@@ -3828,6 +3839,19 @@ int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) {
38283839 break ;
38293840 }
38303841
3842+ case FASTQ_OPT_UMI_REGEX : {
3843+ va_start (args , opt );
3844+ char * re = va_arg (args , char * );
3845+ va_end (args );
3846+
3847+ regfree (& x -> regex );
3848+ if (regcomp (& x -> regex , re , REG_EXTENDED ) != 0 ) {
3849+ hts_log_error ("Regular expression '%s' is not supported" , re );
3850+ return -1 ;
3851+ }
3852+ break ;
3853+ }
3854+
38313855 case FASTQ_OPT_RNUM :
38323856 x -> rnum = 1 ;
38333857 break ;
@@ -3944,30 +3968,23 @@ static int fastq_parse1(htsFile *fp, bam1_t *b) {
39443968 char UMI_seq [256 ]; // maximum length in spec
39453969 size_t UMI_len = 0 ;
39463970 if (x -> UMI [0 ][0 ]) {
3947- // 7th colon
3948- char * cp = x -> name .s ;
3949- int nc = 0 ;
3950- for (nc = 0 ; nc < 7 ; nc ++ , cp ++ ) {
3951- if ((cp = strchr (cp , ':' )) == NULL )
3952- break ;
3953- }
3954- if (cp ) {
3955- x -> name .l = cp - x -> name .s - 1 ;
3956-
3957- // UMI ends at string end or '#num'.
3958- char * UMI_start = cp ;
3959- while (* cp && * cp != '#' ) {
3960- UMI_len ++ ;
3961- cp ++ ;
3962- }
3971+ regmatch_t match [3 ];
3972+ if (regexec (& x -> regex , x -> name .s , 2 , match , 0 ) == 0
3973+ && match [0 ].rm_so >= 0 // whole regex
3974+ && match [1 ].rm_so >= 0 ) { // bracketted UMI component
3975+ UMI_len = match [1 ].rm_eo - match [1 ].rm_so ;
39633976 if (UMI_len > 255 ) {
39643977 hts_log_error ("SAM read name is too long" );
39653978 return -2 ;
39663979 }
3967- memcpy (UMI_seq , UMI_start , UMI_len );
3980+ memcpy (UMI_seq , x -> name . s + match [ 1 ]. rm_so , UMI_len );
39683981 UMI_seq [UMI_len ++ ] = 0 ;
39693982
39703983 // Move any trailing #num earlier in the name
3984+ x -> name .l = match [1 ].rm_so ;
3985+ if (x -> name .l > 0 && x -> name .s [x -> name .l - 1 ] == ':' )
3986+ x -> name .l -- ; // remove colon too
3987+ char * cp = x -> name .s + match [1 ].rm_eo ;
39713988 while (* cp )
39723989 x -> name .s [x -> name .l ++ ] = * cp ++ ;
39733990 x -> name .s [x -> name .l ] = 0 ;
0 commit comments