Skip to content

Commit 653ac56

Browse files
committed
Switch to regexp for matching Illumina read names (FASTQ UMI).
The default regexp is still the 8th component: "^[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:([^:#]+)" This is changed using the fastq_umi_regex option.
1 parent 1d64016 commit 653ac56

File tree

3 files changed

+44
-17
lines changed

3 files changed

+44
-17
lines changed

hts.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1212,6 +1212,10 @@ int hts_opt_add(hts_opt **opts, const char *c_arg) {
12121212
strcmp(o->arg, "FASTQ_UMI") == 0)
12131213
o->opt = FASTQ_OPT_UMI, o->val.s = val;
12141214

1215+
else if (strcmp(o->arg, "fastq_umi_regex") == 0 ||
1216+
strcmp(o->arg, "FASTQ_UMI_REGEX") == 0)
1217+
o->opt = FASTQ_OPT_UMI_REGEX, o->val.s = val;
1218+
12151219
else {
12161220
hts_log_error("Unknown option '%s'", o->arg);
12171221
free(o->arg);
@@ -1255,6 +1259,7 @@ int hts_opt_apply(htsFile *fp, hts_opt *opts) {
12551259
case FASTQ_OPT_AUX:
12561260
case FASTQ_OPT_BARCODE:
12571261
case FASTQ_OPT_UMI:
1262+
case FASTQ_OPT_UMI_REGEX:
12581263
if (hts_set_opt(fp, opts->opt, opts->val.s) != 0)
12591264
return -1;
12601265
break;
@@ -1847,6 +1852,7 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) {
18471852

18481853
case FASTQ_OPT_BARCODE:
18491854
case FASTQ_OPT_UMI:
1855+
case FASTQ_OPT_UMI_REGEX:
18501856
if (fp->format.format == fastq_format ||
18511857
fp->format.format == fasta_format) {
18521858
va_start(args, opt);

htslib/hts.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,10 @@ enum hts_fmt_option {
372372
// On write, it queries the tags in turn and copies the first found
373373
// to the read name suffix, converting any non-alpha to "+".
374374
FASTQ_OPT_UMI,
375+
376+
// Regex to use for matching read name.
377+
// Def: "^[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:([^:#]+)"
378+
FASTQ_OPT_UMI_REGEX,
375379
};
376380

377381
// Profile options for encoding; primarily used at present in CRAM

sam.c

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ DEALINGS IN THE SOFTWARE. */
3636
#include <signal.h>
3737
#include <inttypes.h>
3838
#include <unistd.h>
39+
#include <regex.h>
3940

4041
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
4142
#include "fuzz_settings.h"
@@ -3715,6 +3716,7 @@ typedef struct {
37153716
khash_t(tag) *tags; // which aux tags to use (if empty, use all).
37163717
char nprefix;
37173718
int sra_names;
3719+
regex_t regex;
37183720
} fastq_state;
37193721

37203722
// Initialise fastq state.
@@ -3725,6 +3727,12 @@ static fastq_state *fastq_state_init(int name_char) {
37253727
return NULL;
37263728
strcpy(x->BC, "BC");
37273729
x->nprefix = name_char;
3730+
// Default Illumina naming convention
3731+
char *re = "^[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:([^:#]+)";
3732+
if (regcomp(&x->regex, re, REG_EXTENDED) != 0) {
3733+
free(x);
3734+
return NULL;
3735+
}
37283736

37293737
return x;
37303738
}
@@ -3737,6 +3745,7 @@ void fastq_state_destroy(htsFile *fp) {
37373745
ks_free(&x->name);
37383746
ks_free(&x->seq);
37393747
ks_free(&x->qual);
3748+
regfree(&x->regex);
37403749
free(fp->state);
37413750
}
37423751
}
@@ -3798,6 +3807,7 @@ int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) {
37983807
}
37993808

38003809
case FASTQ_OPT_UMI: {
3810+
// UMI tag: an empty string disables UMI by setting x->UMI[0] to \0\0\0
38013811
va_start(args, opt);
38023812
char *bc = va_arg(args, char *), *bc_orig = bc;
38033813
va_end(args);
@@ -3817,6 +3827,7 @@ int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) {
38173827
break;
38183828
}
38193829
bc+=(*bc==',');
3830+
x->UMI[ntags][2] = 0;
38203831
}
38213832
for (; ntags < UMI_TAGS; ntags++)
38223833
x->UMI[ntags][0] = x->UMI[ntags][1] = x->UMI[ntags][2] = 0;
@@ -3828,6 +3839,19 @@ int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) {
38283839
break;
38293840
}
38303841

3842+
case FASTQ_OPT_UMI_REGEX: {
3843+
va_start(args, opt);
3844+
char *re = va_arg(args, char *);
3845+
va_end(args);
3846+
3847+
regfree(&x->regex);
3848+
if (regcomp(&x->regex, re, REG_EXTENDED) != 0) {
3849+
hts_log_error("Regular expression '%s' is not supported", re);
3850+
return -1;
3851+
}
3852+
break;
3853+
}
3854+
38313855
case FASTQ_OPT_RNUM:
38323856
x->rnum = 1;
38333857
break;
@@ -3944,30 +3968,23 @@ static int fastq_parse1(htsFile *fp, bam1_t *b) {
39443968
char UMI_seq[256]; // maximum length in spec
39453969
size_t UMI_len = 0;
39463970
if (x->UMI[0][0]) {
3947-
// 7th colon
3948-
char *cp = x->name.s;
3949-
int nc = 0;
3950-
for (nc = 0; nc < 7; nc++, cp++) {
3951-
if ((cp = strchr(cp, ':')) == NULL)
3952-
break;
3953-
}
3954-
if (cp) {
3955-
x->name.l = cp - x->name.s - 1;
3956-
3957-
// UMI ends at string end or '#num'.
3958-
char *UMI_start = cp;
3959-
while (*cp && *cp != '#') {
3960-
UMI_len++;
3961-
cp++;
3962-
}
3971+
regmatch_t match[3];
3972+
if (regexec(&x->regex, x->name.s, 2, match, 0) == 0
3973+
&& match[0].rm_so >= 0 // whole regex
3974+
&& match[1].rm_so >= 0) { // bracketted UMI component
3975+
UMI_len = match[1].rm_eo - match[1].rm_so;
39633976
if (UMI_len > 255) {
39643977
hts_log_error("SAM read name is too long");
39653978
return -2;
39663979
}
3967-
memcpy(UMI_seq, UMI_start, UMI_len);
3980+
memcpy(UMI_seq, x->name.s + match[1].rm_so, UMI_len);
39683981
UMI_seq[UMI_len++] = 0;
39693982

39703983
// Move any trailing #num earlier in the name
3984+
x->name.l = match[1].rm_so;
3985+
if (x->name.l > 0 && x->name.s[x->name.l-1] == ':')
3986+
x->name.l--; // remove colon too
3987+
char *cp = x->name.s + match[1].rm_eo;
39713988
while (*cp)
39723989
x->name.s[x->name.l++] = *cp++;
39733990
x->name.s[x->name.l] = 0;

0 commit comments

Comments
 (0)