Skip to content

Commit 710291e

Browse files
committed
Add UMI support to FASTQ input/output.
The fastq_umi option (FASTQ_OPT_UMI enum) is used to enable UMI parsing in read names. When reading FASTQ it converts the 8th Illumina field to an aux tag (default to RX). We may need to amend this if people require it to work on earlier Illumina naming systems, but for now we target the current software. When writing FASTQ we hunt for a series of tags and choose the first one found. RX is the usual one, but users may wish to also use OX if they potentially have error corrected data in RX and want to regenerate fastq from the original uncorrect tag. Complexities arrive when dealing with /1 or /2 and #num multi-plexing strings, meaning we have to use temporary buffers rather than simply truncating the read names. Note we convert dual-index UMIs of SEQ+SEQ to RX:Z:SEQ-SEQ as per the SAMtags recommendation. However it's a bit wild west out there and not everyone does this. (I've seen 10X outputs with CR using underscores for example.) This is a best efforts approach to reduce the likelihood of incompatibility with downstream pipelines.
1 parent 1e3cef4 commit 710291e

File tree

6 files changed

+196
-0
lines changed

6 files changed

+196
-0
lines changed

hts.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1208,6 +1208,14 @@ int hts_opt_add(hts_opt **opts, const char *c_arg) {
12081208
strcmp(o->arg, "FASTQ_NAME2") == 0)
12091209
o->opt = FASTQ_OPT_NAME2, o->val.i = 1;
12101210

1211+
else if (strcmp(o->arg, "fastq_umi") == 0 ||
1212+
strcmp(o->arg, "FASTQ_UMI") == 0)
1213+
o->opt = FASTQ_OPT_UMI, o->val.s = val;
1214+
1215+
else if (strcmp(o->arg, "fastq_umi_regex") == 0 ||
1216+
strcmp(o->arg, "FASTQ_UMI_REGEX") == 0)
1217+
o->opt = FASTQ_OPT_UMI_REGEX, o->val.s = val;
1218+
12111219
else {
12121220
hts_log_error("Unknown option '%s'", o->arg);
12131221
free(o->arg);
@@ -1250,6 +1258,8 @@ int hts_opt_apply(htsFile *fp, hts_opt *opts) {
12501258
case HTS_OPT_FILTER:
12511259
case FASTQ_OPT_AUX:
12521260
case FASTQ_OPT_BARCODE:
1261+
case FASTQ_OPT_UMI:
1262+
case FASTQ_OPT_UMI_REGEX:
12531263
if (hts_set_opt(fp, opts->opt, opts->val.s) != 0)
12541264
return -1;
12551265
break;
@@ -1841,6 +1851,8 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) {
18411851
return 0;
18421852

18431853
case FASTQ_OPT_BARCODE:
1854+
case FASTQ_OPT_UMI:
1855+
case FASTQ_OPT_UMI_REGEX:
18441856
if (fp->format.format == fastq_format ||
18451857
fp->format.format == fasta_format) {
18461858
va_start(args, opt);

htslib/hts.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,16 @@ enum hts_fmt_option {
366366
// name to the second field and insert a constructed <run>.<number>
367367
// name in its place.
368368
FASTQ_OPT_NAME2,
369+
370+
// Process the UMI tag. Tag or Tag,tag,tag...
371+
// On read, this converts the last read-name element (Illumina) to the tag.
372+
// On write, it queries the tags in turn and copies the first found
373+
// to the read name suffix, converting any non-alpha to "+".
374+
FASTQ_OPT_UMI,
375+
376+
// Regex to use for matching read name.
377+
// Def: "^[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:([^:#/]*)"
378+
FASTQ_OPT_UMI_REGEX,
369379
};
370380

371381
// Profile options for encoding; primarily used at present in CRAM

sam.c

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ DEALINGS IN THE SOFTWARE. */
3636
#include <signal.h>
3737
#include <inttypes.h>
3838
#include <unistd.h>
39+
#include <regex.h>
3940

4041
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
4142
#include "fuzz_settings.h"
@@ -3701,6 +3702,7 @@ int sam_set_threads(htsFile *fp, int nthreads) {
37013702
return 0;
37023703
}
37033704

3705+
#define UMI_TAGS 5
37043706
typedef struct {
37053707
kstring_t name;
37063708
kstring_t comment; // NB: pointer into name, do not free
@@ -3710,9 +3712,11 @@ typedef struct {
37103712
int aux;
37113713
int rnum;
37123714
char BC[3]; // aux tag ID for barcode
3715+
char UMI[UMI_TAGS][3]; // aux tag list for UMIs.
37133716
khash_t(tag) *tags; // which aux tags to use (if empty, use all).
37143717
char nprefix;
37153718
int sra_names;
3719+
regex_t regex;
37163720
} fastq_state;
37173721

37183722
// Initialise fastq state.
@@ -3723,6 +3727,12 @@ static fastq_state *fastq_state_init(int name_char) {
37233727
return NULL;
37243728
strcpy(x->BC, "BC");
37253729
x->nprefix = name_char;
3730+
// Default Illumina naming convention
3731+
char *re = "^[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:([^:#/]*)";
3732+
if (regcomp(&x->regex, re, REG_EXTENDED) != 0) {
3733+
free(x);
3734+
return NULL;
3735+
}
37263736

37273737
return x;
37283738
}
@@ -3735,6 +3745,7 @@ void fastq_state_destroy(htsFile *fp) {
37353745
ks_free(&x->name);
37363746
ks_free(&x->seq);
37373747
ks_free(&x->qual);
3748+
regfree(&x->regex);
37383749
free(fp->state);
37393750
}
37403751
}
@@ -3795,6 +3806,52 @@ int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) {
37953806
break;
37963807
}
37973808

3809+
case FASTQ_OPT_UMI: {
3810+
// UMI tag: an empty string disables UMI by setting x->UMI[0] to \0\0\0
3811+
va_start(args, opt);
3812+
char *bc = va_arg(args, char *), *bc_orig = bc;
3813+
va_end(args);
3814+
if (!bc || strcmp(bc, "1") == 0)
3815+
bc = "RX";
3816+
int ntags = 0, err = 0;
3817+
for (ntags = 0; *bc && ntags < UMI_TAGS; ntags++) {
3818+
if (!isalpha(bc[0]) || !isalnum_c(bc[1])) {
3819+
err = 1;
3820+
break;
3821+
}
3822+
3823+
strncpy(x->UMI[ntags], bc, 3);
3824+
bc += 2;
3825+
if (*bc && *bc != ',') {
3826+
err = 1;
3827+
break;
3828+
}
3829+
bc+=(*bc==',');
3830+
x->UMI[ntags][2] = 0;
3831+
}
3832+
for (; ntags < UMI_TAGS; ntags++)
3833+
x->UMI[ntags][0] = x->UMI[ntags][1] = x->UMI[ntags][2] = 0;
3834+
3835+
3836+
if (err)
3837+
hts_log_warning("Bad UMI tag list '%s'", bc_orig);
3838+
3839+
break;
3840+
}
3841+
3842+
case FASTQ_OPT_UMI_REGEX: {
3843+
va_start(args, opt);
3844+
char *re = va_arg(args, char *);
3845+
va_end(args);
3846+
3847+
regfree(&x->regex);
3848+
if (regcomp(&x->regex, re, REG_EXTENDED) != 0) {
3849+
hts_log_error("Regular expression '%s' is not supported", re);
3850+
return -1;
3851+
}
3852+
break;
3853+
}
3854+
37983855
case FASTQ_OPT_RNUM:
37993856
x->rnum = 1;
38003857
break;
@@ -3907,6 +3964,43 @@ static int fastq_parse1(htsFile *fp, bam1_t *b) {
39073964
x->name.s[x->name.l-=2] = 0;
39083965
}
39093966

3967+
// Strip Illumina formatted UMI off read-name
3968+
char UMI_seq[256]; // maximum length in spec
3969+
size_t UMI_len = 0;
3970+
if (x->UMI[0][0]) {
3971+
regmatch_t match[3];
3972+
if (regexec(&x->regex, x->name.s, 2, match, 0) == 0
3973+
&& match[0].rm_so >= 0 // whole regex
3974+
&& match[1].rm_so >= 0) { // bracketted UMI component
3975+
UMI_len = match[1].rm_eo - match[1].rm_so;
3976+
if (UMI_len > 255) {
3977+
hts_log_error("SAM read name is too long");
3978+
return -2;
3979+
}
3980+
3981+
// The SAMTags spec recommends (but not requires) separating
3982+
// barcodes with hyphen ('-').
3983+
size_t i;
3984+
for (i = 0; i < UMI_len; i++)
3985+
UMI_seq[i] = isalpha_c(x->name.s[i+match[1].rm_so])
3986+
? x->name.s[i+match[1].rm_so]
3987+
: '-';
3988+
3989+
// Move any trailing #num earlier in the name
3990+
if (UMI_len) {
3991+
UMI_seq[UMI_len++] = 0;
3992+
3993+
x->name.l = match[1].rm_so;
3994+
if (x->name.l > 0 && x->name.s[x->name.l-1] == ':')
3995+
x->name.l--; // remove colon too
3996+
char *cp = x->name.s + match[1].rm_eo;
3997+
while (*cp)
3998+
x->name.s[x->name.l++] = *cp++;
3999+
x->name.s[x->name.l] = 0;
4000+
}
4001+
}
4002+
}
4003+
39104004
// Convert to BAM
39114005
ret = bam_set1(b,
39124006
x->name.s + x->name.l - name, name,
@@ -3918,6 +4012,12 @@ static int fastq_parse1(htsFile *fp, bam1_t *b) {
39184012
0);
39194013
if (ret < 0) return -2;
39204014

4015+
// Add UMI tag if removed from read-name above
4016+
if (UMI_len) {
4017+
if (bam_aux_append(b, x->UMI[0], 'Z', UMI_len, (uint8_t *)UMI_seq) < 0)
4018+
ret = -2;
4019+
}
4020+
39214021
// Identify Illumina CASAVA strings.
39224022
// <read>:<is_filtered>:<control_bits>:<barcode_sequence>
39234023
char *barcode = NULL;
@@ -4260,6 +4360,39 @@ int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str)
42604360
if (kputc(x->nprefix, str) == EOF || kputs(bam_get_qname(b), str) == EOF)
42614361
return -1;
42624362

4363+
// UMI tag
4364+
if (x && *x->UMI[0]) {
4365+
// Temporary copy of '#num' if present
4366+
char plex[256];
4367+
size_t len = str->l;
4368+
while (len && str->s[len] != ':' && str->s[len] != '#')
4369+
len--;
4370+
4371+
if (str->s[len] == '#' && str->l - len < 255) {
4372+
memcpy(plex, &str->s[len], str->l - len);
4373+
plex[str->l - len] = 0;
4374+
str->l = len;
4375+
} else {
4376+
*plex = 0;
4377+
}
4378+
4379+
uint8_t *bc = NULL;
4380+
int n;
4381+
for (n = 0; !bc && n < UMI_TAGS; n++)
4382+
bc = bam_aux_get(b, x->UMI[n]);
4383+
if (bc && *bc == 'Z') {
4384+
int err = kputc(':', str) < 0;
4385+
// Replace any non-alpha with '+'
4386+
while (*++bc)
4387+
err |= kputc(isalpha_c(*bc) ? toupper_c(*bc) : '+', str) < 0;
4388+
if (err)
4389+
return -1;
4390+
}
4391+
4392+
if (*plex && kputs(plex, str) < 0)
4393+
return -1;
4394+
}
4395+
42634396
// /1 or /2 suffix
42644397
if (x && x->rnum && (flag & BAM_FPAIRED)) {
42654398
int r12 = flag & (BAM_FREAD1 | BAM_FREAD2);

test/fastq/UMI.fq

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
@HS25:09827:FID:2:1201:1505:59794
2+
CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT
3+
+
4+
CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI
5+
@HS25:09827:FID:2:1201:1505:59795:A
6+
CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT
7+
+
8+
CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI
9+
@HS25:09827:FID:2:1201:1505:59796:ACG+TTTTTA
10+
CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT
11+
+
12+
CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI
13+
@HS25:09827:FID:2:1201:1505:59797:TACG+TTTTTA/1
14+
CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT
15+
+
16+
CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI
17+
@HS25:09827:FID:2:1201:1505:59797:TACG+TTTTTA/2
18+
CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT
19+
+
20+
CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI
21+
@HS25:09827:FID:2:1201:1505:59798:TTA#49/1
22+
CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT
23+
+
24+
CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI
25+
@HS25:09827:FID:2:1201:1505:59798:TTA#49/2
26+
CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT
27+
+
28+
CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI

test/fastq/UMI.sam

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
HS25:09827:FID:2:1201:1505:59794 4 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI
2+
HS25:09827:FID:2:1201:1505:59795 4 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI RX:Z:A
3+
HS25:09827:FID:2:1201:1505:59796 4 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI RX:Z:ACG-TTTTTA
4+
HS25:09827:FID:2:1201:1505:59797 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI RX:Z:TACG-TTTTTA
5+
HS25:09827:FID:2:1201:1505:59797 141 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI RX:Z:TACG-TTTTTA
6+
HS25:09827:FID:2:1201:1505:59798#49 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI RX:Z:TTA
7+
HS25:09827:FID:2:1201:1505:59798#49 141 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI RX:Z:TTA

test/fastq/fastq.tst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,9 @@ P r2-q.sam $tview -i fastq_aux r2.fa
8484
P name2.sam $tview -i fastq_name2 name2.fq
8585
P name2-q.sam $tview -i fastq_name2 name2.fa
8686

87+
# UMI barcodes
88+
P UMI.sam $tview -i fastq_umi=RX UMI.fq
89+
8790
# --------------------
8891
# Writing
8992

@@ -114,3 +117,6 @@ P r1.fq $tview -f -o fastq_aux -o fastq_rnum r1.sam
114117
P r2.fq $tview -f -o fastq_aux -o fastq_rnum r2.sam
115118
P r1.fa $tview -F -o fastq_aux -o fastq_rnum r1.sam
116119
P r2.fa $tview -F -o fastq_aux -o fastq_rnum r2.sam
120+
121+
# UMI barcodes
122+
P UMI.fq $tview -f -o fastq_rnum -o fastq_umi UMI.sam

0 commit comments

Comments
 (0)