Skip to content

Commit a532924

Browse files
authored
Add dedicated UUID4 (ONT) recognition to the name tokeniser (PR #134)
Improved existing code for spotting UUIDs in search_trie, and updated encode_name to store them when found. Also improves the compression of mixed data sets. This still isn't optimal as we'd need to start separating the name classes and adding NOP tokens, but it's often a 10-20% compression improvement. We still need to do more work on optimising name compression in a general manner, especially on mixed data sets, but this is resolves a simple case while having no impact on any other name formats.
1 parent 5392126 commit a532924

File tree

1 file changed

+37
-18
lines changed

1 file changed

+37
-18
lines changed

htscodecs/tokenise_name3.c

Lines changed: 37 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -610,12 +610,16 @@ int search_trie(name_context *ctx, char *data, size_t len, int n, int *exact, in
610610
prefix_len = 6; // IonTorrent
611611
*fixed_len = 6;
612612
*is_fixed = 1;
613-
} else if (l > 37 && d[f+8] == '-' && d[f+13] == '-' && d[f+18] == '-' && d[f+23] == '-' &&
614-
((d[f+0] >= '0' && d[f+0] <='9') || (d[f+0] >= 'a' && d[f+0] <= 'f')) &&
615-
((d[f+35] >= '0' && d[f+35] <='9') || (d[f+35] >= 'a' && d[f+35] <= 'f'))) {
613+
} else if (l >= 36
614+
&& d[f+8]=='-' && d[f+13]=='-' && d[f+18]=='-' && d[f+23]=='-'
615+
&& isxdigit((uint8_t)d[f+0]) && isxdigit((uint8_t)d[f+7])
616+
&& isxdigit((uint8_t)d[f+9]) && isxdigit((uint8_t)d[f+12])
617+
&& isxdigit((uint8_t)d[f+14]) && isxdigit((uint8_t)d[f+17])
618+
&& isxdigit((uint8_t)d[f+19]) && isxdigit((uint8_t)d[f+22])
619+
&& isxdigit((uint8_t)d[f+24]) && isxdigit((uint8_t)d[f+35])) {
616620
// ONT: f33d30d5-6eb8-4115-8f46-154c2620a5da_Basecall_1D_template...
617-
prefix_len = 37;
618-
*fixed_len = 37;
621+
prefix_len = 36;
622+
*fixed_len = 36;
619623
*is_fixed = 1;
620624
} else {
621625
// Check Illumina and trim back to lane:tile:x:y.
@@ -638,7 +642,6 @@ int search_trie(name_context *ctx, char *data, size_t len, int n, int *exact, in
638642
*is_fixed = 0;
639643
}
640644
}
641-
//prefix_len = INT_MAX;
642645

643646
if (!ctx->t_head) {
644647
ctx->t_head = calloc(1, sizeof(*ctx->t_head));
@@ -647,6 +650,7 @@ int search_trie(name_context *ctx, char *data, size_t len, int n, int *exact, in
647650
}
648651

649652
// Find an item in the trie
653+
int from_punct = from;
650654
for (nlines = i = 0; i < len; i++, nlines++) {
651655
t = ctx->t_head;
652656
while (i < len && data[i] > '\n') {
@@ -661,24 +665,18 @@ int search_trie(name_context *ctx, char *data, size_t len, int n, int *exact, in
661665
x = x->sibling;
662666
t = x;
663667

664-
// t = t->next[c];
665-
666-
// if (!t)
667-
// return -1;
668-
669668
from = t->n;
669+
if ((ispunct(c) || isspace(c)) && t->n != n)
670+
from_punct = t->n;
670671
if (i == prefix_len) p3 = t->n;
671-
//if (t->count >= .0035*ctx->t_head->count && t->n != n) p3 = t->n; // pacbio
672-
//if (i == 60) p3 = t->n; // pacbio
673-
//if (i == 7) p3 = t->n; // iontorrent
674672
t->n = n;
675673
}
676674
}
677675

678676
//printf("Looked for %d, found %d, prefix %d\n", n, from, p3);
679677

680678
*exact = (n != from) && len;
681-
return *exact ? from : p3;
679+
return *exact ? from : (p3 != -1 ? p3 : from_punct);
682680
}
683681

684682

@@ -729,10 +727,29 @@ static int encode_name(name_context *ctx, char *name, int len, int mode) {
729727
if (!ctx->lc[cnum].last)
730728
return -1;
731729
encode_token_diff(ctx, cnum-pnum);
732-
733730
int ntok = 1;
734-
i = 0;
735-
if (is_fixed) {
731+
732+
if (fixed_len == 36) {
733+
// ONT uuid4 format data
734+
if (37 >= ctx->max_tok) {
735+
do {
736+
memset(&ctx->desc[ctx->max_tok << 4], 0, 16*sizeof(ctx->desc[0]));
737+
memset(&ctx->token_dcount[ctx->max_tok], 0, sizeof(int));
738+
memset(&ctx->token_icount[ctx->max_tok], 0, sizeof(int));
739+
} while (ctx->max_tok++ < 37);
740+
}
741+
#ifdef ENC_DEBUG
742+
fprintf(stderr, "Tok %d (%d x uuid chr)", ntok, len);
743+
#endif
744+
for (i = 0; i < 36; i++, ntok++) {
745+
encode_token_char(ctx, ntok, name[i]);
746+
ctx->lc[cnum].last[ntok].token_int = name[i];
747+
ctx->lc[cnum].last[ntok].token_type = N_CHAR;
748+
}
749+
is_fixed = 0;
750+
i = 36;
751+
} else if (is_fixed) {
752+
// Other fixed length data
736753
if (ntok >= ctx->max_tok) {
737754
memset(&ctx->desc[ctx->max_tok << 4], 0, 16*sizeof(ctx->desc[0]));
738755
memset(&ctx->token_dcount[ctx->max_tok], 0, sizeof(int));
@@ -752,6 +769,8 @@ static int encode_name(name_context *ctx, char *name, int len, int mode) {
752769
ctx->lc[cnum].last[ntok].token_str = 0;
753770
ctx->lc[cnum].last[ntok++].token_type = N_ALPHA;
754771
i = fixed_len;
772+
} else {
773+
i = 0;
755774
}
756775

757776
for (; i < len; i++) {

0 commit comments

Comments
 (0)