Skip to content

Commit 40b8c82

Browse files
committed
Improve the UUID4 detection.
We already had code for spotting this in search_trie, so improved that a little and use it in encode_name instead of having a second scan. Also improve the compression of mixed data sets. This still isn't optimal as we'd need to start separating the name classes and adding NOP tokens, but it's often a 10-20% compression improvement.
1 parent c666474 commit 40b8c82

File tree

1 file changed

+22
-34
lines changed

1 file changed

+22
-34
lines changed

htscodecs/tokenise_name3.c

Lines changed: 22 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -610,12 +610,16 @@ int search_trie(name_context *ctx, char *data, size_t len, int n, int *exact, in
610610
prefix_len = 6; // IonTorrent
611611
*fixed_len = 6;
612612
*is_fixed = 1;
613-
} else if (l > 37 && d[f+8] == '-' && d[f+13] == '-' && d[f+18] == '-' && d[f+23] == '-' &&
614-
((d[f+0] >= '0' && d[f+0] <='9') || (d[f+0] >= 'a' && d[f+0] <= 'f')) &&
615-
((d[f+35] >= '0' && d[f+35] <='9') || (d[f+35] >= 'a' && d[f+35] <= 'f'))) {
613+
} else if (l >= 36
614+
&& d[f+8]=='-' && d[f+13]=='-' && d[f+18]=='-' && d[f+23]=='-'
615+
&& isxdigit((uint8_t)d[f+0]) && isxdigit((uint8_t)d[f+7])
616+
&& isxdigit((uint8_t)d[f+9]) && isxdigit((uint8_t)d[f+12])
617+
&& isxdigit((uint8_t)d[f+14]) && isxdigit((uint8_t)d[f+17])
618+
&& isxdigit((uint8_t)d[f+19]) && isxdigit((uint8_t)d[f+22])
619+
&& isxdigit((uint8_t)d[f+24]) && isxdigit((uint8_t)d[f+35])) {
616620
// ONT: f33d30d5-6eb8-4115-8f46-154c2620a5da_Basecall_1D_template...
617-
prefix_len = 37;
618-
*fixed_len = 37;
621+
prefix_len = 36;
622+
*fixed_len = 36;
619623
*is_fixed = 1;
620624
} else {
621625
// Check Illumina and trim back to lane:tile:x:y.
@@ -638,7 +642,6 @@ int search_trie(name_context *ctx, char *data, size_t len, int n, int *exact, in
638642
*is_fixed = 0;
639643
}
640644
}
641-
//prefix_len = INT_MAX;
642645

643646
if (!ctx->t_head) {
644647
ctx->t_head = calloc(1, sizeof(*ctx->t_head));
@@ -647,6 +650,7 @@ int search_trie(name_context *ctx, char *data, size_t len, int n, int *exact, in
647650
}
648651

649652
// Find an item in the trie
653+
int from_punct = from;
650654
for (nlines = i = 0; i < len; i++, nlines++) {
651655
t = ctx->t_head;
652656
while (i < len && data[i] > '\n') {
@@ -661,24 +665,18 @@ int search_trie(name_context *ctx, char *data, size_t len, int n, int *exact, in
661665
x = x->sibling;
662666
t = x;
663667

664-
// t = t->next[c];
665-
666-
// if (!t)
667-
// return -1;
668-
669668
from = t->n;
669+
if ((ispunct(c) || isspace(c)) && t->n != n)
670+
from_punct = t->n;
670671
if (i == prefix_len) p3 = t->n;
671-
//if (t->count >= .0035*ctx->t_head->count && t->n != n) p3 = t->n; // pacbio
672-
//if (i == 60) p3 = t->n; // pacbio
673-
//if (i == 7) p3 = t->n; // iontorrent
674672
t->n = n;
675673
}
676674
}
677675

678676
//printf("Looked for %d, found %d, prefix %d\n", n, from, p3);
679677

680678
*exact = (n != from) && len;
681-
return *exact ? from : p3;
679+
return *exact ? from : (p3 != -1 ? p3 : from_punct);
682680
}
683681

684682

@@ -731,17 +729,8 @@ static int encode_name(name_context *ctx, char *name, int len, int mode) {
731729
encode_token_diff(ctx, cnum-pnum);
732730
int ntok = 1;
733731

734-
// Look for common form of UUID4 names and special case them
735-
i = 0;
736-
if (len == 36) {
737-
for (i = 0; i < len; i++) {
738-
if (!(isxdigit((uint8_t)name[i]) || name[i] == '-'))
739-
break;
740-
}
741-
}
742-
743-
// Is uuid4 (eg ONT).
744-
if (i == len) {
732+
if (fixed_len == 36) {
733+
// ONT uuid4 format data
745734
if (37 >= ctx->max_tok) {
746735
do {
747736
memset(&ctx->desc[ctx->max_tok << 4], 0, 16*sizeof(ctx->desc[0]));
@@ -752,17 +741,15 @@ static int encode_name(name_context *ctx, char *name, int len, int mode) {
752741
#ifdef ENC_DEBUG
753742
fprintf(stderr, "Tok %d (%d x uuid chr)", ntok, len);
754743
#endif
755-
//encode_token_nop(ctx, ntok++);
756-
for (i = 0; i < len; i++, ntok++) {
744+
for (i = 0; i < 36; i++, ntok++) {
757745
encode_token_char(ctx, ntok, name[i]);
758746
ctx->lc[cnum].last[ntok].token_int = name[i];
759747
ctx->lc[cnum].last[ntok].token_type = N_CHAR;
760748
}
761-
goto end;
762-
}
763-
764-
i = 0;
765-
if (is_fixed) {
749+
is_fixed = 0;
750+
i = 36;
751+
} else if (is_fixed) {
752+
// Other fixed length data
766753
if (ntok >= ctx->max_tok) {
767754
memset(&ctx->desc[ctx->max_tok << 4], 0, 16*sizeof(ctx->desc[0]));
768755
memset(&ctx->token_dcount[ctx->max_tok], 0, sizeof(int));
@@ -782,6 +769,8 @@ static int encode_name(name_context *ctx, char *name, int len, int mode) {
782769
ctx->lc[cnum].last[ntok].token_str = 0;
783770
ctx->lc[cnum].last[ntok++].token_type = N_ALPHA;
784771
i = fixed_len;
772+
} else {
773+
i = 0;
785774
}
786775

787776
for (; i < len; i++) {
@@ -997,7 +986,6 @@ static int encode_name(name_context *ctx, char *name, int len, int mode) {
997986
//putchar(' ');
998987
}
999988

1000-
end:
1001989
#ifdef ENC_DEBUG
1002990
fprintf(stderr, "Tok %d (end)\n", N_END);
1003991
#endif

0 commit comments

Comments
 (0)