samtools · rhpvorderman · May 7, 2025 · May 12, 2025 · May 12, 2025 · May 12, 2025
diff --git a/htscodecs/tokenise_name3.c b/htscodecs/tokenise_name3.c
@@ -681,6 +681,51 @@ int search_trie(name_context *ctx, char *data, size_t len, int n, int *exact, in
     return *exact ? from : p3;
 }
 
+enum char_types {
+    DIGIT = 0,          // 0b0000 0000
+    HEXDIGIT_LOWER = 1, // 0b0000 0001
+    HEXDIGIT_UPPER = 2, // 0b0000 0010
+    STRING = 3,         // 0b0000 0011
+    SEPARATION = 7,     // 0b0000 0111
+};
+
+#define D DIGIT
+#define L HEXDIGIT_LOWER
+#define U HEXDIGIT_UPPER
+#define S STRING 
+#define P SEPARATION
+
+/* Alternative classification table. 
+Contrary to `ispunct` some characters are classified as the string type: 
+$, !, ? are normal values in Strings and not used to indicate field separators. 
+, + and & are often used for multiple values in the same field and should thus 
+not be separated. */
+static uint8_t CHAR_TO_TYPE[256] = {
+// Control characters
+    P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P,
+    P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P,
+//   , !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /
+    P, S, P, P, S, P, S, P, P, P, P, S, P, P, P, P,
+//  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, :, ;, <, =, >, ?
+    D, D, D, D, D, D, D, D, D, D, P, P, P, P, P, S,
+//  @, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O,
+    S, U, U, U, U, U, U, S, S, S, S, S, S, S, S, S,
+//  P, Q, R, S, T, U, V, W, X, Y, Z, [, \, ], ^, _, 
+    S, S, S, S, S, S, S, S, S, S, S, P, P, P, P, P,
+//  `, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o,
+    P, L, L, L, L, L, L, S, S, S, S, S, S, S, S, S,
+//  p, q, r, s, t, u, v, w, x, y, z, {, }, |, ~,
+    S, S, S, S, S, S, S, S, S, S, S, P, P, P, P, P,
+// Assume all non-ASCII characters are strings
+    S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S,
+    S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S,
+    S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S,
+    S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S,
+    S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S,
+    S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S,
+    S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S,
+    S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S,
+};
 
 //-----------------------------------------------------------------------------
 // Name encoder
@@ -764,121 +809,159 @@ static int encode_name(name_context *ctx, char *name, int len, int mode) {
             ctx->max_tok = ntok+1;
         }
 
-        /* Determine data type of this segment */
-        if (isalpha((uint8_t)name[i])) {
-            int s = i+1;
-//          int S = i+1;
-
-//          // FIXME: try which of these is best.  alnum is good sometimes.
-//          while (s < len && isalpha((uint8_t)name[s]))
-            while (s < len && (isalpha((uint8_t)name[s]) ||
-                               ispunct((uint8_t)name[s])))
-//          while (s < len && name[s] != ':')
-//          while (s < len && !isdigit((uint8_t)name[s]) && name[s] != ':')
-                s++;
-
-//          if (!is_fixed) {
-//              while (S < len && isalnum((uint8_t)name[S]))
-//                  S++;
-//              if (s < S)
-//                  s = S;
-//          }
+        uint8_t first_char = (uint8_t)name[i]; 
+        if (CHAR_TO_TYPE[first_char] == SEPARATION) {
+            /* Treat separation as seperate tokens. */
+            goto n_char;
+        }
+
+        /* Determine segment length and data type */ 
+        int s = i; 
+        uint8_t token_type = DIGIT;
+        while (s < len) {
+            uint8_t c = (uint8_t)name[s];
+            uint8_t char_type = CHAR_TO_TYPE[c];
+            if (char_type == SEPARATION) {
+                break;
+            }
+            token_type |= char_type;
+            s+=1;
+        }
+        char *token = name + i;
+        int token_length = s - i; 
+        int token_starts_with_zero = first_char == '0';
+        int token_is_number = token_type == DIGIT;
+        uint32_t v = 0;
+        if (token_type == DIGIT) {
+            /* Do not encode larger numbers because of uint32_t limits */
+            if (token_length > 9) {
+                token_length = 9;
+            }
+            for (int j=0; j < token_length; j++) {
+                v = v * 10 + token[j] - '0';
+            }
+        }
+        if (token_type == HEXDIGIT_LOWER || token_type == HEXDIGIT_UPPER) {
+            if (token_length > 16) {
+                // Hexadecimals more than 64-bits are unlikely. This is a most 
+                // likely just a string.
+                token_type = STRING;
+            }
+        }
+        if (token_type == STRING && token_length > 1) {
+            // Determine if it is prefix<number> or <number>suffix
+            size_t digit_start = token_length;
+            size_t i = 0;
+            for (;i < token_length; i++) {
+                if (CHAR_TO_TYPE[(uint8_t)token[i]] == DIGIT) {
+                    digit_start = i;
+                    break;
+                }
+            }
+            for (; i < token_length; i++) {
+                if (CHAR_TO_TYPE[(uint8_t)token[i]] != DIGIT) {
+                    break;
+                }
+            }
+            int digit_end = i;
+            int remainder_all_string = 1;
+            for (; i < token_length; i++) {
+                if (CHAR_TO_TYPE[(uint8_t)token[i] == DIGIT]) {
+                    remainder_all_string = 0;
+                }
+            }
+            if (digit_start == 0 && remainder_all_string) {
+                // <number>suffix
+                token_is_number = 1; 
+                token_length = digit_end - digit_start;
+            }
+            if (digit_end == token_length && digit_start != token_length) {
+                // prefix<number>
+                // Only encode the prefix as a string and come back for the number later.
+                token_length = digit_start;
+            }
+        }
+
+        if (!token_is_number) {
 
             // Single byte strings are better encoded as chars.
-            if (s-i == 1) goto n_char;
+            if (token_length == 1) goto n_char;
 
             if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last[ntok].token_type == N_ALPHA) {
-                if (s-i == ctx->lc[pnum].last[ntok].token_int &&
-                    memcmp(&name[i], 
+                if (token_length == ctx->lc[pnum].last[ntok].token_int &&
+                    memcmp(token, 
                            &ctx->lc[pnum].last_name[ctx->lc[pnum].last[ntok].token_str],
-                           s-i) == 0) {
+                           token_length) == 0) {
 #ifdef ENC_DEBUG
-                    fprintf(stderr, "Tok %d (alpha-mat, %.*s)\n", N_MATCH, s-i, &name[i]);
+                    fprintf(stderr, "Tok %d (alpha-mat, %.*s)\n", N_MATCH, token_length, token);
 #endif
                     if (encode_token_match(ctx, ntok) < 0) return -1;
                 } else {
 #ifdef ENC_DEBUG
                     fprintf(stderr, "Tok %d (alpha, %.*s / %.*s)\n", N_ALPHA,
-                            s-i, &ctx->lc[pnum].last_name[ctx->lc[pnum].last[ntok].token_str], s-i, &name[i]);
+                            token_length, &ctx->lc[pnum].last_name[ctx->lc[pnum].last[ntok].token_str], token_length, token);
 #endif
                     // same token/length, but mismatches
-                    if (encode_token_alpha(ctx, ntok, &name[i], s-i) < 0) return -1;
+                    if (encode_token_alpha(ctx, ntok, token, token_length) < 0) return -1;
                 }
             } else {
 #ifdef ENC_DEBUG
-                fprintf(stderr, "Tok %d (new alpha, %.*s)\n", N_ALPHA, s-i, &name[i]);
+                fprintf(stderr, "Tok %d (new alpha, %.*s)\n", N_ALPHA, token_length, token);
 #endif
-                if (encode_token_alpha(ctx, ntok, &name[i], s-i) < 0) return -1;
+                if (encode_token_alpha(ctx, ntok, token, token_length) < 0) return -1;
             }
 
-            ctx->lc[cnum].last[ntok].token_int = s-i;
+            ctx->lc[cnum].last[ntok].token_int = token_length;
             ctx->lc[cnum].last[ntok].token_str = i;
             ctx->lc[cnum].last[ntok].token_type = N_ALPHA;
 
             i = s-1;
-        } else if (name[i] == '0') digits0: {
+        } else if (token_starts_with_zero && token_is_number) digits0: {
             // Digits starting with zero; encode length + value
-            uint32_t s = i;
-            uint32_t v = 0;
             int d = 0;
-
-            while (s < len && isdigit((uint8_t)name[s]) && s-i < 9) {
-                v = v*10 + name[s] - '0';
-                //putchar(name[s]);
-                s++;
-            }
-
             // TODO: optimise choice over whether to switch from DIGITS to DELTA
             // regularly vs all DIGITS, also MATCH vs DELTA 0.
             if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last[ntok].token_type == N_DIGITS0) {
                 d = v - ctx->lc[pnum].last[ntok].token_int;
-                if (d == 0 && ctx->lc[pnum].last[ntok].token_str == s-i) {
+                if (d == 0 && ctx->lc[pnum].last[ntok].token_str == token_length) {
 #ifdef ENC_DEBUG
                     fprintf(stderr, "Tok %d (dig-mat, %d)\n", N_MATCH, v);
 #endif
                     if (encode_token_match(ctx, ntok) < 0) return -1;
                     //ctx->lc[pnum].last[ntok].token_delta=0;
-                } else if (mode == 1 && d < 256 && d >= 0 && ctx->lc[pnum].last[ntok].token_str == s-i) {
+                } else if (mode == 1 && d < 256 && d >= 0 && ctx->lc[pnum].last[ntok].token_str == token_length) {
 #ifdef ENC_DEBUG
                     fprintf(stderr, "Tok %d (dig0-delta, %d / %d)\n", N_DDELTA0, ctx->lc[pnum].last[ntok].token_int, v);
 #endif
-                    //if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1;
+                    //if (encode_token_int1_(ctx, ntok, N_DZLEN, token_length) < 0) return -1;
                     if (encode_token_int1(ctx, ntok, N_DDELTA0, d) < 0) return -1;
                     //ctx->lc[pnum].last[ntok].token_delta=1;
                 } else {
 #ifdef ENC_DEBUG
-                    fprintf(stderr, "Tok %d (dig0, %d / %d len %d)\n", N_DIGITS0, ctx->lc[pnum].last[ntok].token_int, v, s-i);
+                    fprintf(stderr, "Tok %d (dig0, %d / %d len %d)\n", N_DIGITS0, ctx->lc[pnum].last[ntok].token_int, v, token_length);
 #endif
-                    if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1;
+                    if (encode_token_int1_(ctx, ntok, N_DZLEN, token_length) < 0) return -1;
                     if (encode_token_int(ctx, ntok, N_DIGITS0, v) < 0) return -1;
                     //ctx->lc[pnum].last[ntok].token_delta=0;
                 }
             } else {
 #ifdef ENC_DEBUG
-                fprintf(stderr, "Tok %d (new dig0, %d len %d)\n", N_DIGITS0, v, s-i);
+                fprintf(stderr, "Tok %d (new dig0, %d len %d)\n", N_DIGITS0, v, token_length);
 #endif
-                if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1;
+                if (encode_token_int1_(ctx, ntok, N_DZLEN, token_length) < 0) return -1;
                 if (encode_token_int(ctx, ntok, N_DIGITS0, v) < 0) return -1;
                 //ctx->lc[pnum].last[ntok].token_delta=0;
             }
 
-            ctx->lc[cnum].last[ntok].token_str = s-i; // length
+            ctx->lc[cnum].last[ntok].token_str = token_length;
             ctx->lc[cnum].last[ntok].token_int = v;
             ctx->lc[cnum].last[ntok].token_type = N_DIGITS0;
 
-            i = s-1;
-        } else if (isdigit((uint8_t)name[i])) {
+            i = i + token_length - 1;
+        } else if (token_is_number) {
             // digits starting 1-9; encode value
-            uint32_t s = i;
-            uint32_t v = 0;
             int d = 0;
 
-            while (s < len && isdigit((uint8_t)name[s]) && s-i < 9) {
-                v = v*10 + name[s] - '0';
-                //putchar(name[s]);
-                s++;
-            }
-
             // dataset/10/K562_cytosol_LID8465_TopHat_v2.names
             // col 4 is Illumina lane - we don't want match & delta in there
             // as it has multiple lanes (so not ALL match) and delta is just
@@ -892,7 +975,7 @@ static int encode_name(name_context *ctx, char *name, int len, int mode) {
             // width, sometimes with leading zeros.
             if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok &&
                 ctx->lc[pnum].last[ntok].token_type == N_DIGITS0 &&
-                ctx->lc[pnum].last[ntok].token_str == s-i)
+                ctx->lc[pnum].last[ntok].token_str == token_length)
                 goto digits0;
 
             // TODO: optimise choice over whether to switch from DIGITS to DELTA
@@ -936,7 +1019,7 @@ static int encode_name(name_context *ctx, char *name, int len, int mode) {
             ctx->lc[cnum].last[ntok].token_int = v;
             ctx->lc[cnum].last[ntok].token_type = N_DIGITS;
 
-            i = s-1;
+            i = i + token_length - 1;
         } else {
         n_char:
             //if (!isalpha((uint8_t)name[i])) putchar(name[i]);