Try to ensure CSI indexes are built with valid parameters

daviesrob · whitwham · commit 0c6c30b14a3e · 2025-11-17T11:48:33.000Z
The genome range that a CSI index can cover is set by the combination of min_shift (the size of each bin) and n_lvls (the number of levels in the binning index, which sets the number of smallest bins present). The index code attempted to adjust n_lvls so that it's high enough to cover the range needed, however setting it to a value of ten or more resulted in a broken index because the resulting bin numbers overflow a 32-bit signed integer. Such an overflow could easily happen when min_shift was set less than 10, and the file being indexed did not include reference lengths so the indexer used its default length of 100 Gbases (chosen to be bigger than any known reference sequence). This rewrites the n_lvls setting code so that the value chosen will never be higher than nine. If necessary, min_shift is adjusted instead to give the desired range and if that happens a warning is printed as it's likely to have overridden a user setting. The code to do this is moved to hts.c so it can be called by all of the SAM/BAM, VCF/BCF and tabix indexers. For the case where there are no contig lengths, n_lvls is chosen to give an indexable length of at least 100G if min_shift >= 10, or otherwise n_lvls is set to the maximum allowed (9) to give the longest range permitted by the requested min_shift. This should work for all be the longest genomes; should the length limit be hit, indexing will fail and the user will see an error message suggesting they use a larger min_shift value (see hts_idx_check_range). Fixes #1966 (CSI access runtime issue with m=9)
diff --git a/hts.c b/hts.c
@@ -2364,6 +2364,39 @@ static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t of
     return 0;
 }
 
+void hts_adjust_csi_settings(int64_t max_len_in, int *min_shift_, int *n_lvls_)
+{
+    const int max_n_lvls = 9; // To prevent bin number overflow
+    int min_shift = *min_shift_;
+    int n_lvls = *n_lvls_;
+    int64_t max_len = max_len_in + 256, maxpos;
+
+    // Check if we need to adjust n_lvls or min_shift to get the range needed
+    if (max_len <= hts_bin_maxpos(min_shift, max_n_lvls)) {
+        // Can get required range by adjusting n_lvls
+        maxpos = hts_bin_maxpos(min_shift, n_lvls);
+        while (max_len > maxpos) {
+            ++n_lvls;
+            maxpos *= 8;
+        }
+        *n_lvls_ = n_lvls;
+    } else {
+        // No room to change n_lvls - adjust min_shift instead
+        // This was likely user-supplied so warn about the change too.
+        n_lvls = max_n_lvls;
+        maxpos = hts_bin_maxpos(min_shift, n_lvls);
+        while (max_len > maxpos) {
+            ++min_shift;
+            maxpos *= 2;
+        }
+        hts_log_warning("Adjusted min_shift from %d to %d"
+                        " due to longest reference of %"PRId64" bases.",
+                        *min_shift_, min_shift, max_len_in);
+        *n_lvls_ = n_lvls;
+        *min_shift_ = min_shift;
+    }
+}
+
 hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
 {
     hts_idx_t *idx;
diff --git a/hts_internal.h b/hts_internal.h
@@ -48,6 +48,19 @@ struct hts_json_token {
 
 struct cram_fd;
 
+/*
+ * Adjust CSI index parameters to support max_len_in bases
+ *
+ * @param max_len_in         Maximum position to be indexed
+ * @param min_shift_[in,out] min_shift parameter
+ * @param n_lvls_[in,out]    n_lvls parameter
+ *
+ * Adjusts *n_lvls_ (preferred) or *min_shift_ so that the resulting values
+ * can be passed to hts_idx_init(, HTS_FMT_CSI, ...) in order to make an
+ * index that can store positions up to max_len_in bases.
+ */
+void hts_adjust_csi_settings(int64_t max_len_in, int *min_shift_, int *n_lvls_);
+
 /*
  * Check the existence of a local index file using part of the alignment file name.
  * The order is alignment.bam.csi, alignment.csi, alignment.bam.bai, alignment.bai
diff --git a/sam.c b/sam.c
@@ -995,13 +995,13 @@ static hts_idx_t *sam_index(htsFile *fp, int min_shift)
     h = sam_hdr_read(fp);
     if (h == NULL) return NULL;
     if (min_shift > 0) {
-        hts_pos_t max_len = 0, s;
+        hts_pos_t max_len = 0;
         for (i = 0; i < h->n_targets; ++i) {
             hts_pos_t len = sam_hdr_tid2len(h, i);
             if (max_len < len) max_len = len;
         }
-        max_len += 256;
-        for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
+        n_lvls = 0;
+        hts_adjust_csi_settings(max_len, &min_shift, &n_lvls);
         fmt = HTS_FMT_CSI;
     } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
     idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
@@ -1093,13 +1093,12 @@ int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx) {
         (fp->format.format == sam && fp->format.compression == bgzf)) {
         int n_lvls, fmt = HTS_FMT_CSI;
         if (min_shift > 0) {
-            int64_t max_len = 0, s;
+            int64_t max_len = 0;
             int i;
             for (i = 0; i < h->n_targets; ++i)
                 if (max_len < h->target_len[i]) max_len = h->target_len[i];
-            max_len += 256;
-            for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
-
+            n_lvls = 0;
+            hts_adjust_csi_settings(max_len, &min_shift, &n_lvls);
         } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
 
         fp->idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
diff --git a/tbx.c b/tbx.c
@@ -434,16 +434,6 @@ static void adjust_max_ref_len_sam(const char *str, int64_t *max_ref_len)
     if (*max_ref_len < len) *max_ref_len = len;
 }
 
-// Adjusts number of levels if not big enough.  This can happen for
-// files with very large contigs.
-static int adjust_n_lvls(int min_shift, int n_lvls, int64_t max_len)
-{
-    int64_t s = hts_bin_maxpos(min_shift, n_lvls);
-    max_len += 256;
-    for (; max_len > s; ++n_lvls, s <<= 3) {}
-    return n_lvls;
-}
-
 tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf)
 {
     tbx_t *tbx;
@@ -478,9 +468,19 @@ tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf)
         }
         if (first == 0) {
             if (fmt == HTS_FMT_CSI) {
-                if (!max_ref_len)
-                    max_ref_len = (int64_t)100*1024*1024*1024; // 100G default
-                n_lvls = adjust_n_lvls(min_shift, n_lvls, max_ref_len);
+                if (max_ref_len) {
+                    hts_adjust_csi_settings(max_ref_len, &min_shift, &n_lvls);
+                } else {
+                    // This will give a maximum reference length of at
+                    // least 100Gbases for min_shift >= 10, and the
+                    // maximum possible for min_shift < 10.
+                    const int max_n_lvls = 9; // To prevent bin number overflow
+                    n_lvls = (min_shift < 10
+                              ? max_n_lvls
+                              : (min_shift < 25
+                                 ? max_n_lvls - (min_shift - 10) / 3
+                                 : 4));
+                }
             }
             tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls);
             if (!tbx->idx) goto fail;
diff --git a/vcf.c b/vcf.c
@@ -4633,11 +4633,11 @@ int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id)
 
 // Calculate number of index levels given min_shift and the header contig
 // list.  Also returns number of contigs in *nids_out.
-static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int min_shift,
+static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int *min_shift_in_out,
                                int starting_n_lvls, int *nids_out)
 {
-    int n_lvls, i, nids = 0;
-    int64_t max_len = 0, s;
+    int n_lvls = starting_n_lvls, i, nids = 0;
+    int64_t max_len = 0;
 
     for (i = 0; i < h->n[BCF_DT_CTG]; ++i)
     {
@@ -4647,9 +4647,8 @@ static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int min_shift,
         nids++;
     }
     if ( !max_len ) max_len = (1LL<<31) - 1;  // In case contig line is broken.
-    max_len += 256;
-    s = hts_bin_maxpos(min_shift, starting_n_lvls);
-    for (n_lvls = starting_n_lvls; max_len > s; ++n_lvls, s <<= 3);
+
+    hts_adjust_csi_settings(max_len, min_shift_in_out, &n_lvls);
 
     if (nids_out) *nids_out = nids;
     return n_lvls;
@@ -4665,7 +4664,7 @@ hts_idx_t *bcf_index(htsFile *fp, int min_shift)
     h = bcf_hdr_read(fp);
     if ( !h ) return NULL;
     int nids = 0;
-    n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids);
+    n_lvls = idx_calc_n_lvls_ids(h, &min_shift, 0, &nids);
     idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
     if (!idx) goto fail;
     b = bcf_init1();
@@ -4765,7 +4764,7 @@ static int vcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fn
         // Set initial n_lvls to match tbx_index()
         int starting_n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3;
         // Increase if necessary
-        n_lvls = idx_calc_n_lvls_ids(h, min_shift, starting_n_lvls, NULL);
+        n_lvls = idx_calc_n_lvls_ids(h, &min_shift, starting_n_lvls, NULL);
         fmt = HTS_FMT_CSI;
     }
 
@@ -4807,7 +4806,7 @@ int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
     if (!min_shift)
         min_shift = 14;
 
-    n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids);
+    n_lvls = idx_calc_n_lvls_ids(h, &min_shift, 0, &nids);
 
     fp->idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
     if (!fp->idx) return -1;