Skip to content

Commit 0c6c30b

Browse files
daviesrobwhitwham
authored andcommitted
Try to ensure CSI indexes are built with valid parameters
The genome range that a CSI index can cover is set by the combination of min_shift (the size of each bin) and n_lvls (the number of levels in the binning index, which sets the number of smallest bins present). The index code attempted to adjust n_lvls so that it's high enough to cover the range needed, however setting it to a value of ten or more resulted in a broken index because the resulting bin numbers overflow a 32-bit signed integer. Such an overflow could easily happen when min_shift was set less than 10, and the file being indexed did not include reference lengths so the indexer used its default length of 100 Gbases (chosen to be bigger than any known reference sequence). This rewrites the n_lvls setting code so that the value chosen will never be higher than nine. If necessary, min_shift is adjusted instead to give the desired range and if that happens a warning is printed as it's likely to have overridden a user setting. The code to do this is moved to hts.c so it can be called by all of the SAM/BAM, VCF/BCF and tabix indexers. For the case where there are no contig lengths, n_lvls is chosen to give an indexable length of at least 100G if min_shift >= 10, or otherwise n_lvls is set to the maximum allowed (9) to give the longest range permitted by the requested min_shift. This should work for all be the longest genomes; should the length limit be hit, indexing will fail and the user will see an error message suggesting they use a larger min_shift value (see hts_idx_check_range). Fixes #1966 (CSI access runtime issue with m=9)
1 parent 8594ac2 commit 0c6c30b

File tree

5 files changed

+73
-29
lines changed

5 files changed

+73
-29
lines changed

hts.c

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2364,6 +2364,39 @@ static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t of
23642364
return 0;
23652365
}
23662366

2367+
void hts_adjust_csi_settings(int64_t max_len_in, int *min_shift_, int *n_lvls_)
2368+
{
2369+
const int max_n_lvls = 9; // To prevent bin number overflow
2370+
int min_shift = *min_shift_;
2371+
int n_lvls = *n_lvls_;
2372+
int64_t max_len = max_len_in + 256, maxpos;
2373+
2374+
// Check if we need to adjust n_lvls or min_shift to get the range needed
2375+
if (max_len <= hts_bin_maxpos(min_shift, max_n_lvls)) {
2376+
// Can get required range by adjusting n_lvls
2377+
maxpos = hts_bin_maxpos(min_shift, n_lvls);
2378+
while (max_len > maxpos) {
2379+
++n_lvls;
2380+
maxpos *= 8;
2381+
}
2382+
*n_lvls_ = n_lvls;
2383+
} else {
2384+
// No room to change n_lvls - adjust min_shift instead
2385+
// This was likely user-supplied so warn about the change too.
2386+
n_lvls = max_n_lvls;
2387+
maxpos = hts_bin_maxpos(min_shift, n_lvls);
2388+
while (max_len > maxpos) {
2389+
++min_shift;
2390+
maxpos *= 2;
2391+
}
2392+
hts_log_warning("Adjusted min_shift from %d to %d"
2393+
" due to longest reference of %"PRId64" bases.",
2394+
*min_shift_, min_shift, max_len_in);
2395+
*n_lvls_ = n_lvls;
2396+
*min_shift_ = min_shift;
2397+
}
2398+
}
2399+
23672400
hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
23682401
{
23692402
hts_idx_t *idx;

hts_internal.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,19 @@ struct hts_json_token {
4848

4949
struct cram_fd;
5050

51+
/*
52+
* Adjust CSI index parameters to support max_len_in bases
53+
*
54+
* @param max_len_in Maximum position to be indexed
55+
* @param min_shift_[in,out] min_shift parameter
56+
* @param n_lvls_[in,out] n_lvls parameter
57+
*
58+
* Adjusts *n_lvls_ (preferred) or *min_shift_ so that the resulting values
59+
* can be passed to hts_idx_init(, HTS_FMT_CSI, ...) in order to make an
60+
* index that can store positions up to max_len_in bases.
61+
*/
62+
void hts_adjust_csi_settings(int64_t max_len_in, int *min_shift_, int *n_lvls_);
63+
5164
/*
5265
* Check the existence of a local index file using part of the alignment file name.
5366
* The order is alignment.bam.csi, alignment.csi, alignment.bam.bai, alignment.bai

sam.c

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -995,13 +995,13 @@ static hts_idx_t *sam_index(htsFile *fp, int min_shift)
995995
h = sam_hdr_read(fp);
996996
if (h == NULL) return NULL;
997997
if (min_shift > 0) {
998-
hts_pos_t max_len = 0, s;
998+
hts_pos_t max_len = 0;
999999
for (i = 0; i < h->n_targets; ++i) {
10001000
hts_pos_t len = sam_hdr_tid2len(h, i);
10011001
if (max_len < len) max_len = len;
10021002
}
1003-
max_len += 256;
1004-
for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
1003+
n_lvls = 0;
1004+
hts_adjust_csi_settings(max_len, &min_shift, &n_lvls);
10051005
fmt = HTS_FMT_CSI;
10061006
} else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
10071007
idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
@@ -1093,13 +1093,12 @@ int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx) {
10931093
(fp->format.format == sam && fp->format.compression == bgzf)) {
10941094
int n_lvls, fmt = HTS_FMT_CSI;
10951095
if (min_shift > 0) {
1096-
int64_t max_len = 0, s;
1096+
int64_t max_len = 0;
10971097
int i;
10981098
for (i = 0; i < h->n_targets; ++i)
10991099
if (max_len < h->target_len[i]) max_len = h->target_len[i];
1100-
max_len += 256;
1101-
for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
1102-
1100+
n_lvls = 0;
1101+
hts_adjust_csi_settings(max_len, &min_shift, &n_lvls);
11031102
} else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
11041103

11051104
fp->idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);

tbx.c

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -434,16 +434,6 @@ static void adjust_max_ref_len_sam(const char *str, int64_t *max_ref_len)
434434
if (*max_ref_len < len) *max_ref_len = len;
435435
}
436436

437-
// Adjusts number of levels if not big enough. This can happen for
438-
// files with very large contigs.
439-
static int adjust_n_lvls(int min_shift, int n_lvls, int64_t max_len)
440-
{
441-
int64_t s = hts_bin_maxpos(min_shift, n_lvls);
442-
max_len += 256;
443-
for (; max_len > s; ++n_lvls, s <<= 3) {}
444-
return n_lvls;
445-
}
446-
447437
tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf)
448438
{
449439
tbx_t *tbx;
@@ -478,9 +468,19 @@ tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf)
478468
}
479469
if (first == 0) {
480470
if (fmt == HTS_FMT_CSI) {
481-
if (!max_ref_len)
482-
max_ref_len = (int64_t)100*1024*1024*1024; // 100G default
483-
n_lvls = adjust_n_lvls(min_shift, n_lvls, max_ref_len);
471+
if (max_ref_len) {
472+
hts_adjust_csi_settings(max_ref_len, &min_shift, &n_lvls);
473+
} else {
474+
// This will give a maximum reference length of at
475+
// least 100Gbases for min_shift >= 10, and the
476+
// maximum possible for min_shift < 10.
477+
const int max_n_lvls = 9; // To prevent bin number overflow
478+
n_lvls = (min_shift < 10
479+
? max_n_lvls
480+
: (min_shift < 25
481+
? max_n_lvls - (min_shift - 10) / 3
482+
: 4));
483+
}
484484
}
485485
tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls);
486486
if (!tbx->idx) goto fail;

vcf.c

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4633,11 +4633,11 @@ int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id)
46334633

46344634
// Calculate number of index levels given min_shift and the header contig
46354635
// list. Also returns number of contigs in *nids_out.
4636-
static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int min_shift,
4636+
static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int *min_shift_in_out,
46374637
int starting_n_lvls, int *nids_out)
46384638
{
4639-
int n_lvls, i, nids = 0;
4640-
int64_t max_len = 0, s;
4639+
int n_lvls = starting_n_lvls, i, nids = 0;
4640+
int64_t max_len = 0;
46414641

46424642
for (i = 0; i < h->n[BCF_DT_CTG]; ++i)
46434643
{
@@ -4647,9 +4647,8 @@ static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int min_shift,
46474647
nids++;
46484648
}
46494649
if ( !max_len ) max_len = (1LL<<31) - 1; // In case contig line is broken.
4650-
max_len += 256;
4651-
s = hts_bin_maxpos(min_shift, starting_n_lvls);
4652-
for (n_lvls = starting_n_lvls; max_len > s; ++n_lvls, s <<= 3);
4650+
4651+
hts_adjust_csi_settings(max_len, min_shift_in_out, &n_lvls);
46534652

46544653
if (nids_out) *nids_out = nids;
46554654
return n_lvls;
@@ -4665,7 +4664,7 @@ hts_idx_t *bcf_index(htsFile *fp, int min_shift)
46654664
h = bcf_hdr_read(fp);
46664665
if ( !h ) return NULL;
46674666
int nids = 0;
4668-
n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids);
4667+
n_lvls = idx_calc_n_lvls_ids(h, &min_shift, 0, &nids);
46694668
idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
46704669
if (!idx) goto fail;
46714670
b = bcf_init1();
@@ -4765,7 +4764,7 @@ static int vcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fn
47654764
// Set initial n_lvls to match tbx_index()
47664765
int starting_n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3;
47674766
// Increase if necessary
4768-
n_lvls = idx_calc_n_lvls_ids(h, min_shift, starting_n_lvls, NULL);
4767+
n_lvls = idx_calc_n_lvls_ids(h, &min_shift, starting_n_lvls, NULL);
47694768
fmt = HTS_FMT_CSI;
47704769
}
47714770

@@ -4807,7 +4806,7 @@ int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
48074806
if (!min_shift)
48084807
min_shift = 14;
48094808

4810-
n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids);
4809+
n_lvls = idx_calc_n_lvls_ids(h, &min_shift, 0, &nids);
48114810

48124811
fp->idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
48134812
if (!fp->idx) return -1;

0 commit comments

Comments
 (0)