@@ -71,10 +71,16 @@ typedef struct {
7171 uint8_t * block ;
7272 int64_t end_offset ;
7373} cache_t ;
74+
7475#include "htslib/khash.h"
7576KHASH_MAP_INIT_INT64 (cache , cache_t )
7677#endif
7778
79+ struct bgzf_cache_t {
80+ khash_t (cache ) * h ;
81+ khint_t last_pos ;
82+ };
83+
7884#ifdef BGZF_MT
7985
8086typedef struct bgzf_job {
@@ -215,7 +221,16 @@ static BGZF *bgzf_read_init(hFILE *hfpr)
215221 fp -> is_compressed = (n == 18 && magic [0 ]== 0x1f && magic [1 ]== 0x8b );
216222 fp -> is_gzip = ( !fp -> is_compressed || ((magic [3 ]& 4 ) && memcmp (& magic [12 ], "BC\2\0" ,4 )== 0 ) ) ? 0 : 1 ;
217223#ifdef BGZF_CACHE
218- fp -> cache = kh_init (cache );
224+ if (!(fp -> cache = malloc (sizeof (* fp -> cache )))) {
225+ free (fp );
226+ return NULL ;
227+ }
228+ if (!(fp -> cache -> h = kh_init (cache ))) {
229+ free (fp -> cache );
230+ free (fp );
231+ return NULL ;
232+ }
233+ fp -> cache -> last_pos = 0 ;
219234#endif
220235 return fp ;
221236}
@@ -524,27 +539,27 @@ static int check_header(const uint8_t *header)
524539static void free_cache (BGZF * fp )
525540{
526541 khint_t k ;
527- khash_t (cache ) * h = (khash_t (cache )* )fp -> cache ;
528542 if (fp -> is_write ) return ;
543+ khash_t (cache ) * h = fp -> cache -> h ;
529544 for (k = kh_begin (h ); k < kh_end (h ); ++ k )
530545 if (kh_exist (h , k )) free (kh_val (h , k ).block );
531546 kh_destroy (cache , h );
547+ free (fp -> cache );
532548}
533549
534550static int load_block_from_cache (BGZF * fp , int64_t block_address )
535551{
536552 khint_t k ;
537553 cache_t * p ;
538554
539- khash_t (cache ) * h = ( khash_t ( cache ) * ) fp -> cache ;
555+ khash_t (cache ) * h = fp -> cache -> h ;
540556 k = kh_get (cache , h , block_address );
541557 if (k == kh_end (h )) return 0 ;
542558 p = & kh_val (h , k );
543559 if (fp -> block_length != 0 ) fp -> block_offset = 0 ;
544560 fp -> block_address = block_address ;
545561 fp -> block_length = p -> size ;
546- // FIXME: why BGZF_MAX_BLOCK_SIZE and not p->size?
547- memcpy (fp -> uncompressed_block , p -> block , BGZF_MAX_BLOCK_SIZE );
562+ memcpy (fp -> uncompressed_block , p -> block , p -> size );
548563 if ( hseek (fp -> fp , p -> end_offset , SEEK_SET ) < 0 )
549564 {
550565 // todo: move the error up
@@ -557,29 +572,48 @@ static int load_block_from_cache(BGZF *fp, int64_t block_address)
557572static void cache_block (BGZF * fp , int size )
558573{
559574 int ret ;
560- khint_t k ;
575+ khint_t k , k_orig ;
576+ uint8_t * block = NULL ;
561577 cache_t * p ;
562578 //fprintf(stderr, "Cache block at %llx\n", (int)fp->block_address);
563- khash_t (cache ) * h = ( khash_t ( cache ) * ) fp -> cache ;
579+ khash_t (cache ) * h = fp -> cache -> h ;
564580 if (BGZF_MAX_BLOCK_SIZE >= fp -> cache_size ) return ;
581+ if (fp -> block_length < 0 || fp -> block_length > BGZF_MAX_BLOCK_SIZE ) return ;
565582 if ((kh_size (h ) + 1 ) * BGZF_MAX_BLOCK_SIZE > (uint32_t )fp -> cache_size ) {
566- /* A better way would be to remove the oldest block in the
567- * cache, but here we remove a random one for simplicity. This
568- * should not have a big impact on performance. */
569- for (k = kh_begin (h ); k < kh_end (h ); ++ k )
570- if (kh_exist (h , k )) break ;
571- if (k < kh_end (h )) {
572- free (kh_val (h , k ).block );
583+ /* Remove uniformly from any position in the hash by a simple
584+ * round-robin approach. An alternative strategy would be to
585+ * remove the least recently accessed block, but the round-robin
586+ * removal is simpler and is not expected to have a big impact
587+ * on performance */
588+ if (fp -> cache -> last_pos >= kh_end (h )) fp -> cache -> last_pos = kh_begin (h );
589+ k_orig = k = fp -> cache -> last_pos ;
590+ if (++ k >= kh_end (h )) k = kh_begin (h );
591+ while (k != k_orig ) {
592+ if (kh_exist (h , k ))
593+ break ;
594+ if (++ k == kh_end (h ))
595+ k = kh_begin (h );
596+ }
597+ fp -> cache -> last_pos = k ;
598+
599+ if (k != k_orig ) {
600+ block = kh_val (h , k ).block ;
573601 kh_del (cache , h , k );
574602 }
603+ } else {
604+ block = (uint8_t * )malloc (BGZF_MAX_BLOCK_SIZE );
575605 }
606+ if (!block ) return ;
576607 k = kh_put (cache , h , fp -> block_address , & ret );
577- if (ret == 0 ) return ; // if this happens, a bug!
608+ if (ret <= 0 ) { // kh_put failed, or in there already (shouldn't happen)
609+ free (block );
610+ return ;
611+ }
578612 p = & kh_val (h , k );
579613 p -> size = fp -> block_length ;
580614 p -> end_offset = fp -> block_address + size ;
581- p -> block = ( uint8_t * ) malloc ( BGZF_MAX_BLOCK_SIZE ) ;
582- memcpy (kh_val ( h , k ). block , fp -> uncompressed_block , BGZF_MAX_BLOCK_SIZE );
615+ p -> block = block ;
616+ memcpy (p -> block , fp -> uncompressed_block , p -> size );
583617}
584618#else
585619static void free_cache (BGZF * fp ) {}
@@ -1489,7 +1523,7 @@ int bgzf_close(BGZF* fp)
14891523
14901524void bgzf_set_cache_size (BGZF * fp , int cache_size )
14911525{
1492- if (fp ) fp -> cache_size = cache_size ;
1526+ if (fp && fp -> cache ) fp -> cache_size = cache_size ;
14931527}
14941528
14951529int bgzf_check_EOF (BGZF * fp ) {
0 commit comments