From 7ab76f963eea36b42e269f2f77a5a2861bc87587 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Mon, 3 Mar 2025 16:06:57 +0100 Subject: [PATCH 01/10] Make ConcurentCache inherit lru_cache. A ConcurrentCache is a specific lru_cache. --- src/concurrent_cache.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/concurrent_cache.h b/src/concurrent_cache.h index a2dd0bd2f..0f014e003 100644 --- a/src/concurrent_cache.h +++ b/src/concurrent_cache.h @@ -40,7 +40,7 @@ namespace zim available. */ template -class ConcurrentCache +class ConcurrentCache: private lru_cache> { private: // types typedef std::shared_future ValuePlaceholder; @@ -48,7 +48,7 @@ class ConcurrentCache public: // types explicit ConcurrentCache(size_t maxEntries) - : impl_(maxEntries) + : Impl(maxEntries) {} // Gets the entry corresponding to the given key. If the entry is not in the @@ -65,7 +65,7 @@ class ConcurrentCache { std::promise valuePromise; std::unique_lock l(lock_); - const auto x = impl_.getOrPut(key, valuePromise.get_future().share()); + const auto x = Impl::getOrPut(key, valuePromise.get_future().share()); l.unlock(); if ( x.miss() ) { try { @@ -82,26 +82,25 @@ class ConcurrentCache bool drop(const Key& key) { std::unique_lock l(lock_); - return impl_.drop(key); + return Impl::drop(key); } size_t getMaxSize() const { std::unique_lock l(lock_); - return impl_.getMaxSize(); + return Impl::getMaxSize(); } size_t getCurrentSize() const { std::unique_lock l(lock_); - return impl_.size(); + return Impl::size(); } void setMaxSize(size_t newSize) { std::unique_lock l(lock_); - return impl_.setMaxSize(newSize); + return Impl::setMaxSize(newSize); } private: // data - Impl impl_; mutable std::mutex lock_; }; From a21ee25b6f5d31b887a883155709701c84df3032 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Fri, 7 Feb 2025 11:55:25 +0100 Subject: [PATCH 02/10] Make caches open to a cost of each item different than 1 At cache level, "size" is renamed to "cost", representing how much a cache can store (and how much an item cost). "size" is keep to count the number of items. At higher level, we keep the "size" sementics as we are speaking about the size of the cache, whatever this is. This is the first step to a cache limited by memory usage. --- src/concurrent_cache.h | 60 ++++++++++++++++++----- src/dirent_accessor.h | 8 +-- src/fileimpl.cpp | 6 +-- src/fileimpl.h | 3 +- src/lrucache.h | 108 +++++++++++++++++++++++++++++++++-------- test/lrucache.cpp | 97 +++++++++++++++++++++++++++++------- 6 files changed, 227 insertions(+), 55 deletions(-) diff --git a/src/concurrent_cache.h b/src/concurrent_cache.h index 0f014e003..b000de744 100644 --- a/src/concurrent_cache.h +++ b/src/concurrent_cache.h @@ -23,6 +23,7 @@ #include "lrucache.h" +#include #include #include #include @@ -30,6 +31,30 @@ namespace zim { +template +struct FutureToValueCostEstimation { + template + static size_t cost(const std::shared_future& future) { + // The future is the value in the cache. + // When calling getOrPut, if the key is not in the cache, + // we add a future and then we compute the value and set the future. + // But lrucache call us when we add the future, meaning before we have + // computed the value. If we wait here (or use future.get), we will dead lock + // as we need to exit before setting the value. + // So in this case, we return 0. `ConcurrentCache::getOrPut` will correctly increase + // the current cache size when it have an actual value. + // We still need to compute the size of the value if the future has a value as it + // is also use to decrease the cache size when the value is drop. + std::future_status status = future.wait_for(std::chrono::nanoseconds::zero()); + if (status == std::future_status::ready) { + return CostEstimation::cost(future.get()); + } else { + return 0; + } + } + +}; + /** ConcurrentCache implements a concurrent thread-safe cache @@ -39,16 +64,16 @@ namespace zim safe, and, in case of a cache miss, will block until that element becomes available. */ -template -class ConcurrentCache: private lru_cache> +template +class ConcurrentCache: private lru_cache, FutureToValueCostEstimation> { private: // types typedef std::shared_future ValuePlaceholder; - typedef lru_cache Impl; + typedef lru_cache> Impl; public: // types - explicit ConcurrentCache(size_t maxEntries) - : Impl(maxEntries) + explicit ConcurrentCache(size_t maxCost) + : Impl(maxCost) {} // Gets the entry corresponding to the given key. If the entry is not in the @@ -70,6 +95,19 @@ class ConcurrentCache: private lru_cache> if ( x.miss() ) { try { valuePromise.set_value(f()); + auto cost = CostEstimation::cost(x.value().get()); + // There is a small window when the valuePromise may be drop from lru cache after + // we set the value but before we increase the size of the cache. + // In this case decrease the size of `cost` before increasing it. + // First of all it should be pretty rare as we have just put the future in the cache so it + // should not be the least used item. + // If it happens, this should not be a problem if current_size is bigger than `cost` (most of the time) + // For the really rare specific case of current cach size being lower than `cost` (if possible), + // `decreaseCost` will clamp the new size to 0. + { + std::unique_lock l(lock_); + Impl::increaseCost(cost); + } } catch (std::exception& e) { drop(key); throw; @@ -85,19 +123,19 @@ class ConcurrentCache: private lru_cache> return Impl::drop(key); } - size_t getMaxSize() const { + size_t getMaxCost() const { std::unique_lock l(lock_); - return Impl::getMaxSize(); + return Impl::getMaxCost(); } - size_t getCurrentSize() const { + size_t getCurrentCost() const { std::unique_lock l(lock_); - return Impl::size(); + return Impl::cost(); } - void setMaxSize(size_t newSize) { + void setMaxCost(size_t newSize) { std::unique_lock l(lock_); - return Impl::setMaxSize(newSize); + return Impl::setMaxCost(newSize); } private: // data diff --git a/src/dirent_accessor.h b/src/dirent_accessor.h index ef3953128..2156e1033 100644 --- a/src/dirent_accessor.h +++ b/src/dirent_accessor.h @@ -55,9 +55,9 @@ class LIBZIM_PRIVATE_API DirectDirentAccessor std::shared_ptr getDirent(entry_index_t idx) const; entry_index_t getDirentCount() const { return m_direntCount; } - size_t getMaxCacheSize() const { return m_direntCache.getMaxSize(); } - size_t getCurrentCacheSize() const { return m_direntCache.size(); } - void setMaxCacheSize(size_t nbDirents) const { m_direntCache.setMaxSize(nbDirents); } + size_t getMaxCacheSize() const { return m_direntCache.getMaxCost(); } + size_t getCurrentCacheSize() const { return m_direntCache.cost(); } + void setMaxCacheSize(size_t nbDirents) const { m_direntCache.setMaxCost(nbDirents); } private: // functions std::shared_ptr readDirent(offset_t) const; @@ -67,7 +67,7 @@ class LIBZIM_PRIVATE_API DirectDirentAccessor std::unique_ptr mp_pathPtrReader; entry_index_t m_direntCount; - mutable lru_cache> m_direntCache; + mutable lru_cache, UnitCostEstimation> m_direntCache; mutable std::mutex m_direntCacheLock; mutable std::vector m_bufferDirentZone; diff --git a/src/fileimpl.cpp b/src/fileimpl.cpp index 3d63f3e8f..66f582e07 100644 --- a/src/fileimpl.cpp +++ b/src/fileimpl.cpp @@ -792,13 +792,13 @@ bool checkTitleListing(const IndirectDirentAccessor& accessor, entry_index_type size_t FileImpl::getClusterCacheMaxSize() const { - return clusterCache.getMaxSize(); + return clusterCache.getMaxCost(); } size_t FileImpl::getClusterCacheCurrentSize() const { - return clusterCache.getCurrentSize(); + return clusterCache.getCurrentCost(); } void FileImpl::setClusterCacheMaxSize(size_t nbClusters) { - clusterCache.setMaxSize(nbClusters); + clusterCache.setMaxCost(nbClusters); } size_t FileImpl::getDirentCacheMaxSize() const { diff --git a/src/fileimpl.h b/src/fileimpl.h index 45452fe37..9614e12cd 100644 --- a/src/fileimpl.h +++ b/src/fileimpl.h @@ -36,6 +36,7 @@ #include "file_reader.h" #include "file_compound.h" #include "fileheader.h" +#include "lrucache.h" #include "zim_types.h" #include "direntreader.h" @@ -55,7 +56,7 @@ namespace zim std::unique_ptr mp_titleDirentAccessor; typedef std::shared_ptr ClusterHandle; - ConcurrentCache clusterCache; + ConcurrentCache clusterCache; const bool m_hasFrontArticlesIndex; const entry_index_t m_startUserEntry; diff --git a/src/lrucache.h b/src/lrucache.h index 03a3644e6..f3a0b1686 100644 --- a/src/lrucache.h +++ b/src/lrucache.h @@ -43,10 +43,38 @@ #include #include #include +#include namespace zim { -template +struct UnitCostEstimation { + template + static size_t cost(const value_t& value) { + return 1; + } +}; + +/** + * A lru cache where the cost of each item can be different than 1. + * + * Most lru cache is limited by the number of items stored. + * This implementation may have a different "size" per item, so the current size of + * this lru is not the number of item but the sum of all items' size. + * + * This implementation used is pretty simple (dumb) and have few limitations: + * - We consider than size of a item do not change over time. Especially the size of a + * item when we put it MUST be equal to the size of the same item when we drop it. + * - Cache eviction is still a Least Recently Used (LRU), so we drop the least used item(s) util + * we have enough space. No other consideration is used to select which item to drop. + * + * This lru is parametrized by a CostEstimation type. The type must have a static method `cost` + * taking a reference to a `value_t` and returing its "cost". As already said, this method must + * always return the same cost for the same value. + * + * While cost could be any kind of value, this implemention is intended to be used only with + * `UnitCostEstimation` (classic lru) and `FutureToValueCostEstimation`. + */ +template class lru_cache { public: // types typedef typename std::pair key_value_pair_t; @@ -81,9 +109,10 @@ class lru_cache { }; public: // functions - explicit lru_cache(size_t max_size) : - _max_size(max_size) { - } + explicit lru_cache(size_t max_cost) : + _max_cost(max_cost), + _current_cost(0) + {} // If 'key' is present in the cache, returns the associated value, // otherwise puts the given value into the cache (and returns it with @@ -103,6 +132,8 @@ class lru_cache { auto it = _cache_items_map.find(key); if (it != _cache_items_map.end()) { _cache_items_list.splice(_cache_items_list.begin(), _cache_items_list, it->second); + decreaseCost(CostEstimation::cost(it->second->second)); + increaseCost(CostEstimation::cost(value)); it->second->second = value; } else { putMissing(key, value); @@ -120,37 +151,72 @@ class lru_cache { } bool drop(const key_t& key) { + list_iterator_t list_it; try { - auto list_it = _cache_items_map.at(key); - _cache_items_list.erase(list_it); - _cache_items_map.erase(key); - return true; + list_it = _cache_items_map.at(key); } catch (std::out_of_range& e) { return false; } + decreaseCost(CostEstimation::cost(list_it->second)); + _cache_items_list.erase(list_it); + _cache_items_map.erase(key); + return true; } bool exists(const key_t& key) const { return _cache_items_map.find(key) != _cache_items_map.end(); } - size_t size() const { - return _cache_items_map.size(); + size_t cost() const { + return _current_cost; } - size_t getMaxSize() const { - return _max_size; + size_t getMaxCost() const { + return _max_cost; } - void setMaxSize(size_t newSize) { - while (newSize < this->size()) { + void setMaxCost(size_t newMaxCost) { + while (newMaxCost < this->cost()) { dropLast(); } - _max_size = newSize; + _max_cost = newMaxCost; + } + + protected: + + void increaseCost(size_t extra_cost) { + // increaseSize is called after we have added a value to the cache to update + // the size of the current cache. + // We must ensure that we don't drop the value we just added. + // While it is technically ok to keep no value if max cache size is 0 (or memory size < of the size of one cluster) + // it will make recreate the value all the time. + // Let's be nice with our user and be tolerent to misconfiguration. + if (!extra_cost) { + // Don't try to remove an item if we have new size == 0. + // This is the case when concurent cache add a future without value. + // We will handle the real increase size when concurent cache will directly call us. + return; + } + _current_cost += extra_cost; + while (_current_cost > _max_cost && size() > 1) { + dropLast(); + } + } + + void decreaseCost(size_t costToRemove) { + if (costToRemove > _current_cost) { + std::cerr << "WARNING: We have detected inconsistant cache management, trying to remove " << costToRemove << " from a cache with size " << _current_cost << std::endl; + std::cerr << "Please open an issue on https://github.com/openzim/libzim/issues with this message and the zim file you use" << std::endl; + _current_cost = 0; + } else { + _current_cost -= costToRemove; + } } private: // functions void dropLast() { + auto list_it = _cache_items_list.back(); + decreaseCost(CostEstimation::cost(list_it.second)); _cache_items_map.erase(_cache_items_list.back().first); _cache_items_list.pop_back(); } @@ -159,15 +225,19 @@ class lru_cache { assert(_cache_items_map.find(key) == _cache_items_map.end()); _cache_items_list.push_front(key_value_pair_t(key, value)); _cache_items_map[key] = _cache_items_list.begin(); - if (_cache_items_map.size() > _max_size) { - dropLast(); - } + increaseCost(CostEstimation::cost(value)); + } + + size_t size() const { + return _cache_items_map.size(); } + private: // data std::list _cache_items_list; std::map _cache_items_map; - size_t _max_size; + size_t _max_cost; + size_t _current_cost; }; } // namespace zim diff --git a/test/lrucache.cpp b/test/lrucache.cpp index 67dba943a..e43b914c5 100644 --- a/test/lrucache.cpp +++ b/test/lrucache.cpp @@ -38,47 +38,110 @@ const unsigned int TEST2_CACHE_CAPACITY = 50u; const unsigned int TEST2_CACHE_CAPACITY_SMALL = 10u; TEST(CacheTest, SimplePut) { - zim::lru_cache cache_lru(1); + zim::lru_cache cache_lru(1); cache_lru.put(7, 777); EXPECT_TRUE(cache_lru.exists(7)); EXPECT_EQ(777, cache_lru.get(7)); - EXPECT_EQ(1u, cache_lru.size()); + EXPECT_EQ(1u, cache_lru.cost()); } TEST(CacheTest, OverwritingPut) { - zim::lru_cache cache_lru(1); + zim::lru_cache cache_lru(1); cache_lru.put(7, 777); cache_lru.put(7, 222); EXPECT_TRUE(cache_lru.exists(7)); EXPECT_EQ(222, cache_lru.get(7)); - EXPECT_EQ(1u, cache_lru.size()); + EXPECT_EQ(1u, cache_lru.cost()); } TEST(CacheTest, MissingValue) { - zim::lru_cache cache_lru(1); + zim::lru_cache cache_lru(1); EXPECT_TRUE(cache_lru.get(7).miss()); EXPECT_FALSE(cache_lru.get(7).hit()); EXPECT_THROW(cache_lru.get(7).value(), std::range_error); } TEST(CacheTest, DropValue) { - zim::lru_cache cache_lru(3); + zim::lru_cache cache_lru(3); cache_lru.put(7, 777); cache_lru.put(8, 888); cache_lru.put(9, 999); - EXPECT_EQ(3u, cache_lru.size()); + EXPECT_EQ(3u, cache_lru.cost()); EXPECT_TRUE(cache_lru.exists(7)); EXPECT_EQ(777, cache_lru.get(7)); EXPECT_TRUE(cache_lru.drop(7)); - EXPECT_EQ(2u, cache_lru.size()); + EXPECT_EQ(2u, cache_lru.cost()); EXPECT_FALSE(cache_lru.exists(7)); EXPECT_THROW(cache_lru.get(7).value(), std::range_error); EXPECT_FALSE(cache_lru.drop(7)); } +struct IdCost { + static size_t cost(size_t value ) { + return value; + } +}; + +TEST(CacheTest, VariableCost) { + zim::lru_cache cache_lru(100); + + cache_lru.put(1, 11); + cache_lru.put(2, 22); + cache_lru.put(3, 33); + EXPECT_EQ(66u, cache_lru.cost()); + + cache_lru.put(4, 44); + EXPECT_EQ(99u, cache_lru.cost()); + EXPECT_FALSE(cache_lru.exists(1)); + EXPECT_TRUE(cache_lru.exists(2)); + EXPECT_TRUE(cache_lru.exists(3)); + EXPECT_TRUE(cache_lru.exists(4)); + + cache_lru.put(5, 55); + EXPECT_EQ(99u, cache_lru.cost()); + EXPECT_FALSE(cache_lru.exists(1)); + EXPECT_FALSE(cache_lru.exists(2)); + EXPECT_FALSE(cache_lru.exists(3)); + EXPECT_TRUE(cache_lru.exists(4)); + EXPECT_TRUE(cache_lru.exists(5)); + + cache_lru.put(1, 11); + EXPECT_EQ(66u, cache_lru.cost()); + EXPECT_TRUE(cache_lru.exists(1)); + EXPECT_FALSE(cache_lru.exists(2)); + EXPECT_FALSE(cache_lru.exists(3)); + EXPECT_FALSE(cache_lru.exists(4)); + EXPECT_TRUE(cache_lru.exists(5)); +} + +TEST(CacheTest, TooBigValue) { + zim::lru_cache cache_lru(10); + + cache_lru.put(1, 11); + EXPECT_EQ(11u, cache_lru.cost()); + EXPECT_TRUE(cache_lru.exists(1)); + + cache_lru.put(2, 22); + EXPECT_EQ(22u, cache_lru.cost()); + EXPECT_FALSE(cache_lru.exists(1)); + EXPECT_TRUE(cache_lru.exists(2)); + + cache_lru.put(3, 33); + EXPECT_EQ(33u, cache_lru.cost()); + EXPECT_FALSE(cache_lru.exists(1)); + EXPECT_FALSE(cache_lru.exists(2)); + EXPECT_TRUE(cache_lru.exists(3)); + + cache_lru.put(1, 11); + EXPECT_EQ(11u, cache_lru.cost()); + EXPECT_TRUE(cache_lru.exists(1)); + EXPECT_FALSE(cache_lru.exists(2)); + EXPECT_FALSE(cache_lru.exists(3)); +} + #define EXPECT_RANGE_MISSING_FROM_CACHE(CACHE, START, END) \ for (unsigned i = START; i < END; ++i) { \ EXPECT_FALSE(CACHE.exists(i)); \ @@ -93,7 +156,7 @@ for (unsigned i = START; i < END; ++i) { \ TEST(CacheTest1, KeepsAllValuesWithinCapacity) { - zim::lru_cache cache_lru(TEST2_CACHE_CAPACITY); + zim::lru_cache cache_lru(TEST2_CACHE_CAPACITY); for (int i = 0; i < NUM_OF_TEST2_RECORDS; ++i) { cache_lru.put(i, i); @@ -103,37 +166,37 @@ TEST(CacheTest1, KeepsAllValuesWithinCapacity) { EXPECT_RANGE_FULLY_IN_CACHE(cache_lru, (NUM_OF_TEST2_RECORDS - TEST2_CACHE_CAPACITY), NUM_OF_TEST2_RECORDS, 1) - size_t size = cache_lru.size(); + size_t size = cache_lru.cost(); EXPECT_EQ(TEST2_CACHE_CAPACITY, size); } TEST(CacheTest1, ChangeCacheCapacity) { - zim::lru_cache cache_lru(TEST2_CACHE_CAPACITY); + zim::lru_cache cache_lru(TEST2_CACHE_CAPACITY); for (int i = 0; i < NUM_OF_TEST2_RECORDS; ++i) { cache_lru.put(i, i); } - EXPECT_EQ(TEST2_CACHE_CAPACITY, cache_lru.size()); + EXPECT_EQ(TEST2_CACHE_CAPACITY, cache_lru.cost()); EXPECT_RANGE_MISSING_FROM_CACHE(cache_lru, 0, (NUM_OF_TEST2_RECORDS - TEST2_CACHE_CAPACITY)) EXPECT_RANGE_FULLY_IN_CACHE(cache_lru, (NUM_OF_TEST2_RECORDS - TEST2_CACHE_CAPACITY), NUM_OF_TEST2_RECORDS, 1) - cache_lru.setMaxSize(TEST2_CACHE_CAPACITY_SMALL); - EXPECT_EQ(TEST2_CACHE_CAPACITY_SMALL, cache_lru.size()); + cache_lru.setMaxCost(TEST2_CACHE_CAPACITY_SMALL); + EXPECT_EQ(TEST2_CACHE_CAPACITY_SMALL, cache_lru.cost()); EXPECT_RANGE_MISSING_FROM_CACHE(cache_lru, 0, (NUM_OF_TEST2_RECORDS - TEST2_CACHE_CAPACITY_SMALL)) EXPECT_RANGE_FULLY_IN_CACHE(cache_lru, (NUM_OF_TEST2_RECORDS - TEST2_CACHE_CAPACITY_SMALL), NUM_OF_TEST2_RECORDS, 1) - cache_lru.setMaxSize(TEST2_CACHE_CAPACITY); + cache_lru.setMaxCost(TEST2_CACHE_CAPACITY); for (int i = 0; i < NUM_OF_TEST2_RECORDS; ++i) { cache_lru.put(i, 1000*i); } - EXPECT_EQ(TEST2_CACHE_CAPACITY, cache_lru.size()); + EXPECT_EQ(TEST2_CACHE_CAPACITY, cache_lru.cost()); EXPECT_RANGE_MISSING_FROM_CACHE(cache_lru, 0, (NUM_OF_TEST2_RECORDS - TEST2_CACHE_CAPACITY)) EXPECT_RANGE_FULLY_IN_CACHE(cache_lru, (NUM_OF_TEST2_RECORDS - TEST2_CACHE_CAPACITY), NUM_OF_TEST2_RECORDS, 1000) } TEST(ConcurrentCacheTest, handleException) { - zim::ConcurrentCache cache(1); + zim::ConcurrentCache cache(1); auto val = cache.getOrPut(7, []() { return 777; }); EXPECT_EQ(val, 777); EXPECT_THROW(cache.getOrPut(8, []() { throw std::runtime_error("oups"); return 0; }), std::runtime_error); From 71a496f4ef9d8fcea587e28bd73e3c5706aa7011 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 11 Feb 2025 10:24:27 +0100 Subject: [PATCH 03/10] Add `getMemorySize` on `IStreamReader` and `Reader`. We need to know the size of our input reader. Especially for compression stream where we need to know the size of the compression state. --- src/buffer_reader.cpp | 5 +++++ src/buffer_reader.h | 1 + src/compression.cpp | 13 +++++++++++++ src/compression.h | 2 ++ src/decoderstreamreader.h | 5 +++++ src/file_reader.h | 1 + src/istreamreader.h | 3 +++ src/rawstreamreader.h | 4 ++++ src/reader.h | 1 + test/istreamreader.cpp | 15 +++++++++++---- 10 files changed, 46 insertions(+), 4 deletions(-) diff --git a/src/buffer_reader.cpp b/src/buffer_reader.cpp index 3b64ca137..0d08518a6 100644 --- a/src/buffer_reader.cpp +++ b/src/buffer_reader.cpp @@ -44,6 +44,11 @@ zsize_t BufferReader::size() const return source.size(); } +size_t BufferReader::getMemorySize() const +{ + return source.size().v; +} + offset_t BufferReader::offset() const { return offset_t((offset_type)(static_cast(source.data(offset_t(0))))); diff --git a/src/buffer_reader.h b/src/buffer_reader.h index f0972c375..27a128fcd 100644 --- a/src/buffer_reader.h +++ b/src/buffer_reader.h @@ -31,6 +31,7 @@ class LIBZIM_PRIVATE_API BufferReader : public Reader { virtual ~BufferReader() {}; zsize_t size() const override; + size_t getMemorySize() const override; offset_t offset() const override; const Buffer get_buffer(offset_t offset, zsize_t size) const override; diff --git a/src/compression.cpp b/src/compression.cpp index f5d3352c8..c3359b9ba 100644 --- a/src/compression.cpp +++ b/src/compression.cpp @@ -60,6 +60,11 @@ void LZMA_INFO::stream_end_decode(stream_t* stream) lzma_end(stream); } +size_t LZMA_INFO::state_size(const stream_t& stream) +{ + return lzma_memusage(&stream); +} + const std::string ZSTD_INFO::name = "zstd"; @@ -170,3 +175,11 @@ void ZSTD_INFO::stream_end_decode(stream_t* stream) void ZSTD_INFO::stream_end_encode(stream_t* stream) { } + +size_t ZSTD_INFO::state_size(const stream_t& stream) { + if (stream.decoder_stream) { + return ZSTD_sizeof_CStream(stream.encoder_stream); + } else { + return ZSTD_sizeof_DStream(stream.decoder_stream); + } +} diff --git a/src/compression.h b/src/compression.h index 4daba33b5..b27a86531 100644 --- a/src/compression.h +++ b/src/compression.h @@ -65,6 +65,7 @@ struct LZMA_INFO { static CompStatus stream_run_decode(stream_t* stream, CompStep step); static CompStatus stream_run(stream_t* stream, CompStep step); static void stream_end_decode(stream_t* stream); + static size_t state_size(const stream_t& stream); }; @@ -94,6 +95,7 @@ struct LIBZIM_PRIVATE_API ZSTD_INFO { static CompStatus stream_run_decode(stream_t* stream, CompStep step); static void stream_end_encode(stream_t* stream); static void stream_end_decode(stream_t* stream); + static size_t state_size(const stream_t& stream); }; diff --git a/src/decoderstreamreader.h b/src/decoderstreamreader.h index d48582b7e..43f2b6256 100644 --- a/src/decoderstreamreader.h +++ b/src/decoderstreamreader.h @@ -23,6 +23,7 @@ #include "compression.h" #include "istreamreader.h" +#include namespace zim { @@ -49,6 +50,10 @@ class DecoderStreamReader : public IStreamReader Decoder::stream_end_decode(&m_decoderState); } + size_t getMemorySize() const override { + return m_encodedDataReader->getMemorySize() + m_encodedDataChunk.size().v + Decoder::state_size(m_decoderState); + } + private: // functions void readNextChunk() { diff --git a/src/file_reader.h b/src/file_reader.h index 817e24c31..b247f21cb 100644 --- a/src/file_reader.h +++ b/src/file_reader.h @@ -34,6 +34,7 @@ class LIBZIM_PRIVATE_API BaseFileReader : public Reader { : _offset(offset), _size(size) {} ~BaseFileReader() = default; zsize_t size() const override { return _size; }; + size_t getMemorySize() const override { return 0; }; offset_t offset() const override { return _offset; }; virtual const Buffer get_mmap_buffer(offset_t offset, diff --git a/src/istreamreader.h b/src/istreamreader.h index a0a5349b2..d2c937974 100644 --- a/src/istreamreader.h +++ b/src/istreamreader.h @@ -59,6 +59,9 @@ class LIBZIM_PRIVATE_API IStreamReader // Reads a blob of the specified size from the stream virtual std::unique_ptr sub_reader(zsize_t size); + // Get the total size occuped by the reader + virtual size_t getMemorySize() const = 0; + private: // virtual methods // Reads exactly 'nbytes' bytes into the provided buffer 'buf' // (which must be at least that big). Throws an exception if diff --git a/src/rawstreamreader.h b/src/rawstreamreader.h index 504f67e63..36c30f38c 100644 --- a/src/rawstreamreader.h +++ b/src/rawstreamreader.h @@ -35,6 +35,10 @@ class RawStreamReader : public IStreamReader m_readerPos(0) {} + size_t getMemorySize() const override { + return m_reader->getMemorySize(); + } + void readImpl(char* buf, zsize_t nbytes) override { m_reader->read(buf, m_readerPos, zsize_t(nbytes)); diff --git a/src/reader.h b/src/reader.h index 8a4dc655c..5fdf7de6b 100644 --- a/src/reader.h +++ b/src/reader.h @@ -36,6 +36,7 @@ class LIBZIM_PRIVATE_API Reader { public: Reader() {}; virtual zsize_t size() const = 0; + virtual size_t getMemorySize() const = 0; virtual ~Reader() {}; void read(char* dest, offset_t offset, zsize_t size) const { diff --git a/test/istreamreader.cpp b/test/istreamreader.cpp index a96b59156..78b2f1135 100644 --- a/test/istreamreader.cpp +++ b/test/istreamreader.cpp @@ -34,18 +34,25 @@ using namespace zim; // Implement the IStreamReader interface in the simplest way class InfiniteZeroStream : public IStreamReader { - void readImpl(char* buf, zim::zsize_t nbytes) { memset(buf, 0, nbytes.v); } + void readImpl(char* buf, zim::zsize_t nbytes) override { memset(buf, 0, nbytes.v); } + size_t getMemorySize() const override { + return 0; + } }; class InfiniteIncreasingStream: public IStreamReader { zim::offset_type current_offset = 0; - void readImpl(char* buf, zim::zsize_t nbytes) { + void readImpl(char* buf, zim::zsize_t nbytes) override { for (size_type i=0; iget_buffer(zim::offset_t(0), zim::zsize_t(N)); EXPECT_EQ(buffer.size().v, N); EXPECT_EQ(0, memcmp(buffer.data(), refbuf, N)); - + buffer = subReader->get_buffer(zim::offset_t(5), zim::zsize_t(N-5)); EXPECT_EQ(buffer.size().v, N-5); EXPECT_EQ(0, memcmp(buffer.data(), refbuf+5, N-5)); From 1add1d3dc7a9b84f8cdfc2f14902c6a7564d449f Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 11 Feb 2025 10:25:44 +0100 Subject: [PATCH 04/10] Limit the cluster cache by memory instead of numbers of items. --- include/zim/archive.h | 8 +++---- meson_options.txt | 4 ++-- src/archive.cpp | 4 ++-- src/cluster.cpp | 45 ++++++++++++++++++++++++++++++++++--- src/cluster.h | 10 +++++++++ src/fileimpl.cpp | 14 ++++++------ src/fileimpl.h | 2 +- test/archive.cpp | 52 ++++++++++++++++++++++++++++--------------- 8 files changed, 102 insertions(+), 37 deletions(-) diff --git a/include/zim/archive.h b/include/zim/archive.h index a5985d1d9..cad5e66ff 100644 --- a/include/zim/archive.h +++ b/include/zim/archive.h @@ -536,13 +536,13 @@ namespace zim /** Get the maximum size of the cluster cache. * - * @return The maximum number of clusters stored in the cache. + * @return The maximum memory size used the cluster cache. */ size_t getClusterCacheMaxSize() const; /** Get the current size of the cluster cache. * - * @return The number of clusters currently stored in the cache. + * @return The current memory size used by the cluster cache. */ size_t getClusterCacheCurrentSize() const; @@ -551,9 +551,9 @@ namespace zim * If the new size is lower than the number of currently stored clusters * some clusters will be dropped from cache to respect the new size. * - * @param nbClusters The maximum number of clusters stored in the cache. + * @param sizeInB The memory limit (in bytes) for the cluster cache. */ - void setClusterCacheMaxSize(size_t nbClusters); + void setClusterCacheMaxSize(size_t sizeInB); /** Get the size of the dirent cache. * diff --git a/meson_options.txt b/meson_options.txt index e23118f6e..6c6d4f021 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -1,5 +1,5 @@ -option('CLUSTER_CACHE_SIZE', type : 'string', value : '16', - description : 'set cluster cache size to number (default:16)') +option('CLUSTER_CACHE_SIZE', type : 'string', value : '67108864', + description : 'set cluster cache size to number (default:64MB)') option('DIRENT_CACHE_SIZE', type : 'string', value : '512', description : 'set dirent cache size to number (default:512)') option('DIRENT_LOOKUP_CACHE_SIZE', type : 'string', value : '1024', diff --git a/src/archive.cpp b/src/archive.cpp index e370a9bf2..e39fb1331 100644 --- a/src/archive.cpp +++ b/src/archive.cpp @@ -514,9 +514,9 @@ namespace zim return m_impl->getClusterCacheCurrentSize(); } - void Archive::setClusterCacheMaxSize(size_t nbClusters) + void Archive::setClusterCacheMaxSize(size_t sizeInB) { - m_impl->setClusterCacheMaxSize(nbClusters); + m_impl->setClusterCacheMaxSize(sizeInB); } size_t Archive::getDirentCacheMaxSize() const diff --git a/src/cluster.cpp b/src/cluster.cpp index 214af0e38..b587b8a19 100644 --- a/src/cluster.cpp +++ b/src/cluster.cpp @@ -31,8 +31,6 @@ #include "log.h" -#include "config.h" - log_define("zim.cluster") #define log_debug1(e) @@ -86,7 +84,8 @@ getClusterReader(const Reader& zimReader, offset_t offset, Cluster::Compression* Cluster::Cluster(std::unique_ptr reader_, Compression comp, bool isExtended) : compression(comp), isExtended(isExtended), - m_reader(std::move(reader_)) + m_reader(std::move(reader_)), + m_memorySize(0) { if (isExtended) { read_header(); @@ -179,4 +178,44 @@ getClusterReader(const Reader& zimReader, offset_t offset, Cluster::Compression* } } + // This function must return a constant size for a given cluster. + // This is important as we want to remove the same size that what we add when we remove + // the cluster from the cache. + // However, because of partial decompression, this size can change: + // - As we advance in the compression, we can create new blob readers in `m_blobReaders` + // - The stream itself may allocate memory. + // To solve this, we take the average and say a cluster's blob readers will half be created and + // so we assume a readers size of half the full uncompressed cluster data size. + // If cluster is not compressed, we never store its content (mmap is created on demand and not cached), + // so we use a size of 0 for the readers. + // It also appears that when we get the size of the stream, we reach a state where no + // futher allocation will be done by it. Probably because: + // - We already started to decompress the stream to read the offsets + // - Cluster data size is smaller than window size associated to compression level (?) + // We anyway check that and print a warning if this is not the case, hopping that user will create + // an issue allowing us for further analysis. + // Note: + // - No need to protect this method from concurent access as it will be called by the concurent_cache which will + // have a lock (on lru cache) to ensure only one thread access it in the same time. + size_t Cluster::getMemorySize() const { + if (!m_memorySize) { + auto offsets_size = sizeof(offset_t) * m_blobOffsets.size(); + auto readers_size = 0; + if (isCompressed()) { + readers_size = m_blobOffsets.back().v / 2; + } + m_streamSize = m_reader->getMemorySize(); + // Compression level define a huge window and make decompression stream allocate a huge memory to store it. + // However, the used memory will not be greater than the content itself, even if window is bigger. + // On linux (at least), the real used memory will be the actual memory used, not the one allocated. + // So, let's clamm the the stream size to the size of the content itself. + m_memorySize = offsets_size + readers_size + std::min(m_streamSize, m_blobOffsets.back().v); + } + auto streamSize = m_reader->getMemorySize(); + if (streamSize != m_streamSize) { + std::cerr << "WARNING: stream size have changed from " << m_streamSize << " to " << streamSize << std::endl; + std::cerr << "Please open an issue on https://github.com/openzim/libzim/issues with this message and the zim file you use" << std::endl; + } + return m_memorySize; + } } diff --git a/src/cluster.h b/src/cluster.h index db1be37c1..0c1a6b553 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -70,6 +70,8 @@ namespace zim mutable std::mutex m_readerAccessMutex; mutable BlobReaders m_blobReaders; + mutable size_t m_memorySize; + mutable size_t m_streamSize; template @@ -90,9 +92,17 @@ namespace zim Blob getBlob(blob_index_t n) const; Blob getBlob(blob_index_t n, offset_t offset, zsize_t size) const; + size_t getMemorySize() const; + static std::shared_ptr read(const Reader& zimReader, offset_t clusterOffset); }; + struct ClusterMemorySize { + static size_t cost(const std::shared_ptr& cluster) { + return cluster->getMemorySize(); + } + }; + } #endif // ZIM_CLUSTER_H diff --git a/src/fileimpl.cpp b/src/fileimpl.cpp index 66f582e07..aacf8153a 100644 --- a/src/fileimpl.cpp +++ b/src/fileimpl.cpp @@ -566,11 +566,11 @@ class Grouping struct zim_MD5_CTX md5ctx; zim_MD5Init(&md5ctx); - + unsigned char ch[CHUNK_SIZE]; offset_type checksumPos = header.getChecksumPos(); offset_type toRead = checksumPos; - + for(auto part = zimFile->begin(); part != zimFile->end(); part++) { @@ -580,7 +580,7 @@ class Grouping zim_MD5Update(&md5ctx, ch, CHUNK_SIZE); toRead-=CHUNK_SIZE; } - + // Previous read was good, so we have exited the previous `while` because // `toRead(ch),toRead); } - + // It updates the checksum with the remaining amount of data when we // reach the end of the file or part zim_MD5Update(&md5ctx, ch, stream.gcount()); toRead-=stream.gcount(); - + if (stream.bad()) { perror("error while reading file"); return false; @@ -797,8 +797,8 @@ bool checkTitleListing(const IndirectDirentAccessor& accessor, entry_index_type size_t FileImpl::getClusterCacheCurrentSize() const { return clusterCache.getCurrentCost(); } - void FileImpl::setClusterCacheMaxSize(size_t nbClusters) { - clusterCache.setMaxCost(nbClusters); + void FileImpl::setClusterCacheMaxSize(size_t sizeInB) { + clusterCache.setMaxCost(sizeInB); } size_t FileImpl::getDirentCacheMaxSize() const { diff --git a/src/fileimpl.h b/src/fileimpl.h index 9614e12cd..c31ce289d 100644 --- a/src/fileimpl.h +++ b/src/fileimpl.h @@ -56,7 +56,7 @@ namespace zim std::unique_ptr mp_titleDirentAccessor; typedef std::shared_ptr ClusterHandle; - ConcurrentCache clusterCache; + ConcurrentCache clusterCache; const bool m_hasFrontArticlesIndex; const entry_index_t m_startUserEntry; diff --git a/test/archive.cpp b/test/archive.cpp index 4d7c63ff1..da6edc0d5 100644 --- a/test/archive.cpp +++ b/test/archive.cpp @@ -318,16 +318,16 @@ TEST(ZimArchive, cacheDontImpactReading) { const TestCacheConfig cacheConfigs[] = { {0, 0, 0}, - {1, 1, 1}, - {2, 2, 2}, - {10, 10, 10}, - {1000, 2000, 1000}, - {0, 2000, 1000}, - {1000, 0, 1000}, - {1000, 2000, 0}, - {1, 2000, 1000}, - {1000, 1, 1000}, - {1000, 2000, 1}, + {1, 1<<20, 1}, + {2, 2<<20, 2}, + {10, 10<<20, 10}, + {1000, 2000<<20, 1000}, + {0, 2000<<20, 1000}, + {1000, 0<<20, 1000}, + {1000, 2000<<20, 0}, + {1, 2000<<20, 1000}, + {1000, 1<<20, 1000}, + {1000, 2000<<20, 1}, }; for (auto& testfile: getDataFilePath("small.zim")) { @@ -351,32 +351,48 @@ TEST(ZimArchive, cacheDontImpactReading) TEST(ZimArchive, cacheChange) { - for (auto& testfile: getDataFilePath("wikibooks_be_all_nopic_2017-02.zim")) { + // We test only one variant here. + // Each variant has cluster of different size (especially the old "withns" which + // have a cluster compressed with a algorithm/compression level making input stream + // having a size of 64MB), + // this make all the following reasoning about cluster size a bit too complex. + // As the test here don't test that we can read all variant, we don't have too. + for (auto& testfile: getDataFilePath("wikibooks_be_all_nopic_2017-02.zim", {"noTitleListingV0"})) { + // wikibooks has only 2 clusters. One of 492121 bytes and one of 823716 bytes. + // For a total of 1315837 bytes. + // Has we try to keep one cluster in the cache, any size under the size of one + // cluster will not be respected. + // So we will define 2 limits: + // 850<<10 : size higher than a cluster size but under 2 + // 2 << 20 : size higher than two clusters + const size_t L1_SIZE = 850 << 10; + const size_t L2_SIZE = 2 << 20; + auto ref_archive = zim::Archive(testfile.path); auto archive = zim::Archive(testfile.path); archive.setDirentCacheMaxSize(30); - archive.setClusterCacheMaxSize(5); + archive.setClusterCacheMaxSize(L2_SIZE); auto range = ref_archive.iterEfficient(); auto ref_it = range.begin(); ASSERT_ARCHIVE_EQUIVALENT_IT_LIMIT(ref_it, range.end(), archive, 50) EXPECT_EQ(archive.getDirentCacheCurrentSize(), 30); - EXPECT_EQ(archive.getClusterCacheCurrentSize(), 2); // Only 2 clusters in the file + EXPECT_LE(archive.getClusterCacheCurrentSize(), L2_SIZE); // Only 2 clusters in the file // Reduce cache size archive.setDirentCacheMaxSize(10); - archive.setClusterCacheMaxSize(1); + archive.setClusterCacheMaxSize(L1_SIZE); EXPECT_EQ(archive.getDirentCacheCurrentSize(), 10); - EXPECT_EQ(archive.getClusterCacheCurrentSize(), 1); + EXPECT_LE(archive.getClusterCacheCurrentSize(), L1_SIZE); // We want to test change of cache while we are iterating on the archive. // So we don't reset the ref_it to `range.begin()`. ASSERT_ARCHIVE_EQUIVALENT_IT_LIMIT(ref_it, range.end(), archive, 50) EXPECT_EQ(archive.getDirentCacheCurrentSize(), 10); - EXPECT_EQ(archive.getClusterCacheCurrentSize(), 1); + EXPECT_LE(archive.getClusterCacheCurrentSize(), L1_SIZE); // Clean cache // (More than testing the value, this is needed as we want to be sure the cache is actually populated later) @@ -388,13 +404,13 @@ TEST(ZimArchive, cacheChange) // Increase the cache archive.setDirentCacheMaxSize(20); - archive.setClusterCacheMaxSize(1); + archive.setClusterCacheMaxSize(L1_SIZE); EXPECT_EQ(archive.getDirentCacheCurrentSize(), 0); EXPECT_EQ(archive.getClusterCacheCurrentSize(), 0); ASSERT_ARCHIVE_EQUIVALENT(ref_archive, archive) EXPECT_EQ(archive.getDirentCacheCurrentSize(), 20); - EXPECT_EQ(archive.getClusterCacheCurrentSize(), 1); + EXPECT_LE(archive.getClusterCacheCurrentSize(), L1_SIZE); } } From dc82b88451e3a56f6bb46905cfed3a2040655eca Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Fri, 14 Feb 2025 14:16:53 +0100 Subject: [PATCH 05/10] Drop all zim clusters from cache at zim file destruction. This is useless now as we destroy the cache itself just after but it will be useful when cache will be global. --- src/concurrent_cache.h | 6 ++++++ src/fileimpl.cpp | 5 +++++ src/fileimpl.h | 1 + src/lrucache.h | 16 ++++++++++++++++ 4 files changed, 28 insertions(+) diff --git a/src/concurrent_cache.h b/src/concurrent_cache.h index b000de744..f0dc4764f 100644 --- a/src/concurrent_cache.h +++ b/src/concurrent_cache.h @@ -123,6 +123,12 @@ class ConcurrentCache: private lru_cache, FutureT return Impl::drop(key); } + template + void dropAll(F f) { + std::unique_lock l(lock_); + Impl::dropAll(f); + } + size_t getMaxCost() const { std::unique_lock l(lock_); return Impl::getMaxCost(); diff --git a/src/fileimpl.cpp b/src/fileimpl.cpp index aacf8153a..93be54566 100644 --- a/src/fileimpl.cpp +++ b/src/fileimpl.cpp @@ -263,6 +263,11 @@ class Grouping readMimeTypes(); } + FileImpl::~FileImpl() { + // We have to clean the global cache for our clusters. + clusterCache.dropAll([=](const cluster_index_type key) {return true;}); + } + std::unique_ptr FileImpl::getTitleAccessorV1(const entry_index_t idx) { auto dirent = mp_pathDirentAccessor->getDirent(idx); diff --git a/src/fileimpl.h b/src/fileimpl.h index c31ce289d..cf50790fa 100644 --- a/src/fileimpl.h +++ b/src/fileimpl.h @@ -107,6 +107,7 @@ namespace zim explicit FileImpl(FdInput fd); explicit FileImpl(const std::vector& fds); #endif + ~FileImpl(); time_t getMTime() const; diff --git a/src/lrucache.h b/src/lrucache.h index f3a0b1686..7b768cf32 100644 --- a/src/lrucache.h +++ b/src/lrucache.h @@ -43,6 +43,7 @@ #include #include #include +#include #include namespace zim { @@ -163,6 +164,21 @@ class lru_cache { return true; } + template + void dropAll(F f) { + std::vector keys_to_drop; + for (auto key_iter:_cache_items_map) { + key_t key = key_iter.first; + if (f(key)) { + keys_to_drop.push_back(key); + } + } + + for(auto key:keys_to_drop) { + drop(key); + } + } + bool exists(const key_t& key) const { return _cache_items_map.find(key) != _cache_items_map.end(); } From a5032d2f3441461811ba735a8544e8f2b4880ef9 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Mon, 17 Feb 2025 16:57:04 +0100 Subject: [PATCH 06/10] Test archive against a preloaded content. Cluster cache will be global. So we cannot test content againt a ref_archive as getting the data from it will also be impacted by cache configuration. So we "preload" all the content first and then to the test. --- test/archive.cpp | 81 ++++++++++++++++++++++++++++++------------------ 1 file changed, 51 insertions(+), 30 deletions(-) diff --git a/test/archive.cpp b/test/archive.cpp index da6edc0d5..48c922c25 100644 --- a/test/archive.cpp +++ b/test/archive.cpp @@ -286,33 +286,49 @@ struct TestCacheConfig { size_t direntLookupCacheSize; }; +struct RefEntry { + void test_is_equal(const zim::Entry& entry) { + ASSERT_EQ(path, entry.getPath()); + ASSERT_EQ(title, entry.getTitle()); + ASSERT_EQ(isRedirect, entry.isRedirect()); + if (isRedirect) { + zim::entry_index_type redirectId = redirect_or_hash; + ASSERT_EQ(redirectId, entry.getRedirectEntryIndex()); + } else { + auto hash = std::hash{}(std::string(entry.getItem().getData())); + ASSERT_EQ(redirect_or_hash, hash); + } + } -#define ASSERT_ARCHIVE_EQUIVALENT(REF_ARCHIVE, TEST_ARCHIVE) \ - ASSERT_ARCHIVE_EQUIVALENT_LIMIT(REF_ARCHIVE, TEST_ARCHIVE, REF_ARCHIVE.getEntryCount()) + std::string path; + std::string title; + bool isRedirect; + // size_t is either 32 or 64 bits and entry_index_type (redirect id) is always 32 bits. + size_t redirect_or_hash; +}; -#define ASSERT_ARCHIVE_EQUIVALENT_LIMIT(REF_ARCHIVE, TEST_ARCHIVE, LIMIT) \ - { \ - auto range = REF_ARCHIVE.iterEfficient(); \ - auto ref_it = range.begin(); \ - ASSERT_ARCHIVE_EQUIVALENT_IT_LIMIT(ref_it, range.end(), TEST_ARCHIVE, LIMIT) \ +struct RefArchiveContent { + RefArchiveContent(const zim::Archive& archive) { + for (auto entry:archive.iterEfficient()) { + RefEntry ref_entry = { + entry.getPath(), + entry.getTitle(), + entry.isRedirect(), + entry.isRedirect() ? entry.getRedirectEntryIndex() : std::hash{}(std::string(entry.getItem().getData())), + }; + ref_entries.push_back(ref_entry); + } } - -#define ASSERT_ARCHIVE_EQUIVALENT_IT_LIMIT(REF_IT, REF_END, TEST_ARCHIVE, LIMIT) \ - for (auto i = 0U; igetPath()); \ - ASSERT_EQ(REF_IT->getPath(), test_entry.getPath()); \ - ASSERT_EQ(REF_IT->getTitle(), test_entry.getTitle()); \ - ASSERT_EQ(REF_IT->isRedirect(), test_entry.isRedirect()); \ - if (REF_IT->isRedirect()) { \ - ASSERT_EQ(REF_IT->getRedirectEntryIndex(), test_entry.getRedirectEntryIndex()); \ - } \ - auto ref_item = REF_IT->getItem(true); \ - auto test_item = test_entry.getItem(true); \ - ASSERT_EQ(ref_item.getClusterIndex(), test_item.getClusterIndex()); \ - ASSERT_EQ(ref_item.getBlobIndex(), test_item.getBlobIndex()); \ - ASSERT_EQ(ref_item.getData(), test_item.getData()); \ + void test_is_equal(const zim::Archive& archive) { + for (auto ref_entry:ref_entries) { + auto entry = archive.getEntryByPath(ref_entry.path); + ref_entry.test_is_equal(entry); + } } + std::vector ref_entries; +}; + TEST(ZimArchive, cacheDontImpactReading) { @@ -331,7 +347,7 @@ TEST(ZimArchive, cacheDontImpactReading) }; for (auto& testfile: getDataFilePath("small.zim")) { - auto ref_archive = zim::Archive(testfile.path); + RefArchiveContent ref_archive(zim::Archive(testfile.path)); for (auto cacheConfig: cacheConfigs) { auto test_archive = zim::Archive(testfile.path); @@ -343,7 +359,7 @@ TEST(ZimArchive, cacheDontImpactReading) EXPECT_EQ(test_archive.getDirentLookupCacheMaxSize(), cacheConfig.direntLookupCacheSize); EXPECT_EQ(test_archive.getClusterCacheMaxSize(), cacheConfig.clusterCacheSize); - ASSERT_ARCHIVE_EQUIVALENT(ref_archive, test_archive) + ref_archive.test_is_equal(test_archive); } } } @@ -368,15 +384,17 @@ TEST(ZimArchive, cacheChange) const size_t L1_SIZE = 850 << 10; const size_t L2_SIZE = 2 << 20; - auto ref_archive = zim::Archive(testfile.path); + RefArchiveContent ref_archive(zim::Archive(testfile.path)); auto archive = zim::Archive(testfile.path); archive.setDirentCacheMaxSize(30); archive.setClusterCacheMaxSize(L2_SIZE); - auto range = ref_archive.iterEfficient(); - auto ref_it = range.begin(); - ASSERT_ARCHIVE_EQUIVALENT_IT_LIMIT(ref_it, range.end(), archive, 50) + auto ref_it = ref_archive.ref_entries.begin(); + for (auto i = 0; i<50 && ref_it != ref_archive.ref_entries.end(); i++, ref_it++) { + auto entry = archive.getEntryByPath(ref_it->path); + ref_it->test_is_equal(entry); + } EXPECT_EQ(archive.getDirentCacheCurrentSize(), 30); EXPECT_LE(archive.getClusterCacheCurrentSize(), L2_SIZE); // Only 2 clusters in the file @@ -389,7 +407,10 @@ TEST(ZimArchive, cacheChange) // We want to test change of cache while we are iterating on the archive. // So we don't reset the ref_it to `range.begin()`. - ASSERT_ARCHIVE_EQUIVALENT_IT_LIMIT(ref_it, range.end(), archive, 50) + for (auto i = 0; i<50 && ref_it != ref_archive.ref_entries.end(); i++, ref_it++) { + auto entry = archive.getEntryByPath(ref_it->path); + ref_it->test_is_equal(entry); + } EXPECT_EQ(archive.getDirentCacheCurrentSize(), 10); EXPECT_LE(archive.getClusterCacheCurrentSize(), L1_SIZE); @@ -408,7 +429,7 @@ TEST(ZimArchive, cacheChange) EXPECT_EQ(archive.getDirentCacheCurrentSize(), 0); EXPECT_EQ(archive.getClusterCacheCurrentSize(), 0); - ASSERT_ARCHIVE_EQUIVALENT(ref_archive, archive) + ref_archive.test_is_equal(archive); EXPECT_EQ(archive.getDirentCacheCurrentSize(), 20); EXPECT_LE(archive.getClusterCacheCurrentSize(), L1_SIZE); } From 61c863efc3238985a7c35a64ae8bbc596a4b494c Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Fri, 14 Feb 2025 14:18:45 +0100 Subject: [PATCH 07/10] Make the cluster cache global. --- include/zim/archive.h | 42 ++++++++--------- meson_options.txt | 4 +- src/archive.cpp | 13 +++--- src/fileimpl.cpp | 31 ++++++------- src/fileimpl.h | 13 +++--- test/archive.cpp | 105 +++++++++++++++++++++++++++++++++++++----- 6 files changed, 141 insertions(+), 67 deletions(-) diff --git a/include/zim/archive.h b/include/zim/archive.h index cad5e66ff..d19b99838 100644 --- a/include/zim/archive.h +++ b/include/zim/archive.h @@ -42,6 +42,27 @@ namespace zim efficientOrder }; + /** Get the maximum size of the cluster cache. + * + * @return The maximum memory size used the cluster cache. + */ + size_t LIBZIM_API getClusterCacheMaxSize(); + + /** Get the current size of the cluster cache. + * + * @return The current memory size used by the cluster cache. + */ + size_t LIBZIM_API getClusterCacheCurrentSize(); + + /** Set the size of the cluster cache. + * + * If the new size is lower than the number of currently stored clusters + * some clusters will be dropped from cache to respect the new size. + * + * @param sizeInB The memory limit (in bytes) for the cluster cache. + */ + void LIBZIM_API setClusterCacheMaxSize(size_t sizeInB); + /** * The Archive class to access content in a zim file. * @@ -534,27 +555,6 @@ namespace zim */ std::shared_ptr getImpl() const { return m_impl; } - /** Get the maximum size of the cluster cache. - * - * @return The maximum memory size used the cluster cache. - */ - size_t getClusterCacheMaxSize() const; - - /** Get the current size of the cluster cache. - * - * @return The current memory size used by the cluster cache. - */ - size_t getClusterCacheCurrentSize() const; - - /** Set the size of the cluster cache. - * - * If the new size is lower than the number of currently stored clusters - * some clusters will be dropped from cache to respect the new size. - * - * @param sizeInB The memory limit (in bytes) for the cluster cache. - */ - void setClusterCacheMaxSize(size_t sizeInB); - /** Get the size of the dirent cache. * * @return The maximum number of dirents stored in the cache. diff --git a/meson_options.txt b/meson_options.txt index 6c6d4f021..936fdf8c5 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -1,5 +1,5 @@ -option('CLUSTER_CACHE_SIZE', type : 'string', value : '67108864', - description : 'set cluster cache size to number (default:64MB)') +option('CLUSTER_CACHE_SIZE', type : 'string', value : '536870912', + description : 'set cluster cache size to number (default:512MB)') option('DIRENT_CACHE_SIZE', type : 'string', value : '512', description : 'set dirent cache size to number (default:512)') option('DIRENT_LOOKUP_CACHE_SIZE', type : 'string', value : '1024', diff --git a/src/archive.cpp b/src/archive.cpp index e39fb1331..5cf1836b7 100644 --- a/src/archive.cpp +++ b/src/archive.cpp @@ -504,19 +504,19 @@ namespace zim return m_impl->hasNewNamespaceScheme(); } - size_t Archive::getClusterCacheMaxSize() const + size_t getClusterCacheMaxSize() { - return m_impl->getClusterCacheMaxSize(); + return getClusterCache().getMaxCost(); } - size_t Archive::getClusterCacheCurrentSize() const + size_t getClusterCacheCurrentSize() { - return m_impl->getClusterCacheCurrentSize(); + return getClusterCache().getCurrentCost(); } - void Archive::setClusterCacheMaxSize(size_t sizeInB) + void setClusterCacheMaxSize(size_t sizeInB) { - m_impl->setClusterCacheMaxSize(sizeInB); + getClusterCache().setMaxCost(sizeInB); } size_t Archive::getDirentCacheMaxSize() const @@ -534,7 +534,6 @@ namespace zim m_impl->setDirentCacheMaxSize(nbDirents); } - size_t Archive::getDirentLookupCacheMaxSize() const { return m_impl->getDirentLookupCacheMaxSize(); diff --git a/src/fileimpl.cpp b/src/fileimpl.cpp index 93be54566..ae487a59d 100644 --- a/src/fileimpl.cpp +++ b/src/fileimpl.cpp @@ -20,6 +20,7 @@ */ #include "dirent_lookup.h" +#include "zim/zim.h" #include "zim_types.h" #include #define CHUNK_SIZE 1024 @@ -31,7 +32,6 @@ #include "buffer_reader.h" #include #include -#include #include #include #include @@ -162,6 +162,11 @@ class Grouping } //unnamed namespace + ClusterCache& getClusterCache() { + static ClusterCache clusterCache(CLUSTER_CACHE_SIZE); + return clusterCache; + } + ////////////////////////////////////////////////////////////////////// // FileImpl // @@ -187,7 +192,6 @@ class Grouping : zimFile(_zimFile), zimReader(makeFileReader(zimFile)), direntReader(new DirentReader(zimReader)), - clusterCache(CLUSTER_CACHE_SIZE), m_hasFrontArticlesIndex(true), m_startUserEntry(0), m_endUserEntry(0), @@ -265,7 +269,7 @@ class Grouping FileImpl::~FileImpl() { // We have to clean the global cache for our clusters. - clusterCache.dropAll([=](const cluster_index_type key) {return true;}); + getClusterCache().dropAll([=](const std::tuple& key) {return std::get<0>(key) == this;}); } std::unique_ptr FileImpl::getTitleAccessorV1(const entry_index_t idx) @@ -470,19 +474,21 @@ class Grouping return entry_index_t(m_articleListByCluster[idx.v]); } - FileImpl::ClusterHandle FileImpl::readCluster(cluster_index_t idx) + ClusterHandle FileImpl::readCluster(cluster_index_t idx) { offset_t clusterOffset(getClusterOffset(idx)); log_debug("read cluster " << idx << " from offset " << clusterOffset); return Cluster::read(*zimReader, clusterOffset); } - std::shared_ptr FileImpl::getCluster(cluster_index_t idx) + ClusterHandle FileImpl::getCluster(cluster_index_t idx) { if (idx >= getCountClusters()) throw ZimFileFormatError("cluster index out of range"); - auto cluster = clusterCache.getOrPut(idx.v, [=](){ return readCluster(idx); }); + auto cluster_index_type = idx.v; + auto key = std::make_tuple(this, cluster_index_type); + auto cluster = getClusterCache().getOrPut(key, [=](){ return readCluster(idx); }); #if ENV32BIT // There was a bug in the way we create the zim files using ZSTD compression. // We were using a too hight compression level and so a window of 128Mb. @@ -497,7 +503,7 @@ class Grouping // 5.0 is not a perfect way to detect faulty zim file (it will generate false // positives) but it should be enough. if (header.getMajorVersion() == 5 && header.getMinorVersion() == 0) { - clusterCache.drop(idx.v); + getClusterCache().drop(key); } } #endif @@ -795,17 +801,6 @@ bool checkTitleListing(const IndirectDirentAccessor& accessor, entry_index_type return true; } - - size_t FileImpl::getClusterCacheMaxSize() const { - return clusterCache.getMaxCost(); - } - size_t FileImpl::getClusterCacheCurrentSize() const { - return clusterCache.getCurrentCost(); - } - void FileImpl::setClusterCacheMaxSize(size_t sizeInB) { - clusterCache.setMaxCost(sizeInB); - } - size_t FileImpl::getDirentCacheMaxSize() const { return mp_pathDirentAccessor->getMaxCacheSize(); } diff --git a/src/fileimpl.h b/src/fileimpl.h index cf50790fa..f64fbc2fb 100644 --- a/src/fileimpl.h +++ b/src/fileimpl.h @@ -24,6 +24,7 @@ #include #include +#include #include #include #include @@ -36,13 +37,17 @@ #include "file_reader.h" #include "file_compound.h" #include "fileheader.h" -#include "lrucache.h" #include "zim_types.h" #include "direntreader.h" namespace zim { + class FileImpl; + typedef std::shared_ptr ClusterHandle; + typedef ConcurrentCache, ClusterHandle, ClusterMemorySize> ClusterCache; + ClusterCache& getClusterCache(); + class FileImpl { std::shared_ptr zimFile; @@ -55,9 +60,6 @@ namespace zim std::shared_ptr mp_pathDirentAccessor; std::unique_ptr mp_titleDirentAccessor; - typedef std::shared_ptr ClusterHandle; - ConcurrentCache clusterCache; - const bool m_hasFrontArticlesIndex; const entry_index_t m_startUserEntry; const entry_index_t m_endUserEntry; @@ -155,9 +157,6 @@ namespace zim bool checkIntegrity(IntegrityCheck checkType); - size_t getClusterCacheMaxSize() const; - size_t getClusterCacheCurrentSize() const; - void setClusterCacheMaxSize(size_t nbClusters); size_t getDirentCacheMaxSize() const; size_t getDirentCacheCurrentSize() const; void setDirentCacheMaxSize(size_t nbDirents); diff --git a/test/archive.cpp b/test/archive.cpp index 48c922c25..bb3673570 100644 --- a/test/archive.cpp +++ b/test/archive.cpp @@ -353,17 +353,35 @@ TEST(ZimArchive, cacheDontImpactReading) auto test_archive = zim::Archive(testfile.path); test_archive.setDirentCacheMaxSize(cacheConfig.direntCacheSize); test_archive.setDirentLookupCacheMaxSize(cacheConfig.direntLookupCacheSize); - test_archive.setClusterCacheMaxSize(cacheConfig.clusterCacheSize); + zim::setClusterCacheMaxSize(cacheConfig.clusterCacheSize); EXPECT_EQ(test_archive.getDirentCacheMaxSize(), cacheConfig.direntCacheSize); EXPECT_EQ(test_archive.getDirentLookupCacheMaxSize(), cacheConfig.direntLookupCacheSize); - EXPECT_EQ(test_archive.getClusterCacheMaxSize(), cacheConfig.clusterCacheSize); + EXPECT_EQ(zim::getClusterCacheMaxSize(), cacheConfig.clusterCacheSize); ref_archive.test_is_equal(test_archive); } } } +TEST(ZimArchive, cacheClean) { + for (auto& testfile: getDataFilePath("wikibooks_be_all_nopic_2017-02.zim")) { + EXPECT_EQ(zim::getClusterCacheCurrentSize(), 0); // No clusters in cache + { + auto archive = zim::Archive(testfile.path); + auto range = archive.iterEfficient(); + auto it = range.begin(); + for (auto i = 0; i<50 && it != range.end(); i++, it++) { + // Be sure to search by path to populate the dirent cache + auto entry = archive.getEntryByPath(it->getPath()); + auto item = entry.getItem(true); + auto data = item.getData(); + } + EXPECT_GT(zim::getClusterCacheCurrentSize(), 0); // No clusters in cache + } + EXPECT_EQ(zim::getClusterCacheCurrentSize(), 0); // No clusters in cache + } +} TEST(ZimArchive, cacheChange) { @@ -384,11 +402,12 @@ TEST(ZimArchive, cacheChange) const size_t L1_SIZE = 850 << 10; const size_t L2_SIZE = 2 << 20; + EXPECT_EQ(zim::getClusterCacheCurrentSize(), 0); RefArchiveContent ref_archive(zim::Archive(testfile.path)); auto archive = zim::Archive(testfile.path); archive.setDirentCacheMaxSize(30); - archive.setClusterCacheMaxSize(L2_SIZE); + zim::setClusterCacheMaxSize(L2_SIZE); auto ref_it = ref_archive.ref_entries.begin(); for (auto i = 0; i<50 && ref_it != ref_archive.ref_entries.end(); i++, ref_it++) { @@ -396,14 +415,14 @@ TEST(ZimArchive, cacheChange) ref_it->test_is_equal(entry); } EXPECT_EQ(archive.getDirentCacheCurrentSize(), 30); - EXPECT_LE(archive.getClusterCacheCurrentSize(), L2_SIZE); // Only 2 clusters in the file + EXPECT_LE(zim::getClusterCacheCurrentSize(), L2_SIZE); // Only 2 clusters in the file // Reduce cache size archive.setDirentCacheMaxSize(10); - archive.setClusterCacheMaxSize(L1_SIZE); + zim::setClusterCacheMaxSize(L1_SIZE); EXPECT_EQ(archive.getDirentCacheCurrentSize(), 10); - EXPECT_LE(archive.getClusterCacheCurrentSize(), L1_SIZE); + EXPECT_LE(zim::getClusterCacheCurrentSize(), L1_SIZE); // We want to test change of cache while we are iterating on the archive. // So we don't reset the ref_it to `range.begin()`. @@ -413,26 +432,88 @@ TEST(ZimArchive, cacheChange) } EXPECT_EQ(archive.getDirentCacheCurrentSize(), 10); - EXPECT_LE(archive.getClusterCacheCurrentSize(), L1_SIZE); + EXPECT_LE(zim::getClusterCacheCurrentSize(), L1_SIZE); // Clean cache // (More than testing the value, this is needed as we want to be sure the cache is actually populated later) archive.setDirentCacheMaxSize(0); - archive.setClusterCacheMaxSize(0); + zim::setClusterCacheMaxSize(0); EXPECT_EQ(archive.getDirentCacheCurrentSize(), 0); - EXPECT_EQ(archive.getClusterCacheCurrentSize(), 0); + EXPECT_EQ(zim::getClusterCacheCurrentSize(), 0); // Increase the cache archive.setDirentCacheMaxSize(20); - archive.setClusterCacheMaxSize(L1_SIZE); + zim::setClusterCacheMaxSize(L1_SIZE); EXPECT_EQ(archive.getDirentCacheCurrentSize(), 0); - EXPECT_EQ(archive.getClusterCacheCurrentSize(), 0); + EXPECT_EQ(zim::getClusterCacheCurrentSize(), 0); ref_archive.test_is_equal(archive); EXPECT_EQ(archive.getDirentCacheCurrentSize(), 20); - EXPECT_LE(archive.getClusterCacheCurrentSize(), L1_SIZE); + EXPECT_LE(zim::getClusterCacheCurrentSize(), L1_SIZE); + } +} + + +TEST(ZimArchive, MultiZimCache) +{ + // Get a list of several zim files to open (whatever the variant) + std::vector zimPaths; + const char* const zimfiles[] = { + "wikibooks_be_all_nopic_2017-02.zim", + "wikibooks_be_all_nopic_2017-02_splitted.zim", + "wikipedia_en_climate_change_mini_2024-06.zim" + }; + + for ( const std::string fname : zimfiles ) { + for (auto& testfile: getDataFilePath(fname)) { + zimPaths.push_back(testfile.path); + } + } + + + const size_t SMALL_LIMIT = 5 << 20; + const size_t BIG_LIMIT = 200 << 20; + zim::setClusterCacheMaxSize(BIG_LIMIT); + + std::vector archives; + for (auto path:zimPaths) { + auto archive = zim::Archive(path); + for (auto entry:archive.iterEfficient()) { + auto item = entry.getItem(true); + auto data = item.getData(); + } + archives.push_back(archive); + } + + EXPECT_LE(zim::getClusterCacheCurrentSize(), BIG_LIMIT); + zim::setClusterCacheMaxSize(SMALL_LIMIT); + EXPECT_LE(zim::getClusterCacheCurrentSize(), SMALL_LIMIT); + + // Opening an archive should increase the cluster cache + zim::setClusterCacheMaxSize(BIG_LIMIT); + auto current_limit = zim::getClusterCacheCurrentSize(); + { + auto archive = zim::Archive(zimPaths[0]); + for (auto entry:archive.iterEfficient()) { + auto item = entry.getItem(true); + auto data = item.getData(); + } + EXPECT_GT(zim::getClusterCacheCurrentSize(), current_limit); + current_limit = zim::getClusterCacheCurrentSize(); + } + // Destroying an archive should decrease the cluster cache + EXPECT_LT(zim::getClusterCacheCurrentSize(), current_limit); + + // Be sure that decreasing the number of archives open also decrease the + // current cache size, until we reach 0. + current_limit = zim::getClusterCacheCurrentSize(); + while (!archives.empty()) { + archives.pop_back(); + EXPECT_LE(zim::getClusterCacheCurrentSize(), current_limit); + current_limit = zim::getClusterCacheCurrentSize(); } + EXPECT_EQ(zim::getClusterCacheCurrentSize(), 0); } TEST(ZimArchive, openDontFallbackOnNonSplitZimArchive) From 8c91f85dbcef1fe3f1e0d8a403a9ad2c128472f7 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Fri, 14 Feb 2025 14:24:11 +0100 Subject: [PATCH 08/10] Reset global cache size before each archive test. --- test/archive.cpp | 59 ++++++++++++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 25 deletions(-) diff --git a/test/archive.cpp b/test/archive.cpp index bb3673570..c8cdcaae7 100644 --- a/test/archive.cpp +++ b/test/archive.cpp @@ -43,6 +43,15 @@ using zim::unittests::TempFile; using zim::unittests::TestItem; using zim::unittests::IsFrontArticle; +class ZimArchive: public testing::Test { + protected: + void SetUp() override { + zim::setClusterCacheMaxSize(0); + zim::setClusterCacheMaxSize(CLUSTER_CACHE_SIZE); + ASSERT_EQ(zim::getClusterCacheCurrentSize(), 0); + } +}; + using TestContextImpl = std::vector >; struct TestContext : TestContextImpl { TestContext(const std::initializer_list& il) @@ -80,7 +89,7 @@ emptyZimArchiveContent() return content; } -TEST(ZimArchive, openingAnInvalidZimArchiveFails) +TEST_F(ZimArchive, openingAnInvalidZimArchiveFails) { const char* const prefixes[] = { "ZIM\x04", "" }; const unsigned char bytes[] = {0x00, 0x01, 0x11, 0x30, 0xFF}; @@ -101,7 +110,7 @@ TEST(ZimArchive, openingAnInvalidZimArchiveFails) } } -TEST(ZimArchive, openingAnEmptyZimArchiveSucceeds) +TEST_F(ZimArchive, openingAnEmptyZimArchiveSucceeds) { const auto tmpfile = makeTempFile("empty_zim_file", emptyZimArchiveContent()); @@ -122,7 +131,7 @@ bool isNastyOffset(int offset) { return true; } -TEST(ZimArchive, nastyEmptyZimArchive) +TEST_F(ZimArchive, nastyEmptyZimArchive) { const std::string correctContent = emptyZimArchiveContent(); for ( int offset = 0; offset < 80; ++offset ) { @@ -136,7 +145,7 @@ TEST(ZimArchive, nastyEmptyZimArchive) } } -TEST(ZimArchive, wrongChecksumInEmptyZimArchive) +TEST_F(ZimArchive, wrongChecksumInEmptyZimArchive) { std::string zimfileContent = emptyZimArchiveContent(); zimfileContent[85] = '\xff'; @@ -147,7 +156,7 @@ TEST(ZimArchive, wrongChecksumInEmptyZimArchive) } -TEST(ZimArchive, openCreatedArchive) +TEST_F(ZimArchive, openCreatedArchive) { TempFile temp("zimfile"); auto tempPath = temp.path(); @@ -245,7 +254,7 @@ TEST(ZimArchive, openCreatedArchive) } #if WITH_TEST_DATA -TEST(ZimArchive, openRealZimArchive) +TEST_F(ZimArchive, openRealZimArchive) { const char* const zimfiles[] = { "small.zim", @@ -266,7 +275,7 @@ TEST(ZimArchive, openRealZimArchive) } } -TEST(ZimArchive, openSplitZimArchive) +TEST_F(ZimArchive, openSplitZimArchive) { const char* fname = "wikibooks_be_all_nopic_2017-02_splitted.zim"; @@ -330,7 +339,7 @@ struct RefArchiveContent { }; -TEST(ZimArchive, cacheDontImpactReading) +TEST_F(ZimArchive, cacheDontImpactReading) { const TestCacheConfig cacheConfigs[] = { {0, 0, 0}, @@ -364,7 +373,7 @@ TEST(ZimArchive, cacheDontImpactReading) } } -TEST(ZimArchive, cacheClean) { +TEST_F(ZimArchive, cacheClean) { for (auto& testfile: getDataFilePath("wikibooks_be_all_nopic_2017-02.zim")) { EXPECT_EQ(zim::getClusterCacheCurrentSize(), 0); // No clusters in cache { @@ -383,7 +392,7 @@ TEST(ZimArchive, cacheClean) { } } -TEST(ZimArchive, cacheChange) +TEST_F(ZimArchive, cacheChange) { // We test only one variant here. // Each variant has cluster of different size (especially the old "withns" which @@ -455,7 +464,7 @@ TEST(ZimArchive, cacheChange) } -TEST(ZimArchive, MultiZimCache) +TEST_F(ZimArchive, MultiZimCache) { // Get a list of several zim files to open (whatever the variant) std::vector zimPaths; @@ -516,7 +525,7 @@ TEST(ZimArchive, MultiZimCache) EXPECT_EQ(zim::getClusterCacheCurrentSize(), 0); } -TEST(ZimArchive, openDontFallbackOnNonSplitZimArchive) +TEST_F(ZimArchive, openDontFallbackOnNonSplitZimArchive) { const char* fname = "wikibooks_be_all_nopic_2017-02.zim"; @@ -532,7 +541,7 @@ TEST(ZimArchive, openDontFallbackOnNonSplitZimArchive) } } -TEST(ZimArchive, openNonExistantZimArchive) +TEST_F(ZimArchive, openNonExistantZimArchive) { const std::string fname = "non_existant.zim"; @@ -545,7 +554,7 @@ TEST(ZimArchive, openNonExistantZimArchive) } } -TEST(ZimArchive, openNonExistantZimSplitArchive) +TEST_F(ZimArchive, openNonExistantZimSplitArchive) { const std::string fname = "non_existant.zimaa"; @@ -558,7 +567,7 @@ TEST(ZimArchive, openNonExistantZimSplitArchive) } } -TEST(ZimArchive, randomEntry) +TEST_F(ZimArchive, randomEntry) { const char* const zimfiles[] = { "wikibooks_be_all_nopic_2017-02.zim", @@ -583,7 +592,7 @@ TEST(ZimArchive, randomEntry) } } -TEST(ZimArchive, illustration) +TEST_F(ZimArchive, illustration) { const char* const zimfiles[] = { "small.zim", @@ -628,7 +637,7 @@ struct TestDataInfo { } }; -TEST(ZimArchive, articleNumber) +TEST_F(ZimArchive, articleNumber) { TestDataInfo zimfiles[] = { // Name mediaCount, withns nons noTitleListingV0 @@ -694,7 +703,7 @@ for(auto& testfile: getDataFilePath(ZIMNAME, CAT)) {EXPECT_BROKEN_ZIMFILE(testfi #define WITH_TITLE_IDX_CAT {"withns", "nons"} #if WITH_TEST_DATA -TEST(ZimArchive, validate) +TEST_F(ZimArchive, validate) { zim::IntegrityCheckList all; all.set(); @@ -892,7 +901,7 @@ void checkEquivalence(const zim::Archive& archive1, const zim::Archive& archive2 #endif } -TEST(ZimArchive, multipart) +TEST_F(ZimArchive, multipart) { auto nonSplittedZims = getDataFilePath("wikibooks_be_all_nopic_2017-02.zim"); auto splittedZims = getDataFilePath("wikibooks_be_all_nopic_2017-02_splitted.zim"); @@ -921,7 +930,7 @@ TEST(ZimArchive, multipart) #endif #ifndef _WIN32 -TEST(ZimArchive, openByFD) +TEST_F(ZimArchive, openByFD) { for(auto& testfile: getDataFilePath("small.zim")) { const zim::Archive archive1(testfile.path); @@ -932,7 +941,7 @@ TEST(ZimArchive, openByFD) } } -TEST(ZimArchive, openZIMFileEmbeddedInAnotherFile) +TEST_F(ZimArchive, openZIMFileEmbeddedInAnotherFile) { auto normalZims = getDataFilePath("small.zim"); auto embeddedZims = getDataFilePath("small.zim.embedded"); @@ -948,7 +957,7 @@ TEST(ZimArchive, openZIMFileEmbeddedInAnotherFile) } } -TEST(ZimArchive, openZIMFileMultiPartEmbeddedInAnotherFile) +TEST_F(ZimArchive, openZIMFileMultiPartEmbeddedInAnotherFile) { auto normalZims = getDataFilePath("small.zim"); auto embeddedZims = getDataFilePath("small.zim.embedded.multi"); @@ -991,7 +1000,7 @@ zim::Blob readItemData(const zim::Item::DirectAccessInfo& dai, zim::size_type si return zim::Blob(data, size); } -TEST(ZimArchive, getDirectAccessInformation) +TEST_F(ZimArchive, getDirectAccessInformation) { for(auto& testfile:getDataFilePath("small.zim")) { const zim::Archive archive(testfile.path); @@ -1012,7 +1021,7 @@ TEST(ZimArchive, getDirectAccessInformation) } #ifndef _WIN32 -TEST(ZimArchive, getDirectAccessInformationInAnArchiveOpenedByFD) +TEST_F(ZimArchive, getDirectAccessInformationInAnArchiveOpenedByFD) { for(auto& testfile:getDataFilePath("small.zim")) { const int fd = OPEN_READ_ONLY(testfile.path); @@ -1033,7 +1042,7 @@ TEST(ZimArchive, getDirectAccessInformationInAnArchiveOpenedByFD) } } -TEST(ZimArchive, getDirectAccessInformationFromEmbeddedArchive) +TEST_F(ZimArchive, getDirectAccessInformationFromEmbeddedArchive) { auto normalZims = getDataFilePath("small.zim"); auto embeddedZims = getDataFilePath("small.zim.embedded"); From 9b9e229a30a701d218c9ea3cd5b963edd4d840c8 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Fri, 14 Feb 2025 15:05:29 +0100 Subject: [PATCH 09/10] Ensure cluster cache is properly clean in case of exception in constructor. --- src/fileimpl.cpp | 41 ++++++++++++++++++++++++++--------------- src/fileimpl.h | 2 ++ test/archive.cpp | 6 +++++- 3 files changed, 33 insertions(+), 16 deletions(-) diff --git a/src/fileimpl.cpp b/src/fileimpl.cpp index ae487a59d..46751a488 100644 --- a/src/fileimpl.cpp +++ b/src/fileimpl.cpp @@ -248,30 +248,41 @@ class Grouping const_cast(m_endUserEntry) = getCountArticles(); } - auto result = tmpDirentLookup.find('X', "listing/titleOrdered/v1"); - if (result.first) { - mp_titleDirentAccessor = getTitleAccessorV1(result.second); - } + // Following code will may create cluster and we want to remove them from cache + // if something goes wrong. + try { + auto result = tmpDirentLookup.find('X', "listing/titleOrdered/v1"); + if (result.first) { + mp_titleDirentAccessor = getTitleAccessorV1(result.second); + } - if (!mp_titleDirentAccessor) { - if (!header.hasTitleListingV0()) { - throw ZimFileFormatError("Zim file doesn't contain a title ordered index"); + if (!mp_titleDirentAccessor) { + if (!header.hasTitleListingV0()) { + throw ZimFileFormatError("Zim file doesn't contain a title ordered index"); + } + offset_t titleOffset(header.getTitleIdxPos()); + zsize_t titleSize(sizeof(entry_index_type)*header.getArticleCount()); + mp_titleDirentAccessor = getTitleAccessor(titleOffset, titleSize, "Title index table"); + const_cast(m_hasFrontArticlesIndex) = false; } - offset_t titleOffset(header.getTitleIdxPos()); - zsize_t titleSize(sizeof(entry_index_type)*header.getArticleCount()); - mp_titleDirentAccessor = getTitleAccessor(titleOffset, titleSize, "Title index table"); - const_cast(m_hasFrontArticlesIndex) = false; - } - m_byTitleDirentLookup.reset(new ByTitleDirentLookup(mp_titleDirentAccessor.get())); + m_byTitleDirentLookup.reset(new ByTitleDirentLookup(mp_titleDirentAccessor.get())); - readMimeTypes(); + readMimeTypes(); + } catch (...) { + dropCachedClusters(); + throw; + } } FileImpl::~FileImpl() { - // We have to clean the global cache for our clusters. + dropCachedClusters(); + } + + void FileImpl::dropCachedClusters() const { getClusterCache().dropAll([=](const std::tuple& key) {return std::get<0>(key) == this;}); } + std::unique_ptr FileImpl::getTitleAccessorV1(const entry_index_t idx) { auto dirent = mp_pathDirentAccessor->getDirent(idx); diff --git a/src/fileimpl.h b/src/fileimpl.h index f64fbc2fb..17cae5aec 100644 --- a/src/fileimpl.h +++ b/src/fileimpl.h @@ -166,6 +166,8 @@ namespace zim explicit FileImpl(std::shared_ptr zimFile); FileImpl(std::shared_ptr zimFile, offset_t offset, zsize_t size); + void dropCachedClusters() const; + std::unique_ptr getTitleAccessorV1(const entry_index_t idx); std::unique_ptr getTitleAccessor(const offset_t offset, const zsize_t size, const std::string& name); diff --git a/test/archive.cpp b/test/archive.cpp index c8cdcaae7..571190c53 100644 --- a/test/archive.cpp +++ b/test/archive.cpp @@ -50,6 +50,9 @@ class ZimArchive: public testing::Test { zim::setClusterCacheMaxSize(CLUSTER_CACHE_SIZE); ASSERT_EQ(zim::getClusterCacheCurrentSize(), 0); } + void TearDown() override { + ASSERT_EQ(zim::getClusterCacheCurrentSize(), 0); + } }; using TestContextImpl = std::vector >; @@ -691,7 +694,8 @@ class CapturedStderr #define EXPECT_BROKEN_ZIMFILE(ZIMPATH, EXPECTED_STDERROR_TEXT) \ CapturedStderr stderror; \ EXPECT_FALSE(zim::validate(ZIMPATH, checksToRun)); \ - EXPECT_EQ(EXPECTED_STDERROR_TEXT, std::string(stderror)) << ZIMPATH; + EXPECT_EQ(EXPECTED_STDERROR_TEXT, std::string(stderror)) << ZIMPATH; \ + ASSERT_EQ(zim::getClusterCacheCurrentSize(), 0); #define TEST_BROKEN_ZIM_NAME(ZIMNAME, EXPECTED) \ for(auto& testfile: getDataFilePath(ZIMNAME)) {EXPECT_BROKEN_ZIMFILE(testfile.path, EXPECTED)} From f896c0f15fb8bd62eefd4630b96234d49d08a0a1 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Wed, 5 Mar 2025 18:05:30 +0100 Subject: [PATCH 10/10] Check if shared_future has been drop while we compute the value. If creation of the value takes a lot of time, we may have move a lot of value in front in the lru cache and drop the shared_future before we set it. So before blindly increase the cost, we check the promise is still here and readd it else. --- src/concurrent_cache.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/concurrent_cache.h b/src/concurrent_cache.h index f0dc4764f..d7b4a57c7 100644 --- a/src/concurrent_cache.h +++ b/src/concurrent_cache.h @@ -90,7 +90,8 @@ class ConcurrentCache: private lru_cache, FutureT { std::promise valuePromise; std::unique_lock l(lock_); - const auto x = Impl::getOrPut(key, valuePromise.get_future().share()); + auto shared_future = valuePromise.get_future().share(); + const auto x = Impl::getOrPut(key, shared_future); l.unlock(); if ( x.miss() ) { try { @@ -98,7 +99,7 @@ class ConcurrentCache: private lru_cache, FutureT auto cost = CostEstimation::cost(x.value().get()); // There is a small window when the valuePromise may be drop from lru cache after // we set the value but before we increase the size of the cache. - // In this case decrease the size of `cost` before increasing it. + // In this case we decrease the size of `cost` before increasing it. // First of all it should be pretty rare as we have just put the future in the cache so it // should not be the least used item. // If it happens, this should not be a problem if current_size is bigger than `cost` (most of the time) @@ -106,7 +107,15 @@ class ConcurrentCache: private lru_cache, FutureT // `decreaseCost` will clamp the new size to 0. { std::unique_lock l(lock_); - Impl::increaseCost(cost); + // There is a window when the shared_future is drop from the cache while we are computing the value. + // If this is the case, we readd the shared_future in the cache. + if (!Impl::exists(key)) { + // We don't have have to increase the cache as the future is already set, so the cost will be valid. + Impl::put(key, shared_future); + } else { + // We just have to increase the cost as we used 0 for unset future. + Impl::increaseCost(cost); + } } } catch (std::exception& e) { drop(key);