diff --git a/include/zim/archive.h b/include/zim/archive.h index a5985d1d9..d19b99838 100644 --- a/include/zim/archive.h +++ b/include/zim/archive.h @@ -42,6 +42,27 @@ namespace zim efficientOrder }; + /** Get the maximum size of the cluster cache. + * + * @return The maximum memory size used the cluster cache. + */ + size_t LIBZIM_API getClusterCacheMaxSize(); + + /** Get the current size of the cluster cache. + * + * @return The current memory size used by the cluster cache. + */ + size_t LIBZIM_API getClusterCacheCurrentSize(); + + /** Set the size of the cluster cache. + * + * If the new size is lower than the number of currently stored clusters + * some clusters will be dropped from cache to respect the new size. + * + * @param sizeInB The memory limit (in bytes) for the cluster cache. + */ + void LIBZIM_API setClusterCacheMaxSize(size_t sizeInB); + /** * The Archive class to access content in a zim file. * @@ -534,27 +555,6 @@ namespace zim */ std::shared_ptr getImpl() const { return m_impl; } - /** Get the maximum size of the cluster cache. - * - * @return The maximum number of clusters stored in the cache. - */ - size_t getClusterCacheMaxSize() const; - - /** Get the current size of the cluster cache. - * - * @return The number of clusters currently stored in the cache. - */ - size_t getClusterCacheCurrentSize() const; - - /** Set the size of the cluster cache. - * - * If the new size is lower than the number of currently stored clusters - * some clusters will be dropped from cache to respect the new size. - * - * @param nbClusters The maximum number of clusters stored in the cache. - */ - void setClusterCacheMaxSize(size_t nbClusters); - /** Get the size of the dirent cache. * * @return The maximum number of dirents stored in the cache. diff --git a/meson_options.txt b/meson_options.txt index e23118f6e..936fdf8c5 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -1,5 +1,5 @@ -option('CLUSTER_CACHE_SIZE', type : 'string', value : '16', - description : 'set cluster cache size to number (default:16)') +option('CLUSTER_CACHE_SIZE', type : 'string', value : '536870912', + description : 'set cluster cache size to number (default:512MB)') option('DIRENT_CACHE_SIZE', type : 'string', value : '512', description : 'set dirent cache size to number (default:512)') option('DIRENT_LOOKUP_CACHE_SIZE', type : 'string', value : '1024', diff --git a/src/archive.cpp b/src/archive.cpp index e370a9bf2..5cf1836b7 100644 --- a/src/archive.cpp +++ b/src/archive.cpp @@ -504,19 +504,19 @@ namespace zim return m_impl->hasNewNamespaceScheme(); } - size_t Archive::getClusterCacheMaxSize() const + size_t getClusterCacheMaxSize() { - return m_impl->getClusterCacheMaxSize(); + return getClusterCache().getMaxCost(); } - size_t Archive::getClusterCacheCurrentSize() const + size_t getClusterCacheCurrentSize() { - return m_impl->getClusterCacheCurrentSize(); + return getClusterCache().getCurrentCost(); } - void Archive::setClusterCacheMaxSize(size_t nbClusters) + void setClusterCacheMaxSize(size_t sizeInB) { - m_impl->setClusterCacheMaxSize(nbClusters); + getClusterCache().setMaxCost(sizeInB); } size_t Archive::getDirentCacheMaxSize() const @@ -534,7 +534,6 @@ namespace zim m_impl->setDirentCacheMaxSize(nbDirents); } - size_t Archive::getDirentLookupCacheMaxSize() const { return m_impl->getDirentLookupCacheMaxSize(); diff --git a/src/buffer_reader.cpp b/src/buffer_reader.cpp index 3b64ca137..0d08518a6 100644 --- a/src/buffer_reader.cpp +++ b/src/buffer_reader.cpp @@ -44,6 +44,11 @@ zsize_t BufferReader::size() const return source.size(); } +size_t BufferReader::getMemorySize() const +{ + return source.size().v; +} + offset_t BufferReader::offset() const { return offset_t((offset_type)(static_cast(source.data(offset_t(0))))); diff --git a/src/buffer_reader.h b/src/buffer_reader.h index f0972c375..27a128fcd 100644 --- a/src/buffer_reader.h +++ b/src/buffer_reader.h @@ -31,6 +31,7 @@ class LIBZIM_PRIVATE_API BufferReader : public Reader { virtual ~BufferReader() {}; zsize_t size() const override; + size_t getMemorySize() const override; offset_t offset() const override; const Buffer get_buffer(offset_t offset, zsize_t size) const override; diff --git a/src/cluster.cpp b/src/cluster.cpp index 214af0e38..b587b8a19 100644 --- a/src/cluster.cpp +++ b/src/cluster.cpp @@ -31,8 +31,6 @@ #include "log.h" -#include "config.h" - log_define("zim.cluster") #define log_debug1(e) @@ -86,7 +84,8 @@ getClusterReader(const Reader& zimReader, offset_t offset, Cluster::Compression* Cluster::Cluster(std::unique_ptr reader_, Compression comp, bool isExtended) : compression(comp), isExtended(isExtended), - m_reader(std::move(reader_)) + m_reader(std::move(reader_)), + m_memorySize(0) { if (isExtended) { read_header(); @@ -179,4 +178,44 @@ getClusterReader(const Reader& zimReader, offset_t offset, Cluster::Compression* } } + // This function must return a constant size for a given cluster. + // This is important as we want to remove the same size that what we add when we remove + // the cluster from the cache. + // However, because of partial decompression, this size can change: + // - As we advance in the compression, we can create new blob readers in `m_blobReaders` + // - The stream itself may allocate memory. + // To solve this, we take the average and say a cluster's blob readers will half be created and + // so we assume a readers size of half the full uncompressed cluster data size. + // If cluster is not compressed, we never store its content (mmap is created on demand and not cached), + // so we use a size of 0 for the readers. + // It also appears that when we get the size of the stream, we reach a state where no + // futher allocation will be done by it. Probably because: + // - We already started to decompress the stream to read the offsets + // - Cluster data size is smaller than window size associated to compression level (?) + // We anyway check that and print a warning if this is not the case, hopping that user will create + // an issue allowing us for further analysis. + // Note: + // - No need to protect this method from concurent access as it will be called by the concurent_cache which will + // have a lock (on lru cache) to ensure only one thread access it in the same time. + size_t Cluster::getMemorySize() const { + if (!m_memorySize) { + auto offsets_size = sizeof(offset_t) * m_blobOffsets.size(); + auto readers_size = 0; + if (isCompressed()) { + readers_size = m_blobOffsets.back().v / 2; + } + m_streamSize = m_reader->getMemorySize(); + // Compression level define a huge window and make decompression stream allocate a huge memory to store it. + // However, the used memory will not be greater than the content itself, even if window is bigger. + // On linux (at least), the real used memory will be the actual memory used, not the one allocated. + // So, let's clamm the the stream size to the size of the content itself. + m_memorySize = offsets_size + readers_size + std::min(m_streamSize, m_blobOffsets.back().v); + } + auto streamSize = m_reader->getMemorySize(); + if (streamSize != m_streamSize) { + std::cerr << "WARNING: stream size have changed from " << m_streamSize << " to " << streamSize << std::endl; + std::cerr << "Please open an issue on https://github.com/openzim/libzim/issues with this message and the zim file you use" << std::endl; + } + return m_memorySize; + } } diff --git a/src/cluster.h b/src/cluster.h index db1be37c1..0c1a6b553 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -70,6 +70,8 @@ namespace zim mutable std::mutex m_readerAccessMutex; mutable BlobReaders m_blobReaders; + mutable size_t m_memorySize; + mutable size_t m_streamSize; template @@ -90,9 +92,17 @@ namespace zim Blob getBlob(blob_index_t n) const; Blob getBlob(blob_index_t n, offset_t offset, zsize_t size) const; + size_t getMemorySize() const; + static std::shared_ptr read(const Reader& zimReader, offset_t clusterOffset); }; + struct ClusterMemorySize { + static size_t cost(const std::shared_ptr& cluster) { + return cluster->getMemorySize(); + } + }; + } #endif // ZIM_CLUSTER_H diff --git a/src/compression.cpp b/src/compression.cpp index f5d3352c8..c3359b9ba 100644 --- a/src/compression.cpp +++ b/src/compression.cpp @@ -60,6 +60,11 @@ void LZMA_INFO::stream_end_decode(stream_t* stream) lzma_end(stream); } +size_t LZMA_INFO::state_size(const stream_t& stream) +{ + return lzma_memusage(&stream); +} + const std::string ZSTD_INFO::name = "zstd"; @@ -170,3 +175,11 @@ void ZSTD_INFO::stream_end_decode(stream_t* stream) void ZSTD_INFO::stream_end_encode(stream_t* stream) { } + +size_t ZSTD_INFO::state_size(const stream_t& stream) { + if (stream.decoder_stream) { + return ZSTD_sizeof_CStream(stream.encoder_stream); + } else { + return ZSTD_sizeof_DStream(stream.decoder_stream); + } +} diff --git a/src/compression.h b/src/compression.h index 4daba33b5..b27a86531 100644 --- a/src/compression.h +++ b/src/compression.h @@ -65,6 +65,7 @@ struct LZMA_INFO { static CompStatus stream_run_decode(stream_t* stream, CompStep step); static CompStatus stream_run(stream_t* stream, CompStep step); static void stream_end_decode(stream_t* stream); + static size_t state_size(const stream_t& stream); }; @@ -94,6 +95,7 @@ struct LIBZIM_PRIVATE_API ZSTD_INFO { static CompStatus stream_run_decode(stream_t* stream, CompStep step); static void stream_end_encode(stream_t* stream); static void stream_end_decode(stream_t* stream); + static size_t state_size(const stream_t& stream); }; diff --git a/src/concurrent_cache.h b/src/concurrent_cache.h index a2dd0bd2f..d7b4a57c7 100644 --- a/src/concurrent_cache.h +++ b/src/concurrent_cache.h @@ -23,6 +23,7 @@ #include "lrucache.h" +#include #include #include #include @@ -30,6 +31,30 @@ namespace zim { +template +struct FutureToValueCostEstimation { + template + static size_t cost(const std::shared_future& future) { + // The future is the value in the cache. + // When calling getOrPut, if the key is not in the cache, + // we add a future and then we compute the value and set the future. + // But lrucache call us when we add the future, meaning before we have + // computed the value. If we wait here (or use future.get), we will dead lock + // as we need to exit before setting the value. + // So in this case, we return 0. `ConcurrentCache::getOrPut` will correctly increase + // the current cache size when it have an actual value. + // We still need to compute the size of the value if the future has a value as it + // is also use to decrease the cache size when the value is drop. + std::future_status status = future.wait_for(std::chrono::nanoseconds::zero()); + if (status == std::future_status::ready) { + return CostEstimation::cost(future.get()); + } else { + return 0; + } + } + +}; + /** ConcurrentCache implements a concurrent thread-safe cache @@ -39,16 +64,16 @@ namespace zim safe, and, in case of a cache miss, will block until that element becomes available. */ -template -class ConcurrentCache +template +class ConcurrentCache: private lru_cache, FutureToValueCostEstimation> { private: // types typedef std::shared_future ValuePlaceholder; - typedef lru_cache Impl; + typedef lru_cache> Impl; public: // types - explicit ConcurrentCache(size_t maxEntries) - : impl_(maxEntries) + explicit ConcurrentCache(size_t maxCost) + : Impl(maxCost) {} // Gets the entry corresponding to the given key. If the entry is not in the @@ -65,11 +90,33 @@ class ConcurrentCache { std::promise valuePromise; std::unique_lock l(lock_); - const auto x = impl_.getOrPut(key, valuePromise.get_future().share()); + auto shared_future = valuePromise.get_future().share(); + const auto x = Impl::getOrPut(key, shared_future); l.unlock(); if ( x.miss() ) { try { valuePromise.set_value(f()); + auto cost = CostEstimation::cost(x.value().get()); + // There is a small window when the valuePromise may be drop from lru cache after + // we set the value but before we increase the size of the cache. + // In this case we decrease the size of `cost` before increasing it. + // First of all it should be pretty rare as we have just put the future in the cache so it + // should not be the least used item. + // If it happens, this should not be a problem if current_size is bigger than `cost` (most of the time) + // For the really rare specific case of current cach size being lower than `cost` (if possible), + // `decreaseCost` will clamp the new size to 0. + { + std::unique_lock l(lock_); + // There is a window when the shared_future is drop from the cache while we are computing the value. + // If this is the case, we readd the shared_future in the cache. + if (!Impl::exists(key)) { + // We don't have have to increase the cache as the future is already set, so the cost will be valid. + Impl::put(key, shared_future); + } else { + // We just have to increase the cost as we used 0 for unset future. + Impl::increaseCost(cost); + } + } } catch (std::exception& e) { drop(key); throw; @@ -82,26 +129,31 @@ class ConcurrentCache bool drop(const Key& key) { std::unique_lock l(lock_); - return impl_.drop(key); + return Impl::drop(key); + } + + template + void dropAll(F f) { + std::unique_lock l(lock_); + Impl::dropAll(f); } - size_t getMaxSize() const { + size_t getMaxCost() const { std::unique_lock l(lock_); - return impl_.getMaxSize(); + return Impl::getMaxCost(); } - size_t getCurrentSize() const { + size_t getCurrentCost() const { std::unique_lock l(lock_); - return impl_.size(); + return Impl::cost(); } - void setMaxSize(size_t newSize) { + void setMaxCost(size_t newSize) { std::unique_lock l(lock_); - return impl_.setMaxSize(newSize); + return Impl::setMaxCost(newSize); } private: // data - Impl impl_; mutable std::mutex lock_; }; diff --git a/src/decoderstreamreader.h b/src/decoderstreamreader.h index d48582b7e..43f2b6256 100644 --- a/src/decoderstreamreader.h +++ b/src/decoderstreamreader.h @@ -23,6 +23,7 @@ #include "compression.h" #include "istreamreader.h" +#include namespace zim { @@ -49,6 +50,10 @@ class DecoderStreamReader : public IStreamReader Decoder::stream_end_decode(&m_decoderState); } + size_t getMemorySize() const override { + return m_encodedDataReader->getMemorySize() + m_encodedDataChunk.size().v + Decoder::state_size(m_decoderState); + } + private: // functions void readNextChunk() { diff --git a/src/dirent_accessor.h b/src/dirent_accessor.h index ef3953128..2156e1033 100644 --- a/src/dirent_accessor.h +++ b/src/dirent_accessor.h @@ -55,9 +55,9 @@ class LIBZIM_PRIVATE_API DirectDirentAccessor std::shared_ptr getDirent(entry_index_t idx) const; entry_index_t getDirentCount() const { return m_direntCount; } - size_t getMaxCacheSize() const { return m_direntCache.getMaxSize(); } - size_t getCurrentCacheSize() const { return m_direntCache.size(); } - void setMaxCacheSize(size_t nbDirents) const { m_direntCache.setMaxSize(nbDirents); } + size_t getMaxCacheSize() const { return m_direntCache.getMaxCost(); } + size_t getCurrentCacheSize() const { return m_direntCache.cost(); } + void setMaxCacheSize(size_t nbDirents) const { m_direntCache.setMaxCost(nbDirents); } private: // functions std::shared_ptr readDirent(offset_t) const; @@ -67,7 +67,7 @@ class LIBZIM_PRIVATE_API DirectDirentAccessor std::unique_ptr mp_pathPtrReader; entry_index_t m_direntCount; - mutable lru_cache> m_direntCache; + mutable lru_cache, UnitCostEstimation> m_direntCache; mutable std::mutex m_direntCacheLock; mutable std::vector m_bufferDirentZone; diff --git a/src/file_reader.h b/src/file_reader.h index 817e24c31..b247f21cb 100644 --- a/src/file_reader.h +++ b/src/file_reader.h @@ -34,6 +34,7 @@ class LIBZIM_PRIVATE_API BaseFileReader : public Reader { : _offset(offset), _size(size) {} ~BaseFileReader() = default; zsize_t size() const override { return _size; }; + size_t getMemorySize() const override { return 0; }; offset_t offset() const override { return _offset; }; virtual const Buffer get_mmap_buffer(offset_t offset, diff --git a/src/fileimpl.cpp b/src/fileimpl.cpp index 3d63f3e8f..46751a488 100644 --- a/src/fileimpl.cpp +++ b/src/fileimpl.cpp @@ -20,6 +20,7 @@ */ #include "dirent_lookup.h" +#include "zim/zim.h" #include "zim_types.h" #include #define CHUNK_SIZE 1024 @@ -31,7 +32,6 @@ #include "buffer_reader.h" #include #include -#include #include #include #include @@ -162,6 +162,11 @@ class Grouping } //unnamed namespace + ClusterCache& getClusterCache() { + static ClusterCache clusterCache(CLUSTER_CACHE_SIZE); + return clusterCache; + } + ////////////////////////////////////////////////////////////////////// // FileImpl // @@ -187,7 +192,6 @@ class Grouping : zimFile(_zimFile), zimReader(makeFileReader(zimFile)), direntReader(new DirentReader(zimReader)), - clusterCache(CLUSTER_CACHE_SIZE), m_hasFrontArticlesIndex(true), m_startUserEntry(0), m_endUserEntry(0), @@ -244,25 +248,41 @@ class Grouping const_cast(m_endUserEntry) = getCountArticles(); } - auto result = tmpDirentLookup.find('X', "listing/titleOrdered/v1"); - if (result.first) { - mp_titleDirentAccessor = getTitleAccessorV1(result.second); - } + // Following code will may create cluster and we want to remove them from cache + // if something goes wrong. + try { + auto result = tmpDirentLookup.find('X', "listing/titleOrdered/v1"); + if (result.first) { + mp_titleDirentAccessor = getTitleAccessorV1(result.second); + } - if (!mp_titleDirentAccessor) { - if (!header.hasTitleListingV0()) { - throw ZimFileFormatError("Zim file doesn't contain a title ordered index"); + if (!mp_titleDirentAccessor) { + if (!header.hasTitleListingV0()) { + throw ZimFileFormatError("Zim file doesn't contain a title ordered index"); + } + offset_t titleOffset(header.getTitleIdxPos()); + zsize_t titleSize(sizeof(entry_index_type)*header.getArticleCount()); + mp_titleDirentAccessor = getTitleAccessor(titleOffset, titleSize, "Title index table"); + const_cast(m_hasFrontArticlesIndex) = false; } - offset_t titleOffset(header.getTitleIdxPos()); - zsize_t titleSize(sizeof(entry_index_type)*header.getArticleCount()); - mp_titleDirentAccessor = getTitleAccessor(titleOffset, titleSize, "Title index table"); - const_cast(m_hasFrontArticlesIndex) = false; + m_byTitleDirentLookup.reset(new ByTitleDirentLookup(mp_titleDirentAccessor.get())); + + readMimeTypes(); + } catch (...) { + dropCachedClusters(); + throw; } - m_byTitleDirentLookup.reset(new ByTitleDirentLookup(mp_titleDirentAccessor.get())); + } - readMimeTypes(); + FileImpl::~FileImpl() { + dropCachedClusters(); } + void FileImpl::dropCachedClusters() const { + getClusterCache().dropAll([=](const std::tuple& key) {return std::get<0>(key) == this;}); + } + + std::unique_ptr FileImpl::getTitleAccessorV1(const entry_index_t idx) { auto dirent = mp_pathDirentAccessor->getDirent(idx); @@ -465,19 +485,21 @@ class Grouping return entry_index_t(m_articleListByCluster[idx.v]); } - FileImpl::ClusterHandle FileImpl::readCluster(cluster_index_t idx) + ClusterHandle FileImpl::readCluster(cluster_index_t idx) { offset_t clusterOffset(getClusterOffset(idx)); log_debug("read cluster " << idx << " from offset " << clusterOffset); return Cluster::read(*zimReader, clusterOffset); } - std::shared_ptr FileImpl::getCluster(cluster_index_t idx) + ClusterHandle FileImpl::getCluster(cluster_index_t idx) { if (idx >= getCountClusters()) throw ZimFileFormatError("cluster index out of range"); - auto cluster = clusterCache.getOrPut(idx.v, [=](){ return readCluster(idx); }); + auto cluster_index_type = idx.v; + auto key = std::make_tuple(this, cluster_index_type); + auto cluster = getClusterCache().getOrPut(key, [=](){ return readCluster(idx); }); #if ENV32BIT // There was a bug in the way we create the zim files using ZSTD compression. // We were using a too hight compression level and so a window of 128Mb. @@ -492,7 +514,7 @@ class Grouping // 5.0 is not a perfect way to detect faulty zim file (it will generate false // positives) but it should be enough. if (header.getMajorVersion() == 5 && header.getMinorVersion() == 0) { - clusterCache.drop(idx.v); + getClusterCache().drop(key); } } #endif @@ -566,11 +588,11 @@ class Grouping struct zim_MD5_CTX md5ctx; zim_MD5Init(&md5ctx); - + unsigned char ch[CHUNK_SIZE]; offset_type checksumPos = header.getChecksumPos(); offset_type toRead = checksumPos; - + for(auto part = zimFile->begin(); part != zimFile->end(); part++) { @@ -580,7 +602,7 @@ class Grouping zim_MD5Update(&md5ctx, ch, CHUNK_SIZE); toRead-=CHUNK_SIZE; } - + // Previous read was good, so we have exited the previous `while` because // `toRead(ch),toRead); } - + // It updates the checksum with the remaining amount of data when we // reach the end of the file or part zim_MD5Update(&md5ctx, ch, stream.gcount()); toRead-=stream.gcount(); - + if (stream.bad()) { perror("error while reading file"); return false; @@ -790,17 +812,6 @@ bool checkTitleListing(const IndirectDirentAccessor& accessor, entry_index_type return true; } - - size_t FileImpl::getClusterCacheMaxSize() const { - return clusterCache.getMaxSize(); - } - size_t FileImpl::getClusterCacheCurrentSize() const { - return clusterCache.getCurrentSize(); - } - void FileImpl::setClusterCacheMaxSize(size_t nbClusters) { - clusterCache.setMaxSize(nbClusters); - } - size_t FileImpl::getDirentCacheMaxSize() const { return mp_pathDirentAccessor->getMaxCacheSize(); } diff --git a/src/fileimpl.h b/src/fileimpl.h index 45452fe37..17cae5aec 100644 --- a/src/fileimpl.h +++ b/src/fileimpl.h @@ -24,6 +24,7 @@ #include #include +#include #include #include #include @@ -42,6 +43,11 @@ namespace zim { + class FileImpl; + typedef std::shared_ptr ClusterHandle; + typedef ConcurrentCache, ClusterHandle, ClusterMemorySize> ClusterCache; + ClusterCache& getClusterCache(); + class FileImpl { std::shared_ptr zimFile; @@ -54,9 +60,6 @@ namespace zim std::shared_ptr mp_pathDirentAccessor; std::unique_ptr mp_titleDirentAccessor; - typedef std::shared_ptr ClusterHandle; - ConcurrentCache clusterCache; - const bool m_hasFrontArticlesIndex; const entry_index_t m_startUserEntry; const entry_index_t m_endUserEntry; @@ -106,6 +109,7 @@ namespace zim explicit FileImpl(FdInput fd); explicit FileImpl(const std::vector& fds); #endif + ~FileImpl(); time_t getMTime() const; @@ -153,9 +157,6 @@ namespace zim bool checkIntegrity(IntegrityCheck checkType); - size_t getClusterCacheMaxSize() const; - size_t getClusterCacheCurrentSize() const; - void setClusterCacheMaxSize(size_t nbClusters); size_t getDirentCacheMaxSize() const; size_t getDirentCacheCurrentSize() const; void setDirentCacheMaxSize(size_t nbDirents); @@ -165,6 +166,8 @@ namespace zim explicit FileImpl(std::shared_ptr zimFile); FileImpl(std::shared_ptr zimFile, offset_t offset, zsize_t size); + void dropCachedClusters() const; + std::unique_ptr getTitleAccessorV1(const entry_index_t idx); std::unique_ptr getTitleAccessor(const offset_t offset, const zsize_t size, const std::string& name); diff --git a/src/istreamreader.h b/src/istreamreader.h index a0a5349b2..d2c937974 100644 --- a/src/istreamreader.h +++ b/src/istreamreader.h @@ -59,6 +59,9 @@ class LIBZIM_PRIVATE_API IStreamReader // Reads a blob of the specified size from the stream virtual std::unique_ptr sub_reader(zsize_t size); + // Get the total size occuped by the reader + virtual size_t getMemorySize() const = 0; + private: // virtual methods // Reads exactly 'nbytes' bytes into the provided buffer 'buf' // (which must be at least that big). Throws an exception if diff --git a/src/lrucache.h b/src/lrucache.h index 03a3644e6..7b768cf32 100644 --- a/src/lrucache.h +++ b/src/lrucache.h @@ -43,10 +43,39 @@ #include #include #include +#include +#include namespace zim { -template +struct UnitCostEstimation { + template + static size_t cost(const value_t& value) { + return 1; + } +}; + +/** + * A lru cache where the cost of each item can be different than 1. + * + * Most lru cache is limited by the number of items stored. + * This implementation may have a different "size" per item, so the current size of + * this lru is not the number of item but the sum of all items' size. + * + * This implementation used is pretty simple (dumb) and have few limitations: + * - We consider than size of a item do not change over time. Especially the size of a + * item when we put it MUST be equal to the size of the same item when we drop it. + * - Cache eviction is still a Least Recently Used (LRU), so we drop the least used item(s) util + * we have enough space. No other consideration is used to select which item to drop. + * + * This lru is parametrized by a CostEstimation type. The type must have a static method `cost` + * taking a reference to a `value_t` and returing its "cost". As already said, this method must + * always return the same cost for the same value. + * + * While cost could be any kind of value, this implemention is intended to be used only with + * `UnitCostEstimation` (classic lru) and `FutureToValueCostEstimation`. + */ +template class lru_cache { public: // types typedef typename std::pair key_value_pair_t; @@ -81,9 +110,10 @@ class lru_cache { }; public: // functions - explicit lru_cache(size_t max_size) : - _max_size(max_size) { - } + explicit lru_cache(size_t max_cost) : + _max_cost(max_cost), + _current_cost(0) + {} // If 'key' is present in the cache, returns the associated value, // otherwise puts the given value into the cache (and returns it with @@ -103,6 +133,8 @@ class lru_cache { auto it = _cache_items_map.find(key); if (it != _cache_items_map.end()) { _cache_items_list.splice(_cache_items_list.begin(), _cache_items_list, it->second); + decreaseCost(CostEstimation::cost(it->second->second)); + increaseCost(CostEstimation::cost(value)); it->second->second = value; } else { putMissing(key, value); @@ -120,37 +152,87 @@ class lru_cache { } bool drop(const key_t& key) { + list_iterator_t list_it; try { - auto list_it = _cache_items_map.at(key); - _cache_items_list.erase(list_it); - _cache_items_map.erase(key); - return true; + list_it = _cache_items_map.at(key); } catch (std::out_of_range& e) { return false; } + decreaseCost(CostEstimation::cost(list_it->second)); + _cache_items_list.erase(list_it); + _cache_items_map.erase(key); + return true; + } + + template + void dropAll(F f) { + std::vector keys_to_drop; + for (auto key_iter:_cache_items_map) { + key_t key = key_iter.first; + if (f(key)) { + keys_to_drop.push_back(key); + } + } + + for(auto key:keys_to_drop) { + drop(key); + } } bool exists(const key_t& key) const { return _cache_items_map.find(key) != _cache_items_map.end(); } - size_t size() const { - return _cache_items_map.size(); + size_t cost() const { + return _current_cost; } - size_t getMaxSize() const { - return _max_size; + size_t getMaxCost() const { + return _max_cost; } - void setMaxSize(size_t newSize) { - while (newSize < this->size()) { + void setMaxCost(size_t newMaxCost) { + while (newMaxCost < this->cost()) { dropLast(); } - _max_size = newSize; + _max_cost = newMaxCost; + } + + protected: + + void increaseCost(size_t extra_cost) { + // increaseSize is called after we have added a value to the cache to update + // the size of the current cache. + // We must ensure that we don't drop the value we just added. + // While it is technically ok to keep no value if max cache size is 0 (or memory size < of the size of one cluster) + // it will make recreate the value all the time. + // Let's be nice with our user and be tolerent to misconfiguration. + if (!extra_cost) { + // Don't try to remove an item if we have new size == 0. + // This is the case when concurent cache add a future without value. + // We will handle the real increase size when concurent cache will directly call us. + return; + } + _current_cost += extra_cost; + while (_current_cost > _max_cost && size() > 1) { + dropLast(); + } + } + + void decreaseCost(size_t costToRemove) { + if (costToRemove > _current_cost) { + std::cerr << "WARNING: We have detected inconsistant cache management, trying to remove " << costToRemove << " from a cache with size " << _current_cost << std::endl; + std::cerr << "Please open an issue on https://github.com/openzim/libzim/issues with this message and the zim file you use" << std::endl; + _current_cost = 0; + } else { + _current_cost -= costToRemove; + } } private: // functions void dropLast() { + auto list_it = _cache_items_list.back(); + decreaseCost(CostEstimation::cost(list_it.second)); _cache_items_map.erase(_cache_items_list.back().first); _cache_items_list.pop_back(); } @@ -159,15 +241,19 @@ class lru_cache { assert(_cache_items_map.find(key) == _cache_items_map.end()); _cache_items_list.push_front(key_value_pair_t(key, value)); _cache_items_map[key] = _cache_items_list.begin(); - if (_cache_items_map.size() > _max_size) { - dropLast(); - } + increaseCost(CostEstimation::cost(value)); + } + + size_t size() const { + return _cache_items_map.size(); } + private: // data std::list _cache_items_list; std::map _cache_items_map; - size_t _max_size; + size_t _max_cost; + size_t _current_cost; }; } // namespace zim diff --git a/src/rawstreamreader.h b/src/rawstreamreader.h index 504f67e63..36c30f38c 100644 --- a/src/rawstreamreader.h +++ b/src/rawstreamreader.h @@ -35,6 +35,10 @@ class RawStreamReader : public IStreamReader m_readerPos(0) {} + size_t getMemorySize() const override { + return m_reader->getMemorySize(); + } + void readImpl(char* buf, zsize_t nbytes) override { m_reader->read(buf, m_readerPos, zsize_t(nbytes)); diff --git a/src/reader.h b/src/reader.h index 8a4dc655c..5fdf7de6b 100644 --- a/src/reader.h +++ b/src/reader.h @@ -36,6 +36,7 @@ class LIBZIM_PRIVATE_API Reader { public: Reader() {}; virtual zsize_t size() const = 0; + virtual size_t getMemorySize() const = 0; virtual ~Reader() {}; void read(char* dest, offset_t offset, zsize_t size) const { diff --git a/test/archive.cpp b/test/archive.cpp index 4d7c63ff1..571190c53 100644 --- a/test/archive.cpp +++ b/test/archive.cpp @@ -43,6 +43,18 @@ using zim::unittests::TempFile; using zim::unittests::TestItem; using zim::unittests::IsFrontArticle; +class ZimArchive: public testing::Test { + protected: + void SetUp() override { + zim::setClusterCacheMaxSize(0); + zim::setClusterCacheMaxSize(CLUSTER_CACHE_SIZE); + ASSERT_EQ(zim::getClusterCacheCurrentSize(), 0); + } + void TearDown() override { + ASSERT_EQ(zim::getClusterCacheCurrentSize(), 0); + } +}; + using TestContextImpl = std::vector >; struct TestContext : TestContextImpl { TestContext(const std::initializer_list& il) @@ -80,7 +92,7 @@ emptyZimArchiveContent() return content; } -TEST(ZimArchive, openingAnInvalidZimArchiveFails) +TEST_F(ZimArchive, openingAnInvalidZimArchiveFails) { const char* const prefixes[] = { "ZIM\x04", "" }; const unsigned char bytes[] = {0x00, 0x01, 0x11, 0x30, 0xFF}; @@ -101,7 +113,7 @@ TEST(ZimArchive, openingAnInvalidZimArchiveFails) } } -TEST(ZimArchive, openingAnEmptyZimArchiveSucceeds) +TEST_F(ZimArchive, openingAnEmptyZimArchiveSucceeds) { const auto tmpfile = makeTempFile("empty_zim_file", emptyZimArchiveContent()); @@ -122,7 +134,7 @@ bool isNastyOffset(int offset) { return true; } -TEST(ZimArchive, nastyEmptyZimArchive) +TEST_F(ZimArchive, nastyEmptyZimArchive) { const std::string correctContent = emptyZimArchiveContent(); for ( int offset = 0; offset < 80; ++offset ) { @@ -136,7 +148,7 @@ TEST(ZimArchive, nastyEmptyZimArchive) } } -TEST(ZimArchive, wrongChecksumInEmptyZimArchive) +TEST_F(ZimArchive, wrongChecksumInEmptyZimArchive) { std::string zimfileContent = emptyZimArchiveContent(); zimfileContent[85] = '\xff'; @@ -147,7 +159,7 @@ TEST(ZimArchive, wrongChecksumInEmptyZimArchive) } -TEST(ZimArchive, openCreatedArchive) +TEST_F(ZimArchive, openCreatedArchive) { TempFile temp("zimfile"); auto tempPath = temp.path(); @@ -245,7 +257,7 @@ TEST(ZimArchive, openCreatedArchive) } #if WITH_TEST_DATA -TEST(ZimArchive, openRealZimArchive) +TEST_F(ZimArchive, openRealZimArchive) { const char* const zimfiles[] = { "small.zim", @@ -266,7 +278,7 @@ TEST(ZimArchive, openRealZimArchive) } } -TEST(ZimArchive, openSplitZimArchive) +TEST_F(ZimArchive, openSplitZimArchive) { const char* fname = "wikibooks_be_all_nopic_2017-02_splitted.zim"; @@ -286,119 +298,237 @@ struct TestCacheConfig { size_t direntLookupCacheSize; }; +struct RefEntry { + void test_is_equal(const zim::Entry& entry) { + ASSERT_EQ(path, entry.getPath()); + ASSERT_EQ(title, entry.getTitle()); + ASSERT_EQ(isRedirect, entry.isRedirect()); + if (isRedirect) { + zim::entry_index_type redirectId = redirect_or_hash; + ASSERT_EQ(redirectId, entry.getRedirectEntryIndex()); + } else { + auto hash = std::hash{}(std::string(entry.getItem().getData())); + ASSERT_EQ(redirect_or_hash, hash); + } + } -#define ASSERT_ARCHIVE_EQUIVALENT(REF_ARCHIVE, TEST_ARCHIVE) \ - ASSERT_ARCHIVE_EQUIVALENT_LIMIT(REF_ARCHIVE, TEST_ARCHIVE, REF_ARCHIVE.getEntryCount()) + std::string path; + std::string title; + bool isRedirect; + // size_t is either 32 or 64 bits and entry_index_type (redirect id) is always 32 bits. + size_t redirect_or_hash; +}; -#define ASSERT_ARCHIVE_EQUIVALENT_LIMIT(REF_ARCHIVE, TEST_ARCHIVE, LIMIT) \ - { \ - auto range = REF_ARCHIVE.iterEfficient(); \ - auto ref_it = range.begin(); \ - ASSERT_ARCHIVE_EQUIVALENT_IT_LIMIT(ref_it, range.end(), TEST_ARCHIVE, LIMIT) \ +struct RefArchiveContent { + RefArchiveContent(const zim::Archive& archive) { + for (auto entry:archive.iterEfficient()) { + RefEntry ref_entry = { + entry.getPath(), + entry.getTitle(), + entry.isRedirect(), + entry.isRedirect() ? entry.getRedirectEntryIndex() : std::hash{}(std::string(entry.getItem().getData())), + }; + ref_entries.push_back(ref_entry); + } } - -#define ASSERT_ARCHIVE_EQUIVALENT_IT_LIMIT(REF_IT, REF_END, TEST_ARCHIVE, LIMIT) \ - for (auto i = 0U; igetPath()); \ - ASSERT_EQ(REF_IT->getPath(), test_entry.getPath()); \ - ASSERT_EQ(REF_IT->getTitle(), test_entry.getTitle()); \ - ASSERT_EQ(REF_IT->isRedirect(), test_entry.isRedirect()); \ - if (REF_IT->isRedirect()) { \ - ASSERT_EQ(REF_IT->getRedirectEntryIndex(), test_entry.getRedirectEntryIndex()); \ - } \ - auto ref_item = REF_IT->getItem(true); \ - auto test_item = test_entry.getItem(true); \ - ASSERT_EQ(ref_item.getClusterIndex(), test_item.getClusterIndex()); \ - ASSERT_EQ(ref_item.getBlobIndex(), test_item.getBlobIndex()); \ - ASSERT_EQ(ref_item.getData(), test_item.getData()); \ + void test_is_equal(const zim::Archive& archive) { + for (auto ref_entry:ref_entries) { + auto entry = archive.getEntryByPath(ref_entry.path); + ref_entry.test_is_equal(entry); + } } + std::vector ref_entries; +}; -TEST(ZimArchive, cacheDontImpactReading) + +TEST_F(ZimArchive, cacheDontImpactReading) { const TestCacheConfig cacheConfigs[] = { {0, 0, 0}, - {1, 1, 1}, - {2, 2, 2}, - {10, 10, 10}, - {1000, 2000, 1000}, - {0, 2000, 1000}, - {1000, 0, 1000}, - {1000, 2000, 0}, - {1, 2000, 1000}, - {1000, 1, 1000}, - {1000, 2000, 1}, + {1, 1<<20, 1}, + {2, 2<<20, 2}, + {10, 10<<20, 10}, + {1000, 2000<<20, 1000}, + {0, 2000<<20, 1000}, + {1000, 0<<20, 1000}, + {1000, 2000<<20, 0}, + {1, 2000<<20, 1000}, + {1000, 1<<20, 1000}, + {1000, 2000<<20, 1}, }; for (auto& testfile: getDataFilePath("small.zim")) { - auto ref_archive = zim::Archive(testfile.path); + RefArchiveContent ref_archive(zim::Archive(testfile.path)); for (auto cacheConfig: cacheConfigs) { auto test_archive = zim::Archive(testfile.path); test_archive.setDirentCacheMaxSize(cacheConfig.direntCacheSize); test_archive.setDirentLookupCacheMaxSize(cacheConfig.direntLookupCacheSize); - test_archive.setClusterCacheMaxSize(cacheConfig.clusterCacheSize); + zim::setClusterCacheMaxSize(cacheConfig.clusterCacheSize); EXPECT_EQ(test_archive.getDirentCacheMaxSize(), cacheConfig.direntCacheSize); EXPECT_EQ(test_archive.getDirentLookupCacheMaxSize(), cacheConfig.direntLookupCacheSize); - EXPECT_EQ(test_archive.getClusterCacheMaxSize(), cacheConfig.clusterCacheSize); + EXPECT_EQ(zim::getClusterCacheMaxSize(), cacheConfig.clusterCacheSize); - ASSERT_ARCHIVE_EQUIVALENT(ref_archive, test_archive) + ref_archive.test_is_equal(test_archive); } } } +TEST_F(ZimArchive, cacheClean) { + for (auto& testfile: getDataFilePath("wikibooks_be_all_nopic_2017-02.zim")) { + EXPECT_EQ(zim::getClusterCacheCurrentSize(), 0); // No clusters in cache + { + auto archive = zim::Archive(testfile.path); + auto range = archive.iterEfficient(); + auto it = range.begin(); + for (auto i = 0; i<50 && it != range.end(); i++, it++) { + // Be sure to search by path to populate the dirent cache + auto entry = archive.getEntryByPath(it->getPath()); + auto item = entry.getItem(true); + auto data = item.getData(); + } + EXPECT_GT(zim::getClusterCacheCurrentSize(), 0); // No clusters in cache + } + EXPECT_EQ(zim::getClusterCacheCurrentSize(), 0); // No clusters in cache + } +} -TEST(ZimArchive, cacheChange) +TEST_F(ZimArchive, cacheChange) { - for (auto& testfile: getDataFilePath("wikibooks_be_all_nopic_2017-02.zim")) { - auto ref_archive = zim::Archive(testfile.path); + // We test only one variant here. + // Each variant has cluster of different size (especially the old "withns" which + // have a cluster compressed with a algorithm/compression level making input stream + // having a size of 64MB), + // this make all the following reasoning about cluster size a bit too complex. + // As the test here don't test that we can read all variant, we don't have too. + for (auto& testfile: getDataFilePath("wikibooks_be_all_nopic_2017-02.zim", {"noTitleListingV0"})) { + // wikibooks has only 2 clusters. One of 492121 bytes and one of 823716 bytes. + // For a total of 1315837 bytes. + // Has we try to keep one cluster in the cache, any size under the size of one + // cluster will not be respected. + // So we will define 2 limits: + // 850<<10 : size higher than a cluster size but under 2 + // 2 << 20 : size higher than two clusters + const size_t L1_SIZE = 850 << 10; + const size_t L2_SIZE = 2 << 20; + + EXPECT_EQ(zim::getClusterCacheCurrentSize(), 0); + RefArchiveContent ref_archive(zim::Archive(testfile.path)); auto archive = zim::Archive(testfile.path); archive.setDirentCacheMaxSize(30); - archive.setClusterCacheMaxSize(5); + zim::setClusterCacheMaxSize(L2_SIZE); - auto range = ref_archive.iterEfficient(); - auto ref_it = range.begin(); - ASSERT_ARCHIVE_EQUIVALENT_IT_LIMIT(ref_it, range.end(), archive, 50) + auto ref_it = ref_archive.ref_entries.begin(); + for (auto i = 0; i<50 && ref_it != ref_archive.ref_entries.end(); i++, ref_it++) { + auto entry = archive.getEntryByPath(ref_it->path); + ref_it->test_is_equal(entry); + } EXPECT_EQ(archive.getDirentCacheCurrentSize(), 30); - EXPECT_EQ(archive.getClusterCacheCurrentSize(), 2); // Only 2 clusters in the file + EXPECT_LE(zim::getClusterCacheCurrentSize(), L2_SIZE); // Only 2 clusters in the file // Reduce cache size archive.setDirentCacheMaxSize(10); - archive.setClusterCacheMaxSize(1); + zim::setClusterCacheMaxSize(L1_SIZE); EXPECT_EQ(archive.getDirentCacheCurrentSize(), 10); - EXPECT_EQ(archive.getClusterCacheCurrentSize(), 1); + EXPECT_LE(zim::getClusterCacheCurrentSize(), L1_SIZE); // We want to test change of cache while we are iterating on the archive. // So we don't reset the ref_it to `range.begin()`. - ASSERT_ARCHIVE_EQUIVALENT_IT_LIMIT(ref_it, range.end(), archive, 50) + for (auto i = 0; i<50 && ref_it != ref_archive.ref_entries.end(); i++, ref_it++) { + auto entry = archive.getEntryByPath(ref_it->path); + ref_it->test_is_equal(entry); + } EXPECT_EQ(archive.getDirentCacheCurrentSize(), 10); - EXPECT_EQ(archive.getClusterCacheCurrentSize(), 1); + EXPECT_LE(zim::getClusterCacheCurrentSize(), L1_SIZE); // Clean cache // (More than testing the value, this is needed as we want to be sure the cache is actually populated later) archive.setDirentCacheMaxSize(0); - archive.setClusterCacheMaxSize(0); + zim::setClusterCacheMaxSize(0); EXPECT_EQ(archive.getDirentCacheCurrentSize(), 0); - EXPECT_EQ(archive.getClusterCacheCurrentSize(), 0); + EXPECT_EQ(zim::getClusterCacheCurrentSize(), 0); // Increase the cache archive.setDirentCacheMaxSize(20); - archive.setClusterCacheMaxSize(1); + zim::setClusterCacheMaxSize(L1_SIZE); EXPECT_EQ(archive.getDirentCacheCurrentSize(), 0); - EXPECT_EQ(archive.getClusterCacheCurrentSize(), 0); + EXPECT_EQ(zim::getClusterCacheCurrentSize(), 0); - ASSERT_ARCHIVE_EQUIVALENT(ref_archive, archive) + ref_archive.test_is_equal(archive); EXPECT_EQ(archive.getDirentCacheCurrentSize(), 20); - EXPECT_EQ(archive.getClusterCacheCurrentSize(), 1); + EXPECT_LE(zim::getClusterCacheCurrentSize(), L1_SIZE); + } +} + + +TEST_F(ZimArchive, MultiZimCache) +{ + // Get a list of several zim files to open (whatever the variant) + std::vector zimPaths; + const char* const zimfiles[] = { + "wikibooks_be_all_nopic_2017-02.zim", + "wikibooks_be_all_nopic_2017-02_splitted.zim", + "wikipedia_en_climate_change_mini_2024-06.zim" + }; + + for ( const std::string fname : zimfiles ) { + for (auto& testfile: getDataFilePath(fname)) { + zimPaths.push_back(testfile.path); + } + } + + + const size_t SMALL_LIMIT = 5 << 20; + const size_t BIG_LIMIT = 200 << 20; + zim::setClusterCacheMaxSize(BIG_LIMIT); + + std::vector archives; + for (auto path:zimPaths) { + auto archive = zim::Archive(path); + for (auto entry:archive.iterEfficient()) { + auto item = entry.getItem(true); + auto data = item.getData(); + } + archives.push_back(archive); + } + + EXPECT_LE(zim::getClusterCacheCurrentSize(), BIG_LIMIT); + zim::setClusterCacheMaxSize(SMALL_LIMIT); + EXPECT_LE(zim::getClusterCacheCurrentSize(), SMALL_LIMIT); + + // Opening an archive should increase the cluster cache + zim::setClusterCacheMaxSize(BIG_LIMIT); + auto current_limit = zim::getClusterCacheCurrentSize(); + { + auto archive = zim::Archive(zimPaths[0]); + for (auto entry:archive.iterEfficient()) { + auto item = entry.getItem(true); + auto data = item.getData(); + } + EXPECT_GT(zim::getClusterCacheCurrentSize(), current_limit); + current_limit = zim::getClusterCacheCurrentSize(); + } + // Destroying an archive should decrease the cluster cache + EXPECT_LT(zim::getClusterCacheCurrentSize(), current_limit); + + // Be sure that decreasing the number of archives open also decrease the + // current cache size, until we reach 0. + current_limit = zim::getClusterCacheCurrentSize(); + while (!archives.empty()) { + archives.pop_back(); + EXPECT_LE(zim::getClusterCacheCurrentSize(), current_limit); + current_limit = zim::getClusterCacheCurrentSize(); } + EXPECT_EQ(zim::getClusterCacheCurrentSize(), 0); } -TEST(ZimArchive, openDontFallbackOnNonSplitZimArchive) +TEST_F(ZimArchive, openDontFallbackOnNonSplitZimArchive) { const char* fname = "wikibooks_be_all_nopic_2017-02.zim"; @@ -414,7 +544,7 @@ TEST(ZimArchive, openDontFallbackOnNonSplitZimArchive) } } -TEST(ZimArchive, openNonExistantZimArchive) +TEST_F(ZimArchive, openNonExistantZimArchive) { const std::string fname = "non_existant.zim"; @@ -427,7 +557,7 @@ TEST(ZimArchive, openNonExistantZimArchive) } } -TEST(ZimArchive, openNonExistantZimSplitArchive) +TEST_F(ZimArchive, openNonExistantZimSplitArchive) { const std::string fname = "non_existant.zimaa"; @@ -440,7 +570,7 @@ TEST(ZimArchive, openNonExistantZimSplitArchive) } } -TEST(ZimArchive, randomEntry) +TEST_F(ZimArchive, randomEntry) { const char* const zimfiles[] = { "wikibooks_be_all_nopic_2017-02.zim", @@ -465,7 +595,7 @@ TEST(ZimArchive, randomEntry) } } -TEST(ZimArchive, illustration) +TEST_F(ZimArchive, illustration) { const char* const zimfiles[] = { "small.zim", @@ -510,7 +640,7 @@ struct TestDataInfo { } }; -TEST(ZimArchive, articleNumber) +TEST_F(ZimArchive, articleNumber) { TestDataInfo zimfiles[] = { // Name mediaCount, withns nons noTitleListingV0 @@ -564,7 +694,8 @@ class CapturedStderr #define EXPECT_BROKEN_ZIMFILE(ZIMPATH, EXPECTED_STDERROR_TEXT) \ CapturedStderr stderror; \ EXPECT_FALSE(zim::validate(ZIMPATH, checksToRun)); \ - EXPECT_EQ(EXPECTED_STDERROR_TEXT, std::string(stderror)) << ZIMPATH; + EXPECT_EQ(EXPECTED_STDERROR_TEXT, std::string(stderror)) << ZIMPATH; \ + ASSERT_EQ(zim::getClusterCacheCurrentSize(), 0); #define TEST_BROKEN_ZIM_NAME(ZIMNAME, EXPECTED) \ for(auto& testfile: getDataFilePath(ZIMNAME)) {EXPECT_BROKEN_ZIMFILE(testfile.path, EXPECTED)} @@ -576,7 +707,7 @@ for(auto& testfile: getDataFilePath(ZIMNAME, CAT)) {EXPECT_BROKEN_ZIMFILE(testfi #define WITH_TITLE_IDX_CAT {"withns", "nons"} #if WITH_TEST_DATA -TEST(ZimArchive, validate) +TEST_F(ZimArchive, validate) { zim::IntegrityCheckList all; all.set(); @@ -774,7 +905,7 @@ void checkEquivalence(const zim::Archive& archive1, const zim::Archive& archive2 #endif } -TEST(ZimArchive, multipart) +TEST_F(ZimArchive, multipart) { auto nonSplittedZims = getDataFilePath("wikibooks_be_all_nopic_2017-02.zim"); auto splittedZims = getDataFilePath("wikibooks_be_all_nopic_2017-02_splitted.zim"); @@ -803,7 +934,7 @@ TEST(ZimArchive, multipart) #endif #ifndef _WIN32 -TEST(ZimArchive, openByFD) +TEST_F(ZimArchive, openByFD) { for(auto& testfile: getDataFilePath("small.zim")) { const zim::Archive archive1(testfile.path); @@ -814,7 +945,7 @@ TEST(ZimArchive, openByFD) } } -TEST(ZimArchive, openZIMFileEmbeddedInAnotherFile) +TEST_F(ZimArchive, openZIMFileEmbeddedInAnotherFile) { auto normalZims = getDataFilePath("small.zim"); auto embeddedZims = getDataFilePath("small.zim.embedded"); @@ -830,7 +961,7 @@ TEST(ZimArchive, openZIMFileEmbeddedInAnotherFile) } } -TEST(ZimArchive, openZIMFileMultiPartEmbeddedInAnotherFile) +TEST_F(ZimArchive, openZIMFileMultiPartEmbeddedInAnotherFile) { auto normalZims = getDataFilePath("small.zim"); auto embeddedZims = getDataFilePath("small.zim.embedded.multi"); @@ -873,7 +1004,7 @@ zim::Blob readItemData(const zim::Item::DirectAccessInfo& dai, zim::size_type si return zim::Blob(data, size); } -TEST(ZimArchive, getDirectAccessInformation) +TEST_F(ZimArchive, getDirectAccessInformation) { for(auto& testfile:getDataFilePath("small.zim")) { const zim::Archive archive(testfile.path); @@ -894,7 +1025,7 @@ TEST(ZimArchive, getDirectAccessInformation) } #ifndef _WIN32 -TEST(ZimArchive, getDirectAccessInformationInAnArchiveOpenedByFD) +TEST_F(ZimArchive, getDirectAccessInformationInAnArchiveOpenedByFD) { for(auto& testfile:getDataFilePath("small.zim")) { const int fd = OPEN_READ_ONLY(testfile.path); @@ -915,7 +1046,7 @@ TEST(ZimArchive, getDirectAccessInformationInAnArchiveOpenedByFD) } } -TEST(ZimArchive, getDirectAccessInformationFromEmbeddedArchive) +TEST_F(ZimArchive, getDirectAccessInformationFromEmbeddedArchive) { auto normalZims = getDataFilePath("small.zim"); auto embeddedZims = getDataFilePath("small.zim.embedded"); diff --git a/test/istreamreader.cpp b/test/istreamreader.cpp index a96b59156..78b2f1135 100644 --- a/test/istreamreader.cpp +++ b/test/istreamreader.cpp @@ -34,18 +34,25 @@ using namespace zim; // Implement the IStreamReader interface in the simplest way class InfiniteZeroStream : public IStreamReader { - void readImpl(char* buf, zim::zsize_t nbytes) { memset(buf, 0, nbytes.v); } + void readImpl(char* buf, zim::zsize_t nbytes) override { memset(buf, 0, nbytes.v); } + size_t getMemorySize() const override { + return 0; + } }; class InfiniteIncreasingStream: public IStreamReader { zim::offset_type current_offset = 0; - void readImpl(char* buf, zim::zsize_t nbytes) { + void readImpl(char* buf, zim::zsize_t nbytes) override { for (size_type i=0; iget_buffer(zim::offset_t(0), zim::zsize_t(N)); EXPECT_EQ(buffer.size().v, N); EXPECT_EQ(0, memcmp(buffer.data(), refbuf, N)); - + buffer = subReader->get_buffer(zim::offset_t(5), zim::zsize_t(N-5)); EXPECT_EQ(buffer.size().v, N-5); EXPECT_EQ(0, memcmp(buffer.data(), refbuf+5, N-5)); diff --git a/test/lrucache.cpp b/test/lrucache.cpp index 67dba943a..e43b914c5 100644 --- a/test/lrucache.cpp +++ b/test/lrucache.cpp @@ -38,47 +38,110 @@ const unsigned int TEST2_CACHE_CAPACITY = 50u; const unsigned int TEST2_CACHE_CAPACITY_SMALL = 10u; TEST(CacheTest, SimplePut) { - zim::lru_cache cache_lru(1); + zim::lru_cache cache_lru(1); cache_lru.put(7, 777); EXPECT_TRUE(cache_lru.exists(7)); EXPECT_EQ(777, cache_lru.get(7)); - EXPECT_EQ(1u, cache_lru.size()); + EXPECT_EQ(1u, cache_lru.cost()); } TEST(CacheTest, OverwritingPut) { - zim::lru_cache cache_lru(1); + zim::lru_cache cache_lru(1); cache_lru.put(7, 777); cache_lru.put(7, 222); EXPECT_TRUE(cache_lru.exists(7)); EXPECT_EQ(222, cache_lru.get(7)); - EXPECT_EQ(1u, cache_lru.size()); + EXPECT_EQ(1u, cache_lru.cost()); } TEST(CacheTest, MissingValue) { - zim::lru_cache cache_lru(1); + zim::lru_cache cache_lru(1); EXPECT_TRUE(cache_lru.get(7).miss()); EXPECT_FALSE(cache_lru.get(7).hit()); EXPECT_THROW(cache_lru.get(7).value(), std::range_error); } TEST(CacheTest, DropValue) { - zim::lru_cache cache_lru(3); + zim::lru_cache cache_lru(3); cache_lru.put(7, 777); cache_lru.put(8, 888); cache_lru.put(9, 999); - EXPECT_EQ(3u, cache_lru.size()); + EXPECT_EQ(3u, cache_lru.cost()); EXPECT_TRUE(cache_lru.exists(7)); EXPECT_EQ(777, cache_lru.get(7)); EXPECT_TRUE(cache_lru.drop(7)); - EXPECT_EQ(2u, cache_lru.size()); + EXPECT_EQ(2u, cache_lru.cost()); EXPECT_FALSE(cache_lru.exists(7)); EXPECT_THROW(cache_lru.get(7).value(), std::range_error); EXPECT_FALSE(cache_lru.drop(7)); } +struct IdCost { + static size_t cost(size_t value ) { + return value; + } +}; + +TEST(CacheTest, VariableCost) { + zim::lru_cache cache_lru(100); + + cache_lru.put(1, 11); + cache_lru.put(2, 22); + cache_lru.put(3, 33); + EXPECT_EQ(66u, cache_lru.cost()); + + cache_lru.put(4, 44); + EXPECT_EQ(99u, cache_lru.cost()); + EXPECT_FALSE(cache_lru.exists(1)); + EXPECT_TRUE(cache_lru.exists(2)); + EXPECT_TRUE(cache_lru.exists(3)); + EXPECT_TRUE(cache_lru.exists(4)); + + cache_lru.put(5, 55); + EXPECT_EQ(99u, cache_lru.cost()); + EXPECT_FALSE(cache_lru.exists(1)); + EXPECT_FALSE(cache_lru.exists(2)); + EXPECT_FALSE(cache_lru.exists(3)); + EXPECT_TRUE(cache_lru.exists(4)); + EXPECT_TRUE(cache_lru.exists(5)); + + cache_lru.put(1, 11); + EXPECT_EQ(66u, cache_lru.cost()); + EXPECT_TRUE(cache_lru.exists(1)); + EXPECT_FALSE(cache_lru.exists(2)); + EXPECT_FALSE(cache_lru.exists(3)); + EXPECT_FALSE(cache_lru.exists(4)); + EXPECT_TRUE(cache_lru.exists(5)); +} + +TEST(CacheTest, TooBigValue) { + zim::lru_cache cache_lru(10); + + cache_lru.put(1, 11); + EXPECT_EQ(11u, cache_lru.cost()); + EXPECT_TRUE(cache_lru.exists(1)); + + cache_lru.put(2, 22); + EXPECT_EQ(22u, cache_lru.cost()); + EXPECT_FALSE(cache_lru.exists(1)); + EXPECT_TRUE(cache_lru.exists(2)); + + cache_lru.put(3, 33); + EXPECT_EQ(33u, cache_lru.cost()); + EXPECT_FALSE(cache_lru.exists(1)); + EXPECT_FALSE(cache_lru.exists(2)); + EXPECT_TRUE(cache_lru.exists(3)); + + cache_lru.put(1, 11); + EXPECT_EQ(11u, cache_lru.cost()); + EXPECT_TRUE(cache_lru.exists(1)); + EXPECT_FALSE(cache_lru.exists(2)); + EXPECT_FALSE(cache_lru.exists(3)); +} + #define EXPECT_RANGE_MISSING_FROM_CACHE(CACHE, START, END) \ for (unsigned i = START; i < END; ++i) { \ EXPECT_FALSE(CACHE.exists(i)); \ @@ -93,7 +156,7 @@ for (unsigned i = START; i < END; ++i) { \ TEST(CacheTest1, KeepsAllValuesWithinCapacity) { - zim::lru_cache cache_lru(TEST2_CACHE_CAPACITY); + zim::lru_cache cache_lru(TEST2_CACHE_CAPACITY); for (int i = 0; i < NUM_OF_TEST2_RECORDS; ++i) { cache_lru.put(i, i); @@ -103,37 +166,37 @@ TEST(CacheTest1, KeepsAllValuesWithinCapacity) { EXPECT_RANGE_FULLY_IN_CACHE(cache_lru, (NUM_OF_TEST2_RECORDS - TEST2_CACHE_CAPACITY), NUM_OF_TEST2_RECORDS, 1) - size_t size = cache_lru.size(); + size_t size = cache_lru.cost(); EXPECT_EQ(TEST2_CACHE_CAPACITY, size); } TEST(CacheTest1, ChangeCacheCapacity) { - zim::lru_cache cache_lru(TEST2_CACHE_CAPACITY); + zim::lru_cache cache_lru(TEST2_CACHE_CAPACITY); for (int i = 0; i < NUM_OF_TEST2_RECORDS; ++i) { cache_lru.put(i, i); } - EXPECT_EQ(TEST2_CACHE_CAPACITY, cache_lru.size()); + EXPECT_EQ(TEST2_CACHE_CAPACITY, cache_lru.cost()); EXPECT_RANGE_MISSING_FROM_CACHE(cache_lru, 0, (NUM_OF_TEST2_RECORDS - TEST2_CACHE_CAPACITY)) EXPECT_RANGE_FULLY_IN_CACHE(cache_lru, (NUM_OF_TEST2_RECORDS - TEST2_CACHE_CAPACITY), NUM_OF_TEST2_RECORDS, 1) - cache_lru.setMaxSize(TEST2_CACHE_CAPACITY_SMALL); - EXPECT_EQ(TEST2_CACHE_CAPACITY_SMALL, cache_lru.size()); + cache_lru.setMaxCost(TEST2_CACHE_CAPACITY_SMALL); + EXPECT_EQ(TEST2_CACHE_CAPACITY_SMALL, cache_lru.cost()); EXPECT_RANGE_MISSING_FROM_CACHE(cache_lru, 0, (NUM_OF_TEST2_RECORDS - TEST2_CACHE_CAPACITY_SMALL)) EXPECT_RANGE_FULLY_IN_CACHE(cache_lru, (NUM_OF_TEST2_RECORDS - TEST2_CACHE_CAPACITY_SMALL), NUM_OF_TEST2_RECORDS, 1) - cache_lru.setMaxSize(TEST2_CACHE_CAPACITY); + cache_lru.setMaxCost(TEST2_CACHE_CAPACITY); for (int i = 0; i < NUM_OF_TEST2_RECORDS; ++i) { cache_lru.put(i, 1000*i); } - EXPECT_EQ(TEST2_CACHE_CAPACITY, cache_lru.size()); + EXPECT_EQ(TEST2_CACHE_CAPACITY, cache_lru.cost()); EXPECT_RANGE_MISSING_FROM_CACHE(cache_lru, 0, (NUM_OF_TEST2_RECORDS - TEST2_CACHE_CAPACITY)) EXPECT_RANGE_FULLY_IN_CACHE(cache_lru, (NUM_OF_TEST2_RECORDS - TEST2_CACHE_CAPACITY), NUM_OF_TEST2_RECORDS, 1000) } TEST(ConcurrentCacheTest, handleException) { - zim::ConcurrentCache cache(1); + zim::ConcurrentCache cache(1); auto val = cache.getOrPut(7, []() { return 777; }); EXPECT_EQ(val, 777); EXPECT_THROW(cache.getOrPut(8, []() { throw std::runtime_error("oups"); return 0; }), std::runtime_error);