@@ -178,42 +178,49 @@ getClusterReader(const Reader& zimReader, offset_t offset, Cluster::Compression*
178178 }
179179 }
180180
181- }
182-
183- // This function must return a constant size for a given cluster.
184- // This is important as we want to remove the same size that what we add when we remove
185- // the cluster from the cache.
186- // However, because of partial decompression, this size can change:
187- // - As we advance in the compression, we can create new blob readers in `m_blobReaders`
188- // - The stream itself may allocate memory.
189- // To solve this, we take the average and say a cluster's blob readers will half be created and
190- // so we assume a readers size of half the full uncompressed cluster data size.
191- // If cluster is not compressed, we never store its content (mmap is created on demand and not cached),
192- // so we use a size of 0 for the readers.
193- // It also appears that when we get the size of the stream, we reach a state where no
194- // futher allocation will be done by it. Probably because:
195- // - We already started to decompress the stream to read the offsets
196- // - Cluster data size is smaller than window size associated to compression level (?)
197- // We anyway check that and print a warning if this is not the case, hopping that user will create
198- // an issue allowing us for further analysis.
199- size_t zim::ClusterMemorySize::get_cluster_size (const Cluster& cluster) {
200- if (!cluster.m_memorySize ) {
201- auto offsets_size = sizeof (offset_t ) * cluster.m_blobOffsets .size ();
202- auto readers_size = 0 ;
203- if (cluster.isCompressed ()) {
204- readers_size = cluster.m_blobOffsets .back ().v / 2 ;
181+ // This function must return a constant size for a given cluster.
182+ // This is important as we want to remove the same size that what we add when we remove
183+ // the cluster from the cache.
184+ // However, because of partial decompression, this size can change:
185+ // - As we advance in the compression, we can create new blob readers in `m_blobReaders`
186+ // - The stream itself may allocate memory.
187+ // To solve this, we take the average and say a cluster's blob readers will half be created and
188+ // so we assume a readers size of half the full uncompressed cluster data size.
189+ // If cluster is not compressed, we never store its content (mmap is created on demand and not cached),
190+ // so we use a size of 0 for the readers.
191+ // It also appears that when we get the size of the stream, we reach a state where no
192+ // futher allocation will be done by it. Probably because:
193+ // - We already started to decompress the stream to read the offsets
194+ // - Cluster data size is smaller than window size associated to compression level (?)
195+ // We anyway check that and print a warning if this is not the case, hopping that user will create
196+ // an issue allowing us for further analysis.
197+ // Note:
198+ // - No need to protect this method from concurent access as it will be called by the concurent_cache which will
199+ // have a lock (on lru cache) to ensure only one thread access it in the same time.
200+ size_t Cluster::getMemorySize () const {
201+ if (!m_memorySize) {
202+ auto offsets_size = sizeof (offset_t ) * m_blobOffsets.size ();
203+ auto readers_size = 0 ;
204+ if (isCompressed ()) {
205+ readers_size = m_blobOffsets.back ().v / 2 ;
206+ }
207+ m_streamSize = m_reader->getMemorySize ();
208+ // Compression level define a huge window and make decompression stream allocate a huge memory to store it.
209+ // However, the used memory will not be greater than the content itself, even if window is bigger.
210+ // On linux (at least), the real used memory will be the actual memory used, not the one allocated.
211+ // So, let's clamm the the stream size to the size of the content itself.
212+ m_memorySize = offsets_size + readers_size + std::min (m_streamSize, m_blobOffsets.back ().v );
205213 }
206- cluster. m_streamSize = cluster. m_reader ->getMemorySize ();
207- // Compression level define a huge window and make decompression stream allocate a huge memory to store it.
208- // However, the used memory will not be greater than the content itself, even if window is bigger.
209- // On linux (at least), the real used memory will be the actual memory used, not the one allocated.
210- // So, let's clamm the the stream size to the size of the content itself.
211- cluster. m_memorySize = offsets_size + readers_size + std::min (cluster. m_streamSize , cluster. m_blobOffsets . back (). v ) ;
214+ auto streamSize = m_reader->getMemorySize ();
215+ if (streamSize != m_streamSize) {
216+ std::cerr << " WARNING: stream size have changed from " << m_streamSize << " to " << streamSize << std::endl;
217+ std::cerr << " Please open an issue on https://github.com/openzim/libzim/issues with this message and the zim file you use " << std::endl;
218+ }
219+ return m_memorySize ;
212220 }
213- auto streamSize = cluster.m_reader ->getMemorySize ();
214- if (streamSize != cluster.m_streamSize ) {
215- std::cerr << " WARNING: stream size have changed from " << cluster.m_streamSize << " to " << streamSize << std::endl;
216- std::cerr << " Please open an issue on https://github.com/openzim/libzim/issues with this message and the zim file you use" << std::endl;
221+
222+ size_t ClusterMemorySize::get_cluster_size (const Cluster& cluster) {
223+ return cluster.getMemorySize ();
217224 }
218- return cluster. m_memorySize ;
225+
219226}
0 commit comments