fixup! Limit the cluster cache by memory instead of numbers of items.

mgautierfr · mgautierfr · commit 951d2da1ea4d · 2025-02-27T11:57:26.000+01:00
diff --git a/src/cluster.cpp b/src/cluster.cpp
@@ -31,8 +31,6 @@
 
 #include "log.h"
 
-#include "config.h"
-
 log_define("zim.cluster")
 
 #define log_debug1(e)
@@ -190,20 +188,27 @@ getClusterReader(const Reader& zimReader, offset_t offset, Cluster::Compression*
 // - The stream itself may allocate memory.
 // To solve this, we take the average and say a cluster's blob readers will half be created and
 // so we assume a readers size of half the full uncompressed cluster data size.
+// If cluster is not compressed, we never store its content (mmap is created on demand and not cached),
+// so we use a size of 0 for the readers.
 // It also appears that when we get the size of the stream, we reach a state where no
 // futher allocation will be done by it. Probably because:
-// - We already started to decompresse the stream to read the offsets
+// - We already started to decompress the stream to read the offsets
 // - Cluster data size is smaller than window size associated to compression level (?)
 // We anyway check that and print a warning if this is not the case, hopping that user will create
 // an issue allowing us for further analysis.
 size_t zim::ClusterMemorySize::get_cluster_size(const Cluster& cluster) {
   if (!cluster.m_memorySize) {
-    auto base_struct = sizeof(Cluster);
     auto offsets_size = sizeof(offset_t) * cluster.m_blobOffsets.size();
-    auto readers_size = cluster.m_blobOffsets.back().v / 2;
+    auto readers_size = 0;
+    if (cluster.isCompressed()) {
+      readers_size = cluster.m_blobOffsets.back().v / 2;
+    }
     cluster.m_streamSize = cluster.m_reader->getMemorySize();
-    cluster.m_memorySize = base_struct + offsets_size + readers_size + cluster.m_streamSize;
-    std::cout << cluster.m_memorySize << " = base:" << base_struct <<" offsets:" << offsets_size <<" readers:" << readers_size <<" input:" << cluster.m_streamSize << std::endl;
+    // Compression level define a huge window and make decompression stream allocate a huge memory to store it.
+    // However, the used memory will not be greater than the content itself, even if window is bigger.
+    // On linux (at least), the real used memory will be the actual memory used, not the one allocated.
+    // So, let's clamm the the stream size to the size of the content itself.
+    cluster.m_memorySize = offsets_size + readers_size + std::min(cluster.m_streamSize, cluster.m_blobOffsets.back().v);
   }
   auto streamSize = cluster.m_reader->getMemorySize();
   if (streamSize != cluster.m_streamSize) {