Add explanatory comments.

groutr · groutr · commit 78cf39b9dd09 · 2024-10-22T14:37:55.000-06:00
diff --git a/fsspec/caching.py b/fsspec/caching.py
@@ -164,19 +164,35 @@ def _fetch(self, start: int | None, end: int | None) -> bytes:
         start_block = start // self.blocksize
         end_block = end // self.blocksize
         block_range = range(start_block, end_block + 1)
+        # Determine which blocks need to be fetched. This sequence is sorted by construction.
         need = (i for i in block_range if i not in self.blocks)
+        # Count the number of blocks already cached
         self.hit_count += sum(1 for i in block_range if i in self.blocks)
 
         # Consolidate needed blocks.
-        # Algorithm adapted from Python 2.x itertools documentation
+        # Algorithm adapted from Python 2.x itertools documentation.
+        # We are grouping an enumerated sequence of blocks. By comparing when the difference
+        # between an ascending range (provided by enumerate) and the needed block numbers
+        # we can detect when the block number skips values. The key computes this difference.
+        # Whenever the difference changes, we know that we have previously cached block(s),
+        # and a new group is started. In other words, this algorithm neatly groups
+        # runs of consecutive block numbers so they can be fetched together.
         for _, _blocks in groupby(enumerate(need), key=lambda x: x[0] - x[1]):
+            # Extract the blocks from the enumerated sequence
             _blocks = tuple(map(itemgetter(1), _blocks))
+            # Compute start of first block
             sstart = _blocks[0] * self.blocksize
+            # Compute the end of the last block. Last block may not be full size.
             send = min(_blocks[-1] * self.blocksize + self.blocksize, self.size)
+
+            # Fetch bytes (could be multiple consecutive blocks)
             self.total_requested_bytes += send - sstart
             logger.debug(f"MMap get blocks {_blocks[0]}-{_blocks[-1]} ({sstart}-{send})")
             self.cache[sstart:send] = self.fetcher(sstart, send)
+
+            # Update set of cached blocks
             self.blocks.update(_blocks)
+            # Update cache statistics with number of blocks we had to cache
             self.miss_count += len(_blocks)
 
         return self.cache[start:end]