parallel missing hash collection (#19552)

altendky · web-flow · commit 76374b09c2c9 · 2025-05-07T08:38:35.000-04:00
* parallel missing hash collection

* more

* breakdown

* update chia_rs
diff --git a/chia/data_layer/data_store.py b/chia/data_layer/data_store.py
@@ -7,14 +7,15 @@
 import shutil
 import sqlite3
 from collections import defaultdict
-from collections.abc import AsyncIterator, Awaitable, Collection
+from collections.abc import AsyncIterator, Awaitable, Sequence
 from contextlib import asynccontextmanager
 from dataclasses import dataclass, replace
 from hashlib import sha256
 from pathlib import Path
 from typing import Any, BinaryIO, Callable, Optional, Union
 
 import aiosqlite
+import anyio.to_thread
 import chia_rs.datalayer
 import zstd
 from chia_rs.datalayer import (
@@ -211,15 +212,14 @@ async def insert_into_data_store_from_file(
 
             if len(missing_hashes) > 0:
                 # TODO: consider adding transactions around this code
-                merkle_blob_queries = await self.build_merkle_blob_queries_for_missing_hashes(
-                    missing_hashes, root, store_id
-                )
-
-                # TODO: consider parallel collection
-                for old_root_hash, indexes in merkle_blob_queries.items():
-                    delta_reader.collect_from_merkle_blob(
-                        self.get_merkle_path(store_id=store_id, root_hash=old_root_hash), indexes=indexes
-                    )
+                merkle_blob_queries = await self.build_merkle_blob_queries_for_missing_hashes(missing_hashes, store_id)
+                if len(merkle_blob_queries) > 0:
+                    jobs = [
+                        (self.get_merkle_path(store_id=store_id, root_hash=old_root_hash), indexes)
+                        for old_root_hash, indexes in merkle_blob_queries.items()
+                    ]
+                    await anyio.to_thread.run_sync(delta_reader.collect_from_merkle_blobs, jobs)
+                await self.build_cache_and_collect_missing_hashes(root, store_id, delta_reader)
 
             merkle_blob, _ = delta_reader.create_merkle_blob(root_hash, set())
 
@@ -228,63 +228,103 @@ async def insert_into_data_store_from_file(
 
     async def build_merkle_blob_queries_for_missing_hashes(
         self,
-        missing_hashes: Collection[bytes32],
-        root: Root,
+        missing_hashes: set[bytes32],
         store_id: bytes32,
     ) -> defaultdict[bytes32, list[TreeIndex]]:
         queries = defaultdict[bytes32, list[TreeIndex]](list)
 
-        if missing_hashes:
-            async with self.db_wrapper.reader() as reader:
-                cursor = await reader.execute(
-                    # TODO: the INDEXED BY seems like it shouldn't be needed, figure out why it is
-                    #       https://sqlite.org/lang_indexedby.html: admonished to omit all use of INDEXED BY
-                    #       https://sqlite.org/queryplanner-ng.html#howtofix
-                    "SELECT MAX(generation) FROM nodes INDEXED BY nodes_generation_index WHERE store_id = ?",
-                    (store_id,),
-                )
-                row = await cursor.fetchone()
-                if row is None or row[0] is None:
-                    current_generation = 0
-                else:
-                    current_generation = row[0]
-
         batch_size = min(500, SQLITE_MAX_VARIABLE_NUMBER - 10)
 
+        async with self.db_wrapper.reader() as reader:
+            for batch in to_batches(missing_hashes, batch_size):
+                placeholders = ",".join(["?"] * len(batch.entries))
+                query = f"""
+                    SELECT hash, root_hash, idx
+                    FROM nodes
+                    WHERE store_id = ? AND hash IN ({placeholders})
+                    LIMIT {len(batch.entries)}
+                """
+
+                async with reader.execute(query, (store_id, *batch.entries)) as cursor:
+                    rows = await cursor.fetchall()
+                    for row in rows:
+                        root_hash_blob = bytes32(row["root_hash"])
+                        index = TreeIndex(row["idx"])
+                        queries[root_hash_blob].append(index)
+
+        return queries
+
+    async def build_cache_and_collect_missing_hashes(
+        self,
+        root: Root,
+        store_id: bytes32,
+        delta_reader: DeltaReader,
+    ) -> None:
+        missing_hashes = delta_reader.get_missing_hashes()
+
+        if len(missing_hashes) == 0:
+            return
+
+        async with self.db_wrapper.reader() as reader:
+            cursor = await reader.execute(
+                # TODO: the INDEXED BY seems like it shouldn't be needed, figure out why it is
+                #       https://sqlite.org/lang_indexedby.html: admonished to omit all use of INDEXED BY
+                #       https://sqlite.org/queryplanner-ng.html#howtofix
+                "SELECT MAX(generation) FROM nodes INDEXED BY nodes_generation_index WHERE store_id = ?",
+                (store_id,),
+            )
+            generation_row = await cursor.fetchone()
+            if generation_row is None or generation_row[0] is None:
+                current_generation = 0
+            else:
+                current_generation = generation_row[0]
+        generations: Sequence[int] = [current_generation]
+
         while missing_hashes:
-            found_hashes: set[bytes32] = set()
-            async with self.db_wrapper.reader() as reader:
-                for batch in to_batches(missing_hashes, batch_size):
-                    placeholders = ",".join(["?"] * len(batch.entries))
-                    query = f"""
-                        SELECT hash, root_hash, idx
-                        FROM nodes
-                        WHERE store_id = ? AND hash IN ({placeholders})
-                        LIMIT {len(batch.entries)}
-                    """
+            if current_generation >= root.generation:
+                raise Exception("Invalid delta file, cannot find all the required hashes")
 
-                    async with reader.execute(query, (store_id, *batch.entries)) as cursor:
-                        rows = await cursor.fetchall()
-                        for row in rows:
-                            node_hash = bytes32(row["hash"])
-                            root_hash_blob = bytes32(row["root_hash"])
-                            index = TreeIndex(row["idx"])
-                            if node_hash in found_hashes:
-                                raise Exception("Internal error: duplicate node_hash found in nodes table")
-                            queries[root_hash_blob].append(index)
-                            found_hashes.add(node_hash)
-
-            missing_hashes = [hash for hash in missing_hashes if hash not in found_hashes]
-            if missing_hashes:
-                if current_generation < root.generation:
-                    current_generation += 1
-                else:
-                    raise Exception("Invalid delta file, cannot find all the required hashes")
+            current_generation = generations[-1] + 1
 
-                await self.add_node_hashes(store_id, current_generation)
-                log.info(f"Missing hashes: added old hashes from generation {current_generation}")
+            # TODO: at least shouldn't be hard coded
+            batch_size = 10
+            generations = range(
+                current_generation,
+                min(current_generation + batch_size, root.generation),
+            )
+            jobs: list[tuple[bytes32, Path]] = []
+            generations_by_root_hash: dict[bytes32, int] = {}
+            for generation in generations:
+                generation_root = await self.get_tree_root(store_id=store_id, generation=generation)
+                if generation_root.node_hash is None:
+                    # no need to process an empty generation
+                    continue
+                path = self.get_merkle_path(store_id=store_id, root_hash=generation_root.node_hash)
+                jobs.append((generation_root.node_hash, path))
+                generations_by_root_hash[generation_root.node_hash] = generation
+
+            found = await anyio.to_thread.run_sync(
+                delta_reader.collect_and_return_from_merkle_blobs,
+                jobs,
+                missing_hashes,
+            )
+            async with self.db_wrapper.writer() as writer:
+                await writer.executemany(
+                    """
+                    INSERT
+                    OR IGNORE INTO nodes(store_id, hash, root_hash, generation, idx)
+                    VALUES (?, ?, ?, ?, ?)
+                    """,
+                    (
+                        (store_id, hash, root_hash, generations_by_root_hash[root_hash], index.raw)
+                        for root_hash, map in found
+                        for hash, index in map.items()
+                    ),
+                )
 
-        return queries
+            missing_hashes = delta_reader.get_missing_hashes()
+
+            log.info(f"Missing hashes: added old hashes from generation {current_generation}")
 
     async def read_from_file(
         self, filename: Path, store_id: bytes32
diff --git a/poetry.lock b/poetry.lock
@@ -827,7 +827,7 @@ typing-extensions = "*"
 type = "git"
 url = "https://github.com/chia-network/chia_rs"
 reference = "long_lived/initial_datalayer"
-resolved_reference = "4fe774fe2149d550776f26c7380a6aefb74407f6"
+resolved_reference = "14cd09555b53b210ac45c720d4eb634af6d8c769"
 subdirectory = "wheel/"
 
 [[package]]