@@ -14,6 +14,30 @@ import (
1414 "github.com/onflow/flow-go/storage/operation"
1515)
1616
17+ // ChunkDataPacks manages storage and retrieval of ChunkDataPacks, primarily serving the use case of EXECUTION NODES persisting
18+ // and indexing chunk data packs for their OWN RESULTS. Essentially, the chunk describes a batch of work to be done, and the
19+ // chunk data pack describes the result of that work. The storage of chunk data packs is segregated across different
20+ // storage components for efficiency and modularity reasons:
21+ // 0. Usually (ignoring the system chunk for a moment), the batch of work is given by the collection referenced in the chunk
22+ // data pack. For any chunk data pack being stored, we assume that the executed collection has *previously* been persisted
23+ // in [storage.Collections]. It is useful to persist the collections individually, so we can individually retrieve them.
24+ // 1. The actual chunk data pack itself is stored in a dedicated storage component `cdpStorage`. Note that for this storage
25+ // component, no atomicity is required, as we are storing chunk data packs by their collision-resistant hashes, so
26+ // different chunk data packs will be stored under different keys.
27+ // Theoretically, nodes could store persist multiple different (disagreeing) chunk data packs for the same
28+ // chunk in this step. However, for efficiency, Execution Nodes only store their own chunk data packs.
29+ // 2. The index mapping from ChunkID to chunkDataPackID is stored in the protocol database for fast retrieval.
30+ // This index is intended to be populated by execution nodes when they commit to a specific result represented by the chunk
31+ // data pack. Here, we require atomicity, as an execution node should not be changing / overwriting which chunk data pack
32+ // it committed to (during normal operations).
33+ //
34+ // Since the executed collections are stored separately (step 0, above), we can just use the collection ID in context of the
35+ // chunk data pack storage (step 1, above). Therefore, we utilize the reduced representation [storage.StoredChunkDataPack]
36+ // internally. While removing redundant data from storage, it takes 3 look-ups to return chunk data pack by chunk ID:
37+ //
38+ // i. a lookup for chunkID -> chunkDataPackID
39+ // ii. a lookup for chunkDataPackID -> StoredChunkDataPack (only has CollectionID, no collection data)
40+ // iii. a lookup for CollectionID -> Collection, then reconstruct the chunk data pack from the collection and the StoredChunkDataPack
1741type ChunkDataPacks struct {
1842 // the protocol DB is used for storing index mappings from chunk ID to chunk data pack ID
1943 protocolDB storage.DB
@@ -27,11 +51,6 @@ type ChunkDataPacks struct {
2751
2852 // cache chunkID -> chunkDataPackID
2953 chunkIDToChunkDataPackIDCache * Cache [flow.Identifier , flow.Identifier ]
30-
31- // it takes 3 look ups to return chunk data pack by chunk ID:
32- // 1. a cache lookup for chunkID -> chunkDataPackID
33- // 2. a lookup for chunkDataPackID -> StoredChunkDataPack (only has CollectionID, no collection data)
34- // 3. a lookup for CollectionID -> Collection, then restore the chunk data pack with the collection and the StoredChunkDataPack
3554}
3655
3756var _ storage.ChunkDataPacks = (* ChunkDataPacks )(nil )
0 commit comments