@@ -14,6 +14,30 @@ import (
1414 "github.com/onflow/flow-go/storage/operation"
1515)
1616
17+ // ChunkDataPacks manages storage and retrieval of ChunkDataPacks, primarily serving the use case of EXECUTION NODES persisting
18+ // and indexing chunk data packs for their OWN RESULTS. Essentially, the chunk describes a batch of work to be done, and the
19+ // chunk data pack describes the result of that work. The storage of chunk data packs is segregated across different
20+ // storage components for efficiency and modularity reasons:
21+ // 0. Usually (ignoring the system chunk for a moment), the batch of work is given by the collection referenced in the chunk
22+ // data pack. For any chunk data pack being stored, we assume that the executed collection has *previously* been persisted
23+ // in [storage.Collections]. It is useful to persist the collections individually, so we can individually retrieve them.
24+ // 1. The actual chunk data pack itself is stored in a dedicated storage component `cdpStorage`. Note that for this storage
25+ // component, no atomicity is required, as we are storing chunk data packs by their collision-resistant hashes, so
26+ // different chunk data packs will be stored under different keys.
27+ // Theoretically, nodes could store persist multiple different (disagreeing) chunk data packs for the same
28+ // chunk in this step. However, for efficiency, Execution Nodes only store their own chunk data packs.
29+ // 2. The index mapping from ChunkID to chunkDataPackID is stored in the protocol database for fast retrieval.
30+ // This index is intended to be populated by execution nodes when they commit to a specific result represented by the chunk
31+ // data pack. Here, we require atomicity, as an execution node should not be changing / overwriting which chunk data pack
32+ // it committed to (during normal operations).
33+ //
34+ // Since the executed collections are stored separately (step 0, above), we can just use the collection ID in context of the
35+ // chunk data pack storage (step 1, above). Therefore, we utilize the reduced representation [storage.StoredChunkDataPack]
36+ // internally. While removing redundant data from storage, it takes 3 look-ups to return chunk data pack by chunk ID:
37+ //
38+ // i. a lookup for chunkID -> chunkDataPackID
39+ // ii. a lookup for chunkDataPackID -> StoredChunkDataPack (only has CollectionID, no collection data)
40+ // iii. a lookup for CollectionID -> Collection, then reconstruct the chunk data pack from the collection and the StoredChunkDataPack
1741type ChunkDataPacks struct {
1842 // the protocol DB is used for storing index mappings from chunk ID to chunk data pack ID
1943 protocolDB storage.DB
@@ -27,11 +51,6 @@ type ChunkDataPacks struct {
2751
2852 // cache chunkID -> chunkDataPackID
2953 chunkIDToChunkDataPackIDCache * Cache [flow.Identifier , flow.Identifier ]
30-
31- // it takes 3 look ups to return chunk data pack by chunk ID:
32- // 1. a cache lookup for chunkID -> chunkDataPackID
33- // 2. a lookup for chunkDataPackID -> StoredChunkDataPack (only has CollectionID, no collection data)
34- // 3. a lookup for CollectionID -> Collection, then restore the chunk data pack with the collection and the StoredChunkDataPack
3554}
3655
3756var _ storage.ChunkDataPacks = (* ChunkDataPacks )(nil )
@@ -76,11 +95,20 @@ func NewChunkDataPacks(collector module.CacheMetrics, db storage.DB, stored stor
7695// chunk data pack (or it will get slashed). This mapping from chunk ID to the ID of the chunk data pack that the Execution Node
7796// actually committed to is stored in the protocol database, in the following phase 2.
7897// - In the second phase, we populate the index mappings from ChunkID to one "distinguished" chunk data pack ID. This mapping
79- // is stored in the protocol database. Typically, en Execution Node uses this for indexing its own chunk data packs which it
98+ // is stored in the protocol database. Typically, an Execution Node uses this for indexing its own chunk data packs which it
8099// publicly committed to.
81- // - This function can approximately be described as an atomic operation. When it completes successfully, either both databases
82- // have been updated, or neither. However, this is an approximation only, because interim states exist, where the chunk data
83- // packs already have been stored in the chunk data pack database, but the index mappings do not yet exist.
100+ //
101+ // ATOMICITY:
102+ // [ChunkDataPacks.Store] executes phase 1 immediately, persisting the chunk data packs in their dedicated database. However,
103+ // the index mappings in phase 2 is deferred to the caller, who must invoke the returned functor to perform phase 2. This
104+ // approach has the following benefits:
105+ // - Our API reflects that we are writing to two different databases here, with the chunk data pack database containing largely
106+ // specialized data subject to pruning. In contrast, the protocol database persists the commitments a node make (subject to
107+ // slashing). The caller receives the ability to persist this commitment in the form of the returned functor. The functor
108+ // may be discarded by the caller without corrupting the state (if anything, we have just stored some additional chunk data
109+ // packs).
110+ // - The serialization and storage of the comparatively large chunk data packs is separated from the protocol database writes.
111+ // - The locking duration of the protocol database is reduced.
84112//
85113// The Store method returns:
86114// - func(lctx lockctx.Proof, rw storage.ReaderBatchWriter) error: Function for populating the index mapping from chunkID
@@ -133,7 +161,8 @@ func (ch *ChunkDataPacks) Store(cs []*flow.ChunkDataPack) (
133161 return nil
134162 }
135163
136- // Return the function that completes the storage process
164+ // Returned Functor: when invoked, will add the deferred storage operations to the provided ReaderBatchWriter
165+ // NOTE: until this functor is called, only the chunk data packs are stored by their respective IDs.
137166 return storeChunkDataPacksFunc , nil
138167}
139168
@@ -242,7 +271,7 @@ func (ch *ChunkDataPacks) ByChunkID(chunkID flow.Identifier) (*flow.ChunkDataPac
242271 return nil , fmt .Errorf ("cannot retrieve stored chunk data pack %x for chunk %x: %w" , chunkDataPackID , chunkID , err )
243272 }
244273
245- var collection * flow.Collection
274+ var collection * flow.Collection // nil by default, which only represents system chunk
246275 if schdp .CollectionID != flow .ZeroID {
247276 collection , err = ch .collections .ByID (schdp .CollectionID )
248277 if err != nil {
0 commit comments