documentation extensions and polishing

AlexHentschel · AlexHentschel · commit fe66dd98d690 · 2025-10-14T17:04:27.000-07:00
diff --git a/storage/chunk_data_packs_stored.go b/storage/chunk_data_packs_stored.go
@@ -38,7 +38,7 @@ type StoredChunkDataPack struct {
 	ChunkID           flow.Identifier
 	StartState        flow.StateCommitment
 	Proof             flow.StorageProof
-	CollectionID      flow.Identifier
+	CollectionID      flow.Identifier // flow.ZeroID for system chunks
 	ExecutionDataRoot flow.BlockExecutionDataRoot
 }
 
@@ -58,6 +58,7 @@ func NewStoredChunkDataPack(
 	}
 }
 
+// IsSystemChunk returns true if this chunk data pack is for a system chunk.
 func (s *StoredChunkDataPack) IsSystemChunk() bool {
 	return s.CollectionID == flow.ZeroID
 }
@@ -67,7 +68,6 @@ func ToStoredChunkDataPack(c *flow.ChunkDataPack) *StoredChunkDataPack {
 	if c.Collection != nil {
 		collectionID = c.Collection.ID()
 	}
-
 	return NewStoredChunkDataPack(
 		c.ChunkID,
 		c.StartState,
@@ -77,7 +77,10 @@ func ToStoredChunkDataPack(c *flow.ChunkDataPack) *StoredChunkDataPack {
 	)
 }
 
-func ToStoredChunkDataPacks(cs []*flow.ChunkDataPack) []*StoredChunkDataPack { // ToStoredChunkDataPack converts the given ChunkDataPacks to their reduced representation,
+// ToStoredChunkDataPacks converts the given Chunk Data Packs to their reduced representation.
+// This is useful for reducing storage consumption, by avoiding repeated storage of the full collections
+// (stored individually anyway).
+func ToStoredChunkDataPacks(cs []*flow.ChunkDataPack) []*StoredChunkDataPack {
 	scs := make([]*StoredChunkDataPack, 0, len(cs))
 	for _, c := range cs {
 		scs = append(scs, ToStoredChunkDataPack(c))
diff --git a/storage/operation/chunk_data_packs.go b/storage/operation/chunk_data_packs.go
@@ -10,8 +10,8 @@ import (
 	"github.com/onflow/flow-go/storage"
 )
 
-// IndexChunkDataPackByChunkID inserts a mapping from chunk ID to stored chunk data pack ID.
-// It requires the [storage.LockInsertOwnReceipt] lock to be held by the caller.
+// IndexChunkDataPackByChunkID inserts a mapping from chunk ID to stored chunk data pack ID. It requires
+// the [storage.LockInsertOwnReceipt] lock to be acquired by the caller and held until the write batch has been committed.
 // Returns [storage.ErrDataMismatch] if a different chunk data pack ID already exists for the given chunk ID.
 func IndexChunkDataPackByChunkID(lctx lockctx.Proof, rw storage.ReaderBatchWriter, chunkID flow.Identifier, chunkDataPackID flow.Identifier) error {
 	if !lctx.HoldsLock(storage.LockInsertOwnReceipt) {
@@ -35,7 +35,7 @@ func IndexChunkDataPackByChunkID(lctx lockctx.Proof, rw storage.ReaderBatchWrite
 }
 
 // RetrieveChunkDataPackID retrieves the stored chunk data pack ID for a given chunk ID.
-// Returns [storage.ErrNotFound] if no mapping exists for the given chunk ID.
+// Returns [storage.ErrNotFound] if no chunk data pack has been indexed as result for the given chunk ID.
 func RetrieveChunkDataPackID(r storage.Reader, chunkID flow.Identifier, chunkDataPackID *flow.Identifier) error {
 	return RetrieveByKey(r, MakePrefix(codeIndexChunkDataPackByChunkID, chunkID), chunkDataPackID)
 }
@@ -47,14 +47,18 @@ func RemoveChunkDataPackID(w storage.Writer, chunkID flow.Identifier) error {
 }
 
 // InsertStoredChunkDataPack inserts a [storage.StoredChunkDataPack] into the database, keyed by its own ID.
-// The caller must ensure the chunkDataPackID is the same as c.ID().
+//
+// CAUTION: The caller must ensure `storeChunkDataPackID` is the same as `c.ID()`, ie. a collision-resistant
+// hash of the chunk data pack! This method silently overrides existing data, which is safe only if for the
+// same key, we always write the same value.
+//
 // No error returns expected during normal operations.
 func InsertStoredChunkDataPack(rw storage.ReaderBatchWriter, storeChunkDataPackID flow.Identifier, c *storage.StoredChunkDataPack) error {
 	return UpsertByKey(rw.Writer(), MakePrefix(codeChunkDataPack, storeChunkDataPackID), c)
 }
 
 // RetrieveStoredChunkDataPack retrieves a chunk data pack by stored chunk data pack ID.
-// It returns [storage.ErrNotFound] if the chunk data pack is not found
+// It returns [storage.ErrNotFound] if no chunk data pack with the given ID is known.
 func RetrieveStoredChunkDataPack(r storage.Reader, storeChunkDataPackID flow.Identifier, c *storage.StoredChunkDataPack) error {
 	return RetrieveByKey(r, MakePrefix(codeChunkDataPack, storeChunkDataPackID), c)
 }
diff --git a/storage/store/chunk_data_packs.go b/storage/store/chunk_data_packs.go
@@ -14,6 +14,30 @@ import (
 	"github.com/onflow/flow-go/storage/operation"
 )
 
+// ChunkDataPacks manages storage and retrieval of ChunkDataPacks, primarily serving the use case of EXECUTION NODES persisting
+// and indexing chunk data packs for their OWN RESULTS. Essentially, the chunk describes a batch of work to be done, and the
+// chunk data pack describes the result of that work. The storage of chunk data packs is segregated across different
+// storage components for efficiency and modularity reasons:
+//  0. Usually (ignoring the system chunk for a moment), the batch of work is given by the collection referenced in the chunk
+//     data pack. For any chunk data pack being stored, we assume that the executed collection has *previously* been persisted
+//     in [storage.Collections]. It is useful to persist the collections individually, so we can individually retrieve them.
+//  1. The actual chunk data pack itself is stored in a dedicated storage component `cdpStorage`. Note that for this storage
+//     component, no atomicity is required, as we are storing chunk data packs by their collision-resistant hashes, so
+//     different chunk data packs will be stored under different keys.
+//     Theoretically, nodes could store persist multiple different (disagreeing) chunk data packs for the same
+//     chunk in this step. However, for efficiency, Execution Nodes only store their own chunk data packs.
+//  2. The index mapping from ChunkID to chunkDataPackID is stored in the protocol database for fast retrieval.
+//     This index is intended to be populated by execution nodes when they commit to a specific result represented by the chunk
+//     data pack. Here, we require atomicity, as an execution node should not be changing / overwriting which chunk data pack
+//     it committed to (during normal operations).
+//
+// Since the executed collections are stored separately (step 0, above), we can just use the collection ID in context of the
+// chunk data pack storage (step 1, above). Therefore, we utilize the reduced representation [storage.StoredChunkDataPack]
+// internally. While removing redundant data from storage, it takes 3 look-ups to return chunk data pack by chunk ID:
+//
+//	i. a lookup for chunkID -> chunkDataPackID
+//	ii. a lookup for chunkDataPackID -> StoredChunkDataPack (only has CollectionID, no collection data)
+//	iii. a lookup for CollectionID -> Collection, then reconstruct the chunk data pack from the collection and the StoredChunkDataPack
 type ChunkDataPacks struct {
 	// the protocol DB is used for storing index mappings from chunk ID to chunk data pack ID
 	protocolDB storage.DB
@@ -27,11 +51,6 @@ type ChunkDataPacks struct {
 
 	// cache chunkID -> chunkDataPackID
 	chunkIDToChunkDataPackIDCache *Cache[flow.Identifier, flow.Identifier]
-
-	// it takes 3 look ups to return chunk data pack by chunk ID:
-	// 1. a cache lookup for chunkID -> chunkDataPackID
-	// 2. a lookup for chunkDataPackID -> StoredChunkDataPack (only has CollectionID, no collection data)
-	// 3. a lookup for CollectionID -> Collection, then restore the chunk data pack with the collection and the StoredChunkDataPack
 }
 
 var _ storage.ChunkDataPacks = (*ChunkDataPacks)(nil)

Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@ type StoredChunkDataPack struct {`
`38`	`38`	`ChunkID flow.Identifier`
`39`	`39`	`StartState flow.StateCommitment`
`40`	`40`	`Proof flow.StorageProof`
`41`		`- CollectionID flow.Identifier`
	`41`	`+ CollectionID flow.Identifier // flow.ZeroID for system chunks`
`42`	`42`	`ExecutionDataRoot flow.BlockExecutionDataRoot`
`43`	`43`	`}`
`44`	`44`
`@@ -58,6 +58,7 @@ func NewStoredChunkDataPack(`
`58`	`58`	`}`
`59`	`59`	`}`
`60`	`60`
	`61`	`+// IsSystemChunk returns true if this chunk data pack is for a system chunk.`
`61`	`62`	`func (s *StoredChunkDataPack) IsSystemChunk() bool {`
`62`	`63`	`return s.CollectionID == flow.ZeroID`
`63`	`64`	`}`
`@@ -67,7 +68,6 @@ func ToStoredChunkDataPack(c flow.ChunkDataPack) StoredChunkDataPack {`
`67`	`68`	`if c.Collection != nil {`
`68`	`69`	`collectionID = c.Collection.ID()`
`69`	`70`	`}`
`70`		`-`
`71`	`71`	`return NewStoredChunkDataPack(`
`72`	`72`	`c.ChunkID,`
`73`	`73`	`c.StartState,`
`@@ -77,7 +77,10 @@ func ToStoredChunkDataPack(c flow.ChunkDataPack) StoredChunkDataPack {`
`77`	`77`	`)`
`78`	`78`	`}`
`79`	`79`
`80`		`-func ToStoredChunkDataPacks(cs []flow.ChunkDataPack) []StoredChunkDataPack { // ToStoredChunkDataPack converts the given ChunkDataPacks to their reduced representation,`
	`80`	`+// ToStoredChunkDataPacks converts the given Chunk Data Packs to their reduced representation.`
	`81`	`+// This is useful for reducing storage consumption, by avoiding repeated storage of the full collections`
	`82`	`+// (stored individually anyway).`
	`83`	`+func ToStoredChunkDataPacks(cs []flow.ChunkDataPack) []StoredChunkDataPack {`
`81`	`84`	`scs := make([]*StoredChunkDataPack, 0, len(cs))`
`82`	`85`	`for _, c := range cs {`
`83`	`86`	`scs = append(scs, ToStoredChunkDataPack(c))`