Skip to content

Commit 10f3cbe

Browse files
committed
feat(chunking): add chunking algorithm
1 parent 7ec9f3e commit 10f3cbe

File tree

1 file changed

+102
-38
lines changed

1 file changed

+102
-38
lines changed

build/bazel/remote/execution/v2/remote_execution.proto

Lines changed: 102 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -486,7 +486,7 @@ service ContentAddressableStorage {
486486
//
487487
// When blob splitting and splicing is used at the same time, the clients and
488488
// the server SHOULD agree out-of-band upon a chunking algorithm used by both
489-
// parties to benefit from each others chunk data and avoid unnecessary data
489+
// parties to benefit from each other's chunk data and avoid unnecessary data
490490
// duplication.
491491
//
492492
// Errors:
@@ -1804,7 +1804,7 @@ message BatchUpdateBlobsRequest {
18041804
bytes data = 2;
18051805

18061806
// The format of `data`. Must be `IDENTITY`/unspecified, or one of the
1807-
// compressors advertised by the
1807+
// compressors advertised by the
18081808
// [CacheCapabilities.supported_batch_compressors][build.bazel.remote.execution.v2.CacheCapabilities.supported_batch_compressors]
18091809
// field.
18101810
Compressor.Value compressor = 3;
@@ -2185,25 +2185,29 @@ message ChunkingFunction {
21852185
// about the chunking algorithm can be made.
21862186
UNKNOWN = 0;
21872187

2188-
// This is a variant of the FastCDC chunking algorithm as described in the
2189-
// 2020 paper by Wen Xia, et al.
2190-
// See https://ieeexplore.ieee.org/document/9055082 for details.
2191-
// Reference implementation could be found in the Rust library
2192-
// https://docs.rs/fastcdc/3.2.1/fastcdc/v2020/index.html
2193-
// with the gear tables available at
2194-
// https://github.com/nlfiedler/fastcdc-rs/blob/3.2.1/src/v2020/mod.rs
2188+
// The FastCDC chunking algorithm as described in the 2020 paper by
2189+
// Wen Xia, et al. See https://ieeexplore.ieee.org/document/9055082
2190+
// for details.
2191+
//
2192+
// Supported implementations:
2193+
// - Rust: https://docs.rs/fastcdc/3.2.1/fastcdc/v2020/index.html
2194+
// - Go: https://github.com/buildbuddy-io/fastcdc2020
21952195
//
2196-
// Server which supports this chunking function MUST advertise the following
2196+
// Servers which support this chunking function SHOULD advertise the following
21972197
// configuration parameters through the CacheCapabilities message:
2198-
// - normalization_level
2199-
// - min_chunk_size_bytes
22002198
// - avg_chunk_size_bytes
2201-
// - max_chunk_size_bytes
2199+
// - normalization_level
22022200
// - seed
22032201
//
2204-
// Client MUST use these advertised parameters to setup the FastCDC chunker.
2205-
// The remaining parameters, such as mask_s, mask_l can be derived from the
2206-
// average chunk size parameter.
2202+
// If these parameters are not set, the client SHOULD use the following
2203+
// defaults:
2204+
// - avg_chunk_size_bytes = 524288 (512 KiB)
2205+
// - normalization_level = 2
2206+
// - seed = 0.
2207+
//
2208+
// min_chunk_size and max_chunk_size are derived from avg_chunk_size (1/4x
2209+
// and 4x respectively). The remaining parameters can be derived from the
2210+
// average chunk size and normalization level.
22072211
FASTCDC_2020 = 1;
22082212
}
22092213
}
@@ -2335,41 +2339,101 @@ message CacheCapabilities {
23352339
}
23362340

23372341
// The chunking configuration of the server.
2342+
//
2343+
// This configuration describes how the server will chunk blobs. When calling
2344+
// SpliceBlob, the client SHOULD use these configurations if able, but the
2345+
// server is not expected to verify that it can reliably reproduce the chunking.
2346+
// The server only needs to verify that the chunks concatenate together to form
2347+
// the complete blob.
2348+
//
2349+
// This is primarily the server telling the client how it intends to chunk
2350+
// blobs. The client SHOULD try to match this configuration for optimal
2351+
// deduplication, but is not required to do so.
2352+
//
2353+
// Content-defined chunking is most beneficial for larger blobs where
2354+
// deduplication opportunities are greater, so smaller blobs SHOULD be
2355+
// uploaded without chunking.
23382356
message ChunkingConfiguration {
2357+
// Parameters for the FastCDC content-defined chunking algorithm.
2358+
//
2359+
// Implementations MUST follow the FastCDC 2020 paper by Wen Xia, et al.:
2360+
// https://ieeexplore.ieee.org/document/9055082
2361+
//
2362+
// The fastcdc-rs library (https://docs.rs/fastcdc/3.2.1/fastcdc/v2020/)
2363+
// is a reference implementation that follows the paper specification.
2364+
//
2365+
// Key algorithm components from the paper:
2366+
//
2367+
// GEAR table: 256 64-bit integers for the rolling hash, computed as:
2368+
// GEAR[i] = high_64_bits(MD5(byte(i))) for i in 0..255
2369+
//
2370+
// MASKS table: Bit patterns for chunk boundary detection from Table II
2371+
// of the paper. Mask selection uses two masks based on avg_chunk_size
2372+
// and normalization_level:
2373+
// bits = log2(avg_chunk_size)
2374+
// mask_small = MASKS[bits + normalization_level] // harder to match
2375+
// mask_large = MASKS[bits - normalization_level] // easier to match
2376+
//
2377+
// The algorithm uses mask_small until avg_chunk_size bytes are processed,
2378+
// then switches to mask_large. This "normalized chunking" approach biases
2379+
// chunk sizes toward the average.
2380+
//
2381+
// For the complete MASKS table and GEAR table values, see the reference
2382+
// implementations linked above.
2383+
//
2384+
// The minimum and maximum chunk sizes are derived from the average:
2385+
// - min_chunk_size = avg_chunk_size_bytes / 4
2386+
// - max_chunk_size = avg_chunk_size_bytes * 4
2387+
//
23392388
// If any of the advertised parameters are not within the expected range,
23402389
// the client SHOULD ignore FastCDC chunking function support.
23412390
message FastCDCParams {
2391+
// The average (expected) chunk size for the FastCDC chunking algorithm.
2392+
// The value MUST be between 256B and 4 MiB.
2393+
// If unset, clients SHOULD use 524288 (512 KiB) as the default.
2394+
//
2395+
// The minimum chunk size will be avg_chunk_size_bytes / 4.
2396+
// The maximum chunk size will be avg_chunk_size_bytes * 4.
2397+
uint64 avg_chunk_size_bytes = 1;
2398+
23422399
// The normalization level for the FastCDC chunking algorithm.
23432400
// The value MUST be between 0 and 3.
2344-
uint32 normalization_level = 1;
2345-
2346-
// The minimum chunk size for the FastCDC chunking algorithm.
2347-
// The value MUST be between 256 bytes and 64 KiB.
2348-
uint64 min_chunk_size_bytes = 2;
2349-
2350-
// The average chunk size for the FastCDC chunking algorithm.
2351-
// The value MUST be between 1 KiB and 256 KiB.
2352-
uint64 avg_chunk_size_bytes = 3;
2353-
2354-
// The maximum chunk size for the FastCDC chunking algorithm.
2355-
// The value MUST be between 4 KiB and 4 MiB.
2356-
uint64 max_chunk_size_bytes = 4;
2401+
//
2402+
// Higher normalization levels produce chunks closer to the average size
2403+
// but may reduce deduplication effectiveness:
2404+
// - Level 0: No normalization, widest size distribution
2405+
// - Level 1: Fewer chunks outside desired range
2406+
// - Level 2: Most chunks match desired size (recommended)
2407+
// - Level 3: Nearly all chunks are the desired size
2408+
//
2409+
// If unset, clients SHOULD use 2 as the default.
2410+
uint32 normalization_level = 2;
23572411

2358-
// The seed for the FastCDC chunking algorithm.
2359-
uint32 seed = 5;
2412+
// The seed to XOR with the GEAR table values.
2413+
// If unset, clients SHOULD use 0 as the default.
2414+
//
2415+
// Using a consistent seed ensures deterministic chunking across
2416+
// different clients, which improves deduplication. A non-zero seed
2417+
// can also prevent fingerprinting attacks that infer content from
2418+
// chunk sizes.
2419+
uint64 seed = 3;
23602420
}
23612421

23622422
// A list of chunking algorithms that the server supports for splitting and
23632423
// splicing blobs.
23642424
repeated ChunkingFunction.Value supported_chunking_algorithms = 1;
23652425

2366-
// The minimum blob size that should be considered for chunking.
2367-
// Blobs smaller than this threshold SHOULD be sent as single blobs.
2368-
// If unset, clients SHOULD use max_cas_blob_size_bytes as the
2369-
// minimum blob size for chunking.
2370-
// If both this field and max_cas_blob_size_bytes are unset, clients
2371-
// MAY chunk blobs of any size.
2372-
uint64 min_blob_size_for_chunking_bytes = 2;
2426+
// The size threshold for chunking blobs.
2427+
// Blobs with size less than or equal to this threshold SHOULD be uploaded
2428+
// as single blobs without chunking. Blobs with size greater than this
2429+
// threshold SHOULD be chunked.
2430+
//
2431+
// This threshold SHOULD be greater than or equal to the maximum chunk size
2432+
// produced by the chunking function to avoid re-chunking blobs that would
2433+
// result in a single chunk.
2434+
//
2435+
// If unset, clients SHOULD use 2097152 (2 MiB) as the default.
2436+
uint64 chunking_threshold_bytes = 2;
23732437

23742438
// The parameters for the FastCDC chunking algorithm.
23752439
FastCDCParams fastcdc_params = 3;

0 commit comments

Comments
 (0)