@@ -486,7 +486,7 @@ service ContentAddressableStorage {
486486 //
487487 // When blob splitting and splicing is used at the same time, the clients and
488488 // the server SHOULD agree out-of-band upon a chunking algorithm used by both
489- // parties to benefit from each others chunk data and avoid unnecessary data
489+ // parties to benefit from each other's chunk data and avoid unnecessary data
490490 // duplication.
491491 //
492492 // Errors:
@@ -1804,7 +1804,7 @@ message BatchUpdateBlobsRequest {
18041804 bytes data = 2 ;
18051805
18061806 // The format of `data`. Must be `IDENTITY`/unspecified, or one of the
1807- // compressors advertised by the
1807+ // compressors advertised by the
18081808 // [CacheCapabilities.supported_batch_compressors][build.bazel.remote.execution.v2.CacheCapabilities.supported_batch_compressors]
18091809 // field.
18101810 Compressor.Value compressor = 3 ;
@@ -2185,25 +2185,29 @@ message ChunkingFunction {
21852185 // about the chunking algorithm can be made.
21862186 UNKNOWN = 0 ;
21872187
2188- // This is a variant of the FastCDC chunking algorithm as described in the
2189- // 2020 paper by Wen Xia, et al.
2190- // See https://ieeexplore.ieee.org/document/9055082 for details.
2191- // Reference implementation could be found in the Rust library
2192- // https://docs.rs/fastcdc/3.2.1/fastcdc/v2020/index.html
2193- // with the gear tables available at
2194- // https://github.com/nlfiedler/fastcdc-rs/blob/3.2.1/src/v2020/mod.rs
2188+ // The FastCDC chunking algorithm as described in the 2020 paper by
2189+ // Wen Xia, et al. See https://ieeexplore.ieee.org/document/9055082
2190+ // for details.
2191+ //
2192+ // Supported implementations:
2193+ // - Rust: https://docs.rs/fastcdc/3.2.1/fastcdc/v2020/index.html
2194+ // - Go: https://github.com/buildbuddy-io/fastcdc2020
21952195 //
2196- // Server which supports this chunking function MUST advertise the following
2196+ // Servers which support this chunking function SHOULD advertise the following
21972197 // configuration parameters through the CacheCapabilities message:
2198- // - normalization_level
2199- // - min_chunk_size_bytes
22002198 // - avg_chunk_size_bytes
2201- // - max_chunk_size_bytes
2199+ // - normalization_level
22022200 // - seed
22032201 //
2204- // Client MUST use these advertised parameters to setup the FastCDC chunker.
2205- // The remaining parameters, such as mask_s, mask_l can be derived from the
2206- // average chunk size parameter.
2202+ // If these parameters are not set, the client SHOULD use the following
2203+ // defaults:
2204+ // - avg_chunk_size_bytes = 524288 (512 KiB)
2205+ // - normalization_level = 2
2206+ // - seed = 0.
2207+ //
2208+ // min_chunk_size and max_chunk_size are derived from avg_chunk_size (1/4x
2209+ // and 4x respectively). The remaining parameters can be derived from the
2210+ // average chunk size and normalization level.
22072211 FASTCDC_2020 = 1 ;
22082212 }
22092213}
@@ -2335,41 +2339,101 @@ message CacheCapabilities {
23352339}
23362340
23372341// The chunking configuration of the server.
2342+ //
2343+ // This configuration describes how the server will chunk blobs. When calling
2344+ // SpliceBlob, the client SHOULD use these configurations if able, but the
2345+ // server is not expected to verify that it can reliably reproduce the chunking.
2346+ // The server only needs to verify that the chunks concatenate together to form
2347+ // the complete blob.
2348+ //
2349+ // This is primarily the server telling the client how it intends to chunk
2350+ // blobs. The client SHOULD try to match this configuration for optimal
2351+ // deduplication, but is not required to do so.
2352+ //
2353+ // Content-defined chunking is most beneficial for larger blobs where
2354+ // deduplication opportunities are greater, so smaller blobs SHOULD be
2355+ // uploaded without chunking.
23382356message ChunkingConfiguration {
2357+ // Parameters for the FastCDC content-defined chunking algorithm.
2358+ //
2359+ // Implementations MUST follow the FastCDC 2020 paper by Wen Xia, et al.:
2360+ // https://ieeexplore.ieee.org/document/9055082
2361+ //
2362+ // The fastcdc-rs library (https://docs.rs/fastcdc/3.2.1/fastcdc/v2020/)
2363+ // is a reference implementation that follows the paper specification.
2364+ //
2365+ // Key algorithm components from the paper:
2366+ //
2367+ // GEAR table: 256 64-bit integers for the rolling hash, computed as:
2368+ // GEAR[i] = high_64_bits(MD5(byte(i))) for i in 0..255
2369+ //
2370+ // MASKS table: Bit patterns for chunk boundary detection from Table II
2371+ // of the paper. Mask selection uses two masks based on avg_chunk_size
2372+ // and normalization_level:
2373+ // bits = log2(avg_chunk_size)
2374+ // mask_small = MASKS[bits + normalization_level] // harder to match
2375+ // mask_large = MASKS[bits - normalization_level] // easier to match
2376+ //
2377+ // The algorithm uses mask_small until avg_chunk_size bytes are processed,
2378+ // then switches to mask_large. This "normalized chunking" approach biases
2379+ // chunk sizes toward the average.
2380+ //
2381+ // For the complete MASKS table and GEAR table values, see the reference
2382+ // implementations linked above.
2383+ //
2384+ // The minimum and maximum chunk sizes are derived from the average:
2385+ // - min_chunk_size = avg_chunk_size_bytes / 4
2386+ // - max_chunk_size = avg_chunk_size_bytes * 4
2387+ //
23392388 // If any of the advertised parameters are not within the expected range,
23402389 // the client SHOULD ignore FastCDC chunking function support.
23412390 message FastCDCParams {
2391+ // The average (expected) chunk size for the FastCDC chunking algorithm.
2392+ // The value MUST be between 256B and 4 MiB.
2393+ // If unset, clients SHOULD use 524288 (512 KiB) as the default.
2394+ //
2395+ // The minimum chunk size will be avg_chunk_size_bytes / 4.
2396+ // The maximum chunk size will be avg_chunk_size_bytes * 4.
2397+ uint64 avg_chunk_size_bytes = 1 ;
2398+
23422399 // The normalization level for the FastCDC chunking algorithm.
23432400 // The value MUST be between 0 and 3.
2344- uint32 normalization_level = 1 ;
2345-
2346- // The minimum chunk size for the FastCDC chunking algorithm.
2347- // The value MUST be between 256 bytes and 64 KiB.
2348- uint64 min_chunk_size_bytes = 2 ;
2349-
2350- // The average chunk size for the FastCDC chunking algorithm.
2351- // The value MUST be between 1 KiB and 256 KiB.
2352- uint64 avg_chunk_size_bytes = 3 ;
2353-
2354- // The maximum chunk size for the FastCDC chunking algorithm.
2355- // The value MUST be between 4 KiB and 4 MiB.
2356- uint64 max_chunk_size_bytes = 4 ;
2401+ //
2402+ // Higher normalization levels produce chunks closer to the average size
2403+ // but may reduce deduplication effectiveness:
2404+ // - Level 0: No normalization, widest size distribution
2405+ // - Level 1: Fewer chunks outside desired range
2406+ // - Level 2: Most chunks match desired size (recommended)
2407+ // - Level 3: Nearly all chunks are the desired size
2408+ //
2409+ // If unset, clients SHOULD use 2 as the default.
2410+ uint32 normalization_level = 2 ;
23572411
2358- // The seed for the FastCDC chunking algorithm.
2359- uint32 seed = 5 ;
2412+ // The seed to XOR with the GEAR table values.
2413+ // If unset, clients SHOULD use 0 as the default.
2414+ //
2415+ // Using a consistent seed ensures deterministic chunking across
2416+ // different clients, which improves deduplication. A non-zero seed
2417+ // can also prevent fingerprinting attacks that infer content from
2418+ // chunk sizes.
2419+ uint64 seed = 3 ;
23602420 }
23612421
23622422 // A list of chunking algorithms that the server supports for splitting and
23632423 // splicing blobs.
23642424 repeated ChunkingFunction.Value supported_chunking_algorithms = 1 ;
23652425
2366- // The minimum blob size that should be considered for chunking.
2367- // Blobs smaller than this threshold SHOULD be sent as single blobs.
2368- // If unset, clients SHOULD use max_cas_blob_size_bytes as the
2369- // minimum blob size for chunking.
2370- // If both this field and max_cas_blob_size_bytes are unset, clients
2371- // MAY chunk blobs of any size.
2372- uint64 min_blob_size_for_chunking_bytes = 2 ;
2426+ // The size threshold for chunking blobs.
2427+ // Blobs with size less than or equal to this threshold SHOULD be uploaded
2428+ // as single blobs without chunking. Blobs with size greater than this
2429+ // threshold SHOULD be chunked.
2430+ //
2431+ // This threshold SHOULD be greater than or equal to the maximum chunk size
2432+ // produced by the chunking function to avoid re-chunking blobs that would
2433+ // result in a single chunk.
2434+ //
2435+ // If unset, clients SHOULD use 2097152 (2 MiB) as the default.
2436+ uint64 chunking_threshold_bytes = 2 ;
23732437
23742438 // The parameters for the FastCDC chunking algorithm.
23752439 FastCDCParams fastcdc_params = 3 ;
0 commit comments