@@ -2193,17 +2193,19 @@ message ChunkingFunction {
21932193 // with the gear tables available at
21942194 // https://github.com/nlfiedler/fastcdc-rs/blob/3.2.1/src/v2020/mod.rs
21952195 //
2196- // Server which supports this chunking function MUST advertise the following
2196+ // Server which supports this chunking function SHOULD advertise the following
21972197 // configuration parameters through the CacheCapabilities message:
2198- // - normalization_level
2199- // - min_chunk_size_bytes
22002198 // - avg_chunk_size_bytes
2201- // - max_chunk_size_bytes
2199+ // - normalization_level
22022200 // - seed
22032201 //
2204- // Client MUST use these advertised parameters to setup the FastCDC chunker.
2205- // The remaining parameters, such as mask_s, mask_l can be derived from the
2206- // average chunk size parameter.
2202+ // If these parameters are not set, the client SHOULD use the following
2203+ // defaults: avg_chunk_size_bytes = 524288 (512 KiB), normalization_level = 2,
2204+ // seed = 0.
2205+ //
2206+ // min_chunk_size and max_chunk_size are derived from avg_chunk_size (1/4x
2207+ // and 4x respectively). The remaining parameters can be derived from the
2208+ // average chunk size and normalization level.
22072209 FASTCDC_2020 = 1 ;
22082210 }
22092211}
@@ -2335,40 +2337,88 @@ message CacheCapabilities {
23352337}
23362338
23372339// The chunking configuration of the server.
2340+ //
2341+ // This configuration describes how the server will chunk blobs. When calling
2342+ // SpliceBlob, the client SHOULD use these configurations if able, but the
2343+ // server is not expected to verify that it can reliably reproduce the chunking.
2344+ // The server only needs to verify that the chunks concatenate together to form
2345+ // the complete blob.
2346+ //
2347+ // This is primarily the server telling the client how it intends to chunk
2348+ // blobs. The client SHOULD try to match this configuration for optimal
2349+ // deduplication, but is not required to do so.
2350+ //
2351+ // Content-defined chunking is most beneficial for larger blobs where
2352+ // deduplication opportunities are greater, so smaller blobs SHOULD be
2353+ // uploaded without chunking.
23382354message ChunkingConfiguration {
2355+ // Parameters for the FastCDC content-defined chunking algorithm.
2356+ //
2357+ // Implementations MUST follow the FastCDC 2020 paper by Wen Xia, et al.:
2358+ // https://ieeexplore.ieee.org/document/9055082
2359+ //
2360+ // The fastcdc-rs library (https://docs.rs/fastcdc/latest/fastcdc/v2020/)
2361+ // is a reference implementation that follows the paper specification.
2362+ //
2363+ // Key algorithm components from the paper:
2364+ //
2365+ // GEAR table: 256 64-bit integers for the rolling hash, computed as:
2366+ // GEAR[i] = high_64_bits(MD5(byte(i))) for i in 0..255
2367+ //
2368+ // MASKS table: Bit patterns for chunk boundary detection, derived from
2369+ // the C reference implementation. The mask selection based on average
2370+ // chunk size and normalization level should match the paper.
2371+ //
2372+ // The minimum and maximum chunk sizes are derived from the average:
2373+ // - min_chunk_size = avg_chunk_size_bytes / 4
2374+ // - max_chunk_size = avg_chunk_size_bytes * 4
2375+ //
23392376 // If any of the advertised parameters are not within the expected range,
23402377 // the client SHOULD ignore FastCDC chunking function support.
23412378 message FastCDCParams {
2379+ // The average (expected) chunk size for the FastCDC chunking algorithm.
2380+ // The value MUST be between 1 KiB and 1 MiB.
2381+ // If unset, clients SHOULD use 524288 (512 KiB) as the default.
2382+ //
2383+ // The minimum chunk size will be avg_chunk_size_bytes / 4.
2384+ // The maximum chunk size will be avg_chunk_size_bytes * 4.
2385+ uint64 avg_chunk_size_bytes = 1 ;
2386+
23422387 // The normalization level for the FastCDC chunking algorithm.
23432388 // The value MUST be between 0 and 3.
2344- uint32 normalization_level = 1 ;
2345-
2346- // The minimum chunk size for the FastCDC chunking algorithm.
2347- // The value MUST be between 256 bytes and 64 KiB.
2348- uint64 min_chunk_size_bytes = 2 ;
2349-
2350- // The average chunk size for the FastCDC chunking algorithm.
2351- // The value MUST be between 1 KiB and 256 KiB.
2352- uint64 avg_chunk_size_bytes = 3 ;
2353-
2354- // The maximum chunk size for the FastCDC chunking algorithm.
2355- // The value MUST be between 4 KiB and 4 MiB.
2356- uint64 max_chunk_size_bytes = 4 ;
2389+ //
2390+ // Higher normalization levels produce chunks closer to the average size
2391+ // but may reduce deduplication effectiveness:
2392+ // - Level 0: No normalization, widest size distribution
2393+ // - Level 1: Fewer chunks outside desired range
2394+ // - Level 2: Most chunks match desired size (recommended)
2395+ // - Level 3: Nearly all chunks are the desired size
2396+ //
2397+ // If unset, clients SHOULD use 2 as the default.
2398+ uint32 normalization_level = 2 ;
23572399
2358- // The seed for the FastCDC chunking algorithm.
2359- uint32 seed = 5 ;
2400+ // The seed for the FastCDC mask generation.
2401+ // If unset, clients SHOULD use 0 as the default.
2402+ //
2403+ // Using a consistent seed ensures deterministic chunking across
2404+ // different clients, which improves deduplication.
2405+ uint32 seed = 3 ;
23602406 }
23612407
23622408 // A list of chunking algorithms that the server supports for splitting and
23632409 // splicing blobs.
23642410 repeated ChunkingFunction.Value supported_chunking_algorithms = 1 ;
23652411
23662412 // The minimum blob size that should be considered for chunking.
2367- // Blobs smaller than this threshold SHOULD be sent as single blobs.
2368- // If unset, clients SHOULD use max_cas_blob_size_bytes as the
2369- // minimum blob size for chunking.
2370- // If both this field and max_cas_blob_size_bytes are unset, clients
2371- // MAY chunk blobs of any size.
2413+ // Blobs with size less than or equal to this threshold SHOULD be uploaded
2414+ // as single blobs without chunking. Blobs with size greater than this
2415+ // threshold SHOULD be chunked.
2416+ //
2417+ // This threshold SHOULD be greater than or equal to the maximum chunk size
2418+ // produced by the chunking function to avoid re-chunking blobs that would
2419+ // result in a single chunk.
2420+ //
2421+ // If unset, clients SHOULD use 2097152 (2 MiB) as the default.
23722422 uint64 min_blob_size_for_chunking_bytes = 2 ;
23732423
23742424 // The parameters for the FastCDC chunking algorithm.
0 commit comments