Skip to content

Commit 1a4b1e4

Browse files
xingbowangmeta-codesync[bot]
authored andcommitted
Add include_blob_files option to GetApproximateSizes (#14501)
Summary: **Summary:** Add a new boolean flag `include_blob_files` (default: `false`) to `SizeApproximationOptions` and a corresponding `INCLUDE_BLOB_FILES` enum value to `SizeApproximationFlags`. When set to `true`, the returned size includes an approximation of blob file data in the queried key range. **Algorithm:** The blob file size contribution is prorated using the SST size ratio: ``` blob_size_in_range ≈ total_blob_size * (sst_size_in_range / total_sst_size) ``` The blob-to-SST ratio (`total_blob_size / total_sst_size`) is computed once before the per-range loop, so iterating levels and blob files only happens once per `GetApproximateSizes` call regardless of how many ranges are queried. The per-range SST size (`ApproximateSize`) is computed once and shared between `include_files` and `include_blob_files`. **Limitations:** - Assumes blob data is distributed proportionally to SST data across the key space. May be inaccurate if blob value sizes vary significantly across different key ranges (e.g., one range has large blobs while another has small ones). - If there are no SST files (all data in memtables), the blob size contribution will be 0 even if blob files exist on disk. **Changes:** - `include/rocksdb/options.h`: New `include_blob_files` field in `SizeApproximationOptions`; updated doc comments for `include_memtables`/`include_files` - `include/rocksdb/db.h`: New `INCLUDE_BLOB_FILES` in `SizeApproximationFlags` enum, updated flags-to-options mapping - `include/rocksdb/c.h`: New `rocksdb_size_approximation_flags_include_blob_files` C API enum value - `java/`: Added `INCLUDE_BLOB_FILES` to `SizeApproximationFlag.java` and JNI flag mapping in `rocksjni.cc` - `db/db_impl/db_impl.cc`: Blob-to-SST ratio computed once before loop, SST range size computed once per range and shared - `db_stress_tool/db_stress_test_base.cc`: Randomized `include_blob_files` in stress test Pull Request resolved: #14501 Test Plan: - New `DBBlobBasicTest.GetApproximateSizesIncludingBlobFiles` — verifies: - Size with blobs > without (full range) - Non-overlapping range returns 0 - Partial range returns proportionally less than full range - `SizeApproximationFlags` API works - Multi-range query: two sub-ranges sum approximately to the full-range result - Stress test now exercises the new option randomly Reviewed By: hx235 Differential Revision: D97984211 Pulled By: xingbowang fbshipit-source-id: e9127eac3308687fd4f0b17a771fd61fba6a8380
1 parent 47de3a3 commit 1a4b1e4

File tree

8 files changed

+207
-16
lines changed

8 files changed

+207
-16
lines changed

db/blob/db_blob_basic_test.cc

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2459,6 +2459,138 @@ TEST_F(DBBlobWithTimestampTest, IterateBlobs) {
24592459
}
24602460
}
24612461

2462+
TEST_F(DBBlobBasicTest, GetApproximateSizesIncludingBlobFiles) {
2463+
Options options = GetDefaultOptions();
2464+
options.enable_blob_files = true;
2465+
options.min_blob_size = 0;
2466+
2467+
Reopen(options);
2468+
2469+
// Write some key-value pairs with blob values and flush to create blob files.
2470+
constexpr int kNumKeys = 1000;
2471+
constexpr int kValueSize = 1024;
2472+
2473+
Random rnd(301);
2474+
for (int i = 0; i < kNumKeys; ++i) {
2475+
ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
2476+
}
2477+
ASSERT_OK(Flush());
2478+
2479+
// Verify blob files exist.
2480+
std::vector<std::string> files;
2481+
ASSERT_OK(env_->GetChildren(dbname_, &files));
2482+
bool has_blob_files = false;
2483+
for (const auto& f : files) {
2484+
if (f.size() > 5 && f.substr(f.size() - 5) == ".blob") {
2485+
has_blob_files = true;
2486+
break;
2487+
}
2488+
}
2489+
ASSERT_TRUE(has_blob_files);
2490+
2491+
// Query the full range - all keys are covered.
2492+
std::string start = Key(0);
2493+
std::string end = Key(kNumKeys);
2494+
Range r(start, end);
2495+
2496+
// Without include_blob_files (default behavior): should not include blob
2497+
// file sizes.
2498+
uint64_t size_without_blobs = 0;
2499+
{
2500+
SizeApproximationOptions size_approx_options;
2501+
size_approx_options.include_files = true;
2502+
size_approx_options.include_blob_files = false;
2503+
ASSERT_OK(db_->GetApproximateSizes(size_approx_options,
2504+
db_->DefaultColumnFamily(), &r, 1,
2505+
&size_without_blobs));
2506+
ASSERT_GT(size_without_blobs, 0);
2507+
}
2508+
2509+
// With include_blob_files: should be strictly larger.
2510+
{
2511+
SizeApproximationOptions size_approx_options;
2512+
size_approx_options.include_files = true;
2513+
size_approx_options.include_blob_files = true;
2514+
uint64_t size_with_blobs = 0;
2515+
ASSERT_OK(db_->GetApproximateSizes(size_approx_options,
2516+
db_->DefaultColumnFamily(), &r, 1,
2517+
&size_with_blobs));
2518+
ASSERT_GT(size_with_blobs, size_without_blobs);
2519+
}
2520+
2521+
// Range that doesn't overlap any data should return 0.
2522+
{
2523+
std::string no_start = Key(kNumKeys + 100);
2524+
std::string no_end = Key(kNumKeys + 200);
2525+
Range no_r(no_start, no_end);
2526+
SizeApproximationOptions size_approx_options;
2527+
size_approx_options.include_files = true;
2528+
size_approx_options.include_blob_files = true;
2529+
uint64_t no_size = 0;
2530+
ASSERT_OK(db_->GetApproximateSizes(
2531+
size_approx_options, db_->DefaultColumnFamily(), &no_r, 1, &no_size));
2532+
ASSERT_EQ(no_size, 0);
2533+
}
2534+
2535+
// Partial range should return proportionally less blob size than full range.
2536+
{
2537+
SizeApproximationOptions size_approx_options;
2538+
size_approx_options.include_files = true;
2539+
size_approx_options.include_blob_files = true;
2540+
2541+
uint64_t full_size = 0;
2542+
ASSERT_OK(db_->GetApproximateSizes(
2543+
size_approx_options, db_->DefaultColumnFamily(), &r, 1, &full_size));
2544+
2545+
// Query roughly the first half of keys.
2546+
std::string half_end = Key(kNumKeys / 2);
2547+
Range half_r(start, half_end);
2548+
uint64_t half_size = 0;
2549+
ASSERT_OK(db_->GetApproximateSizes(size_approx_options,
2550+
db_->DefaultColumnFamily(), &half_r, 1,
2551+
&half_size));
2552+
ASSERT_GT(half_size, 0);
2553+
ASSERT_LT(half_size, full_size);
2554+
}
2555+
2556+
// Via SizeApproximationFlags API.
2557+
{
2558+
uint64_t size_flags = 0;
2559+
ASSERT_OK(db_->GetApproximateSizes(
2560+
db_->DefaultColumnFamily(), &r, 1, &size_flags,
2561+
DB::SizeApproximationFlags::INCLUDE_FILES |
2562+
DB::SizeApproximationFlags::INCLUDE_BLOB_FILES));
2563+
ASSERT_GT(size_flags, size_without_blobs);
2564+
}
2565+
2566+
// Multi-range query: two non-overlapping sub-ranges should sum to
2567+
// approximately the full-range result.
2568+
{
2569+
SizeApproximationOptions size_approx_options;
2570+
size_approx_options.include_files = true;
2571+
size_approx_options.include_blob_files = true;
2572+
2573+
std::string mid = Key(kNumKeys / 2);
2574+
std::string r1_start = Key(0);
2575+
std::string r1_end = mid;
2576+
std::string r2_start = mid;
2577+
std::string r2_end = Key(kNumKeys);
2578+
Range ranges[2] = {Range(r1_start, r1_end), Range(r2_start, r2_end)};
2579+
uint64_t sizes[2] = {0, 0};
2580+
ASSERT_OK(db_->GetApproximateSizes(
2581+
size_approx_options, db_->DefaultColumnFamily(), ranges, 2, sizes));
2582+
// Each sub-range should return a positive size.
2583+
ASSERT_GT(sizes[0], 0);
2584+
ASSERT_GT(sizes[1], 0);
2585+
// Sum of sub-ranges should be close to the full-range result.
2586+
uint64_t full_size = 0;
2587+
ASSERT_OK(db_->GetApproximateSizes(
2588+
size_approx_options, db_->DefaultColumnFamily(), &r, 1, &full_size));
2589+
ASSERT_NEAR(static_cast<double>(sizes[0] + sizes[1]),
2590+
static_cast<double>(full_size), full_size * 0.1);
2591+
}
2592+
}
2593+
24622594
} // namespace ROCKSDB_NAMESPACE
24632595

24642596
int main(int argc, char** argv) {

db/db_impl/db_impl.cc

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4952,7 +4952,8 @@ void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
49524952
Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options,
49534953
ColumnFamilyHandle* column_family,
49544954
const Range* range, int n, uint64_t* sizes) {
4955-
if (!options.include_memtables && !options.include_files) {
4955+
if (!options.include_memtables && !options.include_files &&
4956+
!options.include_blob_files) {
49564957
return Status::InvalidArgument("Invalid options");
49574958
}
49584959

@@ -4968,6 +4969,28 @@ Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options,
49684969

49694970
// TODO: plumb Env::IOActivity, Env::IOPriority
49704971
const ReadOptions read_options;
4972+
4973+
// Pre-compute blob-to-SST ratio once (invariant across ranges for the same
4974+
// Version). This avoids iterating all levels and blob files per range.
4975+
double blob_to_sst_ratio = 0.0;
4976+
if (options.include_blob_files) {
4977+
const auto* vstorage = v->storage_info();
4978+
uint64_t total_sst_size = 0;
4979+
for (int level = 0; level < vstorage->num_non_empty_levels(); ++level) {
4980+
total_sst_size += vstorage->NumLevelBytes(level);
4981+
}
4982+
if (total_sst_size > 0) {
4983+
uint64_t total_blob_size = 0;
4984+
const auto& blob_files = vstorage->GetBlobFiles();
4985+
for (const auto& blob_file_meta : blob_files) {
4986+
assert(blob_file_meta);
4987+
total_blob_size += blob_file_meta->GetBlobFileSize();
4988+
}
4989+
blob_to_sst_ratio = static_cast<double>(total_blob_size) /
4990+
static_cast<double>(total_sst_size);
4991+
}
4992+
}
4993+
49714994
for (int i = 0; i < n; i++) {
49724995
// Add timestamp if needed
49734996
std::string start_with_ts, limit_with_ts;
@@ -4979,16 +5002,26 @@ Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options,
49795002
InternalKey k1(start.value(), kMaxSequenceNumber, kValueTypeForSeek);
49805003
InternalKey k2(limit.value(), kMaxSequenceNumber, kValueTypeForSeek);
49815004
sizes[i] = 0;
4982-
if (options.include_files) {
4983-
sizes[i] += versions_->ApproximateSize(
5005+
// Compute SST size in range (needed for both include_files and
5006+
// include_blob_files, since blob size is prorated by SST ratio).
5007+
uint64_t sst_size_in_range = 0;
5008+
if (options.include_files || options.include_blob_files) {
5009+
sst_size_in_range = versions_->ApproximateSize(
49845010
options, read_options, v, k1.Encode(), k2.Encode(),
49855011
/*start_level=*/0,
49865012
/*end_level=*/-1, TableReaderCaller::kUserApproximateSize);
49875013
}
5014+
if (options.include_files) {
5015+
sizes[i] += sst_size_in_range;
5016+
}
49885017
if (options.include_memtables) {
49895018
sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size;
49905019
sizes[i] += sv->imm->ApproximateStats(k1.Encode(), k2.Encode()).size;
49915020
}
5021+
if (options.include_blob_files) {
5022+
sizes[i] += static_cast<uint64_t>(static_cast<double>(sst_size_in_range) *
5023+
blob_to_sst_ratio);
5024+
}
49925025
}
49935026

49945027
ReturnAndCleanupSuperVersion(cfd, sv);

db_stress_tool/db_stress_test_base.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2790,7 +2790,8 @@ Status StressTest::TestApproximateSize(
27902790
// Call GetApproximateSizes
27912791
SizeApproximationOptions sao;
27922792
sao.include_memtables = thread->rand.OneIn(2);
2793-
if (sao.include_memtables) {
2793+
sao.include_blob_files = thread->rand.OneIn(2);
2794+
if (sao.include_memtables || sao.include_blob_files) {
27942795
sao.include_files = thread->rand.OneIn(2);
27952796
}
27962797
if (thread->rand.OneIn(2)) {

include/rocksdb/c.h

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -732,6 +732,7 @@ enum {
732732
rocksdb_size_approximation_flags_none = 0,
733733
rocksdb_size_approximation_flags_include_memtable = 1 << 0,
734734
rocksdb_size_approximation_flags_include_files = 1 << 1,
735+
rocksdb_size_approximation_flags_include_blob_files = 1 << 2,
735736
};
736737

737738
extern ROCKSDB_LIBRARY_API void rocksdb_approximate_sizes_cf_with_flags(
@@ -2729,12 +2730,13 @@ extern ROCKSDB_LIBRARY_API void rocksdb_try_catch_up_with_primary(
27292730
/* SliceTransform */
27302731

27312732
extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
2732-
rocksdb_slicetransform_create(
2733-
void* state, void (*destructor)(void*),
2734-
char* (*transform)(void*, const char* key, size_t length,
2735-
size_t* dst_length),
2736-
unsigned char (*in_domain)(void*, const char* key, size_t length),
2737-
const char* (*name)(void*));
2733+
rocksdb_slicetransform_create(void* state, void (*destructor)(void*),
2734+
char* (*transform)(void*, const char* key,
2735+
size_t length,
2736+
size_t* dst_length),
2737+
unsigned char (*in_domain)(void*, const char* key,
2738+
size_t length),
2739+
const char* (*name)(void*));
27382740
extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
27392741
rocksdb_slicetransform_create_fixed_prefix(size_t);
27402742
extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*

include/rocksdb/db.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1459,7 +1459,8 @@ class DB {
14591459
enum class SizeApproximationFlags : uint8_t {
14601460
NONE = 0,
14611461
INCLUDE_MEMTABLES = 1 << 0,
1462-
INCLUDE_FILES = 1 << 1
1462+
INCLUDE_FILES = 1 << 1,
1463+
INCLUDE_BLOB_FILES = 1 << 2
14631464
};
14641465

14651466
// For each i in [0,n-1], store in "sizes[i]", the approximate
@@ -2260,6 +2261,7 @@ inline Status DB::GetApproximateSizes(ColumnFamilyHandle* column_family,
22602261
using enum SizeApproximationFlags; // Require C++20 support
22612262
options.include_memtables = ((include_flags & INCLUDE_MEMTABLES) != NONE);
22622263
options.include_files = ((include_flags & INCLUDE_FILES) != NONE);
2264+
options.include_blob_files = ((include_flags & INCLUDE_BLOB_FILES) != NONE);
22632265
return GetApproximateSizes(options, column_family, ranges, n, sizes);
22642266
}
22652267

include/rocksdb/options.h

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2861,11 +2861,26 @@ struct ImportColumnFamilyOptions {
28612861
// Options used with DB::GetApproximateSizes()
28622862
struct SizeApproximationOptions {
28632863
// Defines whether the returned size should include the recently written
2864-
// data in the memtables. If set to false, include_files must be true.
2864+
// data in the memtables. If set to false, at least one of include_files or
2865+
// include_blob_files must be true.
28652866
bool include_memtables = false;
28662867
// Defines whether the returned size should include data serialized to disk.
2867-
// If set to false, include_memtables must be true.
2868+
// If set to false, at least one of include_memtables or include_blob_files
2869+
// must be true.
28682870
bool include_files = true;
2871+
// Defines whether the returned size should include an approximation of
2872+
// blob file data in the key range. When enabled, the total blob file size
2873+
// is prorated by the ratio of SST data in the range to the total SST data:
2874+
//
2875+
// blob_size_in_range ≈ total_blob_size * (sst_in_range / total_sst)
2876+
//
2877+
// Limitations of this approximation:
2878+
// - Assumes blob data is distributed proportionally to SST data, which
2879+
// may not hold if blob value sizes vary significantly across keys.
2880+
// - If there are no SST files (all data in memtables), the blob size
2881+
// contribution will be 0 even if blob files exist on disk.
2882+
// Default: false (for backward compatibility).
2883+
bool include_blob_files = false;
28692884
// When approximating the files total size that is used to store a keys range
28702885
// using DB::GetApproximateSizes, allow approximation with an error margin of
28712886
// up to total_files_size * files_size_error_margin. This allows to take some

java/rocksjni/rocksjni.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2542,6 +2542,11 @@ jlongArray Java_org_rocksdb_RocksDB_getApproximateSizes(
25422542
(include_flags |
25432543
ROCKSDB_NAMESPACE::DB::SizeApproximationFlags::INCLUDE_FILES);
25442544
}
2545+
if (jinclude_flags & 4) {
2546+
include_flags =
2547+
(include_flags |
2548+
ROCKSDB_NAMESPACE::DB::SizeApproximationFlags::INCLUDE_BLOB_FILES);
2549+
}
25452550

25462551
db->GetApproximateSizes(cf_handle, ranges.get(),
25472552
static_cast<int>(range_count), sizes.get(),

java/src/main/java/org/rocksdb/SizeApproximationFlag.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,10 @@
1010
* or file stats approximation or both.
1111
*/
1212
public enum SizeApproximationFlag {
13-
NONE((byte)0x0),
14-
INCLUDE_MEMTABLES((byte)0x1),
15-
INCLUDE_FILES((byte)0x2);
13+
NONE((byte) 0x0),
14+
INCLUDE_MEMTABLES((byte) 0x1),
15+
INCLUDE_FILES((byte) 0x2),
16+
INCLUDE_BLOB_FILES((byte) 0x4);
1617

1718
private final byte value;
1819

0 commit comments

Comments
 (0)