Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 132 additions & 0 deletions db/blob/db_blob_basic_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2459,6 +2459,138 @@
}
}

TEST_F(DBBlobBasicTest, GetApproximateSizesIncludingBlobFiles) {
Options options = GetDefaultOptions();
options.enable_blob_files = true;
options.min_blob_size = 0;

Reopen(options);

// Write some key-value pairs with blob values and flush to create blob files.
constexpr int kNumKeys = 1000;
constexpr int kValueSize = 1024;

Random rnd(301);
for (int i = 0; i < kNumKeys; ++i) {
ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
}
ASSERT_OK(Flush());

// Verify blob files exist.
std::vector<std::string> files;
ASSERT_OK(env_->GetChildren(dbname_, &files));
bool has_blob_files = false;
for (const auto& f : files) {
if (f.size() > 5 && f.substr(f.size() - 5) == ".blob") {
has_blob_files = true;
break;
}
}
ASSERT_TRUE(has_blob_files);

// Query the full range - all keys are covered.
std::string start = Key(0);
std::string end = Key(kNumKeys);
Range r(start, end);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seem like both stress test and unit test only cover one single range, may worth twisting either of them to cover two ranges at least.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done — added a multi-range test case that queries two non-overlapping sub-ranges and verifies both return positive sizes, and that their sum is approximately equal to the full-range result (within 10%).


// Without include_blob_files (default behavior): should not include blob
// file sizes.
uint64_t size_without_blobs = 0;
{
SizeApproximationOptions size_approx_options;
size_approx_options.include_files = true;
size_approx_options.include_blob_files = false;
ASSERT_OK(db_->GetApproximateSizes(size_approx_options,
db_->DefaultColumnFamily(), &r, 1,
&size_without_blobs));
ASSERT_GT(size_without_blobs, 0);
}

// With include_blob_files: should be strictly larger.
{
SizeApproximationOptions size_approx_options;
size_approx_options.include_files = true;
size_approx_options.include_blob_files = true;
uint64_t size_with_blobs = 0;
ASSERT_OK(db_->GetApproximateSizes(size_approx_options,
db_->DefaultColumnFamily(), &r, 1,
&size_with_blobs));
ASSERT_GT(size_with_blobs, size_without_blobs);
}

// Range that doesn't overlap any data should return 0.
{
std::string no_start = Key(kNumKeys + 100);
std::string no_end = Key(kNumKeys + 200);
Range no_r(no_start, no_end);
SizeApproximationOptions size_approx_options;
size_approx_options.include_files = true;
size_approx_options.include_blob_files = true;
uint64_t no_size = 0;
ASSERT_OK(db_->GetApproximateSizes(
size_approx_options, db_->DefaultColumnFamily(), &no_r, 1, &no_size));
ASSERT_EQ(no_size, 0);
}

// Partial range should return proportionally less blob size than full range.
{
SizeApproximationOptions size_approx_options;
size_approx_options.include_files = true;
size_approx_options.include_blob_files = true;

uint64_t full_size = 0;
ASSERT_OK(db_->GetApproximateSizes(
size_approx_options, db_->DefaultColumnFamily(), &r, 1, &full_size));

// Query roughly the first half of keys.
std::string half_end = Key(kNumKeys / 2);
Range half_r(start, half_end);
uint64_t half_size = 0;
ASSERT_OK(db_->GetApproximateSizes(size_approx_options,
db_->DefaultColumnFamily(), &half_r, 1,
&half_size));
ASSERT_GT(half_size, 0);
ASSERT_LT(half_size, full_size);
}

// Via SizeApproximationFlags API.
{
uint64_t size_flags = 0;
ASSERT_OK(db_->GetApproximateSizes(
db_->DefaultColumnFamily(), &r, 1, &size_flags,
DB::SizeApproximationFlags::INCLUDE_FILES |
DB::SizeApproximationFlags::INCLUDE_BLOB_FILES));
ASSERT_GT(size_flags, size_without_blobs);
}

// Multi-range query: two non-overlapping sub-ranges should sum to
// approximately the full-range result.
{
SizeApproximationOptions size_approx_options;
size_approx_options.include_files = true;
size_approx_options.include_blob_files = true;

std::string mid = Key(kNumKeys / 2);
std::string r1_start = Key(0);
std::string r1_end = mid;

Check warning on line 2575 in db/blob/db_blob_basic_test.cc

View workflow job for this annotation

GitHub Actions / clang-tidy

local copy 'r1_end' of the variable 'mid' is never modified; consider avoiding the copy [performance-unnecessary-copy-initialization]
std::string r2_start = mid;

Check warning on line 2576 in db/blob/db_blob_basic_test.cc

View workflow job for this annotation

GitHub Actions / clang-tidy

local copy 'r2_start' of the variable 'mid' is never modified; consider avoiding the copy [performance-unnecessary-copy-initialization]
std::string r2_end = Key(kNumKeys);
Range ranges[2] = {Range(r1_start, r1_end), Range(r2_start, r2_end)};
uint64_t sizes[2] = {0, 0};
ASSERT_OK(db_->GetApproximateSizes(
size_approx_options, db_->DefaultColumnFamily(), ranges, 2, sizes));
// Each sub-range should return a positive size.
ASSERT_GT(sizes[0], 0);
ASSERT_GT(sizes[1], 0);
// Sum of sub-ranges should be close to the full-range result.
uint64_t full_size = 0;
ASSERT_OK(db_->GetApproximateSizes(
size_approx_options, db_->DefaultColumnFamily(), &r, 1, &full_size));
ASSERT_NEAR(static_cast<double>(sizes[0] + sizes[1]),
static_cast<double>(full_size), full_size * 0.1);
}
}

} // namespace ROCKSDB_NAMESPACE

int main(int argc, char** argv) {
Expand Down
39 changes: 36 additions & 3 deletions db/db_impl/db_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4952,7 +4952,8 @@ void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options,
ColumnFamilyHandle* column_family,
const Range* range, int n, uint64_t* sizes) {
if (!options.include_memtables && !options.include_files) {
if (!options.include_memtables && !options.include_files &&
!options.include_blob_files) {
return Status::InvalidArgument("Invalid options");
}

Expand All @@ -4968,6 +4969,28 @@ Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options,

// TODO: plumb Env::IOActivity, Env::IOPriority
const ReadOptions read_options;

// Pre-compute blob-to-SST ratio once (invariant across ranges for the same
// Version). This avoids iterating all levels and blob files per range.
double blob_to_sst_ratio = 0.0;
if (options.include_blob_files) {
const auto* vstorage = v->storage_info();
uint64_t total_sst_size = 0;
for (int level = 0; level < vstorage->num_non_empty_levels(); ++level) {
total_sst_size += vstorage->NumLevelBytes(level);
}
if (total_sst_size > 0) {
uint64_t total_blob_size = 0;
const auto& blob_files = vstorage->GetBlobFiles();
for (const auto& blob_file_meta : blob_files) {
assert(blob_file_meta);
total_blob_size += blob_file_meta->GetBlobFileSize();
}
blob_to_sst_ratio = static_cast<double>(total_blob_size) /
static_cast<double>(total_sst_size);
}
}

for (int i = 0; i < n; i++) {
// Add timestamp if needed
std::string start_with_ts, limit_with_ts;
Expand All @@ -4979,16 +5002,26 @@ Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options,
InternalKey k1(start.value(), kMaxSequenceNumber, kValueTypeForSeek);
InternalKey k2(limit.value(), kMaxSequenceNumber, kValueTypeForSeek);
sizes[i] = 0;
if (options.include_files) {
sizes[i] += versions_->ApproximateSize(
// Compute SST size in range (needed for both include_files and
// include_blob_files, since blob size is prorated by SST ratio).
uint64_t sst_size_in_range = 0;
if (options.include_files || options.include_blob_files) {
sst_size_in_range = versions_->ApproximateSize(
options, read_options, v, k1.Encode(), k2.Encode(),
/*start_level=*/0,
/*end_level=*/-1, TableReaderCaller::kUserApproximateSize);
}
if (options.include_files) {
sizes[i] += sst_size_in_range;
}
if (options.include_memtables) {
sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size;
sizes[i] += sv->imm->ApproximateStats(k1.Encode(), k2.Encode()).size;
}
if (options.include_blob_files) {
sizes[i] += static_cast<uint64_t>(static_cast<double>(sst_size_in_range) *
blob_to_sst_ratio);
}
}

ReturnAndCleanupSuperVersion(cfd, sv);
Expand Down
3 changes: 2 additions & 1 deletion db_stress_tool/db_stress_test_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2781,7 +2781,8 @@ Status StressTest::TestApproximateSize(
// Call GetApproximateSizes
SizeApproximationOptions sao;
sao.include_memtables = thread->rand.OneIn(2);
if (sao.include_memtables) {
sao.include_blob_files = thread->rand.OneIn(2);
if (sao.include_memtables || sao.include_blob_files) {
sao.include_files = thread->rand.OneIn(2);
}
if (thread->rand.OneIn(2)) {
Expand Down
14 changes: 8 additions & 6 deletions include/rocksdb/c.h
Original file line number Diff line number Diff line change
Expand Up @@ -732,6 +732,7 @@ enum {
rocksdb_size_approximation_flags_none = 0,
rocksdb_size_approximation_flags_include_memtable = 1 << 0,
rocksdb_size_approximation_flags_include_files = 1 << 1,
rocksdb_size_approximation_flags_include_blob_files = 1 << 2,
};

extern ROCKSDB_LIBRARY_API void rocksdb_approximate_sizes_cf_with_flags(
Expand Down Expand Up @@ -2729,12 +2730,13 @@ extern ROCKSDB_LIBRARY_API void rocksdb_try_catch_up_with_primary(
/* SliceTransform */

extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
rocksdb_slicetransform_create(
void* state, void (*destructor)(void*),
char* (*transform)(void*, const char* key, size_t length,
size_t* dst_length),
unsigned char (*in_domain)(void*, const char* key, size_t length),
const char* (*name)(void*));
rocksdb_slicetransform_create(void* state, void (*destructor)(void*),
char* (*transform)(void*, const char* key,
size_t length,
size_t* dst_length),
unsigned char (*in_domain)(void*, const char* key,
size_t length),
const char* (*name)(void*));
extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
rocksdb_slicetransform_create_fixed_prefix(size_t);
extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
Expand Down
4 changes: 3 additions & 1 deletion include/rocksdb/db.h
Original file line number Diff line number Diff line change
Expand Up @@ -1459,7 +1459,8 @@ class DB {
enum class SizeApproximationFlags : uint8_t {
NONE = 0,
INCLUDE_MEMTABLES = 1 << 0,
INCLUDE_FILES = 1 << 1
INCLUDE_FILES = 1 << 1,
INCLUDE_BLOB_FILES = 1 << 2
};

// For each i in [0,n-1], store in "sizes[i]", the approximate
Expand Down Expand Up @@ -2260,6 +2261,7 @@ inline Status DB::GetApproximateSizes(ColumnFamilyHandle* column_family,
using enum SizeApproximationFlags; // Require C++20 support
options.include_memtables = ((include_flags & INCLUDE_MEMTABLES) != NONE);
options.include_files = ((include_flags & INCLUDE_FILES) != NONE);
options.include_blob_files = ((include_flags & INCLUDE_BLOB_FILES) != NONE);
return GetApproximateSizes(options, column_family, ranges, n, sizes);
}

Expand Down
19 changes: 17 additions & 2 deletions include/rocksdb/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -2854,11 +2854,26 @@ struct ImportColumnFamilyOptions {
// Options used with DB::GetApproximateSizes()
struct SizeApproximationOptions {
// Defines whether the returned size should include the recently written
// data in the memtables. If set to false, include_files must be true.
// data in the memtables. If set to false, at least one of include_files or
// include_blob_files must be true.
bool include_memtables = false;
// Defines whether the returned size should include data serialized to disk.
// If set to false, include_memtables must be true.
// If set to false, at least one of include_memtables or include_blob_files
// must be true.
bool include_files = true;
// Defines whether the returned size should include an approximation of
// blob file data in the key range. When enabled, the total blob file size
// is prorated by the ratio of SST data in the range to the total SST data:
//
// blob_size_in_range ≈ total_blob_size * (sst_in_range / total_sst)
//
// Limitations of this approximation:
// - Assumes blob data is distributed proportionally to SST data, which
// may not hold if blob value sizes vary significantly across keys.
// - If there are no SST files (all data in memtables), the blob size
// contribution will be 0 even if blob files exist on disk.
// Default: false (for backward compatibility).
bool include_blob_files = false;
// When approximating the files total size that is used to store a keys range
// using DB::GetApproximateSizes, allow approximation with an error margin of
// up to total_files_size * files_size_error_margin. This allows to take some
Expand Down
5 changes: 5 additions & 0 deletions java/rocksjni/rocksjni.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2542,6 +2542,11 @@ jlongArray Java_org_rocksdb_RocksDB_getApproximateSizes(
(include_flags |
ROCKSDB_NAMESPACE::DB::SizeApproximationFlags::INCLUDE_FILES);
}
if (jinclude_flags & 4) {
include_flags =
(include_flags |
ROCKSDB_NAMESPACE::DB::SizeApproximationFlags::INCLUDE_BLOB_FILES);
}

db->GetApproximateSizes(cf_handle, ranges.get(),
static_cast<int>(range_count), sizes.get(),
Expand Down
7 changes: 4 additions & 3 deletions java/src/main/java/org/rocksdb/SizeApproximationFlag.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
* or file stats approximation or both.
*/
public enum SizeApproximationFlag {
NONE((byte)0x0),
INCLUDE_MEMTABLES((byte)0x1),
INCLUDE_FILES((byte)0x2);
NONE((byte) 0x0),
INCLUDE_MEMTABLES((byte) 0x1),
INCLUDE_FILES((byte) 0x2),
INCLUDE_BLOB_FILES((byte) 0x4);

private final byte value;

Expand Down
Loading