Skip to content

Commit c160ca1

Browse files
RuoxinXuMongoDB Bot
authored andcommitted
SERVER-99348 Collect a deterministic sample for testing purpose of Sampling CE (#31831)
GitOrigin-RevId: 405e3fe
1 parent d9c2ded commit c160ca1

File tree

4 files changed

+86
-15
lines changed

4 files changed

+86
-15
lines changed

jstests/noPassthroughWithMongod/query/cbr/cbr_sampling.js

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,15 @@ function assertCollscanUsesSampling(query) {
3838
assert.eq(plan.estimatesMetadata.ceSource, "Sampling", plan);
3939
}
4040

41-
function assertAllPlansUseSampling(query) {
41+
function assertAllPlansUseSampling(query, ce) {
4242
const explain = coll.find(query).explain();
4343
[getWinningPlanFromExplain(explain), ...getRejectedPlans(explain)].forEach(plan => {
4444
assert.eq(plan.estimatesMetadata.ceSource, "Sampling", plan);
45-
assert.gt(plan.cardinalityEstimate, 0, plan);
45+
if (ce === undefined) {
46+
assert.gt(plan.cardinalityEstimate, 0, plan);
47+
} else {
48+
assert.close(plan.cardinalityEstimate, ce, plan);
49+
}
4650
assert.gt(plan.inputStage.cardinalityEstimate, 0, plan);
4751
assert.gt(plan.inputStage.numKeysEstimate, 0, plan);
4852
});
@@ -73,7 +77,28 @@ try {
7377
assertAllPlansUseSampling({b: {$lt: 100}});
7478
assertAllPlansUseSampling({a: {$lt: 100}, b: {$lt: 500}});
7579
assertAllPlansUseSampling({a: {$lt: 100}, b: {$lt: 500}, c: {$exists: true}});
80+
81+
// Test the sequential scan sampling method that generates the sample by scanning the first N
82+
// documents of the collection. The sample generated by this method is repeatable as long as
83+
// the collection was populated in the same way.
84+
assert.commandWorked(
85+
db.adminCommand({setParameter: 1, internalQuerySamplingBySequentialScan: true}));
86+
// Run the same query multiple times to ensure the sample is repeatable.
87+
assertAllPlansUseSampling({a: {$lt: 100}}, 260.41666);
88+
assertAllPlansUseSampling({a: {$lt: 100}}, 260.41666);
89+
assertAllPlansUseSampling({a: {$lt: 100}}, 260.41666);
90+
91+
// Require a sample larger than the collection and test that a full scan of the collection was
92+
// done to collect the sample.
93+
assert.commandWorked(db.adminCommand({setParameter: 1, samplingMarginOfError: 1.0}));
94+
// Since the sample was actually generated from all the documents in the collection. The 'ce' of
95+
// this predicate should include all documents, which is 1000.
96+
assertAllPlansUseSampling({a: {$lt: 1000}}, 1000);
97+
assertAllPlansUseSampling({a: {$lt: 1000}}, 1000);
7698
} finally {
7799
// Ensure that query knob doesn't leak into other testcases in the suite.
78100
assert.commandWorked(db.adminCommand({setParameter: 1, planRankerMode: "multiPlanning"}));
101+
assert.commandWorked(
102+
db.adminCommand({setParameter: 1, internalQuerySamplingBySequentialScan: false}));
103+
assert.commandWorked(db.adminCommand({setParameter: 1, samplingMarginOfError: 5.0}));
79104
}

src/mongo/db/query/ce/sampling_estimator_impl.cpp

Lines changed: 42 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -160,19 +160,9 @@ std::vector<BSONObj> SamplingEstimatorImpl::getIndexKeys(const IndexBounds& boun
160160
nullptr /*collator*/,
161161
boost::none);
162162

163-
// This function converts an index key to a BSONObj in order to compare with the IndexBounds.
164-
auto keyStringToBson = [](const key_string::Value& keyString) {
165-
BSONObjBuilder bob;
166-
auto keyStringObj = key_string::toBson(keyString, Ordering::make(BSONObj()));
167-
for (auto&& keyStringElem : keyStringObj) {
168-
bob.append(keyStringElem);
169-
}
170-
return bob.obj();
171-
};
172-
173163
std::vector<BSONObj> indexKeys;
174164
for (auto&& keyString : keyStrings) {
175-
indexKeys.push_back(keyStringToBson(keyString));
165+
indexKeys.push_back(key_string::toBson(keyString, Ordering::make(BSONObj())));
176166
}
177167
return indexKeys;
178168
}
@@ -303,10 +293,12 @@ SamplingEstimatorImpl::generateChunkSamplingPlan(PlanYieldPolicy* sbeYieldPolicy
303293
0 /* nodeId */);
304294

305295

296+
sbe::value::SlotVector outerProjectsSlots;
297+
sbe::value::SlotVector outerCorrelatedSlots{*outerRid};
306298
auto loopJoinStage = sbe::makeS<sbe::LoopJoinStage>(std::move(outerStage),
307299
std::move(innerStage),
308-
sbe::value::SlotVector{},
309-
sbe::value::SlotVector{*outerRid},
300+
outerProjectsSlots,
301+
outerCorrelatedSlots,
310302
nullptr /* predicate */,
311303
0 /* _nodeId */);
312304

@@ -414,6 +406,38 @@ void SamplingEstimatorImpl::generateChunkSample() {
414406
return;
415407
}
416408

409+
void SamplingEstimatorImpl::generateSampleBySeqScanningForTesting() {
410+
// Create a CanonicalQuery for the sampling plan.
411+
auto cq = makeCanonicalQuery(_collections.getMainCollection()->ns(), _opCtx, _sampleSize);
412+
auto sbeYieldPolicy = PlanYieldPolicySBE::make(
413+
_opCtx, PlanYieldPolicy::YieldPolicy::YIELD_AUTO, _collections, cq->nss());
414+
415+
auto staticData = std::make_unique<stage_builder::PlanStageStaticData>();
416+
sbe::value::SlotIdGenerator ids;
417+
staticData->resultSlot = ids.generate();
418+
const CollectionPtr& collection = _collections.getMainCollection();
419+
// Scan the first '_sampleSize' documents sequentially from the start of the target collection
420+
// in order to generate a repeatable sample.
421+
auto stage = makeScanStage(
422+
collection, staticData->resultSlot, boost::none, boost::none, false, sbeYieldPolicy.get());
423+
stage = sbe::makeS<sbe::LimitSkipStage>(
424+
std::move(stage),
425+
sbe::makeE<sbe::EConstant>(sbe::value::TypeTags::NumberInt64,
426+
sbe::value::bitcastFrom<int64_t>(_sampleSize)),
427+
nullptr /* skip */,
428+
0 /* nodeId */);
429+
430+
stage_builder::PlanStageData data{
431+
stage_builder::Environment{std::make_unique<sbe::RuntimeEnvironment>()},
432+
std::move(staticData)};
433+
auto plan =
434+
std::make_pair<std::unique_ptr<sbe::PlanStage>, mongo::stage_builder::PlanStageData>(
435+
std::move(stage), std::move(data));
436+
executeSamplingQueryAndSample(plan, std::move(cq), std::move(sbeYieldPolicy));
437+
438+
return;
439+
}
440+
417441
CardinalityEstimate SamplingEstimatorImpl::estimateCardinality(const MatchExpression* expr) const {
418442
size_t cnt = 0;
419443
for (const auto& doc : _sample) {
@@ -527,6 +551,11 @@ SamplingEstimatorImpl::SamplingEstimatorImpl(OperationContext* opCtx,
527551
_sampleSize(sampleSize),
528552
_numChunks(numChunks),
529553
_collectionCard(collectionCard) {
554+
if (internalQuerySamplingBySequentialScan.load()) {
555+
// This is only used for testing purposes when a repeatable sample is needed.
556+
generateSampleBySeqScanningForTesting();
557+
return;
558+
}
530559

531560
if (sampleSize >= collectionCard.cardinality().v()) {
532561
// If the required sample is larger than the collection, the sample is generated from all

src/mongo/db/query/ce/sampling_estimator_impl.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,13 @@ class SamplingEstimatorImpl : public SamplingEstimator {
225225
std::unique_ptr<CanonicalQuery> cq,
226226
std::unique_ptr<PlanYieldPolicySBE> sbeYieldPolicy);
227227

228+
/**
229+
* Generates a sample by sequentially scanning documents from the start of the target
230+
* collection. The sample is generated from the first '_sampleSize' documents of the collection.
231+
* This sampling method is only used for testing purposes where a repeatable sample is needed.
232+
*/
233+
void generateSampleBySeqScanningForTesting();
234+
228235
/*
229236
* The SamplingEstimator calculates the size of a sample based on the confidence level and
230237
* margin of error required.

src/mongo/db/query/query_knobs.idl

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1511,6 +1511,16 @@ server_parameters:
15111511
gt: 0
15121512
redact: false
15131513

1514+
internalQuerySamplingBySequentialScan:
1515+
description: "Indicate sampling CE to generate a repeatable sample by sequentially scanning
1516+
documents from the start of the target collection. This sampling method is useful for testing
1517+
purposes and should not be used in production."
1518+
set_at: [ startup, runtime ]
1519+
cpp_varname: "internalQuerySamplingBySequentialScan"
1520+
cpp_vartype: AtomicWord<bool>
1521+
default: false
1522+
redact: false
1523+
15141524
# Note for adding additional query knobs:
15151525
#
15161526
# When adding a new query knob, you should consider whether or not you need to add an 'on_update'

0 commit comments

Comments
 (0)