Skip to content

Commit 8bb53a0

Browse files
author
Dianna Hohensee
committed
Revert "SERVER-36956 SnapshotTooOld errors will always increase the snapshot history window size"
This reverts commit 8899b34.
1 parent 8cb0251 commit 8bb53a0

23 files changed

+209
-204
lines changed

jstests/noPassthrough/snapshotWindow_serverParameters.js

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,17 @@
1717
false /*hasUpperBound*/,
1818
"unused" /*upperOutOfBounds*/);
1919

20+
// Valid parameter values are in the range [0, 100].
21+
testNumericServerParameter("cachePressureThreshold",
22+
true /*isStartupParameter*/,
23+
true /*isRuntimeParameter*/,
24+
50 /*defaultValue*/,
25+
70 /*nonDefaultValidValue*/,
26+
true /*hasLowerBound*/,
27+
-1 /*lowerOutOfBounds*/,
28+
true /*hasUpperBound*/,
29+
101 /*upperOutOfBounds*/);
30+
2031
// Valid parameter values are in the range (0, 1).
2132
testNumericServerParameter("snapshotWindowMultiplicativeDecrease",
2233
true /*isStartupParameter*/,
@@ -32,18 +43,18 @@
3243
testNumericServerParameter("snapshotWindowAdditiveIncreaseSeconds",
3344
true /*isStartupParameter*/,
3445
true /*isRuntimeParameter*/,
35-
1 /*defaultValue*/,
46+
2 /*defaultValue*/,
3647
10 /*nonDefaultValidValue*/,
3748
true /*hasLowerBound*/,
3849
0 /*lowerOutOfBounds*/,
3950
false /*hasUpperBound*/,
4051
"unused" /*upperOutOfBounds*/);
4152

4253
// Valid parameter values are in the range [1, infinity).
43-
testNumericServerParameter("decreaseHistoryIfNotNeededPeriodSeconds",
54+
testNumericServerParameter("checkCachePressurePeriodSeconds",
4455
true /*isStartupParameter*/,
4556
true /*isRuntimeParameter*/,
46-
15 /*defaultValue*/,
57+
5 /*defaultValue*/,
4758
8 /*nonDefaultValidValue*/,
4859
true /*hasLowerBound*/,
4960
0 /*lowerOutOfBounds*/,
@@ -60,4 +71,15 @@
6071
0 /*lowerOutOfBounds*/,
6172
false /*hasUpperBound*/,
6273
"unused" /*upperOutOfBounds*/);
74+
75+
// Valid parameter values are in the range [1, infinity).
76+
testNumericServerParameter("minMillisBetweenSnapshotWindowDec",
77+
true /*isStartupParameter*/,
78+
true /*isRuntimeParameter*/,
79+
500 /*defaultValue*/,
80+
2 * 1000 /*nonDefaultValidValue*/,
81+
true /*hasLowerBound*/,
82+
0 /*lowerOutOfBounds*/,
83+
false /*hasUpperBound*/,
84+
"unused" /*upperOutOfBounds*/);
6385
})();

src/mongo/db/db.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -611,7 +611,7 @@ ExitCode _initAndListen(int listenPort) {
611611
// release periodically in order to avoid storage cache pressure build up.
612612
if (storageEngine->supportsReadConcernSnapshot()) {
613613
PeriodicThreadToAbortExpiredTransactions::get(serviceContext)->start();
614-
PeriodicThreadToDecreaseSnapshotHistoryIfNotNeeded::get(serviceContext)->start();
614+
PeriodicThreadToDecreaseSnapshotHistoryCachePressure::get(serviceContext)->start();
615615
}
616616

617617
// Set up the logical session cache
@@ -929,7 +929,7 @@ void shutdownTask(const ShutdownTaskArgs& shutdownArgs) {
929929
if (auto storageEngine = serviceContext->getStorageEngine()) {
930930
if (storageEngine->supportsReadConcernSnapshot()) {
931931
PeriodicThreadToAbortExpiredTransactions::get(serviceContext)->stop();
932-
PeriodicThreadToDecreaseSnapshotHistoryIfNotNeeded::get(serviceContext)->stop();
932+
PeriodicThreadToDecreaseSnapshotHistoryCachePressure::get(serviceContext)->stop();
933933
}
934934

935935
ServiceContext::UniqueOperationContext uniqueOpCtx;

src/mongo/db/periodic_runner_job_decrease_snapshot_cache_pressure.cpp

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -43,26 +43,26 @@
4343

4444
namespace mongo {
4545

46-
auto PeriodicThreadToDecreaseSnapshotHistoryIfNotNeeded::get(ServiceContext* serviceContext)
47-
-> PeriodicThreadToDecreaseSnapshotHistoryIfNotNeeded& {
46+
auto PeriodicThreadToDecreaseSnapshotHistoryCachePressure::get(ServiceContext* serviceContext)
47+
-> PeriodicThreadToDecreaseSnapshotHistoryCachePressure& {
4848
auto& jobContainer = _serviceDecoration(serviceContext);
4949
jobContainer._init(serviceContext);
5050
return jobContainer;
5151
}
5252

53-
auto PeriodicThreadToDecreaseSnapshotHistoryIfNotNeeded::operator-> () const noexcept
53+
auto PeriodicThreadToDecreaseSnapshotHistoryCachePressure::operator-> () const noexcept
5454
-> PeriodicJobAnchor* {
5555
stdx::lock_guard lk(_mutex);
5656
return _anchor.get();
5757
}
5858

59-
auto PeriodicThreadToDecreaseSnapshotHistoryIfNotNeeded::operator*() const noexcept
59+
auto PeriodicThreadToDecreaseSnapshotHistoryCachePressure::operator*() const noexcept
6060
-> PeriodicJobAnchor& {
6161
stdx::lock_guard lk(_mutex);
6262
return *_anchor;
6363
}
6464

65-
void PeriodicThreadToDecreaseSnapshotHistoryIfNotNeeded::_init(ServiceContext* serviceContext) {
65+
void PeriodicThreadToDecreaseSnapshotHistoryCachePressure::_init(ServiceContext* serviceContext) {
6666
stdx::lock_guard lk(_mutex);
6767
if (_anchor) {
6868
return;
@@ -72,7 +72,7 @@ void PeriodicThreadToDecreaseSnapshotHistoryIfNotNeeded::_init(ServiceContext* s
7272
invariant(periodicRunner);
7373

7474
PeriodicRunner::PeriodicJob job(
75-
"startPeriodicThreadToDecreaseSnapshotHistoryIfNotNeeded",
75+
"startPeriodicThreadToDecreaseSnapshotHistoryCachePressure",
7676
[](Client* client) {
7777
try {
7878
// The opCtx destructor handles unsetting itself from the Client.
@@ -88,18 +88,17 @@ void PeriodicThreadToDecreaseSnapshotHistoryIfNotNeeded::_init(ServiceContext* s
8888
}
8989
}
9090
},
91-
Seconds(snapshotWindowParams.decreaseHistoryIfNotNeededPeriodSeconds.load()));
91+
Seconds(snapshotWindowParams.checkCachePressurePeriodSeconds.load()));
9292

9393
_anchor = std::make_shared<PeriodicJobAnchor>(periodicRunner->makeJob(std::move(job)));
9494

95-
SnapshotWindowParams::observeDecreaseHistoryIfNotNeededPeriodSeconds.addObserver([anchor =
96-
_anchor](
95+
SnapshotWindowParams::observeCheckCachePressurePeriodSeconds.addObserver([anchor = _anchor](
9796
const auto& secs) {
9897
try {
9998
anchor->setPeriod(Seconds(secs));
10099
} catch (const DBException& ex) {
101-
log() << "Failed to update the period of the thread which decreases data history "
102-
"target window size if there have been no new SnapshotTooOld errors."
100+
log() << "Failed to update the period of the thread which decreases data history cache "
101+
"target size if there is cache pressure."
103102
<< ex.toStatus();
104103
}
105104
});

src/mongo/db/periodic_runner_job_decrease_snapshot_cache_pressure.h

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,18 +38,17 @@
3838
namespace mongo {
3939

4040
/**
41-
* Periodically checks whether there has been any storage engine cache pressure and SnapshotTooOld
42-
* errors to determine whether the maintained snapshot history window target setting should be
43-
* decreased. If there has been cache pressure and no new SnapshotTooOld errors in the last period,
44-
* then the target window size will be decrease. Maintaining too much snapshot and write history can
45-
* slow down the system. Runs once every decreaseHistoryIfNotNeededPeriodSeconds.
41+
* Periodically checks for storage engine cache pressure to determine whether the maintained
42+
* snapshot history window target setting should be decreased. Maintaining too much snapshot and
43+
* write history can slow down the system. Runs once every checkCachePressurePeriodSeconds.
4644
*
4745
* This function should only ever be called once, during mongod server startup (db.cpp).
4846
* The PeriodicRunner will handle shutting down the job on shutdown, no extra handling necessary.
4947
*/
50-
class PeriodicThreadToDecreaseSnapshotHistoryIfNotNeeded {
48+
class PeriodicThreadToDecreaseSnapshotHistoryCachePressure {
5149
public:
52-
static PeriodicThreadToDecreaseSnapshotHistoryIfNotNeeded& get(ServiceContext* serviceContext);
50+
static PeriodicThreadToDecreaseSnapshotHistoryCachePressure& get(
51+
ServiceContext* serviceContext);
5352

5453
PeriodicJobAnchor* operator->() const noexcept;
5554
PeriodicJobAnchor& operator*() const noexcept;
@@ -58,7 +57,7 @@ class PeriodicThreadToDecreaseSnapshotHistoryIfNotNeeded {
5857
void _init(ServiceContext* serviceContext);
5958

6059
inline static const auto _serviceDecoration =
61-
ServiceContext::declareDecoration<PeriodicThreadToDecreaseSnapshotHistoryIfNotNeeded>();
60+
ServiceContext::declareDecoration<PeriodicThreadToDecreaseSnapshotHistoryCachePressure>();
6261

6362
mutable stdx::mutex _mutex;
6463
std::shared_ptr<PeriodicJobAnchor> _anchor;

src/mongo/db/service_entry_point_common.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -860,7 +860,6 @@ void execCommandDatabase(OperationContext* opCtx,
860860
// of successful future PIT atClusterTime requests.
861861
auto engine = opCtx->getServiceContext()->getStorageEngine();
862862
if (engine && engine->supportsReadConcernSnapshot()) {
863-
SnapshotWindowUtil::incrementSnapshotTooOldErrorCount();
864863
SnapshotWindowUtil::increaseTargetSnapshotWindowSize(opCtx);
865864
}
866865
} else {

src/mongo/db/snapshot_window_options.h

Lines changed: 29 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -64,42 +64,49 @@ struct SnapshotWindowParams {
6464
AtomicWord<int> targetSnapshotHistoryWindowInSeconds{
6565
maxTargetSnapshotHistoryWindowInSeconds.load()};
6666

67+
// cachePressureThreshold (startup & runtime server paramter, range [0, 100]).
68+
//
69+
// Dictates what percentage of cache in use is considered too high. This setting helps preempt
70+
// storage cache pressure immobilizing the system. Attempts to increase
71+
// targetSnapshotHistoryWindowInSeconds will be ignored when the cache pressure reaches this
72+
// threshold. Additionally, a periodic task will decrease targetSnapshotHistoryWindowInSeconds
73+
// when cache pressure exceeds the threshold.
74+
AtomicWord<int> cachePressureThreshold{50};
75+
6776
// snapshotWindowMultiplicativeDecrease (startup & runtime server paramter, range (0,1)).
6877
//
69-
// Controls by what multiplier the target snapshot history window size setting is decreased
70-
// when there is cache pressure.
78+
// Controls by what multiplier the target snapshot history window setting is decreased when
79+
// cache pressure becomes too high, per the cachePressureThreshold setting.
7180
AtomicDouble snapshotWindowMultiplicativeDecrease{0.75};
7281

7382
// snapshotWindowAdditiveIncreaseSeconds (startup & runtime server paramter, range 1+).
7483
//
75-
// Controls by how much the target snapshot history window size setting is increased when we
76-
// need to service older snapshots for global point-in-time reads.
77-
AtomicWord<int> snapshotWindowAdditiveIncreaseSeconds{1};
84+
// Controls by how much the target snapshot history window setting is increased when cache
85+
// pressure is OK, per cachePressureThreshold, and we need to service older snapshots for global
86+
// point-in-time reads.
87+
AtomicWord<int> snapshotWindowAdditiveIncreaseSeconds{2};
7888

7989
// minMillisBetweenSnapshotWindowInc (startup & runtime server paramter, range 0+).
90+
// minMillisBetweenSnapshotWindowDec (startup & runtime server paramter, range 0+).
8091
//
81-
// Controls how often attempting to increase the target snapshot window will have an effect.
82-
// Multiple callers within minMillisBetweenSnapshotWindowInc will have the same effect as one.
83-
// This protects the system because it takes time for the target snapshot window to affect the
84-
// actual storage engine snapshot window. The stable timestamp must move forward for the window
85-
// between it and oldest timestamp to grow or shrink.
92+
// Controls how often attempting to increase/decrease the target snapshot window will have an
93+
// effect. Multiple callers within minMillisBetweenSnapshotWindowInc will have the same effect
94+
// as one. This protects the system because it takes time for the target snapshot window to
95+
// affect the actual storage engine snapshot window. The stable timestamp must move forward for
96+
// the window between it and oldest timestamp to grow or shrink.
8697
AtomicWord<int> minMillisBetweenSnapshotWindowInc{500};
98+
AtomicWord<int> minMillisBetweenSnapshotWindowDec{500};
8799

88-
// decreaseHistoryIfNotNeededPeriodSeconds (startup & runtime server paramter, range 1+)
100+
// checkCachePressurePeriodSeconds (startup & runtime server paramter, range 1+)
89101
//
90102
// Controls the period of the task that checks for cache pressure and decreases
91-
// targetSnapshotHistoryWindowInSeconds if there has been pressure and no new SnapshotTooOld
92-
// errors.
93-
//
94-
// This should not run very frequently. It is preferable to increase the window size, and cache
95-
// pressure, rather than failing PIT reads.
96-
AtomicWord<int> decreaseHistoryIfNotNeededPeriodSeconds{15};
97-
98-
static inline MutableObserverRegistry<decltype(
99-
decreaseHistoryIfNotNeededPeriodSeconds)::WordType>
100-
observeDecreaseHistoryIfNotNeededPeriodSeconds;
103+
// targetSnapshotHistoryWindowInSeconds if the pressure is above cachePressureThreshold. The
104+
// target window size setting must not be decreased too fast because time must be allowed for
105+
// the storage engine to attempt to act on the new setting.
106+
AtomicWord<int> checkCachePressurePeriodSeconds{5};
101107

102-
AtomicWord<long long> snapshotTooOldErrorCount{0};
108+
static inline MutableObserverRegistry<decltype(checkCachePressurePeriodSeconds)::WordType>
109+
observeCheckCachePressurePeriodSeconds;
103110
};
104111

105112
extern SnapshotWindowParams snapshotWindowParams;

src/mongo/db/snapshot_window_options.idl

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,14 @@ server_parameters:
4141
cpp_varname: "snapshotWindowParams.maxTargetSnapshotHistoryWindowInSeconds"
4242
validator: { gte: 0 }
4343

44+
cachePressureThreshold:
45+
description: "Cache pressure threshold"
46+
set_at: [ startup, runtime ]
47+
cpp_varname: "snapshotWindowParams.cachePressureThreshold"
48+
validator:
49+
gte: 0
50+
lte: 100
51+
4452
snapshotWindowMultiplicativeDecrease:
4553
description: "Snapshot window multiplicative decrease"
4654
set_at: [ startup, runtime ]
@@ -61,9 +69,15 @@ server_parameters:
6169
cpp_varname: "snapshotWindowParams.minMillisBetweenSnapshotWindowInc"
6270
validator: { gte: 1 }
6371

64-
decreaseHistoryIfNotNeededPeriodSeconds:
72+
minMillisBetweenSnapshotWindowDec:
73+
description: "Minimum duration between snapshot window decrement, in milliseconds"
74+
set_at: [ startup, runtime ]
75+
cpp_varname: "snapshotWindowParams.minMillisBetweenSnapshotWindowDec"
76+
validator: { gte: 1 }
77+
78+
checkCachePressurePeriodSeconds:
6579
description: "Check cache pressure period, in seconds"
6680
set_at: [ startup, runtime ]
67-
cpp_varname: "snapshotWindowParams.decreaseHistoryIfNotNeededPeriodSeconds"
81+
cpp_varname: "snapshotWindowParams.checkCachePressurePeriodSeconds"
6882
validator: { gte: 1 }
69-
on_update: std::ref(SnapshotWindowParams::observeDecreaseHistoryIfNotNeededPeriodSeconds)
83+
on_update: std::ref(SnapshotWindowParams::observeCheckCachePressurePeriodSeconds)

src/mongo/db/snapshot_window_util.cpp

Lines changed: 36 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -52,12 +52,25 @@ namespace SnapshotWindowUtil {
5252
// Adds concurrency control to increaseTargetSnapshotWindowSize() and
5353
// decreaseTargetSnapshotWindowSize(). They should not run concurrently with themselves or one
5454
// another, since they act on and modify the same storage parameters. Further guards the static
55-
// variable "_snapshotWindowLastIncreasedAt" used in increaseTargetSnapshotWindowSize().
55+
// variables "_snapshotWindowLastDecreasedAt" and "_snapshotWindowLastIncreasedAt" used in
56+
// increaseTargetSnapshotWindowSize() and decreaseSnapshowWindow().
5657
stdx::mutex snapshotWindowMutex;
5758

5859
namespace {
5960

60-
void _decreaseTargetSnapshotWindowSize(WithLock, OperationContext* opCtx) {
61+
void _decreaseTargetSnapshotWindowSize(WithLock lock, OperationContext* opCtx) {
62+
// Tracks the last time that the snapshot window was decreased so that it does not go down so
63+
// fast that the system does not have time to react and reduce snapshot availability.
64+
static Date_t _snapshotWindowLastDecreasedAt{Date_t::min()};
65+
66+
if (_snapshotWindowLastDecreasedAt >
67+
(Date_t::now() -
68+
Milliseconds(snapshotWindowParams.minMillisBetweenSnapshotWindowDec.load()))) {
69+
// We have already decreased the window size in the last minMillisBetweenSnapshotWindowDec
70+
// milliseconds.
71+
return;
72+
}
73+
6174
snapshotWindowParams.targetSnapshotHistoryWindowInSeconds.store(
6275
snapshotWindowParams.targetSnapshotHistoryWindowInSeconds.load() *
6376
snapshotWindowParams.snapshotWindowMultiplicativeDecrease.load());
@@ -67,6 +80,8 @@ void _decreaseTargetSnapshotWindowSize(WithLock, OperationContext* opCtx) {
6780
StorageEngine* engine = opCtx->getServiceContext()->getStorageEngine();
6881
invariant(engine);
6982
engine->setOldestTimestampFromStable();
83+
84+
_snapshotWindowLastDecreasedAt = Date_t::now();
7085
}
7186

7287
} // namespace
@@ -90,6 +105,23 @@ void increaseTargetSnapshotWindowSize(OperationContext* opCtx) {
90105
return;
91106
}
92107

108+
// If the cache pressure is already too high, we will not put more pressure on it by increasing
109+
// the window size.
110+
StorageEngine* engine = opCtx->getServiceContext()->getStorageEngine();
111+
if (engine && engine->isCacheUnderPressure(opCtx)) {
112+
warning() << "Attempted to increase the time window of available snapshots for "
113+
"point-in-time operations (readConcern level 'snapshot' or transactions), but "
114+
"the storage engine cache pressure, per the cachePressureThreshold setting of "
115+
"'"
116+
<< snapshotWindowParams.cachePressureThreshold.load()
117+
<< "', is too high to allow it to increase. If this happens frequently, consider "
118+
"either increasing the cache pressure threshold or increasing the memory "
119+
"available to the storage engine cache, in order to improve the success rate "
120+
"or speed of point-in-time requests.";
121+
_decreaseTargetSnapshotWindowSize(lock, opCtx);
122+
return;
123+
}
124+
93125
if (snapshotWindowParams.targetSnapshotHistoryWindowInSeconds.load() ==
94126
snapshotWindowParams.maxTargetSnapshotHistoryWindowInSeconds.load()) {
95127
warning() << "Attempted to increase the time window of available snapshots for "
@@ -119,28 +151,10 @@ void decreaseTargetSnapshotWindowSize(OperationContext* opCtx) {
119151
stdx::unique_lock<stdx::mutex> lock(snapshotWindowMutex);
120152

121153
StorageEngine* engine = opCtx->getServiceContext()->getStorageEngine();
122-
if (engine) {
123-
static auto lastInsertsCount = 0;
124-
static auto lastSnapshotErrorCount = 0;
125-
126-
auto currentInsertsCount = engine->getCacheOverflowTableInsertCount(opCtx);
127-
auto currentSnapshotErrorCount = snapshotWindowParams.snapshotTooOldErrorCount.load();
128-
129-
// Only decrease the snapshot window size if there were writes to the cache overflow table
130-
// and there has been no new SnapshotTooOld errors in the same time period.
131-
if (currentInsertsCount > lastInsertsCount &&
132-
currentSnapshotErrorCount == lastSnapshotErrorCount) {
133-
_decreaseTargetSnapshotWindowSize(lock, opCtx);
134-
}
135-
136-
lastInsertsCount = currentInsertsCount;
137-
lastSnapshotErrorCount = currentSnapshotErrorCount;
154+
if (engine && engine->isCacheUnderPressure(opCtx)) {
155+
_decreaseTargetSnapshotWindowSize(lock, opCtx);
138156
}
139157
}
140158

141-
void incrementSnapshotTooOldErrorCount() {
142-
snapshotWindowParams.snapshotTooOldErrorCount.addAndFetch(1);
143-
}
144-
145159
} // namespace SnapshotWindowUtil
146160
} // namespace mongo

0 commit comments

Comments
 (0)