Skip to content

Commit 915b78f

Browse files
ruby-oujoEvergreen Agent
authored andcommitted
Import wiredtiger: 15e567db3a1adbc9b520a479a9d99bd448a1bd9c from branch mongodb-master
ref: 13392cc987..15e567db3a for: 7.2.0-rc0 WT-10716 MVP implementation for automatic prefetching
1 parent 295c41f commit 915b78f

File tree

26 files changed

+1373
-826
lines changed

26 files changed

+1373
-826
lines changed

src/third_party/wiredtiger/bench/wtperf/track.c

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -111,15 +111,13 @@ uint64_t
111111
sum_scan_ops(WTPERF *wtperf)
112112
{
113113
CONFIG_OPTS *opts;
114-
uint64_t total;
115114

116115
opts = wtperf->opts;
117116

118-
if (opts->scan_interval > 0)
119-
total = wtperf->scanthreads->scan.ops;
120-
else
121-
total = 0;
122-
return (total);
117+
if (opts->scan_interval == 0)
118+
return (0);
119+
120+
return (wtperf->scanthreads->scan.ops);
123121
}
124122

125123
/*

src/third_party/wiredtiger/dist/api_data.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1218,6 +1218,10 @@ def __ge__(self, other):
12181218
permit sharing between processes (will automatically start an RPC server for primary
12191219
processes and use RPC for secondary processes). <b>Not yet supported in WiredTiger</b>''',
12201220
type='boolean'),
1221+
Config('prefetch', 'false', r'''
1222+
Enable automatic detection of scans by applications, and attempt to pre-fetch future
1223+
content into the cache''',
1224+
type='boolean'),
12211225
Config('readonly', 'false', r'''
12221226
open connection in read-only mode. The database must exist. All methods that may
12231227
modify a database are disabled. See @ref readonly for more information''',

src/third_party/wiredtiger/dist/filelist

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ src/btree/bt_import.c
3232
src/btree/bt_misc.c
3333
src/btree/bt_ovfl.c
3434
src/btree/bt_page.c
35+
src/btree/bt_prefetch.c
3536
src/btree/bt_random.c
3637
src/btree/bt_read.c
3738
src/btree/bt_ret.c
@@ -80,6 +81,7 @@ src/conn/conn_dhandle.c
8081
src/conn/conn_handle.c
8182
src/conn/conn_log.c
8283
src/conn/conn_open.c
84+
src/conn/conn_prefetch.c
8385
src/conn/conn_reconfig.c
8486
src/conn/conn_stat.c
8587
src/conn/conn_sweep.c
@@ -199,6 +201,7 @@ src/schema/schema_worker.c
199201
src/session/session_api.c
200202
src/session/session_compact.c
201203
src/session/session_dhandle.c
204+
src/session/session_prefetch.c
202205
src/rollback_to_stable/rts_api.c
203206
src/rollback_to_stable/rts_btree.c
204207
src/rollback_to_stable/rts_btree_walk.c

src/third_party/wiredtiger/dist/s_string.ok

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,7 @@ PID
282282
POS
283283
POSIX
284284
PPC
285+
PREFETCH
285286
PRId
286287
PRIu
287288
PRNG
@@ -1083,6 +1084,7 @@ pre
10831084
pread
10841085
prealloc
10851086
precomp
1087+
prefetch
10861088
preload
10871089
prepend
10881090
prepended

src/third_party/wiredtiger/dist/stat_data.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,13 @@ def __init__(self, name, desc, flags=''):
236236
BlockCacheStat('block_cache_misses', 'number of misses'),
237237
BlockCacheStat('block_cache_not_evicted_overhead', 'number of blocks not evicted due to overhead'),
238238

239+
BlockCacheStat('block_prefetch_attempts', 'pre-fetch triggered by page read'),
240+
BlockCacheStat('block_prefetch_disk_one', 'pre-fetch not triggered after single disk read'),
241+
BlockCacheStat('block_prefetch_pages_queued', 'pre-fetch pages queued'),
242+
BlockCacheStat('block_prefetch_pages_read', 'pre-fetch pages read in background'),
243+
BlockCacheStat('block_prefetch_skipped', 'pre-fetch not triggered by page read'),
244+
BlockCacheStat('block_prefetch_pages_fail', 'pre-fetch page not on disk when reading'),
245+
239246
##########################################
240247
# Block manager statistics
241248
##########################################
@@ -270,6 +277,7 @@ def __init__(self, name, desc, flags=''):
270277
CacheStat('cache_eviction_app', 'pages evicted by application threads'),
271278
CacheStat('cache_eviction_app_dirty', 'modified pages evicted by application threads'),
272279
CacheStat('cache_eviction_clear_ordinary', 'pages removed from the ordinary queue to be queued for urgent eviction'),
280+
CacheStat('cache_eviction_consider_prefetch', 'pages considered for eviction that were brought in by pre-fetch', 'no_clear,no_scale'),
273281
CacheStat('cache_eviction_empty_score', 'eviction empty score', 'no_clear,no_scale'),
274282
CacheStat('cache_eviction_fail', 'pages selected for eviction unable to be evicted'),
275283
CacheStat('cache_eviction_fail_active_children_on_an_internal_page', 'pages selected for eviction unable to be evicted because of active children on an internal page'),
@@ -940,6 +948,7 @@ def __init__(self, name, desc, flags=''):
940948
CacheStat('cache_hs_write_squash', 'history store table writes requiring squashed modifies'),
941949
CacheStat('cache_inmem_split', 'in-memory page splits'),
942950
CacheStat('cache_inmem_splittable', 'in-memory page passed criteria to be split'),
951+
CacheStat('cache_pages_prefetch', 'pages requested from the cache due to pre-fetch'),
943952
CacheStat('cache_pages_requested', 'pages requested from the cache'),
944953
CacheStat('cache_read', 'pages read into cache'),
945954
CacheStat('cache_read_deleted', 'pages read into cache after truncate'),

src/third_party/wiredtiger/import.data

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
"vendor": "wiredtiger",
33
"github": "wiredtiger/wiredtiger.git",
44
"branch": "mongodb-master",
5-
"commit": "13392cc987b86ff0d131ea20255f019f54f42521"
5+
"commit": "15e567db3a1adbc9b520a479a9d99bd448a1bd9c"
66
}

src/third_party/wiredtiger/src/block_cache/block_io.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ __wt_blkcache_read(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr,
7070
blkcache_found = found = false;
7171
skip_cache_put = (blkcache->type == WT_BLKCACHE_UNCONFIGURED);
7272

73+
WT_ASSERT_ALWAYS(session, session->dhandle != NULL, "The block cache requires a dhandle");
7374
/*
7475
* If anticipating a compressed or encrypted block, start with a scratch buffer and convert into
7576
* the caller's buffer. Else, start with the caller's buffer.
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
/*-
2+
* Copyright (c) 2014-present MongoDB, Inc.
3+
* Copyright (c) 2008-2014 WiredTiger, Inc.
4+
* All rights reserved.
5+
*
6+
* See the file LICENSE for redistribution information.
7+
*/
8+
9+
#include "wt_internal.h"
10+
11+
/*
12+
* __wt_btree_prefetch --
13+
* Pre-load a set of pages into the cache. This session holds a hazard pointer on the ref passed
14+
* in, so there must be a valid page and a valid parent page (though that parent could change if
15+
* a split happens).
16+
*/
17+
int
18+
__wt_btree_prefetch(WT_SESSION_IMPL *session, WT_REF *ref)
19+
{
20+
WT_CONNECTION_IMPL *conn;
21+
WT_DECL_RET;
22+
WT_REF *next_ref;
23+
uint64_t block_preload;
24+
25+
conn = S2C(session);
26+
block_preload = 0;
27+
28+
/*
29+
* FIXME-WT-11759 Consider whether we should have these asserts here or swallow up the errors
30+
* instead.
31+
*/
32+
WT_ASSERT_ALWAYS(session, F_ISSET(ref, WT_REF_FLAG_LEAF),
33+
"Pre-fetch starts with a leaf page and reviews the parent");
34+
35+
WT_ASSERT_ALWAYS(session, __wt_session_gen(session, WT_GEN_SPLIT) != 0,
36+
"Pre-fetch requires a split generation to traverse internal page(s)");
37+
38+
session->prefetch_prev_ref = ref;
39+
/* Load and decompress a set of pages into the block cache. */
40+
WT_INTL_FOREACH_BEGIN (session, ref->home, next_ref) {
41+
/* Don't let the pre-fetch queue get overwhelmed. */
42+
if (conn->prefetch_queue_count > WT_MAX_PREFETCH_QUEUE ||
43+
block_preload > WT_PREFETCH_QUEUE_PER_TRIGGER)
44+
break;
45+
46+
/*
47+
* Skip queuing pages that are already in cache or are internal. They aren't the pages we
48+
* are looking for. This pretty much assumes that all children of an internal page remain in
49+
* cache during the scan. If a previous pre-fetch of this internal page read a page in, then
50+
* that page was evicted and now a future page wants to be pre-fetched, this algorithm needs
51+
* a tweak. It would need to remember which child was last queued and start again from
52+
* there, rather than this approximation which assumes recently pre-fetched pages are still
53+
* in cache.
54+
*/
55+
if (next_ref->state == WT_REF_DISK && F_ISSET(next_ref, WT_REF_FLAG_LEAF)) {
56+
ret = __wt_conn_prefetch_queue_push(session, next_ref);
57+
if (ret == 0)
58+
++block_preload;
59+
else if (ret != EBUSY)
60+
WT_RET(ret);
61+
}
62+
}
63+
WT_INTL_FOREACH_END;
64+
65+
WT_STAT_CONN_INCRV(session, block_prefetch_pages_queued, block_preload);
66+
return (0);
67+
}
68+
69+
/*
70+
* __wt_prefetch_page_in --
71+
* Does the heavy lifting of reading a page into the cache. Immediately releases the page since
72+
* reading it in is the useful side effect here. Must be called while holding a dhandle.
73+
*/
74+
int
75+
__wt_prefetch_page_in(WT_SESSION_IMPL *session, WT_PREFETCH *pf)
76+
{
77+
WT_ADDR_COPY addr;
78+
79+
if (pf->ref->home != pf->first_home)
80+
__wt_verbose(
81+
session, WT_VERB_PREFETCH, "The home changed while queued for pre-fetch %s", "");
82+
83+
/*
84+
* FIXME-WT-11759 Consider whether we should have these asserts here or swallow up the errors
85+
* instead.
86+
*/
87+
WT_ASSERT_ALWAYS(session, pf->dhandle != NULL, "Pre-fetch needs to save a valid dhandle");
88+
WT_ASSERT_ALWAYS(
89+
session, !F_ISSET(pf->ref, WT_REF_FLAG_INTERNAL), "Pre-fetch should only see leaf pages");
90+
91+
if (pf->ref->state != WT_REF_DISK) {
92+
WT_STAT_CONN_INCR(session, block_prefetch_pages_fail);
93+
return (0);
94+
}
95+
96+
WT_STAT_CONN_INCR(session, block_prefetch_pages_read);
97+
98+
if (__wt_ref_addr_copy(session, pf->ref, &addr)) {
99+
WT_RET(__wt_page_in(session, pf->ref, WT_READ_PREFETCH));
100+
WT_RET(__wt_page_release(session, pf->ref, 0));
101+
} else
102+
return (WT_ERROR);
103+
104+
return (0);
105+
}

src/third_party/wiredtiger/src/btree/bt_read.c

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,8 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
194194
page_flags = WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED;
195195
if (LF_ISSET(WT_READ_IGNORE_CACHE_SIZE))
196196
FLD_SET(page_flags, WT_PAGE_EVICT_NO_PROGRESS);
197+
if (LF_ISSET(WT_READ_PREFETCH))
198+
FLD_SET(page_flags, WT_PAGE_PREFETCH);
197199
WT_ERR(__wt_page_inmem(session, ref, tmp.data, page_flags, &notused, &prepare));
198200
tmp.mem = NULL;
199201
if (prepare)
@@ -282,6 +284,9 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
282284
if (!LF_ISSET(WT_READ_CACHE))
283285
WT_STAT_CONN_DATA_INCR(session, cache_pages_requested);
284286

287+
if (LF_ISSET(WT_READ_PREFETCH))
288+
WT_STAT_CONN_INCR(session, cache_pages_prefetch);
289+
285290
/*
286291
* If configured, free stashed memory more aggressively to encourage finding bugs in generation
287292
* tracking code.
@@ -321,11 +326,12 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
321326
* If configured to not trash the cache, leave the page generation unset, we'll set it
322327
* before returning to the oldest read generation, so the page is forcibly evicted as
323328
* soon as possible. We don't do that set here because we don't want to evict the page
324-
* before we "acquire" it.
329+
* before we "acquire" it. Also avoid queuing a pre-fetch page for forced eviction
330+
* before it has a chance of being used. Otherwise the work we've just done is wasted.
325331
*/
326332
wont_need = LF_ISSET(WT_READ_WONT_NEED) ||
327333
F_ISSET(session, WT_SESSION_READ_WONT_NEED) ||
328-
F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_NOKEEP);
334+
(!LF_ISSET(WT_READ_PREFETCH) && F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_NOKEEP));
329335
continue;
330336
case WT_REF_LOCKED:
331337
if (LF_ISSET(WT_READ_NO_WAIT))
@@ -434,6 +440,25 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
434440
}
435441

436442
skip_evict:
443+
page = ref->page;
444+
/*
445+
* Keep track of whether a session is reading leaf pages into the cache. This allows for
446+
* the session to decide whether pre-fetch would be helpful. It might not work if a
447+
* session has multiple cursors on different tables open, since the operations on
448+
* different tables get in the way of the heuristic. That isn't super likely - this is
449+
* to catch traversals through a btree, not complex multi-table user transactions.
450+
*/
451+
if (!LF_ISSET(WT_READ_PREFETCH) && F_ISSET(ref, WT_REF_FLAG_LEAF)) {
452+
/*
453+
* If the page was read by this retrieval or was pulled into the cache via the
454+
* pre-fetch mechanism, count that as a page read directly from disk.
455+
*/
456+
if (F_ISSET_ATOMIC_16(page, WT_PAGE_PREFETCH) ||
457+
page->read_gen == WT_READGEN_NOTSET)
458+
++session->prefetch_disk_read_count;
459+
else
460+
session->prefetch_disk_read_count = 0;
461+
}
437462
/*
438463
* If we read the page and are configured to not trash the cache, and no other thread
439464
* has already used the page, set the read generation so the page is evicted soon.
@@ -442,7 +467,6 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
442467
* generation and the page isn't already flagged for forced eviction, update the page
443468
* read generation.
444469
*/
445-
page = ref->page;
446470
if (page->read_gen == WT_READGEN_NOTSET) {
447471
if (wont_need)
448472
page->read_gen = WT_READGEN_WONT_NEED;

src/third_party/wiredtiger/src/btree/bt_walk.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,10 @@ __tree_walk_internal(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp
402402
couple = NULL;
403403
*refp = ref;
404404
WT_ASSERT(session, ref != ref_orig);
405+
406+
if (__wt_session_prefetch_check(session, ref))
407+
WT_ERR(__wt_btree_prefetch(session, ref));
408+
405409
goto done;
406410
}
407411

@@ -470,6 +474,9 @@ __tree_walk_internal(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp
470474
/* Success, so "couple" has been released. */
471475
couple = NULL;
472476

477+
if (__wt_session_prefetch_check(session, ref))
478+
WT_ERR(__wt_btree_prefetch(session, ref));
479+
473480
/* Return leaf pages to our caller. */
474481
if (F_ISSET(ref, WT_REF_FLAG_LEAF)) {
475482
*refp = ref;

0 commit comments

Comments
 (0)