Skip to content

Commit 038b8d1

Browse files
committed
libceph: optionally use bounce buffer on recv path in crc mode
Both msgr1 and msgr2 in crc mode are zero copy in the sense that message data is read from the socket directly into the destination buffer. We assume that the destination buffer is stable (i.e. remains unchanged while it is being read to) though. Otherwise, CRC errors ensue: libceph: read_partial_message 0000000048edf8ad data crc 1063286393 != exp. 228122706 libceph: osd1 (1)192.168.122.1:6843 bad crc/signature libceph: bad data crc, calculated 57958023, expected 1805382778 libceph: osd2 (2)192.168.122.1:6876 integrity error, bad crc Introduce rxbounce option to enable use of a bounce buffer when receiving message data. In particular this is needed if a mapped image is a Windows VM disk, passed to QEMU. Windows has a system-wide "dummy" page that may be mapped into the destination buffer (potentially more than once into the same buffer) by the Windows Memory Manager in an effort to generate a single large I/O [1][2]. QEMU makes a point of preserving overlap relationships when cloning I/O vectors, so krbd gets exposed to this behaviour. [1] "What Is Really in That MDL?" https://docs.microsoft.com/en-us/previous-versions/windows/hardware/design/dn614012(v=vs.85) [2] https://blogs.msmvps.com/kernelmustard/2005/05/04/dummy-pages/ URL: https://bugzilla.redhat.com/show_bug.cgi?id=1973317 Signed-off-by: Ilya Dryomov <[email protected]> Reviewed-by: Jeff Layton <[email protected]>
1 parent 2ea8871 commit 038b8d1

File tree

6 files changed

+105
-20
lines changed

6 files changed

+105
-20
lines changed

include/linux/ceph/libceph.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#define CEPH_OPT_TCP_NODELAY (1<<4) /* TCP_NODELAY on TCP sockets */
3636
#define CEPH_OPT_NOMSGSIGN (1<<5) /* don't sign msgs (msgr1) */
3737
#define CEPH_OPT_ABORT_ON_FULL (1<<6) /* abort w/ ENOSPC when full */
38+
#define CEPH_OPT_RXBOUNCE (1<<7) /* double-buffer read data */
3839

3940
#define CEPH_OPT_DEFAULT (CEPH_OPT_TCP_NODELAY)
4041

include/linux/ceph/messenger.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -461,6 +461,7 @@ struct ceph_connection {
461461
struct ceph_msg *out_msg; /* sending message (== tail of
462462
out_sent) */
463463

464+
struct page *bounce_page;
464465
u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */
465466

466467
struct timespec64 last_keepalive_ack; /* keepalive2 ack stamp */

net/ceph/ceph_common.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,7 @@ enum {
246246
Opt_cephx_sign_messages,
247247
Opt_tcp_nodelay,
248248
Opt_abort_on_full,
249+
Opt_rxbounce,
249250
};
250251

251252
enum {
@@ -295,6 +296,7 @@ static const struct fs_parameter_spec ceph_parameters[] = {
295296
fsparam_u32 ("osdkeepalive", Opt_osdkeepalivetimeout),
296297
fsparam_enum ("read_from_replica", Opt_read_from_replica,
297298
ceph_param_read_from_replica),
299+
fsparam_flag ("rxbounce", Opt_rxbounce),
298300
fsparam_enum ("ms_mode", Opt_ms_mode,
299301
ceph_param_ms_mode),
300302
fsparam_string ("secret", Opt_secret),
@@ -584,6 +586,9 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
584586
case Opt_abort_on_full:
585587
opt->flags |= CEPH_OPT_ABORT_ON_FULL;
586588
break;
589+
case Opt_rxbounce:
590+
opt->flags |= CEPH_OPT_RXBOUNCE;
591+
break;
587592

588593
default:
589594
BUG();
@@ -660,6 +665,8 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
660665
seq_puts(m, "notcp_nodelay,");
661666
if (show_all && (opt->flags & CEPH_OPT_ABORT_ON_FULL))
662667
seq_puts(m, "abort_on_full,");
668+
if (opt->flags & CEPH_OPT_RXBOUNCE)
669+
seq_puts(m, "rxbounce,");
663670

664671
if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
665672
seq_printf(m, "mount_timeout=%d,",

net/ceph/messenger.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -515,6 +515,10 @@ static void ceph_con_reset_protocol(struct ceph_connection *con)
515515
ceph_msg_put(con->out_msg);
516516
con->out_msg = NULL;
517517
}
518+
if (con->bounce_page) {
519+
__free_page(con->bounce_page);
520+
con->bounce_page = NULL;
521+
}
518522

519523
if (ceph_msgr2(from_msgr(con->msgr)))
520524
ceph_con_v2_reset_protocol(con);

net/ceph/messenger_v1.c

Lines changed: 48 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -992,18 +992,14 @@ static int read_partial_message_section(struct ceph_connection *con,
992992

993993
static int read_partial_msg_data(struct ceph_connection *con)
994994
{
995-
struct ceph_msg *msg = con->in_msg;
996-
struct ceph_msg_data_cursor *cursor = &msg->cursor;
995+
struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor;
997996
bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
998997
struct page *page;
999998
size_t page_offset;
1000999
size_t length;
10011000
u32 crc = 0;
10021001
int ret;
10031002

1004-
if (!msg->num_data_items)
1005-
return -EIO;
1006-
10071003
if (do_datacrc)
10081004
crc = con->in_data_crc;
10091005
while (cursor->total_resid) {
@@ -1031,6 +1027,46 @@ static int read_partial_msg_data(struct ceph_connection *con)
10311027
return 1; /* must return > 0 to indicate success */
10321028
}
10331029

1030+
static int read_partial_msg_data_bounce(struct ceph_connection *con)
1031+
{
1032+
struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor;
1033+
struct page *page;
1034+
size_t off, len;
1035+
u32 crc;
1036+
int ret;
1037+
1038+
if (unlikely(!con->bounce_page)) {
1039+
con->bounce_page = alloc_page(GFP_NOIO);
1040+
if (!con->bounce_page) {
1041+
pr_err("failed to allocate bounce page\n");
1042+
return -ENOMEM;
1043+
}
1044+
}
1045+
1046+
crc = con->in_data_crc;
1047+
while (cursor->total_resid) {
1048+
if (!cursor->resid) {
1049+
ceph_msg_data_advance(cursor, 0);
1050+
continue;
1051+
}
1052+
1053+
page = ceph_msg_data_next(cursor, &off, &len, NULL);
1054+
ret = ceph_tcp_recvpage(con->sock, con->bounce_page, 0, len);
1055+
if (ret <= 0) {
1056+
con->in_data_crc = crc;
1057+
return ret;
1058+
}
1059+
1060+
crc = crc32c(crc, page_address(con->bounce_page), ret);
1061+
memcpy_to_page(page, off, page_address(con->bounce_page), ret);
1062+
1063+
ceph_msg_data_advance(cursor, ret);
1064+
}
1065+
con->in_data_crc = crc;
1066+
1067+
return 1; /* must return > 0 to indicate success */
1068+
}
1069+
10341070
/*
10351071
* read (part of) a message.
10361072
*/
@@ -1141,7 +1177,13 @@ static int read_partial_message(struct ceph_connection *con)
11411177

11421178
/* (page) data */
11431179
if (data_len) {
1144-
ret = read_partial_msg_data(con);
1180+
if (!m->num_data_items)
1181+
return -EIO;
1182+
1183+
if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE))
1184+
ret = read_partial_msg_data_bounce(con);
1185+
else
1186+
ret = read_partial_msg_data(con);
11451187
if (ret <= 0)
11461188
return ret;
11471189
}

net/ceph/messenger_v2.c

Lines changed: 44 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1753,7 +1753,7 @@ static int prepare_read_control_remainder(struct ceph_connection *con)
17531753
return 0;
17541754
}
17551755

1756-
static void prepare_read_data(struct ceph_connection *con)
1756+
static int prepare_read_data(struct ceph_connection *con)
17571757
{
17581758
struct bio_vec bv;
17591759

@@ -1762,23 +1762,55 @@ static void prepare_read_data(struct ceph_connection *con)
17621762
data_len(con->in_msg));
17631763

17641764
get_bvec_at(&con->v2.in_cursor, &bv);
1765-
set_in_bvec(con, &bv);
1765+
if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
1766+
if (unlikely(!con->bounce_page)) {
1767+
con->bounce_page = alloc_page(GFP_NOIO);
1768+
if (!con->bounce_page) {
1769+
pr_err("failed to allocate bounce page\n");
1770+
return -ENOMEM;
1771+
}
1772+
}
1773+
1774+
bv.bv_page = con->bounce_page;
1775+
bv.bv_offset = 0;
1776+
set_in_bvec(con, &bv);
1777+
} else {
1778+
set_in_bvec(con, &bv);
1779+
}
17661780
con->v2.in_state = IN_S_PREPARE_READ_DATA_CONT;
1781+
return 0;
17671782
}
17681783

17691784
static void prepare_read_data_cont(struct ceph_connection *con)
17701785
{
17711786
struct bio_vec bv;
17721787

1773-
con->in_data_crc = ceph_crc32c_page(con->in_data_crc,
1774-
con->v2.in_bvec.bv_page,
1775-
con->v2.in_bvec.bv_offset,
1776-
con->v2.in_bvec.bv_len);
1788+
if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
1789+
con->in_data_crc = crc32c(con->in_data_crc,
1790+
page_address(con->bounce_page),
1791+
con->v2.in_bvec.bv_len);
1792+
1793+
get_bvec_at(&con->v2.in_cursor, &bv);
1794+
memcpy_to_page(bv.bv_page, bv.bv_offset,
1795+
page_address(con->bounce_page),
1796+
con->v2.in_bvec.bv_len);
1797+
} else {
1798+
con->in_data_crc = ceph_crc32c_page(con->in_data_crc,
1799+
con->v2.in_bvec.bv_page,
1800+
con->v2.in_bvec.bv_offset,
1801+
con->v2.in_bvec.bv_len);
1802+
}
17771803

17781804
ceph_msg_data_advance(&con->v2.in_cursor, con->v2.in_bvec.bv_len);
17791805
if (con->v2.in_cursor.total_resid) {
17801806
get_bvec_at(&con->v2.in_cursor, &bv);
1781-
set_in_bvec(con, &bv);
1807+
if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
1808+
bv.bv_page = con->bounce_page;
1809+
bv.bv_offset = 0;
1810+
set_in_bvec(con, &bv);
1811+
} else {
1812+
set_in_bvec(con, &bv);
1813+
}
17821814
WARN_ON(con->v2.in_state != IN_S_PREPARE_READ_DATA_CONT);
17831815
return;
17841816
}
@@ -1791,14 +1823,13 @@ static void prepare_read_data_cont(struct ceph_connection *con)
17911823
con->v2.in_state = IN_S_HANDLE_EPILOGUE;
17921824
}
17931825

1794-
static void prepare_read_tail_plain(struct ceph_connection *con)
1826+
static int prepare_read_tail_plain(struct ceph_connection *con)
17951827
{
17961828
struct ceph_msg *msg = con->in_msg;
17971829

17981830
if (!front_len(msg) && !middle_len(msg)) {
17991831
WARN_ON(!data_len(msg));
1800-
prepare_read_data(con);
1801-
return;
1832+
return prepare_read_data(con);
18021833
}
18031834

18041835
reset_in_kvecs(con);
@@ -1817,6 +1848,7 @@ static void prepare_read_tail_plain(struct ceph_connection *con)
18171848
add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN);
18181849
con->v2.in_state = IN_S_HANDLE_EPILOGUE;
18191850
}
1851+
return 0;
18201852
}
18211853

18221854
static void prepare_read_enc_page(struct ceph_connection *con)
@@ -2699,8 +2731,7 @@ static int __handle_control(struct ceph_connection *con, void *p)
26992731
if (con_secure(con))
27002732
return prepare_read_tail_secure(con);
27012733

2702-
prepare_read_tail_plain(con);
2703-
return 0;
2734+
return prepare_read_tail_plain(con);
27042735
}
27052736

27062737
static int handle_preamble(struct ceph_connection *con)
@@ -2856,8 +2887,7 @@ static int populate_in_iter(struct ceph_connection *con)
28562887
ret = handle_control_remainder(con);
28572888
break;
28582889
case IN_S_PREPARE_READ_DATA:
2859-
prepare_read_data(con);
2860-
ret = 0;
2890+
ret = prepare_read_data(con);
28612891
break;
28622892
case IN_S_PREPARE_READ_DATA_CONT:
28632893
prepare_read_data_cont(con);

0 commit comments

Comments
 (0)