Skip to content

Commit f4f359e

Browse files
committed
client: restrict bufferlist to total write size
In linux, systems calls like write() anyways don't allow writes > 2GiB, the total write size passed to the Client::_write is clamped to INT_MAX but bufferlist is not handled. This edge case is patched here. bug that arises due to buffer list beyond INT_MAX stalls async i/o due to: unknown file: Failure C++ exception with description "End of buffer [buffer:2]" thrown in the test body. 2024-05-28T16:17:06.854+0530 7f9a5d24c9c0 2 client.4311 unmount 2024-05-28T16:17:06.854+0530 7f9a5d24c9c0 2 client.4311 unmounting which results in disconnected inode and cap leaks: 2024-05-28T16:17:11.855+0530 7f9a5d24c9c0 1 client.4311 dump_inode: DISCONNECTED inode 0x10000000001 #0x10000000001 ref 3 0x10000000001.head(faked_ino=0 nref=3 ll_ref=0 cap_refs={4=0,1024=1,4096=1,8192=2} open={3=0} mode=100666 size=0/4294967296 nlink=1 btime=2024-05-28T16:17:03.387546+0530 mtime=2024-05-28T16:17:03.387546+0530 ctime=2024-05-28T16:17:03.387546+0530 change_attr=0 caps=pAsxLsXsxFsxcrwb(0=pAsxLsXsxFsxcrwb) objectset[0x10000000001 ts 0/0 objects 1 dirty_or_tx 0] 0x7f9a2c009530) 2024-05-28T16:17:11.855+0530 7f9a5d24c9c0 2 client.4311 cache still has 0+1 items, waiting (for caps to release?) This commit changes the way Client::_write accepts data. So, now instead of accepting ptr, iovec array and iovcnt, the helper now accepts a bufferlist which should be contructed by the caller itself. The reason behind this change is simple - to declutter the API. For more context checkout this conversation ceph#58564 (comment) Fixes: https://tracker.ceph.com/issues/66245 Signed-off-by: Dhairya Parmar <[email protected]>
1 parent 28b15bd commit f4f359e

File tree

2 files changed

+31
-24
lines changed

2 files changed

+31
-24
lines changed

src/client/Client.cc

Lines changed: 28 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -11305,7 +11305,9 @@ int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
1130511305
#endif
1130611306
/* We can't return bytes written larger than INT_MAX, clamp size to that */
1130711307
size = std::min(size, (loff_t)INT_MAX);
11308-
int r = _write(fh, offset, size, buf, NULL, false);
11308+
bufferlist bl;
11309+
bl.append(buf, size);
11310+
int r = _write(fh, offset, size, std::move(bl));
1130911311
ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
1131011312
return r;
1131111313
}
@@ -11330,7 +11332,7 @@ int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
1133011332
if(iovcnt < 0) {
1133111333
return -CEPHFS_EINVAL;
1133211334
}
11333-
loff_t totallen = 0;
11335+
size_t totallen = 0;
1133411336
for (int i = 0; i < iovcnt; i++) {
1133511337
totallen += iov[i].iov_len;
1133611338
}
@@ -11340,12 +11342,29 @@ int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
1134011342
* 32-bit signed integers. Clamp the I/O sizes in those functions so that
1134111343
* we don't do I/Os larger than the values we can return.
1134211344
*/
11345+
bufferlist data;
1134311346
if (clamp_to_int) {
11344-
totallen = std::min(totallen, (loff_t)INT_MAX);
11347+
totallen = std::min(totallen, (size_t)INT_MAX);
11348+
size_t total_appended = 0;
11349+
for (int i = 0; i < iovcnt; i++) {
11350+
if (iov[i].iov_len > 0) {
11351+
if (total_appended + iov[i].iov_len >= totallen) {
11352+
data.append((const char *)iov[i].iov_base, totallen - total_appended);
11353+
break;
11354+
} else {
11355+
data.append((const char *)iov[i].iov_base, iov[i].iov_len);
11356+
total_appended += iov[i].iov_len;
11357+
}
11358+
}
11359+
}
11360+
} else {
11361+
for (int i = 0; i < iovcnt; i++) {
11362+
data.append((const char *)iov[i].iov_base, iov[i].iov_len);
11363+
}
1134511364
}
1134611365

1134711366
if (write) {
11348-
int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt, onfinish, do_fsync, syncdataonly);
11367+
int64_t w = _write(fh, offset, totallen, std::move(data), onfinish, do_fsync, syncdataonly);
1134911368
ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
1135011369
return w;
1135111370
} else {
@@ -11529,9 +11548,8 @@ bool Client::C_Write_Finisher::try_complete()
1152911548
return false;
1153011549
}
1153111550

11532-
int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
11533-
const struct iovec *iov, int iovcnt, Context *onfinish,
11534-
bool do_fsync, bool syncdataonly)
11551+
int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, bufferlist bl,
11552+
Context *onfinish, bool do_fsync, bool syncdataonly)
1153511553
{
1153611554
ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
1153711555

@@ -11601,19 +11619,6 @@ int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
1160111619
ceph_assert(in->inline_version > 0);
1160211620
}
1160311621

11604-
// copy into fresh buffer (since our write may be resub, async)
11605-
bufferlist bl;
11606-
if (buf) {
11607-
if (size > 0)
11608-
bl.append(buf, size);
11609-
} else if (iov){
11610-
for (int i = 0; i < iovcnt; i++) {
11611-
if (iov[i].iov_len > 0) {
11612-
bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
11613-
}
11614-
}
11615-
}
11616-
1161711622
int want, have;
1161811623
if (f->mode & CEPH_FILE_MODE_LAZY)
1161911624
want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
@@ -16066,7 +16071,9 @@ int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
1606616071
tout(cct) << off << std::endl;
1606716072
tout(cct) << len << std::endl;
1606816073

16069-
int r = _write(fh, off, len, data, NULL, 0);
16074+
bufferlist bl;
16075+
bl.append(data, len);
16076+
int r = _write(fh, off, len, std::move(bl));
1607016077
ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
1607116078
<< dendl;
1607216079
return r;

src/client/Client.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1715,9 +1715,9 @@ class Client : public Dispatcher, public md_config_obs_t {
17151715
void do_readahead(Fh *f, Inode *in, uint64_t off, uint64_t len);
17161716
int64_t _write_success(Fh *fh, utime_t start, uint64_t fpos,
17171717
int64_t offset, uint64_t size, Inode *in);
1718-
int64_t _write(Fh *fh, int64_t offset, uint64_t size, const char *buf,
1719-
const struct iovec *iov, int iovcnt, Context *onfinish = nullptr,
1720-
bool do_fsync = false, bool syncdataonly = false);
1718+
int64_t _write(Fh *fh, int64_t offset, uint64_t size, bufferlist bl,
1719+
Context *onfinish = nullptr, bool do_fsync = false,
1720+
bool syncdataonly = false);
17211721
int64_t _preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
17221722
int iovcnt, int64_t offset,
17231723
bool write, bool clamp_to_int,

0 commit comments

Comments
 (0)