Skip to content

Commit 5c64737

Browse files
lxbszidryomov
authored andcommitted
ceph: add truncate size handling support for fscrypt
This will transfer the encrypted last block contents to the MDS along with the truncate request only when the new size is smaller and not aligned to the fscrypt BLOCK size. When the last block is located in the file hole, the truncate request will only contain the header. The MDS could fail to do the truncate if there has another client or process has already updated the RADOS object which contains the last block, and will return -EAGAIN, then the kclient needs to retry it. The RMW will take around 50ms, and will let it retry 20 times for now. Signed-off-by: Xiubo Li <[email protected]> Reviewed-by: Jeff Layton <[email protected]> Reviewed-and-tested-by: Luís Henriques <[email protected]> Reviewed-by: Milind Changire <[email protected]> Signed-off-by: Ilya Dryomov <[email protected]>
1 parent d4d5188 commit 5c64737

File tree

4 files changed

+234
-12
lines changed

4 files changed

+234
-12
lines changed

fs/ceph/caps.c

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2960,10 +2960,9 @@ int ceph_try_get_caps(struct inode *inode, int need, int want,
29602960
* due to a small max_size, make sure we check_max_size (and possibly
29612961
* ask the mds) so we don't get hung up indefinitely.
29622962
*/
2963-
int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got)
2963+
int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need,
2964+
int want, loff_t endoff, int *got)
29642965
{
2965-
struct ceph_file_info *fi = filp->private_data;
2966-
struct inode *inode = file_inode(filp);
29672966
struct ceph_inode_info *ci = ceph_inode(inode);
29682967
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
29692968
int ret, _got, flags;
@@ -2972,7 +2971,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got
29722971
if (ret < 0)
29732972
return ret;
29742973

2975-
if ((fi->fmode & CEPH_FILE_MODE_WR) &&
2974+
if (fi && (fi->fmode & CEPH_FILE_MODE_WR) &&
29762975
fi->filp_gen != READ_ONCE(fsc->filp_gen))
29772976
return -EBADF;
29782977

@@ -3025,7 +3024,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got
30253024
continue;
30263025
}
30273026

3028-
if ((fi->fmode & CEPH_FILE_MODE_WR) &&
3027+
if (fi && (fi->fmode & CEPH_FILE_MODE_WR) &&
30293028
fi->filp_gen != READ_ONCE(fsc->filp_gen)) {
30303029
if (ret >= 0 && _got)
30313030
ceph_put_cap_refs(ci, _got);
@@ -3088,6 +3087,15 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got
30883087
return 0;
30893088
}
30903089

3090+
int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff,
3091+
int *got)
3092+
{
3093+
struct ceph_file_info *fi = filp->private_data;
3094+
struct inode *inode = file_inode(filp);
3095+
3096+
return __ceph_get_caps(inode, fi, need, want, endoff, got);
3097+
}
3098+
30913099
/*
30923100
* Take cap refs. Caller must already know we hold at least one ref
30933101
* on the caps in question or we don't know this is safe.

fs/ceph/crypto.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,27 @@ struct ceph_fname {
2626
bool no_copy;
2727
};
2828

29+
/*
30+
* Header for the crypted file when truncating the size, this
31+
* will be sent to MDS, and the MDS will update the encrypted
32+
* last block and then truncate the size.
33+
*/
34+
struct ceph_fscrypt_truncate_size_header {
35+
__u8 ver;
36+
__u8 compat;
37+
38+
/*
39+
* It will be sizeof(assert_ver + file_offset + block_size)
40+
* if the last block is empty when it's located in a file
41+
* hole. Or the data_len will plus CEPH_FSCRYPT_BLOCK_SIZE.
42+
*/
43+
__le32 data_len;
44+
45+
__le64 change_attr;
46+
__le64 file_offset;
47+
__le32 block_size;
48+
} __packed;
49+
2950
struct ceph_fscrypt_auth {
3051
__le32 cfa_version;
3152
__le32 cfa_blob_len;

fs/ceph/inode.c

Lines changed: 193 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -596,6 +596,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
596596
ci->i_truncate_seq = 0;
597597
ci->i_truncate_size = 0;
598598
ci->i_truncate_pending = 0;
599+
ci->i_truncate_pagecache_size = 0;
599600

600601
ci->i_max_size = 0;
601602
ci->i_reported_size = 0;
@@ -767,6 +768,10 @@ int ceph_fill_file_size(struct inode *inode, int issued,
767768
dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
768769
truncate_size);
769770
ci->i_truncate_size = truncate_size;
771+
if (IS_ENCRYPTED(inode))
772+
ci->i_truncate_pagecache_size = size;
773+
else
774+
ci->i_truncate_pagecache_size = truncate_size;
770775
}
771776
return queue_trunc;
772777
}
@@ -2147,7 +2152,7 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
21472152
/* there should be no reader or writer */
21482153
WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref);
21492154

2150-
to = ci->i_truncate_size;
2155+
to = ci->i_truncate_pagecache_size;
21512156
wrbuffer_refs = ci->i_wrbuffer_ref;
21522157
dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
21532158
ci->i_truncate_pending, to);
@@ -2157,7 +2162,7 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
21572162
truncate_pagecache(inode, to);
21582163

21592164
spin_lock(&ci->i_ceph_lock);
2160-
if (to == ci->i_truncate_size) {
2165+
if (to == ci->i_truncate_pagecache_size) {
21612166
ci->i_truncate_pending = 0;
21622167
finish = 1;
21632168
}
@@ -2241,6 +2246,144 @@ static const struct inode_operations ceph_encrypted_symlink_iops = {
22412246
.listxattr = ceph_listxattr,
22422247
};
22432248

2249+
/*
2250+
* Transfer the encrypted last block to the MDS and the MDS
2251+
* will help update it when truncating a smaller size.
2252+
*
2253+
* We don't support a PAGE_SIZE that is smaller than the
2254+
* CEPH_FSCRYPT_BLOCK_SIZE.
2255+
*/
2256+
static int fill_fscrypt_truncate(struct inode *inode,
2257+
struct ceph_mds_request *req,
2258+
struct iattr *attr)
2259+
{
2260+
struct ceph_inode_info *ci = ceph_inode(inode);
2261+
int boff = attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE;
2262+
loff_t pos, orig_pos = round_down(attr->ia_size,
2263+
CEPH_FSCRYPT_BLOCK_SIZE);
2264+
u64 block = orig_pos >> CEPH_FSCRYPT_BLOCK_SHIFT;
2265+
struct ceph_pagelist *pagelist = NULL;
2266+
struct kvec iov = {0};
2267+
struct iov_iter iter;
2268+
struct page *page = NULL;
2269+
struct ceph_fscrypt_truncate_size_header header;
2270+
int retry_op = 0;
2271+
int len = CEPH_FSCRYPT_BLOCK_SIZE;
2272+
loff_t i_size = i_size_read(inode);
2273+
int got, ret, issued;
2274+
u64 objver;
2275+
2276+
ret = __ceph_get_caps(inode, NULL, CEPH_CAP_FILE_RD, 0, -1, &got);
2277+
if (ret < 0)
2278+
return ret;
2279+
2280+
issued = __ceph_caps_issued(ci, NULL);
2281+
2282+
dout("%s size %lld -> %lld got cap refs on %s, issued %s\n", __func__,
2283+
i_size, attr->ia_size, ceph_cap_string(got),
2284+
ceph_cap_string(issued));
2285+
2286+
/* Try to writeback the dirty pagecaches */
2287+
if (issued & (CEPH_CAP_FILE_BUFFER)) {
2288+
loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SHIFT - 1;
2289+
2290+
ret = filemap_write_and_wait_range(inode->i_mapping,
2291+
orig_pos, lend);
2292+
if (ret < 0)
2293+
goto out;
2294+
}
2295+
2296+
page = __page_cache_alloc(GFP_KERNEL);
2297+
if (page == NULL) {
2298+
ret = -ENOMEM;
2299+
goto out;
2300+
}
2301+
2302+
pagelist = ceph_pagelist_alloc(GFP_KERNEL);
2303+
if (!pagelist) {
2304+
ret = -ENOMEM;
2305+
goto out;
2306+
}
2307+
2308+
iov.iov_base = kmap_local_page(page);
2309+
iov.iov_len = len;
2310+
iov_iter_kvec(&iter, READ, &iov, 1, len);
2311+
2312+
pos = orig_pos;
2313+
ret = __ceph_sync_read(inode, &pos, &iter, &retry_op, &objver);
2314+
if (ret < 0)
2315+
goto out;
2316+
2317+
/* Insert the header first */
2318+
header.ver = 1;
2319+
header.compat = 1;
2320+
header.change_attr = cpu_to_le64(inode_peek_iversion_raw(inode));
2321+
2322+
/*
2323+
* Always set the block_size to CEPH_FSCRYPT_BLOCK_SIZE,
2324+
* because in MDS it may need this to do the truncate.
2325+
*/
2326+
header.block_size = cpu_to_le32(CEPH_FSCRYPT_BLOCK_SIZE);
2327+
2328+
/*
2329+
* If we hit a hole here, we should just skip filling
2330+
* the fscrypt for the request, because once the fscrypt
2331+
* is enabled, the file will be split into many blocks
2332+
* with the size of CEPH_FSCRYPT_BLOCK_SIZE, if there
2333+
* has a hole, the hole size should be multiple of block
2334+
* size.
2335+
*
2336+
* If the Rados object doesn't exist, it will be set to 0.
2337+
*/
2338+
if (!objver) {
2339+
dout("%s hit hole, ppos %lld < size %lld\n", __func__,
2340+
pos, i_size);
2341+
2342+
header.data_len = cpu_to_le32(8 + 8 + 4);
2343+
header.file_offset = 0;
2344+
ret = 0;
2345+
} else {
2346+
header.data_len = cpu_to_le32(8 + 8 + 4 + CEPH_FSCRYPT_BLOCK_SIZE);
2347+
header.file_offset = cpu_to_le64(orig_pos);
2348+
2349+
/* truncate and zero out the extra contents for the last block */
2350+
memset(iov.iov_base + boff, 0, PAGE_SIZE - boff);
2351+
2352+
/* encrypt the last block */
2353+
ret = ceph_fscrypt_encrypt_block_inplace(inode, page,
2354+
CEPH_FSCRYPT_BLOCK_SIZE,
2355+
0, block,
2356+
GFP_KERNEL);
2357+
if (ret)
2358+
goto out;
2359+
}
2360+
2361+
/* Insert the header */
2362+
ret = ceph_pagelist_append(pagelist, &header, sizeof(header));
2363+
if (ret)
2364+
goto out;
2365+
2366+
if (header.block_size) {
2367+
/* Append the last block contents to pagelist */
2368+
ret = ceph_pagelist_append(pagelist, iov.iov_base,
2369+
CEPH_FSCRYPT_BLOCK_SIZE);
2370+
if (ret)
2371+
goto out;
2372+
}
2373+
req->r_pagelist = pagelist;
2374+
out:
2375+
dout("%s %p size dropping cap refs on %s\n", __func__,
2376+
inode, ceph_cap_string(got));
2377+
ceph_put_cap_refs(ci, got);
2378+
if (iov.iov_base)
2379+
kunmap_local(iov.iov_base);
2380+
if (page)
2381+
__free_pages(page, 0);
2382+
if (ret && pagelist)
2383+
ceph_pagelist_release(pagelist);
2384+
return ret;
2385+
}
2386+
22442387
int __ceph_setattr(struct inode *inode, struct iattr *attr,
22452388
struct ceph_iattr *cia)
22462389
{
@@ -2249,13 +2392,17 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr,
22492392
struct ceph_mds_request *req;
22502393
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
22512394
struct ceph_cap_flush *prealloc_cf;
2395+
loff_t isize = i_size_read(inode);
22522396
int issued;
22532397
int release = 0, dirtied = 0;
22542398
int mask = 0;
22552399
int err = 0;
22562400
int inode_dirty_flags = 0;
22572401
bool lock_snap_rwsem = false;
2402+
bool fill_fscrypt;
2403+
int truncate_retry = 20; /* The RMW will take around 50ms */
22582404

2405+
retry:
22592406
prealloc_cf = ceph_alloc_cap_flush();
22602407
if (!prealloc_cf)
22612408
return -ENOMEM;
@@ -2267,6 +2414,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr,
22672414
return PTR_ERR(req);
22682415
}
22692416

2417+
fill_fscrypt = false;
22702418
spin_lock(&ci->i_ceph_lock);
22712419
issued = __ceph_caps_issued(ci, NULL);
22722420

@@ -2388,10 +2536,27 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr,
23882536
}
23892537
}
23902538
if (ia_valid & ATTR_SIZE) {
2391-
loff_t isize = i_size_read(inode);
2392-
23932539
dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size);
2394-
if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) {
2540+
/*
2541+
* Only when the new size is smaller and not aligned to
2542+
* CEPH_FSCRYPT_BLOCK_SIZE will the RMW is needed.
2543+
*/
2544+
if (IS_ENCRYPTED(inode) && attr->ia_size < isize &&
2545+
(attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE)) {
2546+
mask |= CEPH_SETATTR_SIZE;
2547+
release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
2548+
CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
2549+
set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags);
2550+
mask |= CEPH_SETATTR_FSCRYPT_FILE;
2551+
req->r_args.setattr.size =
2552+
cpu_to_le64(round_up(attr->ia_size,
2553+
CEPH_FSCRYPT_BLOCK_SIZE));
2554+
req->r_args.setattr.old_size =
2555+
cpu_to_le64(round_up(isize,
2556+
CEPH_FSCRYPT_BLOCK_SIZE));
2557+
req->r_fscrypt_file = attr->ia_size;
2558+
fill_fscrypt = true;
2559+
} else if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) {
23952560
if (attr->ia_size > isize) {
23962561
i_size_write(inode, attr->ia_size);
23972562
inode->i_blocks = calc_inode_blocks(attr->ia_size);
@@ -2414,7 +2579,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr,
24142579
cpu_to_le64(round_up(isize,
24152580
CEPH_FSCRYPT_BLOCK_SIZE));
24162581
req->r_fscrypt_file = attr->ia_size;
2417-
/* FIXME: client must zero out any partial blocks! */
24182582
} else {
24192583
req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
24202584
req->r_args.setattr.old_size = cpu_to_le64(isize);
@@ -2481,8 +2645,10 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr,
24812645

24822646
release &= issued;
24832647
spin_unlock(&ci->i_ceph_lock);
2484-
if (lock_snap_rwsem)
2648+
if (lock_snap_rwsem) {
24852649
up_read(&mdsc->snap_rwsem);
2650+
lock_snap_rwsem = false;
2651+
}
24862652

24872653
if (inode_dirty_flags)
24882654
__mark_inode_dirty(inode, inode_dirty_flags);
@@ -2494,7 +2660,27 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr,
24942660
req->r_args.setattr.mask = cpu_to_le32(mask);
24952661
req->r_num_caps = 1;
24962662
req->r_stamp = attr->ia_ctime;
2663+
if (fill_fscrypt) {
2664+
err = fill_fscrypt_truncate(inode, req, attr);
2665+
if (err)
2666+
goto out;
2667+
}
2668+
2669+
/*
2670+
* The truncate request will return -EAGAIN when the
2671+
* last block has been updated just before the MDS
2672+
* successfully gets the xlock for the FILE lock. To
2673+
* avoid corrupting the file contents we need to retry
2674+
* it.
2675+
*/
24972676
err = ceph_mdsc_do_request(mdsc, NULL, req);
2677+
if (err == -EAGAIN && truncate_retry--) {
2678+
dout("setattr %p result=%d (%s locally, %d remote), retry it!\n",
2679+
inode, err, ceph_cap_string(dirtied), mask);
2680+
ceph_mdsc_put_request(req);
2681+
ceph_free_cap_flush(prealloc_cf);
2682+
goto retry;
2683+
}
24982684
}
24992685
out:
25002686
dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,

fs/ceph/super.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,11 @@ struct ceph_inode_info {
424424
u32 i_truncate_seq; /* last truncate to smaller size */
425425
u64 i_truncate_size; /* and the size we last truncated down to */
426426
int i_truncate_pending; /* still need to call vmtruncate */
427+
/*
428+
* For none fscrypt case it equals to i_truncate_size or it will
429+
* equals to fscrypt_file_size
430+
*/
431+
u64 i_truncate_pagecache_size;
427432

428433
u64 i_max_size; /* max file size authorized by mds */
429434
u64 i_reported_size; /* (max_)size reported to or requested of mds */
@@ -1265,6 +1270,8 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
12651270
struct inode *dir,
12661271
int mds, int drop, int unless);
12671272

1273+
extern int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi,
1274+
int need, int want, loff_t endoff, int *got);
12681275
extern int ceph_get_caps(struct file *filp, int need, int want,
12691276
loff_t endoff, int *got);
12701277
extern int ceph_try_get_caps(struct inode *inode,

0 commit comments

Comments
 (0)