@@ -596,6 +596,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
596
596
ci -> i_truncate_seq = 0 ;
597
597
ci -> i_truncate_size = 0 ;
598
598
ci -> i_truncate_pending = 0 ;
599
+ ci -> i_truncate_pagecache_size = 0 ;
599
600
600
601
ci -> i_max_size = 0 ;
601
602
ci -> i_reported_size = 0 ;
@@ -767,6 +768,10 @@ int ceph_fill_file_size(struct inode *inode, int issued,
767
768
dout ("truncate_size %lld -> %llu\n" , ci -> i_truncate_size ,
768
769
truncate_size );
769
770
ci -> i_truncate_size = truncate_size ;
771
+ if (IS_ENCRYPTED (inode ))
772
+ ci -> i_truncate_pagecache_size = size ;
773
+ else
774
+ ci -> i_truncate_pagecache_size = truncate_size ;
770
775
}
771
776
return queue_trunc ;
772
777
}
@@ -2147,7 +2152,7 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
2147
2152
/* there should be no reader or writer */
2148
2153
WARN_ON_ONCE (ci -> i_rd_ref || ci -> i_wr_ref );
2149
2154
2150
- to = ci -> i_truncate_size ;
2155
+ to = ci -> i_truncate_pagecache_size ;
2151
2156
wrbuffer_refs = ci -> i_wrbuffer_ref ;
2152
2157
dout ("__do_pending_vmtruncate %p (%d) to %lld\n" , inode ,
2153
2158
ci -> i_truncate_pending , to );
@@ -2157,7 +2162,7 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
2157
2162
truncate_pagecache (inode , to );
2158
2163
2159
2164
spin_lock (& ci -> i_ceph_lock );
2160
- if (to == ci -> i_truncate_size ) {
2165
+ if (to == ci -> i_truncate_pagecache_size ) {
2161
2166
ci -> i_truncate_pending = 0 ;
2162
2167
finish = 1 ;
2163
2168
}
@@ -2241,6 +2246,144 @@ static const struct inode_operations ceph_encrypted_symlink_iops = {
2241
2246
.listxattr = ceph_listxattr ,
2242
2247
};
2243
2248
2249
+ /*
2250
+ * Transfer the encrypted last block to the MDS and the MDS
2251
+ * will help update it when truncating a smaller size.
2252
+ *
2253
+ * We don't support a PAGE_SIZE that is smaller than the
2254
+ * CEPH_FSCRYPT_BLOCK_SIZE.
2255
+ */
2256
+ static int fill_fscrypt_truncate (struct inode * inode ,
2257
+ struct ceph_mds_request * req ,
2258
+ struct iattr * attr )
2259
+ {
2260
+ struct ceph_inode_info * ci = ceph_inode (inode );
2261
+ int boff = attr -> ia_size % CEPH_FSCRYPT_BLOCK_SIZE ;
2262
+ loff_t pos , orig_pos = round_down (attr -> ia_size ,
2263
+ CEPH_FSCRYPT_BLOCK_SIZE );
2264
+ u64 block = orig_pos >> CEPH_FSCRYPT_BLOCK_SHIFT ;
2265
+ struct ceph_pagelist * pagelist = NULL ;
2266
+ struct kvec iov = {0 };
2267
+ struct iov_iter iter ;
2268
+ struct page * page = NULL ;
2269
+ struct ceph_fscrypt_truncate_size_header header ;
2270
+ int retry_op = 0 ;
2271
+ int len = CEPH_FSCRYPT_BLOCK_SIZE ;
2272
+ loff_t i_size = i_size_read (inode );
2273
+ int got , ret , issued ;
2274
+ u64 objver ;
2275
+
2276
+ ret = __ceph_get_caps (inode , NULL , CEPH_CAP_FILE_RD , 0 , -1 , & got );
2277
+ if (ret < 0 )
2278
+ return ret ;
2279
+
2280
+ issued = __ceph_caps_issued (ci , NULL );
2281
+
2282
+ dout ("%s size %lld -> %lld got cap refs on %s, issued %s\n" , __func__ ,
2283
+ i_size , attr -> ia_size , ceph_cap_string (got ),
2284
+ ceph_cap_string (issued ));
2285
+
2286
+ /* Try to writeback the dirty pagecaches */
2287
+ if (issued & (CEPH_CAP_FILE_BUFFER )) {
2288
+ loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SHIFT - 1 ;
2289
+
2290
+ ret = filemap_write_and_wait_range (inode -> i_mapping ,
2291
+ orig_pos , lend );
2292
+ if (ret < 0 )
2293
+ goto out ;
2294
+ }
2295
+
2296
+ page = __page_cache_alloc (GFP_KERNEL );
2297
+ if (page == NULL ) {
2298
+ ret = - ENOMEM ;
2299
+ goto out ;
2300
+ }
2301
+
2302
+ pagelist = ceph_pagelist_alloc (GFP_KERNEL );
2303
+ if (!pagelist ) {
2304
+ ret = - ENOMEM ;
2305
+ goto out ;
2306
+ }
2307
+
2308
+ iov .iov_base = kmap_local_page (page );
2309
+ iov .iov_len = len ;
2310
+ iov_iter_kvec (& iter , READ , & iov , 1 , len );
2311
+
2312
+ pos = orig_pos ;
2313
+ ret = __ceph_sync_read (inode , & pos , & iter , & retry_op , & objver );
2314
+ if (ret < 0 )
2315
+ goto out ;
2316
+
2317
+ /* Insert the header first */
2318
+ header .ver = 1 ;
2319
+ header .compat = 1 ;
2320
+ header .change_attr = cpu_to_le64 (inode_peek_iversion_raw (inode ));
2321
+
2322
+ /*
2323
+ * Always set the block_size to CEPH_FSCRYPT_BLOCK_SIZE,
2324
+ * because in MDS it may need this to do the truncate.
2325
+ */
2326
+ header .block_size = cpu_to_le32 (CEPH_FSCRYPT_BLOCK_SIZE );
2327
+
2328
+ /*
2329
+ * If we hit a hole here, we should just skip filling
2330
+ * the fscrypt for the request, because once the fscrypt
2331
+ * is enabled, the file will be split into many blocks
2332
+ * with the size of CEPH_FSCRYPT_BLOCK_SIZE, if there
2333
+ * has a hole, the hole size should be multiple of block
2334
+ * size.
2335
+ *
2336
+ * If the Rados object doesn't exist, it will be set to 0.
2337
+ */
2338
+ if (!objver ) {
2339
+ dout ("%s hit hole, ppos %lld < size %lld\n" , __func__ ,
2340
+ pos , i_size );
2341
+
2342
+ header .data_len = cpu_to_le32 (8 + 8 + 4 );
2343
+ header .file_offset = 0 ;
2344
+ ret = 0 ;
2345
+ } else {
2346
+ header .data_len = cpu_to_le32 (8 + 8 + 4 + CEPH_FSCRYPT_BLOCK_SIZE );
2347
+ header .file_offset = cpu_to_le64 (orig_pos );
2348
+
2349
+ /* truncate and zero out the extra contents for the last block */
2350
+ memset (iov .iov_base + boff , 0 , PAGE_SIZE - boff );
2351
+
2352
+ /* encrypt the last block */
2353
+ ret = ceph_fscrypt_encrypt_block_inplace (inode , page ,
2354
+ CEPH_FSCRYPT_BLOCK_SIZE ,
2355
+ 0 , block ,
2356
+ GFP_KERNEL );
2357
+ if (ret )
2358
+ goto out ;
2359
+ }
2360
+
2361
+ /* Insert the header */
2362
+ ret = ceph_pagelist_append (pagelist , & header , sizeof (header ));
2363
+ if (ret )
2364
+ goto out ;
2365
+
2366
+ if (header .block_size ) {
2367
+ /* Append the last block contents to pagelist */
2368
+ ret = ceph_pagelist_append (pagelist , iov .iov_base ,
2369
+ CEPH_FSCRYPT_BLOCK_SIZE );
2370
+ if (ret )
2371
+ goto out ;
2372
+ }
2373
+ req -> r_pagelist = pagelist ;
2374
+ out :
2375
+ dout ("%s %p size dropping cap refs on %s\n" , __func__ ,
2376
+ inode , ceph_cap_string (got ));
2377
+ ceph_put_cap_refs (ci , got );
2378
+ if (iov .iov_base )
2379
+ kunmap_local (iov .iov_base );
2380
+ if (page )
2381
+ __free_pages (page , 0 );
2382
+ if (ret && pagelist )
2383
+ ceph_pagelist_release (pagelist );
2384
+ return ret ;
2385
+ }
2386
+
2244
2387
int __ceph_setattr (struct inode * inode , struct iattr * attr ,
2245
2388
struct ceph_iattr * cia )
2246
2389
{
@@ -2249,13 +2392,17 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr,
2249
2392
struct ceph_mds_request * req ;
2250
2393
struct ceph_mds_client * mdsc = ceph_sb_to_client (inode -> i_sb )-> mdsc ;
2251
2394
struct ceph_cap_flush * prealloc_cf ;
2395
+ loff_t isize = i_size_read (inode );
2252
2396
int issued ;
2253
2397
int release = 0 , dirtied = 0 ;
2254
2398
int mask = 0 ;
2255
2399
int err = 0 ;
2256
2400
int inode_dirty_flags = 0 ;
2257
2401
bool lock_snap_rwsem = false;
2402
+ bool fill_fscrypt ;
2403
+ int truncate_retry = 20 ; /* The RMW will take around 50ms */
2258
2404
2405
+ retry :
2259
2406
prealloc_cf = ceph_alloc_cap_flush ();
2260
2407
if (!prealloc_cf )
2261
2408
return - ENOMEM ;
@@ -2267,6 +2414,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr,
2267
2414
return PTR_ERR (req );
2268
2415
}
2269
2416
2417
+ fill_fscrypt = false;
2270
2418
spin_lock (& ci -> i_ceph_lock );
2271
2419
issued = __ceph_caps_issued (ci , NULL );
2272
2420
@@ -2388,10 +2536,27 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr,
2388
2536
}
2389
2537
}
2390
2538
if (ia_valid & ATTR_SIZE ) {
2391
- loff_t isize = i_size_read (inode );
2392
-
2393
2539
dout ("setattr %p size %lld -> %lld\n" , inode , isize , attr -> ia_size );
2394
- if ((issued & CEPH_CAP_FILE_EXCL ) && attr -> ia_size >= isize ) {
2540
+ /*
2541
+ * Only when the new size is smaller and not aligned to
2542
+ * CEPH_FSCRYPT_BLOCK_SIZE will the RMW is needed.
2543
+ */
2544
+ if (IS_ENCRYPTED (inode ) && attr -> ia_size < isize &&
2545
+ (attr -> ia_size % CEPH_FSCRYPT_BLOCK_SIZE )) {
2546
+ mask |= CEPH_SETATTR_SIZE ;
2547
+ release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
2548
+ CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR ;
2549
+ set_bit (CEPH_MDS_R_FSCRYPT_FILE , & req -> r_req_flags );
2550
+ mask |= CEPH_SETATTR_FSCRYPT_FILE ;
2551
+ req -> r_args .setattr .size =
2552
+ cpu_to_le64 (round_up (attr -> ia_size ,
2553
+ CEPH_FSCRYPT_BLOCK_SIZE ));
2554
+ req -> r_args .setattr .old_size =
2555
+ cpu_to_le64 (round_up (isize ,
2556
+ CEPH_FSCRYPT_BLOCK_SIZE ));
2557
+ req -> r_fscrypt_file = attr -> ia_size ;
2558
+ fill_fscrypt = true;
2559
+ } else if ((issued & CEPH_CAP_FILE_EXCL ) && attr -> ia_size >= isize ) {
2395
2560
if (attr -> ia_size > isize ) {
2396
2561
i_size_write (inode , attr -> ia_size );
2397
2562
inode -> i_blocks = calc_inode_blocks (attr -> ia_size );
@@ -2414,7 +2579,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr,
2414
2579
cpu_to_le64 (round_up (isize ,
2415
2580
CEPH_FSCRYPT_BLOCK_SIZE ));
2416
2581
req -> r_fscrypt_file = attr -> ia_size ;
2417
- /* FIXME: client must zero out any partial blocks! */
2418
2582
} else {
2419
2583
req -> r_args .setattr .size = cpu_to_le64 (attr -> ia_size );
2420
2584
req -> r_args .setattr .old_size = cpu_to_le64 (isize );
@@ -2481,8 +2645,10 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr,
2481
2645
2482
2646
release &= issued ;
2483
2647
spin_unlock (& ci -> i_ceph_lock );
2484
- if (lock_snap_rwsem )
2648
+ if (lock_snap_rwsem ) {
2485
2649
up_read (& mdsc -> snap_rwsem );
2650
+ lock_snap_rwsem = false;
2651
+ }
2486
2652
2487
2653
if (inode_dirty_flags )
2488
2654
__mark_inode_dirty (inode , inode_dirty_flags );
@@ -2494,7 +2660,27 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr,
2494
2660
req -> r_args .setattr .mask = cpu_to_le32 (mask );
2495
2661
req -> r_num_caps = 1 ;
2496
2662
req -> r_stamp = attr -> ia_ctime ;
2663
+ if (fill_fscrypt ) {
2664
+ err = fill_fscrypt_truncate (inode , req , attr );
2665
+ if (err )
2666
+ goto out ;
2667
+ }
2668
+
2669
+ /*
2670
+ * The truncate request will return -EAGAIN when the
2671
+ * last block has been updated just before the MDS
2672
+ * successfully gets the xlock for the FILE lock. To
2673
+ * avoid corrupting the file contents we need to retry
2674
+ * it.
2675
+ */
2497
2676
err = ceph_mdsc_do_request (mdsc , NULL , req );
2677
+ if (err == - EAGAIN && truncate_retry -- ) {
2678
+ dout ("setattr %p result=%d (%s locally, %d remote), retry it!\n" ,
2679
+ inode , err , ceph_cap_string (dirtied ), mask );
2680
+ ceph_mdsc_put_request (req );
2681
+ ceph_free_cap_flush (prealloc_cf );
2682
+ goto retry ;
2683
+ }
2498
2684
}
2499
2685
out :
2500
2686
dout ("setattr %p result=%d (%s locally, %d remote)\n" , inode , err ,
0 commit comments