Skip to content

Commit 922a763

Browse files
committed
Merge tag 'zonefs-5.10-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/zonefs
Pull zonefs updates from Damien Le Moal: "Add an 'explicit-open' mount option to automatically issue a REQ_OP_ZONE_OPEN command to the device whenever a sequential zone file is open for writing for the first time. This avoids 'insufficient zone resources' errors for write operations on some drives with limited zone resources or on ZNS drives with a limited number of active zones. From Johannes" * tag 'zonefs-5.10-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/zonefs: zonefs: document the explicit-open mount option zonefs: open/close zone on file open/close zonefs: provide no-lock zonefs_io_error variant zonefs: introduce helper for zone management
2 parents 7cf726a + 48bfd5c commit 922a763

File tree

3 files changed

+233
-13
lines changed

3 files changed

+233
-13
lines changed

Documentation/filesystems/zonefs.rst

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,21 @@ discover the amount of data that has been written to the zone. In the case of a
326326
read-only zone discovered at run-time, as indicated in the previous section.
327327
The size of the zone file is left unchanged from its last updated value.
328328

329+
A zoned block device (e.g. an NVMe Zoned Namespace device) may have limits on
330+
the number of zones that can be active, that is, zones that are in the
331+
implicit open, explicit open or closed conditions. This potential limitation
332+
translates into a risk for applications to see write IO errors due to this
333+
limit being exceeded if the zone of a file is not already active when a write
334+
request is issued by the user.
335+
336+
To avoid these potential errors, the "explicit-open" mount option forces zones
337+
to be made active using an open zone command when a file is opened for writing
338+
for the first time. If the zone open command succeeds, the application is then
339+
guaranteed that write requests can be processed. Conversely, the
340+
"explicit-open" mount option will result in a zone close command being issued
341+
to the device on the last close() of a zone file if the zone is not full nor
342+
empty.
343+
329344
Zonefs User Space Tools
330345
=======================
331346

fs/zonefs/super.c

Lines changed: 208 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,39 @@
2424

2525
#include "zonefs.h"
2626

27+
static inline int zonefs_zone_mgmt(struct inode *inode,
28+
enum req_opf op)
29+
{
30+
struct zonefs_inode_info *zi = ZONEFS_I(inode);
31+
int ret;
32+
33+
lockdep_assert_held(&zi->i_truncate_mutex);
34+
35+
ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector,
36+
zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS);
37+
if (ret) {
38+
zonefs_err(inode->i_sb,
39+
"Zone management operation %s at %llu failed %d\n",
40+
blk_op_str(op), zi->i_zsector, ret);
41+
return ret;
42+
}
43+
44+
return 0;
45+
}
46+
47+
static inline void zonefs_i_size_write(struct inode *inode, loff_t isize)
48+
{
49+
struct zonefs_inode_info *zi = ZONEFS_I(inode);
50+
51+
i_size_write(inode, isize);
52+
/*
53+
* A full zone is no longer open/active and does not need
54+
* explicit closing.
55+
*/
56+
if (isize >= zi->i_max_size)
57+
zi->i_flags &= ~ZONEFS_ZONE_OPEN;
58+
}
59+
2760
static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
2861
unsigned int flags, struct iomap *iomap,
2962
struct iomap *srcmap)
@@ -301,6 +334,17 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
301334
}
302335
}
303336

337+
/*
338+
* If the filesystem is mounted with the explicit-open mount option, we
339+
* need to clear the ZONEFS_ZONE_OPEN flag if the zone transitioned to
340+
* the read-only or offline condition, to avoid attempting an explicit
341+
* close of the zone when the inode file is closed.
342+
*/
343+
if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) &&
344+
(zone->cond == BLK_ZONE_COND_OFFLINE ||
345+
zone->cond == BLK_ZONE_COND_READONLY))
346+
zi->i_flags &= ~ZONEFS_ZONE_OPEN;
347+
304348
/*
305349
* If error=remount-ro was specified, any error result in remounting
306350
* the volume as read-only.
@@ -315,7 +359,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
315359
* invalid data.
316360
*/
317361
zonefs_update_stats(inode, data_size);
318-
i_size_write(inode, data_size);
362+
zonefs_i_size_write(inode, data_size);
319363
zi->i_wpoffset = data_size;
320364

321365
return 0;
@@ -328,7 +372,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
328372
* eventually correct the file size and zonefs inode write pointer offset
329373
* (which can be out of sync with the drive due to partial write failures).
330374
*/
331-
static void zonefs_io_error(struct inode *inode, bool write)
375+
static void __zonefs_io_error(struct inode *inode, bool write)
332376
{
333377
struct zonefs_inode_info *zi = ZONEFS_I(inode);
334378
struct super_block *sb = inode->i_sb;
@@ -342,8 +386,6 @@ static void zonefs_io_error(struct inode *inode, bool write)
342386
};
343387
int ret;
344388

345-
mutex_lock(&zi->i_truncate_mutex);
346-
347389
/*
348390
* Memory allocations in blkdev_report_zones() can trigger a memory
349391
* reclaim which may in turn cause a recursion into zonefs as well as
@@ -359,7 +401,14 @@ static void zonefs_io_error(struct inode *inode, bool write)
359401
zonefs_err(sb, "Get inode %lu zone information failed %d\n",
360402
inode->i_ino, ret);
361403
memalloc_noio_restore(noio_flag);
404+
}
362405

406+
static void zonefs_io_error(struct inode *inode, bool write)
407+
{
408+
struct zonefs_inode_info *zi = ZONEFS_I(inode);
409+
410+
mutex_lock(&zi->i_truncate_mutex);
411+
__zonefs_io_error(inode, write);
363412
mutex_unlock(&zi->i_truncate_mutex);
364413
}
365414

@@ -397,13 +446,27 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize)
397446
if (isize == old_isize)
398447
goto unlock;
399448

400-
ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector,
401-
zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS);
402-
if (ret) {
403-
zonefs_err(inode->i_sb,
404-
"Zone management operation at %llu failed %d",
405-
zi->i_zsector, ret);
449+
ret = zonefs_zone_mgmt(inode, op);
450+
if (ret)
406451
goto unlock;
452+
453+
/*
454+
* If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set,
455+
* take care of open zones.
456+
*/
457+
if (zi->i_flags & ZONEFS_ZONE_OPEN) {
458+
/*
459+
* Truncating a zone to EMPTY or FULL is the equivalent of
460+
* closing the zone. For a truncation to 0, we need to
461+
* re-open the zone to ensure new writes can be processed.
462+
* For a truncation to the maximum file size, the zone is
463+
* closed and writes cannot be accepted anymore, so clear
464+
* the open flag.
465+
*/
466+
if (!isize)
467+
ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
468+
else
469+
zi->i_flags &= ~ZONEFS_ZONE_OPEN;
407470
}
408471

409472
zonefs_update_stats(inode, isize);
@@ -584,7 +647,7 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
584647
mutex_lock(&zi->i_truncate_mutex);
585648
if (i_size_read(inode) < iocb->ki_pos + size) {
586649
zonefs_update_stats(inode, iocb->ki_pos + size);
587-
i_size_write(inode, iocb->ki_pos + size);
650+
zonefs_i_size_write(inode, iocb->ki_pos + size);
588651
}
589652
mutex_unlock(&zi->i_truncate_mutex);
590653
}
@@ -865,8 +928,128 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
865928
return ret;
866929
}
867930

931+
static inline bool zonefs_file_use_exp_open(struct inode *inode, struct file *file)
932+
{
933+
struct zonefs_inode_info *zi = ZONEFS_I(inode);
934+
struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
935+
936+
if (!(sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN))
937+
return false;
938+
939+
if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
940+
return false;
941+
942+
if (!(file->f_mode & FMODE_WRITE))
943+
return false;
944+
945+
return true;
946+
}
947+
948+
static int zonefs_open_zone(struct inode *inode)
949+
{
950+
struct zonefs_inode_info *zi = ZONEFS_I(inode);
951+
struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
952+
int ret = 0;
953+
954+
mutex_lock(&zi->i_truncate_mutex);
955+
956+
zi->i_wr_refcnt++;
957+
if (zi->i_wr_refcnt == 1) {
958+
959+
if (atomic_inc_return(&sbi->s_open_zones) > sbi->s_max_open_zones) {
960+
atomic_dec(&sbi->s_open_zones);
961+
ret = -EBUSY;
962+
goto unlock;
963+
}
964+
965+
if (i_size_read(inode) < zi->i_max_size) {
966+
ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
967+
if (ret) {
968+
zi->i_wr_refcnt--;
969+
atomic_dec(&sbi->s_open_zones);
970+
goto unlock;
971+
}
972+
zi->i_flags |= ZONEFS_ZONE_OPEN;
973+
}
974+
}
975+
976+
unlock:
977+
mutex_unlock(&zi->i_truncate_mutex);
978+
979+
return ret;
980+
}
981+
982+
static int zonefs_file_open(struct inode *inode, struct file *file)
983+
{
984+
int ret;
985+
986+
ret = generic_file_open(inode, file);
987+
if (ret)
988+
return ret;
989+
990+
if (zonefs_file_use_exp_open(inode, file))
991+
return zonefs_open_zone(inode);
992+
993+
return 0;
994+
}
995+
996+
static void zonefs_close_zone(struct inode *inode)
997+
{
998+
struct zonefs_inode_info *zi = ZONEFS_I(inode);
999+
int ret = 0;
1000+
1001+
mutex_lock(&zi->i_truncate_mutex);
1002+
zi->i_wr_refcnt--;
1003+
if (!zi->i_wr_refcnt) {
1004+
struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
1005+
struct super_block *sb = inode->i_sb;
1006+
1007+
/*
1008+
* If the file zone is full, it is not open anymore and we only
1009+
* need to decrement the open count.
1010+
*/
1011+
if (!(zi->i_flags & ZONEFS_ZONE_OPEN))
1012+
goto dec;
1013+
1014+
ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
1015+
if (ret) {
1016+
__zonefs_io_error(inode, false);
1017+
/*
1018+
* Leaving zones explicitly open may lead to a state
1019+
* where most zones cannot be written (zone resources
1020+
* exhausted). So take preventive action by remounting
1021+
* read-only.
1022+
*/
1023+
if (zi->i_flags & ZONEFS_ZONE_OPEN &&
1024+
!(sb->s_flags & SB_RDONLY)) {
1025+
zonefs_warn(sb, "closing zone failed, remounting filesystem read-only\n");
1026+
sb->s_flags |= SB_RDONLY;
1027+
}
1028+
}
1029+
zi->i_flags &= ~ZONEFS_ZONE_OPEN;
1030+
dec:
1031+
atomic_dec(&sbi->s_open_zones);
1032+
}
1033+
mutex_unlock(&zi->i_truncate_mutex);
1034+
}
1035+
1036+
static int zonefs_file_release(struct inode *inode, struct file *file)
1037+
{
1038+
/*
1039+
* If we explicitly open a zone we must close it again as well, but the
1040+
* zone management operation can fail (either due to an IO error or as
1041+
* the zone has gone offline or read-only). Make sure we don't fail the
1042+
* close(2) for user-space.
1043+
*/
1044+
if (zonefs_file_use_exp_open(inode, file))
1045+
zonefs_close_zone(inode);
1046+
1047+
return 0;
1048+
}
1049+
8681050
static const struct file_operations zonefs_file_operations = {
869-
.open = generic_file_open,
1051+
.open = zonefs_file_open,
1052+
.release = zonefs_file_release,
8701053
.fsync = zonefs_file_fsync,
8711054
.mmap = zonefs_file_mmap,
8721055
.llseek = zonefs_file_llseek,
@@ -890,6 +1073,7 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb)
8901073
inode_init_once(&zi->i_vnode);
8911074
mutex_init(&zi->i_truncate_mutex);
8921075
init_rwsem(&zi->i_mmap_sem);
1076+
zi->i_wr_refcnt = 0;
8931077

8941078
return &zi->i_vnode;
8951079
}
@@ -940,14 +1124,15 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf)
9401124

9411125
enum {
9421126
Opt_errors_ro, Opt_errors_zro, Opt_errors_zol, Opt_errors_repair,
943-
Opt_err,
1127+
Opt_explicit_open, Opt_err,
9441128
};
9451129

9461130
static const match_table_t tokens = {
9471131
{ Opt_errors_ro, "errors=remount-ro"},
9481132
{ Opt_errors_zro, "errors=zone-ro"},
9491133
{ Opt_errors_zol, "errors=zone-offline"},
9501134
{ Opt_errors_repair, "errors=repair"},
1135+
{ Opt_explicit_open, "explicit-open" },
9511136
{ Opt_err, NULL}
9521137
};
9531138

@@ -984,6 +1169,9 @@ static int zonefs_parse_options(struct super_block *sb, char *options)
9841169
sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
9851170
sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_REPAIR;
9861171
break;
1172+
case Opt_explicit_open:
1173+
sbi->s_mount_opts |= ZONEFS_MNTOPT_EXPLICIT_OPEN;
1174+
break;
9871175
default:
9881176
return -EINVAL;
9891177
}
@@ -1403,6 +1591,13 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
14031591
sbi->s_gid = GLOBAL_ROOT_GID;
14041592
sbi->s_perm = 0640;
14051593
sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO;
1594+
sbi->s_max_open_zones = bdev_max_open_zones(sb->s_bdev);
1595+
atomic_set(&sbi->s_open_zones, 0);
1596+
if (!sbi->s_max_open_zones &&
1597+
sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
1598+
zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n");
1599+
sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN;
1600+
}
14061601

14071602
ret = zonefs_read_super(sb);
14081603
if (ret)

fs/zonefs/zonefs.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone)
3838
return ZONEFS_ZTYPE_SEQ;
3939
}
4040

41+
#define ZONEFS_ZONE_OPEN (1 << 0)
42+
4143
/*
4244
* In-memory inode data.
4345
*/
@@ -74,6 +76,10 @@ struct zonefs_inode_info {
7476
*/
7577
struct mutex i_truncate_mutex;
7678
struct rw_semaphore i_mmap_sem;
79+
80+
/* guarded by i_truncate_mutex */
81+
unsigned int i_wr_refcnt;
82+
unsigned int i_flags;
7783
};
7884

7985
static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode)
@@ -154,6 +160,7 @@ enum zonefs_features {
154160
#define ZONEFS_MNTOPT_ERRORS_MASK \
155161
(ZONEFS_MNTOPT_ERRORS_RO | ZONEFS_MNTOPT_ERRORS_ZRO | \
156162
ZONEFS_MNTOPT_ERRORS_ZOL | ZONEFS_MNTOPT_ERRORS_REPAIR)
163+
#define ZONEFS_MNTOPT_EXPLICIT_OPEN (1 << 4) /* Explicit open/close of zones on open/close */
157164

158165
/*
159166
* In-memory Super block information.
@@ -175,6 +182,9 @@ struct zonefs_sb_info {
175182

176183
loff_t s_blocks;
177184
loff_t s_used_blocks;
185+
186+
unsigned int s_max_open_zones;
187+
atomic_t s_open_zones;
178188
};
179189

180190
static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb)

0 commit comments

Comments
 (0)