@@ -201,9 +201,9 @@ BlueFS::BlueFS(CephContext* cct)
201201 : cct(cct),
202202 bdev(MAX_BDEV),
203203 ioc(MAX_BDEV),
204- block_reserved(MAX_BDEV),
205204 alloc(MAX_BDEV),
206- alloc_size(MAX_BDEV, 0 )
205+ alloc_size(MAX_BDEV, 0 ),
206+ locked_alloc(MAX_BDEV)
207207{
208208 dirty.pending_release .resize (MAX_BDEV);
209209 discard_cb[BDEV_WAL] = wal_discard_cb;
@@ -496,33 +496,28 @@ void BlueFS::_update_logger_stats()
496496int BlueFS::add_block_device (unsigned id, const string& path, bool trim,
497497 bluefs_shared_alloc_context_t * _shared_alloc)
498498{
499- uint64_t reserved;
500499 string dev_name;
501500 switch (id) {
502501 case BDEV_WAL:
503502 case BDEV_NEWWAL:
504- reserved = BDEV_LABEL_BLOCK_SIZE;
505503 dev_name = " wal" ;
506504 break ;
507505 case BDEV_DB:
508506 case BDEV_NEWDB:
509- reserved = SUPER_RESERVED;
510507 dev_name = " db" ;
511508 break ;
512509 case BDEV_SLOW:
513- reserved = 0 ;
514510 dev_name = " slow" ;
515511 break ;
516512 default :
517513 ceph_assert (false );
518514 }
519515 dout (10 ) << __func__ << " bdev " << id << " path " << path << " "
520- << " reserved " << reserved << dendl;
516+ << dendl;
521517 ceph_assert (id < bdev.size ());
522518 ceph_assert (bdev[id] == NULL );
523519 BlockDevice *b = BlockDevice::create (cct, path, NULL , NULL ,
524520 discard_cb[id], static_cast <void *>(this ), dev_name.c_str ());
525- block_reserved[id] = reserved;
526521 if (_shared_alloc) {
527522 b->set_no_exclusive_lock ();
528523 }
@@ -628,6 +623,35 @@ uint64_t BlueFS::get_free(unsigned id)
628623 return alloc[id]->get_free ();
629624}
630625
626+ uint64_t BlueFS::_get_minimal_reserved (unsigned id) const
627+ {
628+ uint64_t reserved = 0 ;
629+ switch (id) {
630+ case BDEV_WAL:
631+ case BDEV_NEWWAL:
632+ reserved = BDEV_LABEL_BLOCK_SIZE;
633+ break ;
634+ case BDEV_DB:
635+ case BDEV_NEWDB:
636+ reserved = SUPER_RESERVED;
637+ break ;
638+ case BDEV_SLOW:
639+ reserved = 0 ;
640+ break ;
641+ default :
642+ ceph_assert (false );
643+ }
644+ return reserved;
645+ }
646+
647+ uint64_t BlueFS::get_full_reserved (unsigned id)
648+ {
649+ if (!is_shared_alloc (id)) {
650+ return locked_alloc[id].length + _get_minimal_reserved (id);
651+ }
652+ return 0 ;
653+ }
654+
631655void BlueFS::dump_perf_counters (Formatter *f)
632656{
633657 f->open_object_section (" bluefs_perf_counters" );
@@ -684,13 +708,13 @@ int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
684708 }
685709
686710 _init_logger ();
687- _init_alloc ();
688711
689712 super.version = 0 ;
690713 super.block_size = bdev[BDEV_DB]->get_block_size ();
691714 super.osd_uuid = osd_uuid;
692715 super.uuid .generate_random ();
693- dout (1 ) << __func__ << " uuid " << super.uuid << dendl;
716+
717+ _init_alloc ();
694718
695719 // init log
696720 FileRef log_file = ceph::make_ref<File>();
@@ -715,6 +739,7 @@ int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
715739 super.log_fnode = log_file->fnode ;
716740 super.memorized_layout = layout;
717741 _write_super (BDEV_DB);
742+ dout (1 ) << __func__ << " super " << super << dendl;
718743 _flush_bdev ();
719744
720745 // clean up
@@ -737,27 +762,10 @@ void BlueFS::_init_alloc()
737762{
738763 dout (20 ) << __func__ << dendl;
739764
740- // 'changed' should keep its previous value if no actual modification occurred
741- auto change_alloc_size = [this ](uint64_t & max_alloc_size,
742- uint64_t new_alloc, bool & changed) {
743- if (max_alloc_size == 0 ||
744- (max_alloc_size > new_alloc && ((new_alloc & (new_alloc -1 )) == 0 ))) {
745- max_alloc_size = new_alloc;
746- changed = true ;
747- dout (5 ) << " changed alloc_size to 0x" << std::hex << new_alloc << dendl;
748- } else if (max_alloc_size != new_alloc) {
749- derr << " can not change current alloc_size 0x" << std::hex
750- << max_alloc_size << " to new alloc_size 0x" << new_alloc << dendl;
751- }
752- };
753-
754- bool alloc_size_changed = false ;
755765 size_t wal_alloc_size = 0 ;
756766 if (bdev[BDEV_WAL]) {
757767 wal_alloc_size = cct->_conf ->bluefs_alloc_size ;
758768 alloc_size[BDEV_WAL] = wal_alloc_size;
759- change_alloc_size (super.bluefs_max_alloc_size [BDEV_WAL],
760- wal_alloc_size, alloc_size_changed);
761769 }
762770 logger->set (l_bluefs_wal_alloc_unit, wal_alloc_size);
763771
@@ -773,46 +781,27 @@ void BlueFS::_init_alloc()
773781 if (bdev[BDEV_SLOW]) {
774782 alloc_size[BDEV_DB] = cct->_conf ->bluefs_alloc_size ;
775783 alloc_size[BDEV_SLOW] = shared_alloc_size;
776- change_alloc_size (super.bluefs_max_alloc_size [BDEV_DB],
777- cct->_conf ->bluefs_alloc_size , alloc_size_changed);
778- change_alloc_size (super.bluefs_max_alloc_size [BDEV_SLOW],
779- shared_alloc_size, alloc_size_changed);
780784 } else {
781785 alloc_size[BDEV_DB] = shared_alloc_size;
782786 alloc_size[BDEV_SLOW] = 0 ;
783- change_alloc_size (super.bluefs_max_alloc_size [BDEV_DB],
784- shared_alloc_size, alloc_size_changed);
785787 }
786788 logger->set (l_bluefs_db_alloc_unit, alloc_size[BDEV_DB]);
787789 logger->set (l_bluefs_slow_alloc_unit, alloc_size[BDEV_SLOW]);
788790 // new wal and db devices are never shared
789791 if (bdev[BDEV_NEWWAL]) {
790792 alloc_size[BDEV_NEWWAL] = cct->_conf ->bluefs_alloc_size ;
791- change_alloc_size (super.bluefs_max_alloc_size [BDEV_NEWWAL],
792- cct->_conf ->bluefs_alloc_size , alloc_size_changed);
793- }
794- if (alloc_size_changed) {
795- dout (1 ) << __func__ << " alloc_size changed, the new super is:" << super << dendl;
796- _write_super (BDEV_DB);
797793 }
798-
799- alloc_size_changed = false ;
800794 if (bdev[BDEV_NEWDB]) {
801795 alloc_size[BDEV_NEWDB] = cct->_conf ->bluefs_alloc_size ;
802- change_alloc_size (super.bluefs_max_alloc_size [BDEV_NEWDB],
803- cct->_conf ->bluefs_alloc_size , alloc_size_changed);
804- }
805- if (alloc_size_changed) {
806- dout (1 ) << __func__ << " alloc_size changed, the new super is:" << super << dendl;
807- _write_super (BDEV_NEWDB);
808796 }
809797
810798 for (unsigned id = 0 ; id < bdev.size (); ++id) {
811799 if (!bdev[id]) {
812800 continue ;
813801 }
814802 ceph_assert (bdev[id]->get_size ());
815- ceph_assert (super.bluefs_max_alloc_size [id]);
803+ locked_alloc[id] = bluefs_extent_t ();
804+
816805 if (is_shared_alloc (id)) {
817806 dout (1 ) << __func__ << " shared, id " << id << std::hex
818807 << " , capacity 0x" << bdev[id]->get_size ()
@@ -826,22 +815,39 @@ void BlueFS::_init_alloc()
826815 name += devnames[id];
827816 else
828817 name += to_string (uintptr_t (this ));
829- string alloc_type = cct->_conf ->bluefs_allocator ;
830818
819+ auto reserved = _get_minimal_reserved (id);
820+ uint64_t locked_offs = 0 ;
821+ {
822+ // Try to lock tailing space at device if allocator controlled space
823+ // isn't aligned with recommended alloc unit.
824+ // Final decision whether locked tail to be maintained is made after
825+ // BlueFS replay depending on existing allocations.
826+ uint64_t size0 = _get_total (id);
827+ uint64_t size = size0 - reserved;
828+ size = p2align (size, alloc_size[id]) + reserved;
829+ if (size < size0) {
830+ locked_offs = size;
831+ locked_alloc[id] = bluefs_extent_t (id, locked_offs, uint32_t (size0 - size));
832+ }
833+ }
834+ string alloc_type = cct->_conf ->bluefs_allocator ;
831835 dout (1 ) << __func__ << " new, id " << id << std::hex
832836 << " , allocator name " << name
833837 << " , allocator type " << alloc_type
834838 << " , capacity 0x" << bdev[id]->get_size ()
835- << " , reserved 0x" << block_reserved[id]
836- << " , block size 0x" << alloc_size[id]
837- << " , max alloc size 0x" << super.bluefs_max_alloc_size [id]
839+ << " , reserved 0x" << reserved
840+ << " , locked 0x" << locked_alloc[id].offset
841+ << " ~" << locked_alloc[id].length
842+ << " , block size 0x" << bdev[id]->get_block_size ()
843+ << " , alloc unit 0x" << alloc_size[id]
838844 << std::dec << dendl;
839845 alloc[id] = Allocator::create (cct, alloc_type,
840846 bdev[id]->get_size (),
841- super. bluefs_max_alloc_size [id],
847+ bdev [id]-> get_block_size () ,
842848 name);
843- auto reserved = block_reserved[id] ;
844- alloc[id]->init_add_free (reserved, _get_total (id) - reserved );
849+ uint64_t free_len = locked_offs ? locked_offs : _get_total (id) - reserved ;
850+ alloc[id]->init_add_free (reserved, free_len );
845851 }
846852 }
847853}
@@ -1045,6 +1051,7 @@ int BlueFS::mount()
10451051 derr << __func__ << " failed to open super: " << cpp_strerror (r) << dendl;
10461052 goto out;
10471053 }
1054+ dout (5 ) << __func__ << " super: " << super << dendl;
10481055
10491056 // set volume selector if not provided before/outside
10501057 if (vselector == nullptr ) {
@@ -1057,7 +1064,6 @@ int BlueFS::mount()
10571064
10581065 _init_alloc ();
10591066
1060- dout (5 ) << __func__ << " super: " << super << dendl;
10611067 r = _replay (false , false );
10621068 if (r < 0 ) {
10631069 derr << __func__ << " failed to replay log: " << cpp_strerror (r) << dendl;
@@ -1075,6 +1081,20 @@ int BlueFS::mount()
10751081 shared_alloc->bluefs_used += q.length ;
10761082 alloc[q.bdev ]->init_rm_free (q.offset , q.length );
10771083 } else if (!is_shared) {
1084+ if (locked_alloc[q.bdev ].length ) {
1085+ auto locked_offs = locked_alloc[q.bdev ].offset ;
1086+ if (q.offset + q.length > locked_offs) {
1087+ // we already have allocated extents in locked range,
1088+ // do not enforce this lock then.
1089+ bluefs_extent_t dummy;
1090+ std::swap (locked_alloc[q.bdev ], dummy);
1091+ alloc[q.bdev ]->init_add_free (dummy.offset , dummy.length );
1092+ dout (1 ) << __func__ << std::hex
1093+ << " unlocked at " << q.bdev
1094+ << " 0x" << dummy.offset << " ~" << dummy.length
1095+ << std::dec << dendl;
1096+ }
1097+ }
10781098 alloc[q.bdev ]->init_rm_free (q.offset , q.length );
10791099 }
10801100 }
@@ -1337,9 +1357,10 @@ int BlueFS::_replay(bool noop, bool to_stdout)
13371357 bool seen_recs = false ;
13381358
13391359 boost::dynamic_bitset<uint64_t > used_blocks[MAX_BDEV];
1360+ bool check_allocations = cct->_conf ->bluefs_log_replay_check_allocations ;
13401361
13411362 if (!noop) {
1342- if (cct-> _conf -> bluefs_log_replay_check_allocations ) {
1363+ if (check_allocations ) {
13431364 for (size_t i = 0 ; i < MAX_BDEV; ++i) {
13441365 if (bdev[i] != nullptr ) {
13451366 // let's use minimal allocation unit we can have
@@ -1671,7 +1692,7 @@ int BlueFS::_replay(bool noop, bool to_stdout)
16711692 }
16721693 if (!noop) {
16731694 FileRef f = _get_file (fnode.ino );
1674- if (cct-> _conf -> bluefs_log_replay_check_allocations ) {
1695+ if (check_allocations ) {
16751696 int r = _check_allocations (f->fnode ,
16761697 used_blocks, false , " OP_FILE_UPDATE" );
16771698 if (r < 0 ) {
@@ -1687,7 +1708,7 @@ int BlueFS::_replay(bool noop, bool to_stdout)
16871708 if (fnode.ino > ino_last) {
16881709 ino_last = fnode.ino ;
16891710 }
1690- if (cct-> _conf -> bluefs_log_replay_check_allocations ) {
1711+ if (check_allocations ) {
16911712 int r = _check_allocations (f->fnode ,
16921713 used_blocks, true , " OP_FILE_UPDATE" );
16931714 if (r < 0 ) {
@@ -1721,7 +1742,7 @@ int BlueFS::_replay(bool noop, bool to_stdout)
17211742 // be leanient, if there is no extents just produce error message
17221743 ceph_assert (delta.offset == fnode.allocated || delta.extents .empty ());
17231744 }
1724- if (cct-> _conf -> bluefs_log_replay_check_allocations ) {
1745+ if (check_allocations ) {
17251746 int r = _check_allocations (fnode,
17261747 used_blocks, false , " OP_FILE_UPDATE_INC" );
17271748 if (r < 0 ) {
@@ -1746,7 +1767,7 @@ int BlueFS::_replay(bool noop, bool to_stdout)
17461767 if (fnode.ino > ino_last) {
17471768 ino_last = fnode.ino ;
17481769 }
1749- if (cct-> _conf -> bluefs_log_replay_check_allocations ) {
1770+ if (check_allocations ) {
17501771 int r = _check_allocations (f->fnode ,
17511772 used_blocks, true , " OP_FILE_UPDATE_INC" );
17521773 if (r < 0 ) {
@@ -1780,7 +1801,7 @@ int BlueFS::_replay(bool noop, bool to_stdout)
17801801 auto p = nodes.file_map .find (ino);
17811802 ceph_assert (p != nodes.file_map .end ());
17821803 vselector->sub_usage (p->second ->vselector_hint , p->second ->fnode );
1783- if (cct-> _conf -> bluefs_log_replay_check_allocations ) {
1804+ if (check_allocations ) {
17841805 int r = _check_allocations (p->second ->fnode ,
17851806 used_blocks, false , " OP_FILE_REMOVE" );
17861807 if (r < 0 ) {
0 commit comments