Skip to content

Commit a46fc40

Browse files
committed
Gracefull recovery when attaching volume fails
When trying to attach a volume to an already running instance the nova-api requests the nova-compute service to create a BlockDeviceMapping. If the nova-api does not receive a response within `rpc_response_timeout` it will treat the request as failed and raise an exception. There are multiple cases where nova-compute actually already processed the request and just the reply did not reach the nova-api in time (see bug report). After the failed request the database will contain a BlockDeviceMapping entry for the volume + instance combination that will never be cleaned up again. This entry also causes the nova-api to reject all future attachments of this volume to this instance (as it assumes it is already attached). To work around this we check if a BlockDeviceMapping has already been created when we see a messaging timeout. If this is the case we can safely delete it as the compute node has already finished processing and we will no longer pick it up. This allows users to try the request again. A previous fix was abandoned but without a clear reason ([1]). [1]: https://review.opendev.org/c/openstack/nova/+/731804 Closes-Bug: 1960401 Change-Id: I17f4d7d2cb129c4ec1479cc4e5d723da75d3a527 (cherry picked from commit 9eb116b)
1 parent 12d01a9 commit a46fc40

File tree

3 files changed

+56
-4
lines changed

3 files changed

+56
-4
lines changed

nova/compute/api.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4802,10 +4802,24 @@ def _attach_volume(self, context, instance, volume, device,
48024802
This method is separated to make it possible for cells version
48034803
to override it.
48044804
"""
4805-
volume_bdm = self._create_volume_bdm(
4806-
context, instance, device, volume, disk_bus=disk_bus,
4807-
device_type=device_type, tag=tag,
4808-
delete_on_termination=delete_on_termination)
4805+
try:
4806+
volume_bdm = self._create_volume_bdm(
4807+
context, instance, device, volume, disk_bus=disk_bus,
4808+
device_type=device_type, tag=tag,
4809+
delete_on_termination=delete_on_termination)
4810+
except oslo_exceptions.MessagingTimeout:
4811+
# The compute node might have already created the attachment but
4812+
# we never received the answer. In this case it is safe to delete
4813+
# the attachment as nobody will ever pick it up again.
4814+
with excutils.save_and_reraise_exception():
4815+
try:
4816+
objects.BlockDeviceMapping.get_by_volume_and_instance(
4817+
context, volume['id'], instance.uuid).destroy()
4818+
LOG.debug("Delete BDM after compute did not respond to "
4819+
f"attachment request for volume {volume['id']}")
4820+
except exception.VolumeBDMNotFound:
4821+
LOG.debug("BDM not found, ignoring removal. "
4822+
f"Error attaching volume {volume['id']}")
48094823
try:
48104824
self._check_attach_and_reserve_volume(context, volume, instance,
48114825
volume_bdm,

nova/tests/unit/compute/test_api.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -520,6 +520,36 @@ def test_tagged_volume_attach_new_flow(
520520
mock_attach.assert_called_once_with(self.context,
521521
instance, fake_bdm)
522522

523+
@mock.patch.object(compute_rpcapi.ComputeAPI, 'reserve_block_device_name')
524+
@mock.patch.object(
525+
objects.BlockDeviceMapping, 'get_by_volume_and_instance')
526+
@mock.patch.object(objects.BlockDeviceMappingList, 'get_by_volume')
527+
def test_attach_volume_reserve_bdm_timeout(
528+
self, mock_get_by_volume, mock_get_by_volume_and_instance,
529+
mock_reserve):
530+
mock_get_by_volume.side_effect = exception.VolumeBDMNotFound(
531+
volume_id='fake-volume-id')
532+
533+
fake_bdm = mock.MagicMock(spec=objects.BlockDeviceMapping)
534+
mock_get_by_volume_and_instance.return_value = fake_bdm
535+
instance = self._create_instance_obj()
536+
volume = fake_volume.fake_volume(1, 'test-vol', 'test-vol',
537+
None, None, None, None, None)
538+
539+
mock_reserve.side_effect = oslo_exceptions.MessagingTimeout()
540+
541+
mock_volume_api = mock.patch.object(self.compute_api, 'volume_api',
542+
mock.MagicMock(spec=cinder.API))
543+
544+
with mock_volume_api as mock_v_api:
545+
mock_v_api.get.return_value = volume
546+
self.assertRaises(oslo_exceptions.MessagingTimeout,
547+
self.compute_api.attach_volume,
548+
self.context, instance, volume['id'])
549+
mock_get_by_volume_and_instance.assert_called_once_with(
550+
self.context, volume['id'], instance.uuid)
551+
fake_bdm.destroy.assert_called_once_with()
552+
523553
@mock.patch.object(compute_rpcapi.ComputeAPI, 'reserve_block_device_name')
524554
@mock.patch.object(objects.BlockDeviceMappingList, 'get_by_volume')
525555
@mock.patch.object(compute_rpcapi.ComputeAPI, 'attach_volume')
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
---
2+
fixes:
3+
- |
4+
The `bug 1960401`_ is fixed which can cause invalid `BlockDeviceMappings`
5+
to accumulate in the database. This prevented the respective volumes from
6+
being attached again to the instance.
7+
8+
.. _bug 1960401: https://bugs.launchpad.net/nova/+bug/1960401

0 commit comments

Comments
 (0)