Skip to content

Commit 07025ab

Browse files
committed
Make libvirt able to trigger a backend image copy when needed
This teaches libvirt's RBD image backend about the outside world, that other ceph clusters may exist, and how to use Glance's multi-store image import-via-copy mechanism. The basic theory is that when we go to do the normal CoW clone for RBD, we do the "does this image have a location that matches my RBD backend?" check. If that check does not pass, if configured, we avoid failing and ask Glance to copy it to our store instead. After that has completed, we just recurse (once) and re-try our existing logic to see if the image is now in a reachable location. If so, we pass like we would have originally, and if not, we fail in the same way we would have. The copy-to-store logic sets up a looping poll to check for copy completion every N seconds according to a tunable, with a total timeout value in case it never completes. If the timeout expires or Glance reports failure, we will treat that the same as unreachable-due-to-location. Related to blueprint rbd-glance-multistore Change-Id: Ia839ad418b0f2887cb8e8f5ee3e660a0751db9ce
1 parent 4a6a366 commit 07025ab

File tree

4 files changed

+360
-1
lines changed

4 files changed

+360
-1
lines changed

nova/conf/libvirt.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -940,6 +940,50 @@
940940
cfg.StrOpt('images_rbd_ceph_conf',
941941
default='', # default determined by librados
942942
help='Path to the ceph configuration file to use'),
943+
cfg.StrOpt('images_rbd_glance_store_name',
944+
default='',
945+
help="""
946+
The name of the Glance store that represents the rbd cluster in use by
947+
this node. If set, this will allow Nova to request that Glance copy an
948+
image from an existing non-local store into the one named by this option
949+
before booting so that proper Copy-on-Write behavior is maintained.
950+
951+
Related options:
952+
953+
* images_type - must be set to ``rbd``
954+
* images_rbd_glance_copy_poll_interval - controls the status poll frequency
955+
* images_rbd_glance_copy_timeout - controls the overall copy timeout
956+
"""),
957+
cfg.IntOpt('images_rbd_glance_copy_poll_interval',
958+
default=15,
959+
help="""
960+
The interval in seconds with which to poll Glance after asking for it
961+
to copy an image to the local rbd store. This affects how often we ask
962+
Glance to report on copy completion, and thus should be short enough that
963+
we notice quickly, but not too aggressive that we generate undue load on
964+
the Glance server.
965+
966+
Related options:
967+
968+
* images_type - must be set to ``rbd``
969+
* images_rbd_glance_store_name - must be set to a store name
970+
"""),
971+
cfg.IntOpt('images_rbd_glance_copy_timeout',
972+
default=600,
973+
help="""
974+
The overall maximum time we will wait for Glance to complete an image
975+
copy to our local rbd store. This should be long enough to allow large
976+
images to be copied over the network link between our local store and the
977+
one where images typically reside. The downside of setting this too long
978+
is just to catch the case where the image copy is stalled or proceeding too
979+
slowly to be useful. Actual errors will be reported by Glance and noticed
980+
according to the poll interval.
981+
982+
Related options:
983+
* images_type - must be set to ``rbd``
984+
* images_rbd_glance_store_name - must be set to a store name
985+
* images_rbd_glance_copy_poll_interval - controls the failure time-to-notice
986+
"""),
943987
cfg.StrOpt('hw_disk_discard',
944988
choices=('ignore', 'unmap'),
945989
help="""

nova/tests/unit/virt/libvirt/test_imagebackend.py

Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import mock
2626
from oslo_concurrency import lockutils
2727
from oslo_config import fixture as config_fixture
28+
from oslo_service import loopingcall
2829
from oslo_utils import imageutils
2930
from oslo_utils import units
3031
from oslo_utils import uuidutils
@@ -1768,6 +1769,230 @@ def test_cleanup_direct_snapshot_destroy_volume(self):
17681769
mock_destroy.assert_called_once_with(image.rbd_name,
17691770
pool=image.driver.pool)
17701771

1772+
@mock.patch('nova.virt.libvirt.imagebackend.IMAGE_API')
1773+
def test_copy_to_store(self, mock_imgapi):
1774+
# Test copy_to_store() happy path where we ask for the image
1775+
# to be copied, it goes into progress and then completes.
1776+
self.flags(images_rbd_glance_copy_poll_interval=0,
1777+
group='libvirt')
1778+
self.flags(images_rbd_glance_store_name='store',
1779+
group='libvirt')
1780+
image = self.image_class(self.INSTANCE, self.NAME)
1781+
mock_imgapi.get.side_effect = [
1782+
# Simulate a race between starting the copy and the first poll
1783+
{'stores': []},
1784+
# Second poll shows it in progress
1785+
{'os_glance_importing_to_stores': ['store'],
1786+
'stores': []},
1787+
# Third poll shows it has also been copied to a non-local store
1788+
{'os_glance_importing_to_stores': ['store'],
1789+
'stores': ['other']},
1790+
# Should-be-last poll shows it complete
1791+
{'os_glance_importing_to_stores': [],
1792+
'stores': ['other', 'store']},
1793+
]
1794+
image.copy_to_store(self.CONTEXT, {'id': 'foo'})
1795+
mock_imgapi.copy_image_to_store.assert_called_once_with(
1796+
self.CONTEXT, 'foo', 'store')
1797+
self.assertEqual(4, mock_imgapi.get.call_count)
1798+
1799+
@mock.patch('nova.virt.libvirt.imagebackend.IMAGE_API')
1800+
def test_copy_to_store_race_with_existing(self, mock_imgapi):
1801+
# Test copy_to_store() where we race to ask Glance to do the
1802+
# copy with another node. One of us will get a BadRequest, which
1803+
# should not cause us to fail. If our desired store is now
1804+
# in progress, continue to wait like we would have if we had
1805+
# won the race.
1806+
self.flags(images_rbd_glance_copy_poll_interval=0,
1807+
group='libvirt')
1808+
self.flags(images_rbd_glance_store_name='store',
1809+
group='libvirt')
1810+
image = self.image_class(self.INSTANCE, self.NAME)
1811+
1812+
mock_imgapi.copy_image_to_store.side_effect = (
1813+
exception.ImageBadRequest(image_id='foo',
1814+
response='already in progress'))
1815+
# Make the first poll indicate that the image has already
1816+
# been copied
1817+
mock_imgapi.get.return_value = {'stores': ['store', 'other']}
1818+
1819+
# Despite the (expected) exception from the copy, we should
1820+
# not raise here if the subsequent poll works.
1821+
image.copy_to_store(self.CONTEXT, {'id': 'foo'})
1822+
1823+
mock_imgapi.get.assert_called_once_with(self.CONTEXT,
1824+
'foo',
1825+
include_locations=True)
1826+
mock_imgapi.copy_image_to_store.assert_called_once_with(
1827+
self.CONTEXT, 'foo', 'store')
1828+
1829+
@mock.patch('nova.virt.libvirt.imagebackend.IMAGE_API')
1830+
def test_copy_to_store_import_impossible(self, mock_imgapi):
1831+
# Test copy_to_store() where Glance tells us that the image
1832+
# is not copy-able for some reason (like it is not active yet
1833+
# or some other workflow reason).
1834+
image = self.image_class(self.INSTANCE, self.NAME)
1835+
mock_imgapi.copy_image_to_store.side_effect = (
1836+
exception.ImageImportImpossible(image_id='foo',
1837+
reason='because tests'))
1838+
self.assertRaises(exception.ImageUnacceptable,
1839+
image.copy_to_store,
1840+
self.CONTEXT, {'id': 'foo'})
1841+
1842+
@mock.patch('nova.virt.libvirt.imagebackend.IMAGE_API')
1843+
def test_copy_to_store_import_failed_other_reason(self, mock_imgapi):
1844+
# Test copy_to_store() where some unexpected failure gets raised.
1845+
# We should bubble that up so it gets all the way back to the caller
1846+
# of the clone() itself, which can handle it independent of one of
1847+
# the image-specific exceptions.
1848+
image = self.image_class(self.INSTANCE, self.NAME)
1849+
mock_imgapi.copy_image_to_store.side_effect = test.TestingException
1850+
# Make sure any other exception makes it through, as those are already
1851+
# expected failures by the callers of the imagebackend code.
1852+
self.assertRaises(test.TestingException,
1853+
image.copy_to_store,
1854+
self.CONTEXT, {'id': 'foo'})
1855+
1856+
@mock.patch('nova.virt.libvirt.imagebackend.IMAGE_API')
1857+
def test_copy_to_store_import_failed_in_progress(self, mock_imgapi):
1858+
# Test copy_to_store() in the situation where we ask for the copy,
1859+
# things start to look good (in progress) and later get reported
1860+
# as failed.
1861+
self.flags(images_rbd_glance_copy_poll_interval=0,
1862+
group='libvirt')
1863+
self.flags(images_rbd_glance_store_name='store',
1864+
group='libvirt')
1865+
image = self.image_class(self.INSTANCE, self.NAME)
1866+
mock_imgapi.get.side_effect = [
1867+
# First poll shows it in progress
1868+
{'os_glance_importing_to_stores': ['store'],
1869+
'stores': []},
1870+
# Second poll shows it failed
1871+
{'os_glance_failed_import': ['store'],
1872+
'stores': []},
1873+
]
1874+
exc = self.assertRaises(exception.ImageUnacceptable,
1875+
image.copy_to_store,
1876+
self.CONTEXT, {'id': 'foo'})
1877+
self.assertIn('unsuccessful because', str(exc))
1878+
1879+
@mock.patch.object(loopingcall.FixedIntervalWithTimeoutLoopingCall,
1880+
'start')
1881+
@mock.patch('nova.virt.libvirt.imagebackend.IMAGE_API')
1882+
def test_copy_to_store_import_failed_timeout(self, mock_imgapi,
1883+
mock_timer_start):
1884+
# Test copy_to_store() simulating the case where we timeout waiting
1885+
# for Glance to do the copy.
1886+
self.flags(images_rbd_glance_store_name='store',
1887+
group='libvirt')
1888+
image = self.image_class(self.INSTANCE, self.NAME)
1889+
mock_timer_start.side_effect = loopingcall.LoopingCallTimeOut()
1890+
exc = self.assertRaises(exception.ImageUnacceptable,
1891+
image.copy_to_store,
1892+
self.CONTEXT, {'id': 'foo'})
1893+
self.assertIn('timed out', str(exc))
1894+
mock_imgapi.copy_image_to_store.assert_called_once_with(
1895+
self.CONTEXT, 'foo', 'store')
1896+
1897+
@mock.patch('nova.virt.libvirt.storage.rbd_utils.RBDDriver')
1898+
@mock.patch('nova.virt.libvirt.imagebackend.IMAGE_API')
1899+
def test_clone_copy_to_store(self, mock_imgapi, mock_driver_):
1900+
# Call image.clone() in a way that will cause it to fall through
1901+
# the locations check to the copy-to-store behavior, and assert
1902+
# that after the copy, we recurse (without becoming infinite) and
1903+
# do the check again.
1904+
self.flags(images_rbd_glance_store_name='store', group='libvirt')
1905+
fake_image = {
1906+
'id': 'foo',
1907+
'disk_format': 'raw',
1908+
'locations': ['fake'],
1909+
}
1910+
mock_imgapi.get.return_value = fake_image
1911+
mock_driver = mock_driver_.return_value
1912+
mock_driver.is_cloneable.side_effect = [False, True]
1913+
image = self.image_class(self.INSTANCE, self.NAME)
1914+
with mock.patch.object(image, 'copy_to_store') as mock_copy:
1915+
image.clone(self.CONTEXT, 'foo')
1916+
mock_copy.assert_called_once_with(self.CONTEXT, fake_image)
1917+
mock_driver.is_cloneable.assert_has_calls([
1918+
# First call is the initial check
1919+
mock.call('fake', fake_image),
1920+
# Second call with the same location must be because we
1921+
# recursed after the copy-to-store operation
1922+
mock.call('fake', fake_image)])
1923+
1924+
@mock.patch('nova.virt.libvirt.storage.rbd_utils.RBDDriver')
1925+
@mock.patch('nova.virt.libvirt.imagebackend.IMAGE_API')
1926+
def test_clone_copy_to_store_failed(self, mock_imgapi, mock_driver_):
1927+
# Call image.clone() in a way that will cause it to fall through
1928+
# the locations check to the copy-to-store behavior, but simulate
1929+
# some situation where we didn't actually copy the image and the
1930+
# recursed check does not succeed. Assert that we do not copy again,
1931+
# nor recurse again, and raise the expected error.
1932+
self.flags(images_rbd_glance_store_name='store', group='libvirt')
1933+
fake_image = {
1934+
'id': 'foo',
1935+
'disk_format': 'raw',
1936+
'locations': ['fake'],
1937+
}
1938+
mock_imgapi.get.return_value = fake_image
1939+
mock_driver = mock_driver_.return_value
1940+
mock_driver.is_cloneable.side_effect = [False, False]
1941+
image = self.image_class(self.INSTANCE, self.NAME)
1942+
with mock.patch.object(image, 'copy_to_store') as mock_copy:
1943+
self.assertRaises(exception.ImageUnacceptable,
1944+
image.clone, self.CONTEXT, 'foo')
1945+
mock_copy.assert_called_once_with(self.CONTEXT, fake_image)
1946+
mock_driver.is_cloneable.assert_has_calls([
1947+
# First call is the initial check
1948+
mock.call('fake', fake_image),
1949+
# Second call with the same location must be because we
1950+
# recursed after the copy-to-store operation
1951+
mock.call('fake', fake_image)])
1952+
1953+
@mock.patch('nova.virt.libvirt.storage.rbd_utils.RBDDriver')
1954+
@mock.patch('nova.virt.libvirt.imagebackend.IMAGE_API')
1955+
def test_clone_without_needed_copy(self, mock_imgapi, mock_driver_):
1956+
# Call image.clone() in a way that will cause it to pass the locations
1957+
# check the first time. Assert that we do not call copy-to-store
1958+
# nor recurse.
1959+
self.flags(images_rbd_glance_store_name='store', group='libvirt')
1960+
fake_image = {
1961+
'id': 'foo',
1962+
'disk_format': 'raw',
1963+
'locations': ['fake'],
1964+
}
1965+
mock_imgapi.get.return_value = fake_image
1966+
mock_driver = mock_driver_.return_value
1967+
mock_driver.is_cloneable.return_value = True
1968+
image = self.image_class(self.INSTANCE, self.NAME)
1969+
with mock.patch.object(image, 'copy_to_store') as mock_copy:
1970+
image.clone(self.CONTEXT, 'foo')
1971+
mock_copy.assert_not_called()
1972+
mock_driver.is_cloneable.assert_called_once_with('fake', fake_image)
1973+
1974+
@mock.patch('nova.virt.libvirt.storage.rbd_utils.RBDDriver')
1975+
@mock.patch('nova.virt.libvirt.imagebackend.IMAGE_API')
1976+
def test_clone_copy_not_configured(self, mock_imgapi, mock_driver_):
1977+
# Call image.clone() in a way that will cause it to fail the locations
1978+
# check the first time. Assert that if the store name is not configured
1979+
# we do not try to copy-to-store and just raise the original exception
1980+
# indicating that the image is not reachable.
1981+
fake_image = {
1982+
'id': 'foo',
1983+
'disk_format': 'raw',
1984+
'locations': ['fake'],
1985+
}
1986+
mock_imgapi.get.return_value = fake_image
1987+
mock_driver = mock_driver_.return_value
1988+
mock_driver.is_cloneable.return_value = False
1989+
image = self.image_class(self.INSTANCE, self.NAME)
1990+
with mock.patch.object(image, 'copy_to_store') as mock_copy:
1991+
self.assertRaises(exception.ImageUnacceptable,
1992+
image.clone, self.CONTEXT, 'foo')
1993+
mock_copy.assert_not_called()
1994+
mock_driver.is_cloneable.assert_called_once_with('fake', fake_image)
1995+
17711996

17721997
class PloopTestCase(_ImageTestCase, test.NoDBTestCase):
17731998
SIZE = 1024

nova/virt/libvirt/imagebackend.py

Lines changed: 78 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from oslo_concurrency import processutils
2626
from oslo_log import log as logging
2727
from oslo_serialization import jsonutils
28+
from oslo_service import loopingcall
2829
from oslo_utils import excutils
2930
from oslo_utils import fileutils
3031
from oslo_utils import strutils
@@ -958,7 +959,77 @@ def snapshot_extract(self, target, out_format):
958959
def is_shared_block_storage():
959960
return True
960961

961-
def clone(self, context, image_id_or_uri):
962+
def copy_to_store(self, context, image_meta):
963+
store_name = CONF.libvirt.images_rbd_glance_store_name
964+
image_id = image_meta['id']
965+
try:
966+
IMAGE_API.copy_image_to_store(context, image_id, store_name)
967+
except exception.ImageBadRequest:
968+
# NOTE(danms): This means that we raced with another node to start
969+
# the copy. Fall through to polling the image for completion
970+
pass
971+
except exception.ImageImportImpossible as exc:
972+
# NOTE(danms): This means we can not do this operation at all,
973+
# so fold this into the kind of imagebackend failure that is
974+
# expected by our callers
975+
raise exception.ImageUnacceptable(image_id=image_id,
976+
reason=str(exc))
977+
978+
def _wait_for_copy():
979+
image = IMAGE_API.get(context, image_id, include_locations=True)
980+
if store_name in image.get('os_glance_failed_import', []):
981+
# Our store is reported as failed
982+
raise loopingcall.LoopingCallDone('failed import')
983+
elif (store_name not in image.get('os_glance_importing_to_stores',
984+
[]) and
985+
store_name in image['stores']):
986+
# No longer importing and our store is listed in the stores
987+
raise loopingcall.LoopingCallDone()
988+
else:
989+
LOG.debug('Glance reports copy of image %(image)s to '
990+
'rbd store %(store)s is still in progress',
991+
{'image': image_id,
992+
'store': store_name})
993+
return True
994+
995+
LOG.info('Asking glance to copy image %(image)s to our '
996+
'rbd store %(store)s',
997+
{'image': image_id,
998+
'store': store_name})
999+
1000+
timer = loopingcall.FixedIntervalWithTimeoutLoopingCall(_wait_for_copy)
1001+
1002+
# NOTE(danms): We *could* do something more complicated like try
1003+
# to scale our polling interval based on image size. The problem with
1004+
# that is that we do not get progress indication from Glance, so if
1005+
# we scale our interval to something long, and happen to poll right
1006+
# near the end of the copy, we will wait another long interval before
1007+
# realizing that the copy is complete. A simple interval per compute
1008+
# allows an operator to set this short on central/fast/inexpensive
1009+
# computes, and longer on nodes that are remote/slow/expensive across
1010+
# a slower link.
1011+
interval = CONF.libvirt.images_rbd_glance_copy_poll_interval
1012+
timeout = CONF.libvirt.images_rbd_glance_copy_timeout
1013+
try:
1014+
result = timer.start(interval=interval, timeout=timeout).wait()
1015+
except loopingcall.LoopingCallTimeOut:
1016+
raise exception.ImageUnacceptable(
1017+
image_id=image_id,
1018+
reason='Copy to store %(store)s timed out' % {
1019+
'store': store_name})
1020+
1021+
if result is not True:
1022+
raise exception.ImageUnacceptable(
1023+
image_id=image_id,
1024+
reason=('Copy to store %(store)s unsuccessful '
1025+
'because: %(reason)s') % {'store': store_name,
1026+
'reason': result})
1027+
1028+
LOG.info('Image %(image)s copied to rbd store %(store)s',
1029+
{'image': image_id,
1030+
'store': store_name})
1031+
1032+
def clone(self, context, image_id_or_uri, copy_to_store=True):
9621033
image_meta = IMAGE_API.get(context, image_id_or_uri,
9631034
include_locations=True)
9641035
locations = image_meta['locations']
@@ -975,6 +1046,12 @@ def clone(self, context, image_id_or_uri):
9751046
LOG.debug('Selected location: %(loc)s', {'loc': location})
9761047
return self.driver.clone(location, self.rbd_name)
9771048

1049+
# Not clone-able in our ceph, so try to get glance to copy it for us
1050+
# and then retry
1051+
if CONF.libvirt.images_rbd_glance_store_name and copy_to_store:
1052+
self.copy_to_store(context, image_meta)
1053+
return self.clone(context, image_id_or_uri, copy_to_store=False)
1054+
9781055
reason = _('No image locations are accessible')
9791056
raise exception.ImageUnacceptable(image_id=image_id_or_uri,
9801057
reason=reason)

0 commit comments

Comments
 (0)