|
| 1 | +From 1b2c01185c7907a1d17c4d90cf3bcd73fe345658 Mon Sep 17 00:00:00 2001 |
| 2 | +From: Jacob Anders < [email protected]> |
| 3 | +Date: Tue, 08 Jul 2025 16:25:40 +1000 |
| 4 | +Subject: [PATCH] Skip initial reboot to IPA when updating firmware out-of-band |
| 5 | + |
| 6 | +This change enables Ironic to skip initial reboot to IPA when |
| 7 | +performing out-of-band firmware. |
| 8 | + |
| 9 | +Change-Id: Id055a4ddbde3dbe336717e5f06ca6eb024b90c9f |
| 10 | +Signed-off-by: Jacob Anders < [email protected]> |
| 11 | +--- |
| 12 | + |
| 13 | +diff --git a/ironic/conductor/servicing.py b/ironic/conductor/servicing.py |
| 14 | +index 8b385d7..6aaa4a5 100644 |
| 15 | +--- a/ironic/conductor/servicing.py |
| 16 | ++++ b/ironic/conductor/servicing.py |
| 17 | +@@ -38,6 +38,7 @@ |
| 18 | + :param disable_ramdisk: Whether to skip booting ramdisk for servicing. |
| 19 | + """ |
| 20 | + node = task.node |
| 21 | ++ ramdisk_needed = None |
| 22 | + try: |
| 23 | + # NOTE(ghe): Valid power and network values are needed to perform |
| 24 | + # a service operation. |
| 25 | +@@ -54,42 +55,53 @@ |
| 26 | + node.set_driver_internal_info('service_disable_ramdisk', |
| 27 | + disable_ramdisk) |
| 28 | + task.node.save() |
| 29 | +- |
| 30 | + utils.node_update_cache(task) |
| 31 | + |
| 32 | +- # Allow the deploy driver to set up the ramdisk again (necessary for IPA) |
| 33 | +- try: |
| 34 | +- if not disable_ramdisk: |
| 35 | +- prepare_result = task.driver.deploy.prepare_service(task) |
| 36 | +- else: |
| 37 | +- LOG.info('Skipping preparing for service in-band service since ' |
| 38 | +- 'out-of-band only service has been requested for node ' |
| 39 | +- '%s', node.uuid) |
| 40 | +- prepare_result = None |
| 41 | +- except Exception as e: |
| 42 | +- msg = (_('Failed to prepare node %(node)s for service: %(e)s') |
| 43 | +- % {'node': node.uuid, 'e': e}) |
| 44 | +- return utils.servicing_error_handler(task, msg, traceback=True) |
| 45 | +- |
| 46 | +- if prepare_result == states.SERVICEWAIT: |
| 47 | +- # Prepare is asynchronous, the deploy driver will need to |
| 48 | +- # set node.driver_internal_info['service_steps'] and |
| 49 | +- # node.service_step and then make an RPC call to |
| 50 | +- # continue_node_service to start service operations. |
| 51 | +- task.process_event('wait') |
| 52 | +- return |
| 53 | + try: |
| 54 | + conductor_steps.set_node_service_steps( |
| 55 | + task, disable_ramdisk=disable_ramdisk) |
| 56 | ++ except exception.InvalidParameterValue: |
| 57 | ++ if disable_ramdisk: |
| 58 | ++ # NOTE(janders) raising log severity since this will result |
| 59 | ++ # in servicing failure |
| 60 | ++ LOG.error('Failed to compose list of service steps for node ' |
| 61 | ++ '%s', node.uuid) |
| 62 | ++ raise |
| 63 | ++ LOG.debug('Unable to compose list of service steps for node %(node)s ' |
| 64 | ++ 'will retry once ramdisk is booted.', {'node': node.uuid}) |
| 65 | ++ ramdisk_needed = True |
| 66 | + except Exception as e: |
| 67 | +- # Catch all exceptions and follow the error handling |
| 68 | +- # path so things are cleaned up properly. |
| 69 | + msg = (_('Cannot service node %(node)s: %(msg)s') |
| 70 | + % {'node': node.uuid, 'msg': e}) |
| 71 | + return utils.servicing_error_handler(task, msg) |
| 72 | + |
| 73 | + steps = node.driver_internal_info.get('service_steps', []) |
| 74 | + step_index = 0 if steps else None |
| 75 | ++ for step in steps: |
| 76 | ++ step_requires_ramdisk = step.get('requires_ramdisk') |
| 77 | ++ if step_requires_ramdisk: |
| 78 | ++ LOG.debug('Found service step %s requiring ramdisk ' |
| 79 | ++ 'for node %s', step, task.node.uuid) |
| 80 | ++ ramdisk_needed = True |
| 81 | ++ |
| 82 | ++ if ramdisk_needed and not disable_ramdisk: |
| 83 | ++ try: |
| 84 | ++ prepare_result = task.driver.deploy.prepare_service(task) |
| 85 | ++ except Exception as e: |
| 86 | ++ msg = (_('Failed to prepare node %(node)s for service: %(e)s') |
| 87 | ++ % {'node': node.uuid, 'e': e}) |
| 88 | ++ return utils.servicing_error_handler(task, msg, traceback=True) |
| 89 | ++ if prepare_result == states.SERVICEWAIT: |
| 90 | ++ # Prepare is asynchronous, the deploy driver will need to |
| 91 | ++ # set node.driver_internal_info['service_steps'] and |
| 92 | ++ # node.service_step and then make an RPC call to |
| 93 | ++ # continue_node_service to start service operations. |
| 94 | ++ task.process_event('wait') |
| 95 | ++ return |
| 96 | ++ else: |
| 97 | ++ LOG.debug('Will proceed with servicing node %(node)s ' |
| 98 | ++ 'without booting the ramdisk.', {'node': node.uuid}) |
| 99 | ++ |
| 100 | + do_next_service_step(task, step_index, disable_ramdisk=disable_ramdisk) |
| 101 | + |
| 102 | + |
| 103 | +diff --git a/ironic/conductor/steps.py b/ironic/conductor/steps.py |
| 104 | +index 39f75fbe..9112e6d 100644 |
| 105 | +--- a/ironic/conductor/steps.py |
| 106 | ++++ b/ironic/conductor/steps.py |
| 107 | +@@ -496,9 +496,11 @@ |
| 108 | + are accepted. |
| 109 | + :raises: InvalidParameterValue if there is a problem with the user's |
| 110 | + clean steps. |
| 111 | +- :raises: NodeCleaningFailure if there was a problem getting the |
| 112 | ++ :raises: NodeServicingFailure if there was a problem getting the |
| 113 | + service steps. |
| 114 | + """ |
| 115 | ++ # FIXME(janders) it seems NodeServicingFailure is never actually raised, |
| 116 | ++ # perhaps we should remove it? |
| 117 | + node = task.node |
| 118 | + steps = _validate_user_service_steps( |
| 119 | + task, node.driver_internal_info.get('service_steps', []), |
| 120 | +diff --git a/ironic/drivers/modules/deploy_utils.py b/ironic/drivers/modules/deploy_utils.py |
| 121 | +index 8afb909..f501b33 100644 |
| 122 | +--- a/ironic/drivers/modules/deploy_utils.py |
| 123 | ++++ b/ironic/drivers/modules/deploy_utils.py |
| 124 | +@@ -1750,7 +1750,7 @@ |
| 125 | + task.driver.boot.prepare_ramdisk(task, deploy_opts) |
| 126 | + |
| 127 | + |
| 128 | +-def reboot_to_finish_step(task, timeout=None): |
| 129 | ++def reboot_to_finish_step(task, timeout=None, disable_ramdisk=None): |
| 130 | + """Reboot the node into IPA to finish a deploy/clean step. |
| 131 | + |
| 132 | + :param task: a TaskManager instance. |
| 133 | +@@ -1759,8 +1759,14 @@ |
| 134 | + :returns: states.CLEANWAIT if cleaning operation in progress |
| 135 | + or states.DEPLOYWAIT if deploy operation in progress. |
| 136 | + """ |
| 137 | +- disable_ramdisk = task.node.driver_internal_info.get( |
| 138 | +- 'cleaning_disable_ramdisk') |
| 139 | ++ if disable_ramdisk is None: |
| 140 | ++ if task.node.provision_state in [states.CLEANING, states.CLEANWAIT]: |
| 141 | ++ disable_ramdisk = task.node.driver_internal_info.get( |
| 142 | ++ 'cleaning_disable_ramdisk') |
| 143 | ++ elif task.node.provision_state in [states.SERVICING, |
| 144 | ++ states.SERVICEWAIT]: |
| 145 | ++ disable_ramdisk = task.node.driver_internal_info.get( |
| 146 | ++ 'service_disable_ramdisk') |
| 147 | + if not disable_ramdisk: |
| 148 | + if (manager_utils.is_fast_track(task) |
| 149 | + and not task.node.disable_power_off): |
| 150 | +diff --git a/ironic/drivers/modules/redfish/firmware.py b/ironic/drivers/modules/redfish/firmware.py |
| 151 | +index 2500cd5..555a921 100644 |
| 152 | +--- a/ironic/drivers/modules/redfish/firmware.py |
| 153 | ++++ b/ironic/drivers/modules/redfish/firmware.py |
| 154 | +@@ -146,7 +146,7 @@ |
| 155 | + requires_ramdisk=True) |
| 156 | + @base.service_step(priority=0, abortable=False, |
| 157 | + argsinfo=_FW_SETTINGS_ARGSINFO, |
| 158 | +- requires_ramdisk=True) |
| 159 | ++ requires_ramdisk=False) |
| 160 | + @base.cache_firmware_components |
| 161 | + def update(self, task, settings): |
| 162 | + """Update the Firmware on the node using the settings for components. |
| 163 | +@@ -181,7 +181,8 @@ |
| 164 | + polling=True |
| 165 | + ) |
| 166 | + |
| 167 | +- return deploy_utils.reboot_to_finish_step(task, timeout=wait_interval) |
| 168 | ++ return deploy_utils.reboot_to_finish_step(task, timeout=wait_interval, |
| 169 | ++ disable_ramdisk=True) |
| 170 | + |
| 171 | + def _execute_firmware_update(self, node, update_service, settings): |
| 172 | + """Executes the next firmware update to the node |
| 173 | +diff --git a/ironic/tests/unit/conductor/test_servicing.py b/ironic/tests/unit/conductor/test_servicing.py |
| 174 | +index 22c2612..b46394d 100644 |
| 175 | +--- a/ironic/tests/unit/conductor/test_servicing.py |
| 176 | ++++ b/ironic/tests/unit/conductor/test_servicing.py |
| 177 | +@@ -109,14 +109,21 @@ |
| 178 | + mock_validate.side_effect = exception.NetworkError() |
| 179 | + self.__do_node_service_validate_fail(mock_validate) |
| 180 | + |
| 181 | ++ @mock.patch.object(conductor_steps, 'set_node_service_steps', |
| 182 | ++ autospec=True) |
| 183 | + @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', |
| 184 | + autospec=True) |
| 185 | + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_service', |
| 186 | + autospec=True) |
| 187 | + def test__do_node_service_prepare_service_fail(self, mock_prep, |
| 188 | + mock_validate, |
| 189 | +- service_steps=None): |
| 190 | +- # Exception from task.driver.deploy.prepare_cleaning should cause node |
| 191 | ++ mock_steps, |
| 192 | ++ service_steps=[]): |
| 193 | ++ # NOTE(janders) after removing unconditional initial reboot into |
| 194 | ++ # ramdisk, set_node_service_steps needs to InvalidParameterValue |
| 195 | ++ # to force boot into ramdisk |
| 196 | ++ mock_steps.side_effect = exception.InvalidParameterValue('error') |
| 197 | ++ # Exception from task.driver.deploy.prepare_service should cause node |
| 198 | + # to go to SERVICEFAIL |
| 199 | + mock_prep.side_effect = exception.InvalidParameterValue('error') |
| 200 | + tgt_prov_state = states.ACTIVE |
| 201 | +@@ -135,17 +142,76 @@ |
| 202 | + self.assertFalse(node.maintenance) |
| 203 | + self.assertIsNone(node.fault) |
| 204 | + |
| 205 | ++ @mock.patch.object(conductor_steps, 'set_node_service_steps', |
| 206 | ++ autospec=True) |
| 207 | + @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', |
| 208 | + autospec=True) |
| 209 | + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_service', |
| 210 | + autospec=True) |
| 211 | + def test__do_node_service_prepare_service_wait(self, mock_prep, |
| 212 | +- mock_validate): |
| 213 | ++ mock_validate, |
| 214 | ++ mock_steps): |
| 215 | + service_steps = [ |
| 216 | + {'step': 'trigger_servicewait', 'priority': 10, |
| 217 | + 'interface': 'vendor'} |
| 218 | + ] |
| 219 | ++ mock_steps.side_effect = exception.InvalidParameterValue('error') |
| 220 | ++ mock_prep.return_value = states.SERVICEWAIT |
| 221 | ++ tgt_prov_state = states.ACTIVE |
| 222 | ++ node = obj_utils.create_test_node( |
| 223 | ++ self.context, driver='fake-hardware', |
| 224 | ++ provision_state=states.SERVICING, |
| 225 | ++ target_provision_state=tgt_prov_state, |
| 226 | ++ vendor_interface='fake') |
| 227 | ++ with task_manager.acquire( |
| 228 | ++ self.context, node.uuid, shared=False) as task: |
| 229 | ++ servicing.do_node_service(task, service_steps=service_steps) |
| 230 | ++ node.refresh() |
| 231 | ++ self.assertEqual(states.SERVICEWAIT, node.provision_state) |
| 232 | ++ self.assertEqual(tgt_prov_state, node.target_provision_state) |
| 233 | ++ mock_prep.assert_called_once_with(mock.ANY, mock.ANY) |
| 234 | ++ mock_validate.assert_called_once_with(mock.ANY, mock.ANY) |
| 235 | + |
| 236 | ++ @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', |
| 237 | ++ autospec=True) |
| 238 | ++ @mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_service', |
| 239 | ++ autospec=True) |
| 240 | ++ def test__do_node_service_out_of_band(self, mock_prep, |
| 241 | ++ mock_validate): |
| 242 | ++ # NOTE(janders) this test ensures ramdisk isn't prepared if |
| 243 | ++ # steps do not require it |
| 244 | ++ service_steps = [ |
| 245 | ++ {'step': 'trigger_servicewait', 'priority': 10, |
| 246 | ++ 'interface': 'vendor'} |
| 247 | ++ ] |
| 248 | ++ mock_prep.return_value = states.SERVICEWAIT |
| 249 | ++ tgt_prov_state = states.ACTIVE |
| 250 | ++ node = obj_utils.create_test_node( |
| 251 | ++ self.context, driver='fake-hardware', |
| 252 | ++ provision_state=states.SERVICING, |
| 253 | ++ target_provision_state=tgt_prov_state, |
| 254 | ++ vendor_interface='fake') |
| 255 | ++ with task_manager.acquire( |
| 256 | ++ self.context, node.uuid, shared=False) as task: |
| 257 | ++ servicing.do_node_service(task, service_steps=service_steps) |
| 258 | ++ node.refresh() |
| 259 | ++ self.assertEqual(states.SERVICEWAIT, node.provision_state) |
| 260 | ++ self.assertEqual(tgt_prov_state, node.target_provision_state) |
| 261 | ++ mock_prep.assert_not_called() |
| 262 | ++ mock_validate.assert_called_once_with(mock.ANY, mock.ANY) |
| 263 | ++ |
| 264 | ++ @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', |
| 265 | ++ autospec=True) |
| 266 | ++ @mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_service', |
| 267 | ++ autospec=True) |
| 268 | ++ def test__do_node_service_requires_ramdisk_fallback(self, mock_prep, |
| 269 | ++ mock_validate): |
| 270 | ++ # NOTE(janders): here we set 'requires_ramdisk': 'True' |
| 271 | ++ # to force ramdisk_needed to be set and trigger prepare ramdisk |
| 272 | ++ service_steps = [ |
| 273 | ++ {'step': 'trigger_servicewait', 'priority': 10, |
| 274 | ++ 'interface': 'vendor', 'requires_ramdisk': 'True'} |
| 275 | ++ ] |
| 276 | + mock_prep.return_value = states.SERVICEWAIT |
| 277 | + tgt_prov_state = states.ACTIVE |
| 278 | + node = obj_utils.create_test_node( |
| 279 | +@@ -194,11 +260,8 @@ |
| 280 | + @mock.patch.object(conductor_steps, 'set_node_service_steps', |
| 281 | + autospec=True) |
| 282 | + def __do_node_service_steps_fail(self, mock_steps, mock_validate, |
| 283 | +- service_steps=None, invalid_exc=True): |
| 284 | +- if invalid_exc: |
| 285 | +- mock_steps.side_effect = exception.InvalidParameterValue('invalid') |
| 286 | +- else: |
| 287 | +- mock_steps.side_effect = exception.NodeCleaningFailure('failure') |
| 288 | ++ service_steps=None): |
| 289 | ++ mock_steps.side_effect = exception.NodeCleaningFailure('failure') |
| 290 | + tgt_prov_state = states.ACTIVE |
| 291 | + node = obj_utils.create_test_node( |
| 292 | + self.context, driver='fake-hardware', |
| 293 | +@@ -224,12 +287,8 @@ |
| 294 | + def test_do_node_service_steps_fail_poweroff(self, mock_steps, |
| 295 | + mock_validate, |
| 296 | + mock_power, |
| 297 | +- service_steps=None, |
| 298 | +- invalid_exc=True): |
| 299 | +- if invalid_exc: |
| 300 | +- mock_steps.side_effect = exception.InvalidParameterValue('invalid') |
| 301 | +- else: |
| 302 | +- mock_steps.side_effect = exception.NodeCleaningFailure('failure') |
| 303 | ++ service_steps=None): |
| 304 | ++ mock_steps.side_effect = exception.NodeCleaningFailure('failure') |
| 305 | + tgt_prov_state = states.ACTIVE |
| 306 | + self.config(poweroff_in_cleanfail=True, group='conductor') |
| 307 | + node = obj_utils.create_test_node( |
| 308 | +@@ -249,9 +308,7 @@ |
| 309 | + self.assertFalse(mock_power.called) |
| 310 | + |
| 311 | + def test__do_node_service_steps_fail(self): |
| 312 | +- for invalid in (True, False): |
| 313 | +- self.__do_node_service_steps_fail(service_steps=[self.deploy_raid], |
| 314 | +- invalid_exc=invalid) |
| 315 | ++ self.__do_node_service_steps_fail(service_steps=[self.deploy_raid]) |
| 316 | + |
| 317 | + @mock.patch.object(conductor_steps, 'set_node_service_steps', |
| 318 | + autospec=True) |
| 319 | +diff --git a/releasenotes/notes/servicing-remove-initial-reboot-into-ramdisk-c1840524832435c2.yaml b/releasenotes/notes/servicing-remove-initial-reboot-into-ramdisk-c1840524832435c2.yaml |
| 320 | +new file mode 100644 |
| 321 | +index 0000000..296779a |
| 322 | +--- /dev/null |
| 323 | ++++ b/releasenotes/notes/servicing-remove-initial-reboot-into-ramdisk-c1840524832435c2.yaml |
| 324 | +@@ -0,0 +1,7 @@ |
| 325 | ++--- |
| 326 | ++fixes: |
| 327 | ++ - | |
| 328 | ++ Removes initial, unconditional reboot into ramdisk during servicing when |
| 329 | ++ not required by specified service steps. This reduces the total number of |
| 330 | ++ reboots needed when performing servicing, speeding up the process. |
| 331 | ++ |
0 commit comments