Skip to content

Commit 07394e4

Browse files
authored
Improve NetBox sync lock error handling to prevent blocking (#1852)
This change improves error handling when NetBox sync operations fail to acquire locks. Previously, lock acquisition timeouts would block the entire sync and continue without proper tracking. Now: 1. NetBox functions (set_maintenance, set_provision_state, set_power_state) return True/False based on lock acquisition success 2. sync_netbox_from_ironic tracks failed devices and reports them at the end 3. Sync continues with remaining devices when individual devices fail 4. Failed devices are reported with a clear warning message This prevents silent failures and provides better visibility into which devices failed to sync due to lock timeouts, while allowing the sync operation to continue for other devices. AI-assisted: Claude Code Signed-off-by: Christian Berendt <[email protected]>
1 parent 884f29a commit 07394e4

File tree

2 files changed

+39
-5
lines changed

2 files changed

+39
-5
lines changed

osism/tasks/conductor/ironic.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -605,6 +605,7 @@ def sync_netbox_from_ironic(request_id, node_name=None, netbox_filter=None):
605605
return
606606

607607
# Sync each node to NetBox
608+
failed_devices = []
608609
for node in nodes:
609610
# Adjust message based on whether secondaries are actually being synced
610611
if reachable_secondaries:
@@ -616,26 +617,44 @@ def sync_netbox_from_ironic(request_id, node_name=None, netbox_filter=None):
616617

617618
osism_utils.push_task_output(request_id, sync_msg)
618619

620+
# Track if this device failed to sync
621+
device_failed = False
622+
619623
# Update all three states (each function handles primary + secondary NetBox instances)
620624
# Pass netbox_filter to only update matching NetBox instances
621625
# Pass reachable_secondaries to only use reachable secondary instances
622-
netbox.set_provision_state(
626+
if not netbox.set_provision_state(
623627
node["name"],
624628
node["provision_state"],
625629
netbox_filter=netbox_filter,
626630
secondary_nb_list=reachable_secondaries,
627-
)
628-
netbox.set_power_state(
631+
):
632+
device_failed = True
633+
634+
if not netbox.set_power_state(
629635
node["name"],
630636
node["power_state"],
631637
netbox_filter=netbox_filter,
632638
secondary_nb_list=reachable_secondaries,
633-
)
634-
netbox.set_maintenance(
639+
):
640+
device_failed = True
641+
642+
if not netbox.set_maintenance(
635643
node["name"],
636644
state=node["is_maintenance"],
637645
netbox_filter=netbox_filter,
638646
secondary_nb_list=reachable_secondaries,
647+
):
648+
device_failed = True
649+
650+
if device_failed:
651+
failed_devices.append(node["name"])
652+
653+
# Report failed devices if any
654+
if failed_devices:
655+
osism_utils.push_task_output(
656+
request_id,
657+
f"WARNING: Failed to sync {len(failed_devices)} device(s) due to lock timeout: {', '.join(failed_devices)}\n",
639658
)
640659

641660
osism_utils.finish_task_output(request_id, rc=0)

osism/tasks/netbox.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,9 @@ def set_maintenance(
127127
Use 'primary' to match the primary NetBox instance.
128128
secondary_nb_list: Optional list of secondary NetBox instances to use.
129129
If not provided, uses utils.secondary_nb_list.
130+
131+
Returns:
132+
bool: True if lock was acquired and operation succeeded, False if lock could not be acquired.
130133
"""
131134
# Check if tasks are locked before execution
132135
utils.check_task_lock_and_exit()
@@ -177,8 +180,10 @@ def set_maintenance(
177180
)
178181
finally:
179182
lock.release()
183+
return True
180184
else:
181185
logger.error(f"Could not acquire lock for node {device_name}")
186+
return False
182187

183188

184189
@app.task(bind=True, name="osism.tasks.netbox.set_provision_state")
@@ -195,6 +200,9 @@ def set_provision_state(
195200
Use 'primary' to match the primary NetBox instance.
196201
secondary_nb_list: Optional list of secondary NetBox instances to use.
197202
If not provided, uses utils.secondary_nb_list.
203+
204+
Returns:
205+
bool: True if lock was acquired and operation succeeded, False if lock could not be acquired.
198206
"""
199207
# Check if tasks are locked before execution
200208
utils.check_task_lock_and_exit()
@@ -245,8 +253,10 @@ def set_provision_state(
245253
)
246254
finally:
247255
lock.release()
256+
return True
248257
else:
249258
logger.error(f"Could not acquire lock for node {device_name}")
259+
return False
250260

251261

252262
@app.task(bind=True, name="osism.tasks.netbox.set_power_state")
@@ -263,6 +273,9 @@ def set_power_state(
263273
Use 'primary' to match the primary NetBox instance.
264274
secondary_nb_list: Optional list of secondary NetBox instances to use.
265275
If not provided, uses utils.secondary_nb_list.
276+
277+
Returns:
278+
bool: True if lock was acquired and operation succeeded, False if lock could not be acquired.
266279
"""
267280
# Convert None to "n/a" for clearer user feedback
268281
if state is None:
@@ -317,8 +330,10 @@ def set_power_state(
317330
)
318331
finally:
319332
lock.release()
333+
return True
320334
else:
321335
logger.error(f"Could not acquire lock for node {device_name}")
336+
return False
322337

323338

324339
@app.task(bind=True, name="osism.tasks.netbox.get_location_id")

0 commit comments

Comments
 (0)