Skip to content

Commit c4b0cac

Browse files
[cuegui/cuebot] Fix UI freeze on gRPC connection drops and add job unavailable notification (#2143)
**Link the Issue(s) this Pull Request is related to.** - #2144 - #2139 **Summarize your change.** Cuebot: - Update ManageJob.java to return NOT_FOUND status code instead of INTERNAL when job data is not found in the database CueGUI: - Add gRPC error handling for NOT_FOUND, CANCELLED, and UNAVAILABLE status codes in FrameMonitorTree, LayerMonitorTree, and JobMonitorGraph - Add null checks in AbstractGraphWidget.handleSelectObjects() to prevent AttributeError when selecting nodes that no longer exist - Add job_not_found signal to CueGuiApplication for cross-widget communication - Add batch notification system in JobMonitorTree: - Group multiple job unavailable notifications within 500ms into single dialog - Use scrollable dialog for large numbers of jobs - Auto-remove unavailable jobs from the monitor list - Prevent removed jobs from being re-added during update cycles - Verify finished jobs still exist during background updates
1 parent 01bd30d commit c4b0cac

File tree

7 files changed

+295
-79
lines changed

7 files changed

+295
-79
lines changed

cuebot/src/main/java/com/imageworks/spcue/servant/ManageJob.java

Lines changed: 68 additions & 68 deletions
Large diffs are not rendered by default.

cuegui/cuegui/AbstractGraphWidget.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,9 @@ def onNodeSelectionChanged(self):
6262
def handleSelectObjects(self, rpcObjects):
6363
"""Select incoming objects in graph.
6464
"""
65+
if rpcObjects is None:
66+
return
67+
6568
received = [o.name() for o in rpcObjects]
6669
current = [rpcObject.name() for rpcObject in self.selectedObjects()]
6770
if received == current:
@@ -72,7 +75,8 @@ def handleSelectObjects(self, rpcObjects):
7275
node.set_selected(False)
7376
for rpcObject in rpcObjects:
7477
node = self.graph.get_node_by_name(rpcObject.name())
75-
node.set_selected(True)
78+
if node is not None:
79+
node.set_selected(True)
7680

7781
def selectedObjects(self):
7882
"""Return the selected nodes rpcObjects in the graph.

cuegui/cuegui/App.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ class CueGuiApplication(QtWidgets.QApplication):
4242
status = QtCore.Signal()
4343
quit = QtCore.Signal()
4444
select_layers = QtCore.Signal(list)
45+
job_not_found = QtCore.Signal(object)
4546

4647
# Thread pool
4748
threadpool = None

cuegui/cuegui/FrameMonitorTree.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
import opencue
3939
from opencue_proto import job_pb2
4040

41+
import cuegui
4142
import cuegui.AbstractTreeWidget
4243
import cuegui.AbstractWidgetItem
4344
import cuegui.Constants
@@ -501,9 +502,16 @@ def _getUpdate(self):
501502
except grpc.RpcError as e:
502503
# Handle gRPC errors - log but don't crash, allow UI to retry
503504
# pylint: disable=no-member
504-
if hasattr(e, 'code') and e.code() in [grpc.StatusCode.CANCELLED,
505-
grpc.StatusCode.UNAVAILABLE]:
506-
logger.warning("gRPC connection interrupted during frame update, will retry")
505+
if hasattr(e, 'code'):
506+
if e.code() == grpc.StatusCode.NOT_FOUND:
507+
logger.info("Job not found, notifying and clearing job from view")
508+
cuegui.app().job_not_found.emit(self.__job)
509+
self.setJob(None)
510+
return []
511+
if e.code() in [grpc.StatusCode.CANCELLED, grpc.StatusCode.UNAVAILABLE]:
512+
logger.warning("gRPC connection interrupted during frame update, will retry")
513+
else:
514+
logger.error("gRPC error in _getUpdate: %s", e)
507515
else:
508516
logger.error("gRPC error in _getUpdate: %s", e)
509517
# pylint: enable=no-member
@@ -539,13 +547,16 @@ def _getUpdateChanged(self):
539547
return None
540548
if e.code() == grpc.StatusCode.NOT_FOUND:
541549
# Job was deleted
542-
logger.info("Job not found, clearing job from view")
550+
logger.info("Job not found, notifying and clearing job from view")
551+
cuegui.app().job_not_found.emit(self.__job)
543552
self.setJob(None)
544553
return []
545554
logger.error("gRPC error in _getUpdateChanged: %s", e)
546555
# pylint: enable=no-member
547556
return None
548557
except opencue.EntityNotFoundException:
558+
logger.info("Job entity not found, notifying and clearing job from view")
559+
cuegui.app().job_not_found.emit(self.__job)
549560
self.setJob(None)
550561
except opencue.exception.CueException as e:
551562
# pylint: disable=no-member

cuegui/cuegui/JobMonitorGraph.py

Lines changed: 65 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,21 @@
1616
"""Node graph to display Layers of a Job"""
1717

1818

19+
import grpc
20+
1921
from qtpy import QtWidgets
2022

23+
from opencue.exception import EntityNotFoundException
24+
25+
import cuegui
26+
import cuegui.Logger
2127
import cuegui.Utils
2228
import cuegui.MenuActions
2329
from cuegui.nodegraph import CueLayerNode
2430
from cuegui.AbstractGraphWidget import AbstractGraphWidget
2531

32+
logger = cuegui.Logger.getLogger(__file__)
33+
2634

2735
class JobMonitorGraph(AbstractGraphWidget):
2836
"""Graph widget to display connections of layers in a job"""
@@ -111,7 +119,30 @@ def createGraph(self):
111119
if not self.job:
112120
return
113121

114-
layers = self.job.getLayers()
122+
try:
123+
layers = self.job.getLayers()
124+
except EntityNotFoundException:
125+
logger.info("Job not found, notifying and clearing job from view")
126+
cuegui.app().job_not_found.emit(self.job)
127+
self.setJob(None)
128+
return
129+
except grpc.RpcError as e:
130+
# pylint: disable=no-member
131+
if hasattr(e, 'code'):
132+
if e.code() == grpc.StatusCode.NOT_FOUND:
133+
logger.info("Job not found, notifying and clearing job from view")
134+
cuegui.app().job_not_found.emit(self.job)
135+
self.setJob(None)
136+
return
137+
if e.code() in [grpc.StatusCode.CANCELLED, grpc.StatusCode.UNAVAILABLE]:
138+
logger.warning(
139+
"gRPC connection interrupted during graph creation, will retry")
140+
else:
141+
logger.error("gRPC error in createGraph: %s", e)
142+
else:
143+
logger.error("gRPC error in createGraph: %s", e)
144+
# pylint: enable=no-member
145+
return
115146

116147
# add job layers to tree
117148
for layer in layers:
@@ -128,7 +159,12 @@ def createGraph(self):
128159
def setupNodeConnections(self):
129160
"""Setup connections between nodes based on their dependencies"""
130161
for node in self.graph.all_nodes():
131-
for depend in node.rpcObject.getWhatDependsOnThis():
162+
try:
163+
depends = node.rpcObject.getWhatDependsOnThis()
164+
except (EntityNotFoundException, grpc.RpcError) as e:
165+
logger.warning("Failed to get dependencies for node %s: %s", node.name(), e)
166+
continue
167+
for depend in depends:
132168
child_node = self.graph.get_node_by_name(depend.dependErLayer())
133169
if child_node:
134170
# todo check if connection exists
@@ -146,7 +182,32 @@ def update(self):
146182
This is run every 20 seconds by the timer.
147183
"""
148184
if self.job is not None:
149-
layers = self.job.getLayers()
185+
try:
186+
layers = self.job.getLayers()
187+
except EntityNotFoundException:
188+
logger.info("Job not found during update, notifying and clearing job from view")
189+
cuegui.app().job_not_found.emit(self.job)
190+
self.setJob(None)
191+
return
192+
except grpc.RpcError as e:
193+
# pylint: disable=no-member
194+
if hasattr(e, 'code'):
195+
if e.code() == grpc.StatusCode.NOT_FOUND:
196+
logger.info("Job not found during update, notifying and clearing job")
197+
cuegui.app().job_not_found.emit(self.job)
198+
self.setJob(None)
199+
return
200+
if e.code() in [grpc.StatusCode.CANCELLED, grpc.StatusCode.UNAVAILABLE]:
201+
logger.warning(
202+
"gRPC connection interrupted during graph update, will retry")
203+
else:
204+
logger.error("gRPC error in update: %s", e)
205+
else:
206+
logger.error("gRPC error in update: %s", e)
207+
# pylint: enable=no-member
208+
return
209+
150210
for layer in layers:
151211
node = self.graph.get_node_by_name(layer.name())
152-
node.setRpcObject(layer)
212+
if node is not None:
213+
node.setRpcObject(layer)

cuegui/cuegui/JobMonitorTree.py

Lines changed: 119 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -202,8 +202,16 @@ def __init__(self, parent):
202202
# pylint: disable=no-member
203203
self.itemClicked.connect(self.__itemSingleClickedCopy)
204204
self.itemClicked.connect(self.__itemSingleClickedComment)
205+
self.app.job_not_found.connect(self.__handleJobNotFound)
205206
# pylint: enable=no-member
206207

208+
# Track jobs that have been notified as not found to avoid duplicate popups
209+
self.__notifiedJobsNotFound = set()
210+
211+
# Batch job-not-found notifications to show in a single dialog
212+
self.__pendingNotFoundJobs = []
213+
self.__notFoundTimer = None
214+
207215
self.__load = {}
208216
self.startTicksUpdate(20, False, 60)
209217

@@ -259,6 +267,101 @@ def __itemSingleClickedComment(self, item, col):
259267
if col == COLUMN_COMMENT and job.isCommented():
260268
self.__menuActions.jobs().viewComments([job])
261269

270+
def __handleJobNotFound(self, job):
271+
"""Handle a job not found signal by batching notifications and removing the job.
272+
@type job: opencue.wrappers.job.Job
273+
@param job: The job that was not found"""
274+
if job is None:
275+
return
276+
277+
jobKey = cuegui.Utils.getObjectKey(job)
278+
279+
# Avoid showing duplicate popups for the same job
280+
if jobKey in self.__notifiedJobsNotFound:
281+
return
282+
283+
self.__notifiedJobsNotFound.add(jobKey)
284+
285+
# Find and remove the job from the list
286+
if jobKey in self._items:
287+
item = self._items[jobKey]
288+
self.removeItem(item)
289+
290+
# Add job name to pending list for batched notification
291+
jobName = job.data.name if hasattr(job, 'data') else str(job)
292+
self.__pendingNotFoundJobs.append(jobName)
293+
294+
# Start or restart the timer to batch multiple notifications
295+
if self.__notFoundTimer is None:
296+
self.__notFoundTimer = QtCore.QTimer(self)
297+
self.__notFoundTimer.setSingleShot(True)
298+
self.__notFoundTimer.timeout.connect(self.__showBatchedJobNotFoundDialog)
299+
self.__notFoundTimer.start(500) # 500ms delay to collect multiple notifications
300+
301+
# Clean up the notification tracking after a delay to allow re-notification
302+
# if the user adds the same job again later
303+
QtCore.QTimer.singleShot(5000, lambda: self.__notifiedJobsNotFound.discard(jobKey))
304+
305+
def __showBatchedJobNotFoundDialog(self):
306+
"""Show a single dialog for all pending job-not-found notifications."""
307+
if not self.__pendingNotFoundJobs:
308+
return
309+
310+
jobNames = self.__pendingNotFoundJobs[:]
311+
self.__pendingNotFoundJobs.clear()
312+
313+
if len(jobNames) == 1:
314+
message = (
315+
f"The job '{jobNames[0]}' is no longer available.\n\n"
316+
"The job has been moved to historical data and is no longer "
317+
"accessible through the live job interface.\n\n"
318+
"The job has been removed from the monitor list."
319+
)
320+
QtWidgets.QMessageBox.warning(
321+
self,
322+
"Job No Longer Available",
323+
message
324+
)
325+
else:
326+
# Use a custom dialog with scrollable area for many jobs
327+
dialog = QtWidgets.QDialog(self)
328+
dialog.setWindowTitle("Jobs No Longer Available")
329+
dialog.setMinimumWidth(500)
330+
dialog.setMaximumHeight(600)
331+
332+
layout = QtWidgets.QVBoxLayout(dialog)
333+
334+
# Header label
335+
headerLabel = QtWidgets.QLabel(
336+
f"The following {len(jobNames)} jobs are no longer available:"
337+
)
338+
layout.addWidget(headerLabel)
339+
340+
# Scrollable job list
341+
jobList = "\n".join(f" • {name}" for name in sorted(jobNames))
342+
textEdit = QtWidgets.QTextEdit()
343+
textEdit.setPlainText(jobList)
344+
textEdit.setReadOnly(True)
345+
textEdit.setMinimumHeight(150)
346+
textEdit.setMaximumHeight(350)
347+
layout.addWidget(textEdit)
348+
349+
# Footer message
350+
footerLabel = QtWidgets.QLabel(
351+
"These jobs have been moved to historical data and are no longer "
352+
"accessible through the live job interface.\n\n"
353+
"The jobs have been removed from the monitor list."
354+
)
355+
footerLabel.setWordWrap(True)
356+
layout.addWidget(footerLabel)
357+
358+
# OK button
359+
buttonBox = QtWidgets.QDialogButtonBox(QtWidgets.QDialogButtonBox.Ok)
360+
buttonBox.accepted.connect(dialog.accept)
361+
layout.addWidget(buttonBox)
362+
363+
dialog.exec_()
364+
262365
def startDrag(self, dropActions):
263366
"""Triggers a drag event"""
264367
cuegui.Utils.startDrag(self, dropActions, self.selectedObjects())
@@ -315,6 +418,9 @@ def addJob(self, job, timestamp=None, loading_from_config=False):
315418
try:
316419
if newJobObj:
317420
jobKey = cuegui.Utils.getObjectKey(newJobObj)
421+
# Skip jobs that were recently marked as not found
422+
if jobKey in self.__notifiedJobsNotFound:
423+
return
318424
if self.__groupByMode == "Clear":
319425
self.__load[jobKey] = newJobObj
320426
self.__jobTimeLoaded[jobKey] = timestamp if timestamp else time.time()
@@ -749,14 +855,16 @@ def _getUpdate(self):
749855
@rtype: dict<class.id: job>"""
750856
try:
751857
jobs = {}
858+
finished_jobs = {} # Track finished jobs to verify they still exist
752859

753860
# TODO: When getJobs is fixed to allow MatchAny, this can be updated to use one call
754861
monitored_proxies = []
755862
for item in list(self._items.values()):
756863
objectKey = cuegui.Utils.getObjectKey(item.rpcObject)
757864
if item.rpcObject.data.state == opencue.api.job_pb2.FINISHED:
758-
# Reuse the old object if job is finished
759-
jobs[objectKey] = item.rpcObject
865+
# Track finished jobs - verify they still exist
866+
finished_jobs[objectKey] = item.rpcObject
867+
monitored_proxies.append(objectKey)
760868
else:
761869
# Gather list of all other jobs to update
762870
monitored_proxies.append(objectKey)
@@ -793,6 +901,12 @@ def _getUpdate(self):
793901
objectKey = cuegui.Utils.getObjectKey(job)
794902
jobs[objectKey] = job
795903

904+
# Check for finished jobs that no longer exist (archived/deleted)
905+
for objectKey, job in finished_jobs.items():
906+
if objectKey not in jobs:
907+
# Job no longer exists - emit signal for batch notification
908+
self.app.job_not_found.emit(job)
909+
796910
except opencue.exception.CueException as e:
797911
list(map(logger.warning, cuegui.Utils.exceptionOutput(e)))
798912
return None
@@ -839,6 +953,9 @@ def _processUpdate(self, work, rpcObjects):
839953
self.clear()
840954

841955
for proxy, job in iteritems(rpcObjects):
956+
# Skip jobs that were recently marked as not found
957+
if proxy in self.__notifiedJobsNotFound:
958+
continue
842959
# Handle different grouping modes
843960
if self.__groupByMode == "Clear":
844961
# No grouping - flat list

cuegui/cuegui/LayerMonitorTree.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
from qtpy import QtGui
2727
from qtpy import QtWidgets
2828

29+
import grpc
30+
2931
from opencue.exception import EntityNotFoundException
3032
from opencue.api import job_pb2
3133

@@ -242,8 +244,28 @@ def _getUpdate(self):
242244
try:
243245
return self.__job.getLayers()
244246
except EntityNotFoundException:
247+
logger.info("Job not found, notifying and clearing job from view")
248+
cuegui.app().job_not_found.emit(self.__job)
245249
self.setJob(None)
246250
return []
251+
except grpc.RpcError as e:
252+
# Handle gRPC errors - log but don't crash, allow UI to retry
253+
# pylint: disable=no-member
254+
if hasattr(e, 'code'):
255+
if e.code() == grpc.StatusCode.NOT_FOUND:
256+
logger.info("Job not found, notifying and clearing job from view")
257+
cuegui.app().job_not_found.emit(self.__job)
258+
self.setJob(None)
259+
return []
260+
if e.code() in [grpc.StatusCode.CANCELLED, grpc.StatusCode.UNAVAILABLE]:
261+
logger.warning(
262+
"gRPC connection interrupted during layer update, will retry")
263+
else:
264+
logger.error("gRPC error in _getUpdate: %s", e)
265+
else:
266+
logger.error("gRPC error in _getUpdate: %s", e)
267+
# pylint: enable=no-member
268+
return []
247269
return []
248270

249271
def contextMenuEvent(self, e):

0 commit comments

Comments
 (0)