Skip to content

Commit 1be2602

Browse files
[cuegui] Handle gRPC CANCELLED errors to fix UI refresh issues (AcademySoftwareFoundation#2042)
**Link the Issue(s) this Pull Request is related to.** - AcademySoftwareFoundation#2041 **Summarize your change.** Fix UI refresh failures caused by unhandled gRPC CANCELLED and UNAVAILABLE errors during connection interruptions. The UI would stop updating after performing actions like "eat and mark done" or "retry", requiring a restart. Changes: - Add CancelledException class and map StatusCode.CANCELLED to exception - Update ThreadPool to gracefully handle gRPC connection errors with warning instead of error, allowing automatic retry on next update cycle - Add gRPC error handling to FrameMonitorTree._getUpdate() and _getUpdateChanged() to recover from transient connection issues - Return None on CANCELLED/UNAVAILABLE to trigger full update on retry - Return empty list on NOT_FOUND to clear deleted jobs from view This allows the UI to automatically recover from transient network or server issues without stopping background updates or requiring manual restart. --------- Signed-off-by: Ramon Figueiredo <ramon.fgrd@gmail.com>
1 parent 5a57381 commit 1be2602

File tree

3 files changed

+49
-2
lines changed

3 files changed

+49
-2
lines changed

cuegui/cuegui/FrameMonitorTree.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from qtpy import QtCore
3434
from qtpy import QtGui
3535
from qtpy import QtWidgets
36+
import grpc
3637

3738
import opencue
3839
from opencue_proto import job_pb2
@@ -484,8 +485,19 @@ def _getUpdate(self):
484485
self.__lastUpdateTime = int(time.time())
485486
return self.__job.getFrames(**self.frameSearch.options)
486487
return []
488+
except grpc.RpcError as e:
489+
# Handle gRPC errors - log but don't crash, allow UI to retry
490+
# pylint: disable=no-member
491+
if hasattr(e, 'code') and e.code() in [grpc.StatusCode.CANCELLED,
492+
grpc.StatusCode.UNAVAILABLE]:
493+
logger.warning("gRPC connection interrupted during frame update, will retry")
494+
else:
495+
logger.error("gRPC error in _getUpdate: %s", e)
496+
# pylint: enable=no-member
497+
return []
487498
except opencue.exception.CueException as e:
488499
list(map(logger.warning, cuegui.Utils.exceptionOutput(e)))
500+
return []
489501

490502
def _getUpdateChanged(self):
491503
"""Returns the updated data from the cuebot
@@ -504,8 +516,24 @@ def _getUpdateChanged(self):
504516
self.__jobState = updated_data.state
505517
updatedFrames = updated_data.updated_frames.updated_frames
506518

519+
except grpc.RpcError as e:
520+
# Handle gRPC errors - allow UI to continue and retry
521+
# pylint: disable=no-member
522+
if hasattr(e, 'code'):
523+
if e.code() in [grpc.StatusCode.CANCELLED, grpc.StatusCode.UNAVAILABLE]:
524+
logger.warning("gRPC connection interrupted during frame update, will retry")
525+
# Return None to trigger a full update on next cycle
526+
return None
527+
if e.code() == grpc.StatusCode.NOT_FOUND:
528+
# Job was deleted
529+
logger.info("Job not found, clearing job from view")
530+
self.setJob(None)
531+
return []
532+
logger.error("gRPC error in _getUpdateChanged: %s", e)
533+
# pylint: enable=no-member
534+
return None
507535
except opencue.EntityNotFoundException:
508-
self.setJobObj(None)
536+
self.setJob(None)
509537
except opencue.exception.CueException as e:
510538
# pylint: disable=no-member
511539
if hasattr(e, "message") and 'timestamp cannot be over a minute off' in e.message:

cuegui/cuegui/ThreadPool.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ def someWorkCallback(work, result):
5050
import os
5151

5252
from qtpy import QtCore
53+
import grpc
5354

5455
import cuegui.Logger
5556

@@ -213,6 +214,16 @@ def run(self):
213214
if work[1]:
214215
self.workComplete.emit(work, result)
215216
del result
217+
except grpc.RpcError as e:
218+
# Handle gRPC errors gracefully - these are often transient
219+
# pylint: disable=no-member
220+
if hasattr(e, 'code') and e.code() in [grpc.StatusCode.CANCELLED,
221+
grpc.StatusCode.UNAVAILABLE]:
222+
logger.warning("gRPC connection issue for '%s': %s - "
223+
"UI will retry on next update", work[2], e.details())
224+
else:
225+
logger.error("gRPC error processing work for '%s': %s", work[2], e)
226+
# pylint: enable=no-member
216227
except TypeError as e:
217228
logger.error("TypeError in work processing for '%s': %s", work[2], e)
218229
except ValueError as e:

pycue/opencue/exception.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,13 @@ class ConnectionException(CueException):
6262
retryable = True
6363

6464

65+
class CancelledException(CueException):
66+
"""Raised when the RPC call was cancelled."""
67+
failMsg = 'RPC call was cancelled. {details}'
68+
retryMsg = 'RPC call was cancelled, retrying...'
69+
retryable = True
70+
71+
6572
def getRetryCount():
6673
"""Return the configured number of retries a cuebot call can make.
6774
If not specified in the config, all retryable calls will be called once and retried 3 times."""
@@ -73,5 +80,6 @@ def getRetryCount():
7380
grpc.StatusCode.ALREADY_EXISTS: EntityAlreadyExistsException,
7481
grpc.StatusCode.DEADLINE_EXCEEDED: DeadlineExceededException,
7582
grpc.StatusCode.INTERNAL: CueInternalErrorException,
76-
grpc.StatusCode.UNAVAILABLE: ConnectionException
83+
grpc.StatusCode.UNAVAILABLE: ConnectionException,
84+
grpc.StatusCode.CANCELLED: CancelledException
7785
}

0 commit comments

Comments
 (0)