-
-
Notifications
You must be signed in to change notification settings - Fork 5
inc: Add logs to Arroyo to track behavior during paused and empty message states #439
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 5 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,6 +35,7 @@ | |
|
|
||
| METRICS_FREQUENCY_SEC = 1.0 # In seconds | ||
| BACKPRESSURE_THRESHOLD = 5.0 # In seconds | ||
| LOGGING_FREQUENCY_SEC = 180.0 # In seconds | ||
|
|
||
| F = TypeVar("F", bound=Callable[[Any], Any]) | ||
|
|
||
|
|
@@ -169,6 +170,10 @@ def __init__( | |
| DlqPolicyWrapper(dlq_policy) if dlq_policy is not None else None | ||
| ) | ||
|
|
||
| self.__last_run_log_ts = time.time() # This is for throttling the logging of each run loop per-consumer | ||
| self.__last_pause_ts = None | ||
| self.__last_empty_msg_ts = None | ||
|
|
||
| def _close_strategy() -> None: | ||
| start_close = time.time() | ||
|
|
||
|
|
@@ -395,6 +400,10 @@ def _handle_invalid_message(self, exc: InvalidMessage) -> None: | |
| def _run_once(self) -> None: | ||
| self.__metrics_buffer.incr_counter("arroyo.consumer.run.count", 1) | ||
|
|
||
| if time.time() - self.__last_run_log_ts >= LOGGING_FREQUENCY_SEC: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please only calculate we also already have a counter metric for this, what is the purpose of the log?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Per-consumer breakdown |
||
| logger.info("Arroyo consumer _run_once loop started") | ||
| self.__last_run_log_ts = time.time() | ||
|
|
||
| message_carried_over = self.__message is not None | ||
|
|
||
| if not message_carried_over: | ||
|
|
@@ -408,6 +417,26 @@ def _run_once(self) -> None: | |
| self.__metrics_buffer.incr_timing( | ||
| "arroyo.consumer.poll.time", time.time() - start_poll | ||
| ) | ||
|
|
||
| if self.__message is None: | ||
| if not self.__is_paused: | ||
| if self.__last_empty_msg_ts is None: | ||
| self.__last_empty_msg_ts = time.time() | ||
|
|
||
| # Records a log if the consumer has been active but receiving no message from poll() for longer than a threshold duration | ||
| elif time.time() - self.__last_empty_msg_ts >= LOGGING_FREQUENCY_SEC: | ||
| logger.info(f"Consumer is not paused but did not receive a message from underlying consumer for {LOGGING_FREQUENCY_SEC} seconds") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. seems severe enough to |
||
| self.__last_empty_msg_ts = time.time() | ||
|
|
||
| else: | ||
| if self.__last_pause_ts is None: | ||
| self.__last_pause_ts = time.time() | ||
|
|
||
| # Records a log if the consumer has been paused for longer than a threshold duration | ||
| elif time.time() - self.__last_pause_ts >= LOGGING_FREQUENCY_SEC: | ||
| logger.info(f"Consumer has been paused for {LOGGING_FREQUENCY_SEC} seconds") | ||
| self.__last_pause_ts = time.time() | ||
|
|
||
| except RecoverableError: | ||
| return | ||
|
|
||
|
|
@@ -423,6 +452,9 @@ def _run_once(self) -> None: | |
| "arroyo.consumer.processing.time", time.time() - start_poll | ||
| ) | ||
| if self.__message is not None: | ||
|
|
||
| # Reset the timer | ||
| self.__last_empty_msg_ts = None | ||
| try: | ||
| start_submit = time.time() | ||
| message = ( | ||
|
|
@@ -467,6 +499,9 @@ def _run_once(self) -> None: | |
| paused_partitions, | ||
| ) | ||
| self.__is_paused = False | ||
|
|
||
| # Reset if we unpause | ||
| self.__last_pause_ts = None | ||
| # unpause paused partitions... just in case a subset is paused | ||
| self.__metrics_buffer.incr_counter( | ||
| "arroyo.consumer.resume", 1 | ||
|
|
@@ -489,6 +524,8 @@ def _run_once(self) -> None: | |
| self.__metrics_buffer.incr_counter("arroyo.consumer.resume", 1) | ||
| self.__consumer.resume([*self.__consumer.tell().keys()]) | ||
| self.__is_paused = False | ||
| # Reset the timer since we unpaused | ||
| self.__last_pause_ts = None | ||
|
|
||
| # Clear backpressure timestamp if it is set | ||
| self._clear_backpressure() | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you can simplify your logic by setting all of these to
0, then you don't have to check forNone