From f81781b3d32a3eac929062d0d4c4abbe875d71e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C2=B5?= Date: Mon, 12 Aug 2024 23:10:19 +0000 Subject: [PATCH] miner.py: implement checks and error handling for failing axons The miner template seemingly assumes that starting an axon never fails. In e2e tests the axon failed to start, due to mixing next_asyncio in bittensor and regular asyncio in uvicorn. It would be better to terminate the miner process when the axon never starts at all. This patch addresses this by: - wrapping run() in a try/except (this is a must in any Python threading application) - signalling exceptions from worker to main thread in a thread safe manner - terminating the miner if starting the axon fails - monitoring and reporting on whether the axon still runs Whether to keep the miner running if axon issues arise later is another question; the code as-is indicates this is indented behavior: "# In case of unforeseen errors, the miner will log the error and continue operations." so this is not changed. This patch depends on another patch to bittensor that adds .is_running() and .exception to class axon. --- neurons/miner.py | 4 ++-- template/base/miner.py | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/neurons/miner.py b/neurons/miner.py index 5f7b9500..0f119585 100644 --- a/neurons/miner.py +++ b/neurons/miner.py @@ -164,6 +164,6 @@ async def priority(self, synapse: template.protocol.Dummy) -> float: # This is the main function, which runs the miner. if __name__ == "__main__": with Miner() as miner: - while True: + while miner.is_running: bt.logging.info(f"Miner running... {time.time()}") - time.sleep(5) + time.sleep(1) diff --git a/template/base/miner.py b/template/base/miner.py index 1788e24b..e58d6f8a 100644 --- a/template/base/miner.py +++ b/template/base/miner.py @@ -69,8 +69,30 @@ def __init__(self, config=None): self.is_running: bool = False self.thread: Union[threading.Thread, None] = None self.lock = asyncio.Lock() + self.thread_lock = threading.Lock() + self.exception = None + + def get_exception(self): + with self.thread_lock: + return self.exception + + def set_exception(self, ex): + with self.thread_lock: + self.exception = ex + self.is_running = False def run(self): + """ + Entrypoint of worker thread. Provides try/except in order to prevent uncaught exceptions. + """ + try: + self.run_unsafe() + except Exception as e: + bt.logging.error("Exception caught in worker thread") + bt.logging.error(traceback.format_exc()) + self.set_exception(e) + + def run_unsafe(self): """ Initiates and manages the main loop for the miner on the Bittensor network. The main loop handles graceful shutdown on keyboard interrupts and logs unforeseen errors. @@ -105,6 +127,14 @@ def run(self): # Start starts the miner's axon, making it active on the network. self.axon.start() + t0 = time.time() + while time.time() - t0 < 1 and not self.axon.is_running(): + time.sleep(0.1) + if not self.axon.is_running(): + e = self.axon.exception + if e: + raise e + raise Exception("Failed to start axon for unknown reason") bt.logging.info(f"Miner starting at block: {self.block}") @@ -115,6 +145,16 @@ def run(self): self.block - self.metagraph.last_update[self.uid] < self.config.neuron.epoch_length ): + if not self.axon.is_running(): + # we may be faster than the exception is being set + ts = time.time() + while time.time() - ts < 3 and not self.axon.exception: + time.sleep(0.1) + e = self.axon.exception + if e: + raise e + else: + raise Exception("axon died without exception") # Wait before checking again. time.sleep(1)